Skip to content

Commit 784851b

Browse files
committed
update ci
1 parent 3f62a3b commit 784851b

File tree

2 files changed

+28
-77
lines changed

2 files changed

+28
-77
lines changed

.github/workflows/examples-calc-x.yml

Lines changed: 23 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -161,52 +161,23 @@ jobs:
161161
cd examples/calc_x
162162
../../scripts/restart_ray.sh
163163
164-
cleanup() {
165-
set +e
166-
pkill -f agl && echo "SIGTERM sent to agl" || echo "No agl process found"
167-
while pgrep -f agl >/dev/null; do
168-
echo "Waiting for agl to finish..."
169-
sleep 5
170-
done
171-
pkill -f train_calc_agent.py && echo "SIGTERM sent to train_calc_agent.py" || echo "No train_calc_agent.py process found"
172-
while pgrep -f train_calc_agent.py >/dev/null; do
173-
echo "Waiting for train_calc_agent.py to finish..."
174-
sleep 5
175-
done
176-
set -e
177-
}
178-
trap cleanup EXIT
179-
180164
agl store --port 4747 &
181-
182165
sleep 5
183-
184-
PYTHONUNBUFFERED=1 AGL_MANAGED_STORE=0 AGL_CURRENT_ROLE=runner python train_calc_agent.py --external-store-address http://localhost:4747 --val-file data/test_mini.parquet --ci &
185-
runner_pid=$!
186-
166+
AGL_MANAGED_STORE=0 AGL_CURRENT_ROLE=runner python train_calc_agent.py --external-store-address http://localhost:4747 --val-file data/test_mini.parquet --ci-fast &
187167
sleep 5
188-
189-
set +e
190-
PYTHONUNBUFFERED=1 AGL_MANAGED_STORE=0 AGL_CURRENT_ROLE=algorithm python train_calc_agent.py --external-store-address http://localhost:4747 --val-file data/test_mini.parquet --ci
191-
algorithm_status=$?
192-
wait "$runner_pid"
193-
runner_status=$?
194-
set -e
195-
196-
trap - EXIT
197-
cleanup
198-
199-
if [[ $runner_status -ne 0 ]]; then
200-
echo "Runner process failed with exit code $runner_status" >&2
201-
exit "$runner_status"
202-
fi
203-
204-
if [[ $algorithm_status -ne 0 ]]; then
205-
echo "Algorithm process failed with exit code $algorithm_status" >&2
206-
exit "$algorithm_status"
207-
fi
208-
209-
sleep 10
168+
AGL_MANAGED_STORE=0 AGL_CURRENT_ROLE=algorithm python train_calc_agent.py --external-store-address http://localhost:4747 --val-file data/test_mini.parquet --ci-fast
169+
170+
pkill -f agl && echo "SIGTERM sent to agl" || echo "No agl process found"
171+
while pgrep -f agl; do
172+
echo "Waiting for agl to finish..."
173+
sleep 5
174+
done
175+
pkill -f train_calc_agent.py && echo "SIGTERM sent to train_calc_agent.py" || echo "No train_calc_agent.py process found"
176+
while pgrep -f train_calc_agent.py; do
177+
echo "Waiting for train_calc_agent.py to finish..."
178+
sleep 5
179+
done
180+
echo "train_calc_agent.py has finished."
210181
shell: bash
211182
env:
212183
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
@@ -220,38 +191,16 @@ jobs:
220191
cd examples/calc_x
221192
../../scripts/restart_ray.sh
222193
223-
cleanup() {
224-
set +e
225-
pkill -f train_calc_agent.py && echo "SIGTERM sent to train_calc_agent.py" || echo "No train_calc_agent.py process found"
226-
while pgrep -f train_calc_agent.py >/dev/null; do
227-
echo "Waiting for train_calc_agent.py to finish..."
228-
sleep 5
229-
done
230-
set -e
231-
}
232-
trap cleanup EXIT
233-
234-
PYTHONUNBUFFERED=1 AGL_SERVER_HOST=0.0.0.0 AGL_SERVER_PORT=5858 AGL_CURRENT_ROLE=algorithm python train_calc_agent.py --val-file data/test_mini.parquet --ci --ci-fast &
235-
algorithm_pid=$!
236-
194+
PYTHONUNBUFFERED=1 AGL_SERVER_HOST=127.0.0.1 AGL_SERVER_PORT=5858 AGL_CURRENT_ROLE=runner python train_calc_agent.py --val-file data/test_mini.parquet --ci-fast &
237195
sleep 5
238-
239-
PYTHONUNBUFFERED=1 AGL_SERVER_HOST=127.0.0.1 AGL_SERVER_PORT=5858 AGL_CURRENT_ROLE=runner python train_calc_agent.py --val-file data/test_mini.parquet --ci --ci-fast --n-runners 2
240-
241-
set +e
242-
wait "$algorithm_pid"
243-
algorithm_status=$?
244-
set -e
245-
246-
trap - EXIT
247-
cleanup
248-
249-
if [[ $algorithm_status -ne 0 ]]; then
250-
echo "Algorithm process failed with exit code $algorithm_status" >&2
251-
exit "$algorithm_status"
252-
fi
253-
254-
sleep 10
196+
PYTHONUNBUFFERED=1 AGL_SERVER_HOST=0.0.0.0 AGL_SERVER_PORT=5858 AGL_CURRENT_ROLE=algorithm python train_calc_agent.py --val-file data/test_mini.parquet --ci-fast
197+
198+
pkill -f train_calc_agent.py && echo "SIGTERM sent to train_calc_agent.py" || echo "No train_calc_agent.py process found"
199+
while pgrep -f train_calc_agent.py; do
200+
echo "Waiting for train_calc_agent.py to finish..."
201+
sleep 5
202+
done
203+
echo "train_calc_agent.py has finished."
255204
shell: bash
256205
env:
257206
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}

examples/calc_x/train_calc_agent.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def train(
136136
config["actor_rollout_ref"]["model"]["path"] = model
137137

138138
# CI toggle keeps everything else the same but you can tweak the lightweight bits here if desired
139-
if ci:
139+
if ci or ci_fast:
140140
# Config the experiment name and project name so that they are available to CI
141141
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
142142
EXPERIMENT_NAME = f"calc_x_{timestamp}"
@@ -163,8 +163,10 @@ def train(
163163
config["trainer"]["project_name"] = PROJECT_NAME
164164
config["trainer"].pop("save_freq", None)
165165

166-
if ci_fast:
167-
config["trainer"]["total_training_steps"] = 1
166+
if ci_fast:
167+
# Extra fast CI toggle for testing purposes.
168+
config["trainer"]["total_training_steps"] = 1
169+
config["trainer"]["test_freq"] = 1
168170

169171
algorithm = agl.VERL(config)
170172

0 commit comments

Comments
 (0)