Skip to content

Examples Test

Examples Test #213

Workflow file for this run

name: Examples Test
permissions:
contents: read
on:
schedule:
# Every day at 3 AM UTC+8
- cron: '0 19 * * *'
workflow_dispatch:
jobs:
examples:
runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu]
timeout-minutes: 120
strategy:
matrix:
include:
- python-version: '3.10'
setup-script: 'legacy'
- python-version: '3.12'
setup-script: 'stable'
- python-version: '3.13'
setup-script: 'latest'
fail-fast: false
steps:
- name: Check GPU status
run: nvidia-smi
- name: Check disk space
run: df -h
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v7
with:
enable-cache: true
python-version: ${{ matrix.python-version }}
- name: Upgrade dependencies (latest)
run: uv lock --upgrade
if: matrix.setup-script == 'latest'
- name: Sync dependencies (latest)
run: |
uv sync --frozen --no-default-groups --extra apo --extra verl \
--group dev --group experiment --group trl --group agents --group torch-gpu-stable
if: matrix.setup-script == 'latest'
- name: Sync dependencies (stable)
run: |
uv sync --frozen --no-default-groups --extra apo --extra verl \
--group dev --group experiment --group trl --group agents --group torch-gpu-stable
if: matrix.setup-script == 'stable'
- name: Sync dependencies (legacy)
run: |
uv sync --frozen --no-default-groups --extra apo --extra verl \
--group dev --group experiment --group agents --group torch-gpu-legacy
if: matrix.setup-script == 'legacy'
- name: Freeze dependencies
run: |
set -ex
uv pip freeze | tee requirements-freeze.txt
echo "UV_LOCKED=1" >> $GITHUB_ENV
echo "UV_NO_SYNC=1" >> $GITHUB_ENV
- name: Upload dependencies artifact
uses: actions/upload-artifact@v4
with:
name: dependencies-${{ matrix.python-version }}-${{ matrix.setup-script }}
path: requirements-freeze.txt
compression-level: 0
- name: Launch LiteLLM Proxy
run: |
./scripts/litellm_run.sh
env:
AZURE_API_BASE: ${{ secrets.AZURE_API_BASE }}
AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
- name: Prepare Unsloth model
run: |
set -ex
cd examples/unsloth
rm -rf models
uv run hf download unsloth/Qwen3-4B-Instruct-2507 --local-dir models/version_0
- name: Prepare Spider dataset
run: |
set -ex
cd examples/spider
uv run gdown --fuzzy https://drive.google.com/file/d/1oi9J1jZP9TyM35L85CL3qeGWl2jqlnL6/view
unzip -q spider-data.zip -d data
rm spider-data.zip
- name: Prepare Calc-X dataset
run: |
set -ex
cd examples/calc_x
uv run gdown --fuzzy https://drive.google.com/file/d/1FQMyKLLd6hP9dw9rfZn1EZOWNvKaDsqw/view
unzip calc-x-data.zip -d data
rm calc-x-data.zip
# APO Examples test
- name: APO example (legacy)
run: |
set -ex
cd examples/apo
uv run legacy_apo_client.py &
sleep 3 # Wait for the client to be up
uv run legacy_apo_server.py
pkill -f legacy_apo_client.py && echo "SIGTERM sent to legacy_apo_client.py" || echo "No legacy_apo_client.py process found"
while pgrep -f legacy_apo_client.py; do
echo "Waiting for legacy_apo_client.py to finish..."
sleep 5
done
echo "legacy_apo_client.py has finished."
sleep 10
env:
OPENAI_API_BASE: http://localhost:12306/
OPENAI_API_KEY: dummy
- name: APO example
run: |
set -ex
cd examples/apo
uv run apo_custom_algorithm_trainer.py | tee _ci_apo.log
# Check whether the log contains "Best prompt found:"
grep "Best prompt found:" _ci_apo.log
env:
# New versions follow OPENAI_BASE_URL instead of OPENAI_API_BASE
OPENAI_BASE_URL: http://localhost:12306/
OPENAI_API_KEY: dummy
- name: APO example debug sanity check
run: |
set -ex
cd examples/apo
uv run apo_debug.py --mode runner
uv run apo_debug.py --mode hook
uv run apo_debug.py --mode trainer
env:
# New versions follow OPENAI_BASE_URL instead of OPENAI_API_BASE
OPENAI_BASE_URL: http://localhost:12306/
OPENAI_API_KEY: dummy
- name: APO built-in algorithm
run: |
set -ex
cd examples/apo
uv run room_selector_apo.py
env:
OPENAI_BASE_URL: http://localhost:12306/
OPENAI_API_KEY: dummy
if: ${{ (success() || failure()) && matrix.setup-script != 'legacy' }}
- name: Spider sanity check
run: |
set -ex
cd examples/spider
uv run sql_agent.py
env:
OPENAI_API_BASE: http://localhost:12306/
OPENAI_API_KEY: dummy
if: success() || failure()
- name: Calc-X MCP sanity check
run: |
set -ex
cd examples/calc_x
uv run tests/test_mcp_calculator.py
env:
OPENAI_API_BASE: http://localhost:12306/
OPENAI_API_KEY: dummy
- name: Calc-X sanity check
run: |
set -ex
cd examples/calc_x
uv run legacy_calc_agent_debug.py
env:
OPENAI_BASE_URL: http://localhost:12306/
OPENAI_API_KEY: dummy
# Calc-X training suddenly works after running the sanity check.
# And it has to be run before Spider training.
# The client side used to hang in many of my attempts.
# Don't ask why. Don't touch this.
- name: Calc-X training v0.1
run: |
set -ex
source .venv/bin/activate
cd examples/calc_x
../../scripts/restart_ray.sh
sleep 5
PYTHONUNBUFFERED=1 python legacy_calc_agent.py &
bash legacy_train.sh
pkill -f legacy_calc_agent.py && echo "SIGTERM sent to legacy_calc_agent.py" || echo "No legacy_calc_agent.py process found"
while pgrep -f legacy_calc_agent.py; do
echo "Waiting for legacy_calc_agent.py to finish..."
sleep 5
done
echo "legacy_calc_agent.py has finished."
sleep 10
shell: bash
env:
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
id: calc_x_train
if: success() || failure()
- name: Validate Calc-X training
run: |
set -ex
uv run scripts/validate_example_wandb.py ${{ steps.calc_x_train.outputs.project_name }} ${{ steps.calc_x_train.outputs.run_name }}
env:
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
- name: Calc-X training v0.2
run: |
set -ex
source .venv/bin/activate
cd examples/calc_x
../../scripts/restart_ray.sh
sleep 5
PYTHONUNBUFFERED=1 python train_calc_agent.py --val-file data/test_mini.parquet --ci
sleep 10
shell: bash
env:
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
id: calc_x_train_v0_2
if: success() || failure()
- name: Calc-X training v0.2 LLM Proxy
run: |
set -ex
source .venv/bin/activate
cd examples/calc_x
../../scripts/restart_ray.sh
sleep 5
PYTHONUNBUFFERED=1 python train_calc_agent.py --val-file data/test_mini.parquet --ci --llm-proxy
sleep 10
shell: bash
env:
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
id: calc_x_train_v0_2_llm_proxy
if: success() || failure()
- name: Calc-X training v0.2 External Store
run: |
set -ex
source .venv/bin/activate
cd examples/calc_x
../../scripts/restart_ray.sh
agl store --port 4747 &
sleep 5
AGL_MANAGED_STORE=0 AGL_CURRENT_ROLE=runner python train_calc_agent.py --external-store-address http://localhost:4747 --val-file data/test_mini.parquet --ci &
sleep 5
AGL_MANAGED_STORE=0 AGL_CURRENT_ROLE=algorithm python train_calc_agent.py --external-store-address http://localhost:4747 --val-file data/test_mini.parquet --ci
pkill -f agl && echo "SIGTERM sent to agl" || echo "No agl process found"
while pgrep -f agl; do
echo "Waiting for agl to finish..."
sleep 5
done
pkill -f train_calc_agent.py && echo "SIGTERM sent to train_calc_agent.py" || echo "No train_calc_agent.py process found"
while pgrep -f train_calc_agent.py; do
echo "Waiting for train_calc_agent.py to finish..."
sleep 5
done
echo "train_calc_agent.py has finished."
sleep 10
shell: bash
env:
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
id: calc_x_train_v0_2_external_store
if: success() || failure()
- name: Spider training
run: |
set -ex
source .venv/bin/activate
cd examples/spider
../../scripts/restart_ray.sh
sleep 5
PYTHONUNBUFFERED=1 python train_sql_agent.py fast
sleep 10
shell: bash
env:
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
id: spider_train
if: success() || failure()
- name: Validate Spider training
run: |
set -ex
uv run scripts/validate_example_wandb.py ${{ steps.spider_train.outputs.project_name }} ${{ steps.spider_train.outputs.run_name }}
env:
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
# Unsloth Examples test
- name: Unsloth SFT example
run: |
set -ex
source .venv/bin/activate
cd examples/unsloth
agl store --port 4747 &
sleep 5
python sft_rollout_runners.py &
sleep 5
python sft_algorithm.py
pkill -f agl && echo "SIGTERM sent to agl" || echo "No agl process found"
while pgrep -f agl; do
echo "Waiting for agl to finish..."
sleep 5
done
pkill -f sft_rollout_runners.py && echo "SIGTERM sent to sft_rollout_runners.py" || echo "No sft_rollout_runners.py process found"
while pgrep -f sft_rollout_runners.py; do
echo "Waiting for sft_rollout_runners.py to finish..."
sleep 5
done
echo "sft_rollout_runners.py has finished."
sleep 10
# Check models/version_2 must exist
if [ ! -d "models/version_2" ]; then
echo "models/version_2 does not exist"
exit 1
fi
env:
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
if: ${{ (success() || failure()) && matrix.setup-script != 'legacy' }}
- name: Unsloth SFT example all-in-one
run: |
set -ex
source .venv/bin/activate
cd examples/unsloth
rm -rf models/version_1 models/version_2
python sft_allinone.py
if [ ! -d "models/version_2" ]; then
echo "models/version_2 does not exist"
exit 1
fi
env:
WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
if: matrix.setup-script != 'legacy'
# Cleanup
- name: Cleanup
run: ./scripts/cleanup.sh
if: success() || failure()