Examples Test #219

Workflow file for this run

.github/workflows/examples.yml at 627b072

	name: Examples Test
	permissions:
	contents: read
	on:
	schedule:
	# Every day at 3 AM UTC+8
	- cron: '0 19 * * *'

	workflow_dispatch:

	jobs:
	calc-x:
	name: Calc-X (Python ${{ matrix.python-version }}, ${{ matrix.setup-script }})
	runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu]
	timeout-minutes: 90
	strategy:
	matrix:
	include:
	- python-version: '3.10'
	setup-script: 'legacy'
	- python-version: '3.12'
	setup-script: 'stable'
	- python-version: '3.13'
	setup-script: 'latest'
	fail-fast: false
	steps:
	- name: Check GPU status
	run: nvidia-smi
	- name: Check disk space
	run: df -h
	- uses: actions/checkout@v4
	- uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	python-version: ${{ matrix.python-version }}
	- name: Upgrade dependencies (latest)
	run: uv lock --upgrade
	if: matrix.setup-script == 'latest'
	- name: Sync dependencies (latest)
	run: \|
	uv sync --frozen --no-default-groups --extra verl \
	--group dev --group experiment --group agents --group torch-gpu-stable
	if: matrix.setup-script == 'latest'
	- name: Sync dependencies (stable & legacy)
	run: \|
	uv sync --frozen --no-default-groups --extra verl \
	--group dev --group experiment --group agents --group torch-gpu-${{ matrix.setup-script }}
	if: matrix.setup-script != 'latest'
	- name: Freeze dependencies
	run: \|
	set -ex
	uv pip freeze \| tee requirements-freeze.txt
	echo "UV_LOCKED=1" >> $GITHUB_ENV
	echo "UV_NO_SYNC=1" >> $GITHUB_ENV
	- name: Upload dependencies artifact
	uses: actions/upload-artifact@v4
	with:
	name: dependencies-calc-x-${{ matrix.python-version }}-${{ matrix.setup-script }}
	path: requirements-freeze.txt
	compression-level: 0

	- name: Launch LiteLLM Proxy
	run: \|
	./scripts/litellm_run.sh
	env:
	AZURE_API_BASE: ${{ secrets.AZURE_API_BASE }}
	AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}

	- name: Prepare Calc-X dataset
	run: \|
	set -ex
	cd examples/calc_x
	uv run gdown --fuzzy https://drive.google.com/file/d/1FQMyKLLd6hP9dw9rfZn1EZOWNvKaDsqw/view
	unzip calc-x-data.zip -d data
	rm calc-x-data.zip

	- name: Calc-X MCP sanity check
	run: \|
	set -ex
	cd examples/calc_x
	uv run tests/test_mcp_calculator.py
	env:
	OPENAI_API_BASE: http://localhost:12306/
	OPENAI_API_KEY: dummy
	- name: Calc-X sanity check
	run: \|
	set -ex
	cd examples/calc_x
	uv run legacy_calc_agent_debug.py
	env:
	OPENAI_BASE_URL: http://localhost:12306/
	OPENAI_API_KEY: dummy

	# Calc-X training suddenly works after running the sanity check.
	# And it has to be run before Spider training.
	# The client side used to hang in many of my attempts.
	# Don't ask why. Don't touch this.
	- name: Calc-X training
	run: \|
	set -ex
	source .venv/bin/activate
	cd examples/calc_x
	../../scripts/restart_ray.sh
	sleep 5
	PYTHONUNBUFFERED=1 python train_calc_agent.py --val-file data/test_mini.parquet --ci
	sleep 10
	shell: bash
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
	id: calc_x_train

	- name: Validate Calc-X training
	run: \|
	set -ex
	uv run scripts/validate_example_wandb.py ${{ steps.calc_x_train.outputs.project_name }} ${{ steps.calc_x_train.outputs.run_name }}
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}

	- name: Calc-X training LLM Proxy
	run: \|
	set -ex
	source .venv/bin/activate
	cd examples/calc_x
	../../scripts/restart_ray.sh
	sleep 5
	PYTHONUNBUFFERED=1 python train_calc_agent.py --val-file data/test_mini.parquet --ci --llm-proxy
	sleep 10
	shell: bash
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
	id: calc_x_train_llm_proxy

	- name: Calc-X training with external store
	run: \|
	set -ex
	source .venv/bin/activate
	cd examples/calc_x
	../../scripts/restart_ray.sh

	agl store --port 4747 &
	sleep 5
	AGL_MANAGED_STORE=0 AGL_CURRENT_ROLE=runner python train_calc_agent.py --external-store-address http://localhost:4747 --val-file data/test_mini.parquet --ci &
	sleep 5
	AGL_MANAGED_STORE=0 AGL_CURRENT_ROLE=algorithm python train_calc_agent.py --external-store-address http://localhost:4747 --val-file data/test_mini.parquet --ci

	pkill -f agl && echo "SIGTERM sent to agl" \|\| echo "No agl process found"
	while pgrep -f agl; do
	echo "Waiting for agl to finish..."
	sleep 5
	done
	pkill -f train_calc_agent.py && echo "SIGTERM sent to train_calc_agent.py" \|\| echo "No train_calc_agent.py process found"
	while pgrep -f train_calc_agent.py; do
	echo "Waiting for train_calc_agent.py to finish..."
	sleep 5
	done
	echo "train_calc_agent.py has finished."

	sleep 10
	shell: bash
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
	id: calc_x_train_external_store


	spider:
	name: Spider (Python ${{ matrix.python-version }}, ${{ matrix.setup-script }})
	runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu]
	timeout-minutes: 60
	strategy:
	matrix:
	include:
	- python-version: '3.10'
	setup-script: 'legacy'
	- python-version: '3.12'
	setup-script: 'stable'
	- python-version: '3.13'
	setup-script: 'latest'
	fail-fast: false
	steps:
	- name: Check GPU status
	run: nvidia-smi
	- name: Check disk space
	run: df -h
	- uses: actions/checkout@v4
	- uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	python-version: ${{ matrix.python-version }}
	- name: Upgrade dependencies (latest)
	run: uv lock --upgrade
	if: matrix.setup-script == 'latest'
	- name: Sync dependencies (latest)
	run: \|
	uv sync --frozen --no-default-groups --extra verl \
	--group dev --group experiment --group agents --group torch-gpu-stable
	if: matrix.setup-script == 'latest'
	- name: Sync dependencies (stable & legacy)
	run: \|
	uv sync --frozen --no-default-groups --extra verl \
	--group dev --group experiment --group agents --group torch-gpu-${{ matrix.setup-script }}
	if: matrix.setup-script != 'latest'
	- name: Freeze dependencies
	run: \|
	set -ex
	uv pip freeze \| tee requirements-freeze.txt
	echo "UV_LOCKED=1" >> $GITHUB_ENV
	echo "UV_NO_SYNC=1" >> $GITHUB_ENV
	- name: Upload dependencies artifact
	uses: actions/upload-artifact@v4
	with:
	name: dependencies-spider-${{ matrix.python-version }}-${{ matrix.setup-script }}
	path: requirements-freeze.txt
	compression-level: 0

	- name: Launch LiteLLM Proxy
	run: \|
	./scripts/litellm_run.sh
	env:
	AZURE_API_BASE: ${{ secrets.AZURE_API_BASE }}
	AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}

	- name: Prepare Spider dataset
	run: \|
	set -ex
	cd examples/spider
	uv run gdown --fuzzy https://drive.google.com/file/d/1oi9J1jZP9TyM35L85CL3qeGWl2jqlnL6/view
	unzip -q spider-data.zip -d data
	rm spider-data.zip

	- name: Spider sanity check
	run: \|
	set -ex
	cd examples/spider
	uv run sql_agent.py
	env:
	OPENAI_API_BASE: http://localhost:12306/
	OPENAI_API_KEY: dummy
	if: success() \|\| failure()

	- name: Spider training
	run: \|
	set -ex
	source .venv/bin/activate
	cd examples/spider
	../../scripts/restart_ray.sh
	sleep 5
	PYTHONUNBUFFERED=1 python train_sql_agent.py fast
	sleep 10
	shell: bash
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
	id: spider_train

	- name: Validate Spider training
	run: \|
	set -ex
	uv run scripts/validate_example_wandb.py ${{ steps.spider_train.outputs.project_name }} ${{ steps.spider_train.outputs.run_name }}
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}


	apo:
	name: APO (Python ${{ matrix.python-version }}, ${{ matrix.setup-script }})
	# This job is run on GitHub hosted runners rather than self-hosted runners because it needs no GPU.
	runs-on: ubuntu-latest
	timeout-minutes: 30
	strategy:
	matrix:
	include:
	- python-version: '3.10'
	setup-script: 'legacy'
	- python-version: '3.12'
	setup-script: 'stable'
	- python-version: '3.13'
	setup-script: 'latest'
	fail-fast: false
	steps:
	- uses: actions/checkout@v4
	- uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	python-version: ${{ matrix.python-version }}
	- name: Upgrade dependencies (latest)
	run: uv lock --upgrade
	if: matrix.setup-script == 'latest'
	- name: Sync dependencies (latest)
	run: \|
	uv sync --frozen --no-default-groups --extra apo \
	--group dev --group experiment --group agents --group core-stable
	if: matrix.setup-script == 'latest'
	- name: Sync dependencies (stable & legacy)
	run: \|
	uv sync --frozen --no-default-groups --extra apo \
	--group dev --group experiment --group agents --group core-${{ matrix.setup-script }}
	if: matrix.setup-script != 'latest'
	- name: Freeze dependencies
	run: \|
	set -ex
	uv pip freeze \| tee requirements-freeze.txt
	echo "UV_LOCKED=1" >> $GITHUB_ENV
	echo "UV_NO_SYNC=1" >> $GITHUB_ENV
	- name: Upload dependencies artifact
	uses: actions/upload-artifact@v4
	with:
	name: dependencies-apo-${{ matrix.python-version }}-${{ matrix.setup-script }}
	path: requirements-freeze.txt
	compression-level: 0

	- name: Launch LiteLLM Proxy
	run: \|
	./scripts/litellm_run.sh
	env:
	AZURE_API_BASE: ${{ secrets.AZURE_API_BASE }}
	AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}

	- name: APO custom algorithm
	run: \|
	set -ex
	cd examples/apo
	uv run apo_custom_algorithm_trainer.py \| tee _ci_apo.log
	# Check whether the log contains "Best prompt found:"
	grep "Best prompt found:" _ci_apo.log
	env:
	# New versions follow OPENAI_BASE_URL instead of OPENAI_API_BASE
	OPENAI_BASE_URL: http://localhost:12306/
	OPENAI_API_KEY: dummy
	- name: APO custom algorithm debugger
	run: \|
	set -ex
	cd examples/apo
	uv run apo_debug.py --mode runner
	uv run apo_debug.py --mode hook
	uv run apo_debug.py --mode trainer
	env:
	# New versions follow OPENAI_BASE_URL instead of OPENAI_API_BASE
	OPENAI_BASE_URL: http://localhost:12306/
	OPENAI_API_KEY: dummy

	- name: APO built-in algorithm
	run: \|
	set -ex
	cd examples/apo
	uv run room_selector_apo.py
	env:
	OPENAI_BASE_URL: http://localhost:12306/
	OPENAI_API_KEY: dummy
	if: matrix.setup-script != 'legacy'


	unsloth:
	name: Unsloth (Python ${{ matrix.python-version }}, ${{ matrix.setup-script }})
	runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu]
	timeout-minutes: 60
	strategy:
	matrix:
	# Legacy versions are not supported for Unsloth examples.
	include:
	- python-version: '3.12'
	setup-script: 'stable'
	- python-version: '3.13'
	setup-script: 'latest'
	fail-fast: false
	steps:
	- name: Check GPU status
	run: nvidia-smi
	- name: Check disk space
	run: df -h
	- uses: actions/checkout@v4
	- uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	python-version: ${{ matrix.python-version }}
	- name: Upgrade dependencies (latest)
	run: uv lock --upgrade
	if: matrix.setup-script == 'latest'
	- name: Sync dependencies
	run: \|
	uv sync --frozen --no-default-groups --extra verl \
	--group dev --group experiment --group trl --group agents --group torch-gpu-stable
	- name: Freeze dependencies
	run: \|
	set -ex
	uv pip freeze \| tee requirements-freeze.txt
	echo "UV_LOCKED=1" >> $GITHUB_ENV
	echo "UV_NO_SYNC=1" >> $GITHUB_ENV
	- name: Upload dependencies artifact
	uses: actions/upload-artifact@v4
	with:
	name: dependencies-unsloth-${{ matrix.python-version }}-${{ matrix.setup-script }}
	path: requirements-freeze.txt
	compression-level: 0

	- name: Prepare Unsloth model
	run: \|
	set -ex
	cd examples/unsloth
	rm -rf models
	uv run hf download unsloth/Qwen3-4B-Instruct-2507 --local-dir models/version_0

	- name: Unsloth SFT example
	run: \|
	set -ex
	source .venv/bin/activate
	cd examples/unsloth

	agl store --port 4747 &
	sleep 5
	python sft_rollout_runners.py &
	sleep 5
	python sft_algorithm.py

	pkill -f agl && echo "SIGTERM sent to agl" \|\| echo "No agl process found"
	while pgrep -f agl; do
	echo "Waiting for agl to finish..."
	sleep 5
	done
	pkill -f sft_rollout_runners.py && echo "SIGTERM sent to sft_rollout_runners.py" \|\| echo "No sft_rollout_runners.py process found"
	while pgrep -f sft_rollout_runners.py; do
	echo "Waiting for sft_rollout_runners.py to finish..."
	sleep 5
	done
	echo "sft_rollout_runners.py has finished."
	sleep 10

	# Check models/version_2 must exist
	if [ ! -d "models/version_2" ]; then
	echo "models/version_2 does not exist"
	exit 1
	fi
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}

	- name: Unsloth SFT example all-in-one
	run: \|
	set -ex
	source .venv/bin/activate
	cd examples/unsloth
	rm -rf models/version_1 models/version_2

	python sft_allinone.py
	if [ ! -d "models/version_2" ]; then
	echo "models/version_2 does not exist"
	exit 1
	fi
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}


	backward-compatibility:
	name: Backward Compatibility (Python ${{ matrix.python-version }}, ${{ matrix.setup-script }})
	runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu]
	timeout-minutes: 30
	strategy:
	matrix:
	include:
	- python-version: '3.10'
	setup-script: 'legacy'
	- python-version: '3.12'
	setup-script: 'stable'
	fail-fast: false
	steps:
	- name: Check GPU status
	run: nvidia-smi
	- name: Check disk space
	run: df -h
	- uses: actions/checkout@v4
	- uses: astral-sh/setup-uv@v7
	with:
	enable-cache: true
	python-version: ${{ matrix.python-version }}
	- name: Sync dependencies
	run: \|
	uv sync --frozen --no-default-groups --extra apo --extra verl \
	--group dev --group experiment --group agents --group torch-gpu-${{ matrix.setup-script }}
	- name: Freeze dependencies
	run: \|
	set -ex
	uv pip freeze \| tee requirements-freeze.txt
	echo "UV_LOCKED=1" >> $GITHUB_ENV
	echo "UV_NO_SYNC=1" >> $GITHUB_ENV
	- name: Upload dependencies artifact
	uses: actions/upload-artifact@v4
	with:
	name: dependencies-backward-compatibility-${{ matrix.python-version }}-${{ matrix.setup-script }}
	path: requirements-freeze.txt
	compression-level: 0

	- name: Launch LiteLLM Proxy
	run: \|
	./scripts/litellm_run.sh
	env:
	AZURE_API_BASE: ${{ secrets.AZURE_API_BASE }}
	AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
	- name: Prepare Calc-X dataset
	run: \|
	set -ex
	cd examples/calc_x
	uv run gdown --fuzzy https://drive.google.com/file/d/1FQMyKLLd6hP9dw9rfZn1EZOWNvKaDsqw/view
	unzip calc-x-data.zip -d data
	rm calc-x-data.zip

	- name: APO example (legacy client-server style)
	run: \|
	set -ex
	cd examples/apo
	uv run legacy_apo_client.py &
	sleep 3 # Wait for the client to be up
	uv run legacy_apo_server.py
	pkill -f legacy_apo_client.py && echo "SIGTERM sent to legacy_apo_client.py" \|\| echo "No legacy_apo_client.py process found"
	while pgrep -f legacy_apo_client.py; do
	echo "Waiting for legacy_apo_client.py to finish..."
	sleep 5
	done
	echo "legacy_apo_client.py has finished."
	sleep 10
	env:
	OPENAI_API_BASE: http://localhost:12306/
	OPENAI_API_KEY: dummy

	- name: Calc-X MCP sanity check
	run: \|
	set -ex
	cd examples/calc_x
	uv run tests/test_mcp_calculator.py
	env:
	OPENAI_API_BASE: http://localhost:12306/
	OPENAI_API_KEY: dummy
	- name: Calc-X sanity check
	run: \|
	set -ex
	cd examples/calc_x
	uv run legacy_calc_agent_debug.py
	env:
	OPENAI_BASE_URL: http://localhost:12306/
	OPENAI_API_KEY: dummy

	- name: Calc-X training (legacy client-server style)
	run: \|
	set -ex
	source .venv/bin/activate
	cd examples/calc_x
	../../scripts/restart_ray.sh
	sleep 5
	PYTHONUNBUFFERED=1 python legacy_calc_agent.py &
	bash legacy_train.sh
	pkill -f legacy_calc_agent.py && echo "SIGTERM sent to legacy_calc_agent.py" \|\| echo "No legacy_calc_agent.py process found"
	while pgrep -f legacy_calc_agent.py; do
	echo "Waiting for legacy_calc_agent.py to finish..."
	sleep 5
	done
	echo "legacy_calc_agent.py has finished."
	sleep 10
	shell: bash
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}
	id: calc_x_train

	- name: Validate Calc-X training
	run: \|
	set -ex
	uv run scripts/validate_example_wandb.py ${{ steps.calc_x_train.outputs.project_name }} ${{ steps.calc_x_train.outputs.run_name }}
	env:
	WANDB_BASE_URL: ${{ secrets.MSR_WANDB_BASE_URL }}
	WANDB_API_KEY: ${{ secrets.MSR_WANDB_API_KEY }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Examples Test #219

Workflow file

Examples Test #219

Uh oh!

Jobs

Run details

Workflow file for this run