Skip to content

Commit 02c5325

Browse files
authored
Merge branch 'volcengine:main' into main
2 parents ac8f3ad + 7e4eec7 commit 02c5325

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1415
-689
lines changed

.github/workflows/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,8 @@ jobs:
6666
with:
6767
mode: "destroy"
6868
faas-url: "${{ env.DYNAMIC_RUNNER_URL }}"
69-
task-id: "${{ needs.setup.outputs.task-id }}"
69+
task-id: "${{ needs.setup.outputs.task-id }}"
70+
```
71+
72+
### Model and Dataset
73+
To avoid CI relies on network, we pre-download dataset on a NFS on the CI machine. The path for models are \${HOME}/models and the path for dataset is \${HOME}/models/hf_data.

.github/workflows/checkpoint_converter.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ jobs:
9292
pip3 install -e .[test]
9393
- name: Download Model to Use
9494
run: |
95-
huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B
96-
huggingface-cli download deepseek-ai/deepseek-coder-1.3b-instruct --local-dir ${HOME}/models/deepseek-ai/deepseek-coder-1.3b-instruct
95+
# huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B
96+
# huggingface-cli download deepseek-ai/deepseek-coder-1.3b-instruct --local-dir ${HOME}/models/deepseek-ai/deepseek-coder-1.3b-instruct
9797
export HF_HUB_OFFLINE=1
9898
- name: Running Huggingface to Megatron dist_ckpt converter (Qwen/Qwen2.5-0.5B)
9999
run: |
@@ -127,7 +127,7 @@ jobs:
127127
pip3 install -e .[test]
128128
- name: Download Model to Use
129129
run: |
130-
huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat --local-dir ${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat
130+
# huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat --local-dir ${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat
131131
export HF_HUB_OFFLINE=1
132132
- name: Running Huggingface to Megatron dist_ckpt CPU converter (Qwen/Qwen1.5-MoE-A2.7B-Chat)
133133
run: |

.github/workflows/e2e_dapo.yml

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,19 +83,36 @@ concurrency:
8383
permissions:
8484
contents: read
8585

86+
env:
87+
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2"
88+
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
89+
8690
jobs:
91+
setup:
92+
if: github.repository_owner == 'volcengine'
93+
runs-on: ubuntu-latest
94+
outputs:
95+
runner-label: ${{ steps.create-runner.outputs.runner-label }}
96+
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
97+
steps:
98+
- uses: actions/checkout@v4
99+
- id: create-runner
100+
uses: volcengine/vemlp-github-runner@v1
101+
with:
102+
mode: "create"
103+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
104+
mlp-image: "${{ env.IMAGE }}"
105+
87106
e2e_dapo:
88-
runs-on: [L20x8]
107+
needs: setup
108+
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
89109
timeout-minutes: 40 # Increase this timeout value as needed
90110
env:
91111
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
92112
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
93113
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
94114
HF_ENDPOINT: "https://hf-mirror.com"
95115
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
96-
container:
97-
image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
98-
options: --gpus all --shm-size=10g
99116
steps:
100117
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
101118
with:
@@ -105,8 +122,24 @@ jobs:
105122
pip3 install --no-deps -e .[test,gpu]
106123
- name: Prepare GSM8K dataset
107124
run: |
108-
python3 examples/data_preprocess/gsm8k.py
125+
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
109126
- name: Running the E2E test with the DAPO algorithm
110127
run: |
111128
ray stop --force
112129
bash tests/special_e2e/run_dapo.sh
130+
131+
cleanup:
132+
runs-on: ubuntu-latest
133+
needs:
134+
[
135+
setup,
136+
e2e_dapo
137+
]
138+
if: always()
139+
steps:
140+
- id: destroy-runner
141+
uses: volcengine/vemlp-github-runner@v1
142+
with:
143+
mode: "destroy"
144+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
145+
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"

.github/workflows/e2e_genrm_remote.yml

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,19 +76,36 @@ concurrency:
7676
permissions:
7777
contents: read
7878

79+
env:
80+
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2"
81+
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
82+
7983
jobs:
84+
setup:
85+
if: github.repository_owner == 'volcengine'
86+
runs-on: ubuntu-latest
87+
outputs:
88+
runner-label: ${{ steps.create-runner.outputs.runner-label }}
89+
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
90+
steps:
91+
- uses: actions/checkout@v4
92+
- id: create-runner
93+
uses: volcengine/vemlp-github-runner@v1
94+
with:
95+
mode: "create"
96+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
97+
mlp-image: "${{ env.IMAGE }}"
98+
8099
e2e_genrm_remote:
81-
runs-on: [L20x8]
100+
needs: setup
101+
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
82102
timeout-minutes: 40 # Increase this timeout value as needed
83103
env:
84104
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
85105
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
86106
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
87107
HF_ENDPOINT: "https://hf-mirror.com"
88108
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
89-
container:
90-
image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
91-
options: --gpus all --shm-size=10g
92109
steps:
93110
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
94111
with:
@@ -98,8 +115,24 @@ jobs:
98115
pip3 install --no-deps -e .[test,gpu]
99116
- name: Prepare GSM8K dataset
100117
run: |
101-
python3 examples/data_preprocess/gsm8k.py
118+
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
102119
- name: Running the E2E test with the Generative Reward Model
103120
run: |
104121
ray stop --force
105122
bash tests/special_e2e/run_genrm_remote.sh
123+
124+
cleanup:
125+
runs-on: ubuntu-latest
126+
needs:
127+
[
128+
setup,
129+
e2e_genrm_remote
130+
]
131+
if: always()
132+
steps:
133+
- id: destroy-runner
134+
uses: volcengine/vemlp-github-runner@v1
135+
with:
136+
mode: "destroy"
137+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
138+
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"

.github/workflows/e2e_one_step_off_policy.yml

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,30 @@ concurrency:
8383
permissions:
8484
contents: read
8585

86+
env:
87+
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2"
88+
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
89+
8690
jobs:
91+
setup:
92+
if: github.repository_owner == 'volcengine'
93+
runs-on: ubuntu-latest
94+
outputs:
95+
runner-label: ${{ steps.create-runner.outputs.runner-label }}
96+
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
97+
steps:
98+
- uses: actions/checkout@v4
99+
- id: create-runner
100+
uses: volcengine/vemlp-github-runner@v1
101+
with:
102+
mode: "create"
103+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
104+
mlp-image: "${{ env.IMAGE }}"
105+
87106
# Test FSDP2 strategy
88107
e2e_one_step_off_policy_fsdp2:
89-
runs-on: [L20x8]
108+
needs: setup
109+
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
90110
timeout-minutes: 10 # Increase timeout for async training
91111
env:
92112
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
@@ -95,9 +115,6 @@ jobs:
95115
HF_ENDPOINT: "https://hf-mirror.com"
96116
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
97117
ACTOR_STRATEGY: "fsdp2"
98-
container:
99-
image: verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.1
100-
options: --gpus all --shm-size=10g
101118
steps:
102119
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
103120
with:
@@ -107,15 +124,16 @@ jobs:
107124
pip3 install --no-deps -e .[test,gpu]
108125
- name: Prepare GSM8K dataset
109126
run: |
110-
python3 examples/data_preprocess/gsm8k.py
127+
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
111128
- name: Running the E2E test with one_step_off_policy algorithm (FSDP2)
112129
run: |
113130
ray stop --force
114131
bash tests/special_e2e/run_one_step_off_policy.sh
115132
116133
# Test Megatron strategy
117134
e2e_one_step_off_policy_megatron:
118-
runs-on: [L20x8]
135+
needs: setup
136+
runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
119137
timeout-minutes: 10 # Increase timeout for async training
120138
env:
121139
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
@@ -124,9 +142,6 @@ jobs:
124142
HF_ENDPOINT: "https://hf-mirror.com"
125143
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
126144
ACTOR_STRATEGY: "megatron"
127-
container:
128-
image: verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.1
129-
options: --gpus all --shm-size=10g
130145
steps:
131146
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
132147
with:
@@ -136,9 +151,25 @@ jobs:
136151
pip3 install --no-deps -e .[test,gpu]
137152
- name: Prepare GSM8K dataset
138153
run: |
139-
python3 examples/data_preprocess/gsm8k.py
154+
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
140155
- name: Running the E2E test with one_step_off_policy algorithm (Megatron)
141156
run: |
142157
ray stop --force
143158
bash tests/special_e2e/run_one_step_off_policy.sh
144159
160+
cleanup:
161+
runs-on: ubuntu-latest
162+
needs:
163+
[
164+
setup,
165+
e2e_one_step_off_policy_fsdp2,
166+
e2e_one_step_off_policy_megatron
167+
]
168+
if: always()
169+
steps:
170+
- id: destroy-runner
171+
uses: volcengine/vemlp-github-runner@v1
172+
with:
173+
mode: "destroy"
174+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
175+
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"

0 commit comments

Comments
 (0)