Skip to content

Commit 286466e

Browse files
ISEEKYANhuangjunyi.0
authored andcommitted
[megatron] chore: update example 671B script, no offline dist-ckpt needed any more (volcengine#2945)
### What does this PR do? update example 671B script, no offline dist-ckpt needed any more
1 parent 016c710 commit 286466e

File tree

1 file changed

+31
-25
lines changed

1 file changed

+31
-25
lines changed

examples/grpo_trainer/run_deepseek671b_math_megatron.sh

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,18 @@
11
set -x
22

3-
# 0. download the config
4-
# only need to download the `configuration_deepseek.py`, `config.json`, `tokenizer_config.json`, `tokenizer.json` and `generation_config.json`
5-
# remove the `quantization_config` in the `config.json`
6-
# set `num_nextn_predict_layers=0` to disable MTP, which is not currently supported
3+
# # 0. download HF checkpoint
4+
# # remove the `quantization_config` in the `config.json`
5+
# # set `num_nextn_predict_layers=0` to disable MTP, which is not currently supported
6+
# huggingface-cli download deepseek-ai/DeepSeek-V3-0324
77

8-
huggingface-cli download deepseek-ai/DeepSeek-V3-0324 configuration_deepseek.py config.json
9-
10-
# 1. download the dist_ckpt format model from https://huggingface.co/BearBiscuit05/dpsk-v3-671B-BF16-dist_ckpt/tree/main
11-
# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path
12-
DIST_CKPT_PATH="<path_to_dist_ckpt>"
8+
# no offline dist checkpoint needed, now with mbridge>=0.13.0, we can directly init model from huggingface downloaded fp8 weights
9+
# tested on docker://verlai/verl:app-verl0.5-vllm0.10.0-mcore0.13.0-te2.2
1310
LLM="<path_to_dsv3_config>"
1411

1512

1613
# 2. run the script
17-
gsm8k_train_path=/data/gsm8k/train.parquet
18-
gsm8k_test_path=/data/gsm8k/test.parquet
14+
gsm8k_train_path=/root/data/gsm8k/train.parquet
15+
gsm8k_test_path=/root/data/gsm8k/test.parquet
1916
train_files=$gsm8k_train_path
2017
test_files=$gsm8k_test_path
2118

@@ -33,30 +30,32 @@ CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
3330
CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
3431
RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
3532

36-
# 512 H20(96GB)
37-
NODES=64
33+
# 256 H100(80GB)
34+
NODES=32
3835
PP=16
3936
TP=1
40-
EP=32
37+
EP=16
4138
ETP=1
4239
INFER_TP=32
4340
# consider TP/ETP, and enable recompute if short of memory
4441

4542
# full recompute
46-
# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
47-
# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
48-
# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
4943

5044
n_resp_per_prompt=4
45+
max_prompt_length=2048
46+
max_response_length=4096
47+
use_dynamic_bsz=True
48+
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
49+
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
5150

5251
# RAY_ADDRESS='auto' ray job submit --working-dir . --
5352
python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
5453
algorithm.adv_estimator=grpo \
5554
data.train_files="$train_files" \
5655
data.val_files="$test_files" \
5756
data.train_batch_size=512 \
58-
data.max_prompt_length=2048 \
59-
data.max_response_length=4096 \
57+
data.max_prompt_length=$max_prompt_length \
58+
data.max_response_length=$max_response_length \
6059
data.filter_overlong_prompts=True \
6160
data.truncation='error' \
6261
actor_rollout_ref.model.path=$LLM \
@@ -81,8 +80,15 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
8180
trainer.nnodes=$NODES \
8281
trainer.save_freq=-1 \
8382
trainer.test_freq=5 \
84-
+actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=3 \
85-
+actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=2 \
83+
actor_rollout_ref.model.use_fused_kernels=True \
84+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
85+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
86+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
87+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
88+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
89+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
90+
+actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=4 \
91+
+actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=1 \
8692
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
8793
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
8894
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
@@ -95,10 +101,10 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
95101
actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
96102
actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
97103
actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
98-
actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
99-
actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
100-
actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
101-
actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
104+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
105+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
106+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
107+
actor_rollout_ref.actor.megatron.use_mbridge=True \
102108
trainer.default_local_dir=$CKPT_DIR \
103109
trainer.val_before_train=False \
104110
trainer.total_epochs=100 $@

0 commit comments

Comments
 (0)