volcengine
diff --git a/‎tests/special_e2e/sft/test_sft_engine_all.sh‎
Lines changed: 6 additions & 2 deletions b/‎tests/special_e2e/sft/test_sft_engine_all.sh‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎verl/utils/seqlen_balancing.py‎
Lines changed: 2 additions & 1 deletion b/‎verl/utils/seqlen_balancing.py‎
Lines changed: 2 additions & 1 deletion
@@ -18,12 +18,16 @@ BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp bash tests/spe
 echo "run with sp4 fsdp_size4 num_gpus8 fsdp_strategy fsdp"
 BACKEND=fsdp SP_SIZE=4 FSDP_SIZE=4 NUM_GPUS=8 FSDP_STRATEGY=fsdp bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
 
+# test with fsdp 2
+echo "run with sp1 fsdp_size1 num_gpus1 fsdp_strategy fsdp2"
+BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
 
 # TODO: toggle the follow tests when the grad norm of fsdp is fixed
-# BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-# BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
+# echo "run with sp1 fsdp_size-1 num_gpus8 fsdp_strategy fsdp2"
 # BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
+# echo "run with sp2 fsdp_size-1 num_gpus8 fsdp_strategy fsdp2"
 # BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
+# BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
 # BACKEND=fsdp SP_SIZE=4 FSDP_SIZE=4 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
 
 # test with megatron
 
@@ -354,7 +354,8 @@ def prepare_dynamic_batch(data: DataProto, max_token_len: int) -> tuple[list[Dat
     for i, batch_idx in enumerate(batch_idx_list):
         tensors = dict(batch[i])
         non_tensors = {key: value[batch_idx] for key, value in data.non_tensor_batch.items()}
-        micro_batches.append(DataProto.from_dict(tensors, non_tensors, meta_info=data.meta_info))
+        meta_info = copy.deepcopy(data.meta_info)
+        micro_batches.append(DataProto.from_dict(tensors, non_tensors, meta_info=meta_info))
 
     return micro_batches, batch_idx_list