PaddlePaddle · ZHUI · Jan 10, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 9, 2025
diff --git a/llm/auto_parallel/gpt-3/gpt_with_intermediate.sh b/llm/auto_parallel/gpt-3/gpt_with_intermediate.sh
@@ -98,7 +98,7 @@ python -u -m paddle.distributed.launch \
     --amp_master_grad true \
     --attention_probs_dropout_prob 0.1 \
     --hidden_dropout_prob 0.1 \
-    --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \
+    --sharding_parallel_config "enable_tensor_fusion enable_overlap" \
     --tensor_parallel_config "enable_mp_async_allreduce" \
     --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
     --pipeline_parallel_config "enable_send_recv_overlap enable_split_backward" \

diff --git a/llm/auto_parallel/llama/llama_with_api.sh b/llm/auto_parallel/llama/llama_with_api.sh
@@ -78,7 +78,7 @@ python -u  -m paddle.distributed.launch \
     --max_seq_length 4096 \
     --sequence_parallel true \
     --sharding "stage1" \
-    --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \
+    --sharding_parallel_config "enable_tensor_fusion enable_overlap" \
     --tensor_parallel_config "enable_mp_async_allreduce" \
     --model_type "llama_network" \
     --ignore_load_lr_and_optim true \

diff --git a/llm/auto_parallel/llama/run_llama3.sh b/llm/auto_parallel/llama/run_llama3.sh
@@ -92,7 +92,7 @@ python -u  -m paddle.distributed.launch \
     --sharding "stage2" \
     --pipeline_parallel_config "enable_send_recv_overlap" \
     --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
-    --sharding_parallel_config "enable_stage2_overlap" \
+    --sharding_parallel_config "enable_overlap" \
     --tensor_parallel_config "enable_mp_async_allreduce" \
     --to_static 1 \
     --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \

diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
@@ -285,7 +285,9 @@ class TrainingArguments:
             Some additional config it highly affect the useage of sharding parallel, we provide some option to config it.
             following config is support:
               enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation
+              enable_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation only used for semi auto mode.
               enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed
+              enable_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed only used for semi auto mode.
               enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap.
               enable_stage1_broadcast_overlap, overlap stage1 V1 broadcast with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap forward compute and no other sync could be called during the training for broadcast overlap.
               enable_stage1_allgather_overlap, overlap stage1 V2 allgather with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for allgather overlap forward compute and no other sync could be called during the training for allgather overlap.
@@ -723,12 +725,14 @@ class TrainingArguments:
                 "Some additional config it highly affect the useage of sharding parallel, we provide some option to config it."
                 "following config is support: \n"
                 "enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation\n"
+                "enable_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation only used for semi auto mode.\n"
                 "enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed\n"
+                "enable_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed only used for semi auto mode.\n"
                 "disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.\n"
                 "enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap\n"
                 "enable_stage1_broadcast_overlap, overlap stage1 V1 broadcast with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap forward compute and no other sync could be called during the training for broadcast overlap.\n"
                 "enable_stage1_allgather_overlap, overlap stage1 V2 allgather with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for allgather overlap forward compute and no other sync could be called during the training for allgather overlap.\n"
-                "enable_stage1_tensor_fusion_blanced_save_load, convert unbalanced optimizer state to balanced state when using tensor fusion strategy, which may increase the memory occupation."
+                "enable_tensor_fusion_blanced_save_load, convert unbalanced optimizer state to balanced state when using tensor fusion strategy, which may increase the memory occupation."
             )
         },
     )
@@ -1645,27 +1649,35 @@ def is_segment_parallel_supported():
                 for x in sharding_parallel_config:
                     if len(x) > 0:
                         if x not in [
-                            "enable_stage1_tensor_fusion",
-                            "enable_stage1_overlap",
-                            "enable_stage2_overlap",
+                            "enable_tensor_fusion",
+                            "enable_overlap",
                             "enable_release_grads",
-                            "enable_stage1_tensor_fusion_blanced_save_load",
+                            "enable_tensor_fusion_blanced_save_load",
                         ]:
+                            if x in ["enable_stage1_overlap", "enable_stage2_overlap"]:
+                                raise ValueError(
+                                    "enable_stage1_overlap and enable_stage2_overlap are not supported in "
+                                    "auto_parallel mode. Please use enable_overlap instead."
+                                )
+                            elif x == "enable_stage1_tensor_fusion":
+                                raise ValueError(
+                                    "enable_stage1_tensor_fusion is not supported in auto_parallel mode. "
+                                    "Please use enable_tensor_fusion instead."
+                                )
                             raise ValueError(
-                                f"Found unknown pipeline mode config {x}, " f"accpet config is reduce_overlap."
+                                f"Found unknown sharding mode config {x}, "
+                                f"accpet config is enable_tensor_fusion, "
+                                "enable_overlap, enable_release_grads, enable_tensor_fusion_blanced_save_load."
                             )
 
-                    if (
-                        "enable_stage1_overlap" in sharding_parallel_config
-                        or "enable_stage2_overlap" in sharding_parallel_config
-                    ):
+                    if "enable_overlap" in sharding_parallel_config:
                         sharding.enable_overlap = True
 
-                    if "enable_stage1_tensor_fusion" in sharding_parallel_config:
+                    if "enable_tensor_fusion" in sharding_parallel_config:
                         sharding.grad_bucket_size_numel = 210355872
-                        sharding.enable_stage1_tensor_fusion = True
+                        sharding.enable_tensor_fusion = True
 
-                    if "enable_stage1_tensor_fusion_blanced_save_load" in sharding_parallel_config:
+                    if "enable_tensor_fusion_blanced_save_load" in sharding_parallel_config:
                         sharding.save_unbalanced_param = False
 
                     if "enable_release_grads" in sharding_parallel_config:

diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
@@ -612,7 +612,7 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
             --sharding "stage2" \
             --pipeline_parallel_config "enable_send_recv_overlap" \
             --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
-            --sharding_parallel_config "enable_stage2_overlap" \
+            --sharding_parallel_config "enable_overlap" \
             --tensor_parallel_config "enable_mp_async_allreduce" \
             --to_static 1 \
             --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
@@ -712,7 +712,7 @@ function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw() {
             --sharding "stage2" \
             --pipeline_parallel_config "enable_send_recv_overlap enable_split_backward" \
             --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
-            --sharding_parallel_config "enable_stage2_overlap" \
+            --sharding_parallel_config "enable_overlap" \
             --tensor_parallel_config "enable_mp_async_allreduce" \
             --to_static 1 \
             --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
@@ -1798,7 +1798,7 @@ function llama_baichuan_pir_auto_fuse_ffn_attention_qkv_DP2_MP2_PP2(){
         --sequence_parallel false \
         --sharding "stage1" \
         --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate " \
-        --sharding_parallel_config "enable_stage1_overlap" \
+        --sharding_parallel_config "enable_overlap" \
         --tensor_parallel_config "enable_mp_async_allreduce" \
         --pipeline_parallel_config "enable_send_recv_overlap" \
         --auto_parallel_resume_form_hybrid_parallel true \
@@ -1870,7 +1870,7 @@ function llama_baichuan_pir_auto_fuse_ffn_attention_qkv_DP2_MP2_PP2_intermediate
         --sequence_parallel false \
         --sharding "stage1" \
         --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate " \
-        --sharding_parallel_config "enable_stage1_overlap" \
+        --sharding_parallel_config "enable_overlap" \
         --tensor_parallel_config "enable_mp_async_allreduce" \
         --pipeline_parallel_config "enable_send_recv_overlap" \
         --auto_parallel_resume_form_hybrid_parallel true \
@@ -2422,7 +2422,7 @@ function llm_gpt_pir_auto_bs8_DP2_TP2_PP2(){
         --fp16_opt_level "O2" \
         --num_hidden_layers 2 \
         --intermediate_size 1024 \
-        --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \
+        --sharding_parallel_config "enable_tensor_fusion enable_overlap" \
         --tensor_parallel_config "enable_mp_async_allreduce" \
         --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
         --pipeline_parallel_config "enable_send_recv_overlap enable_split_backward" \
@@ -2489,7 +2489,7 @@ function llm_gpt_pir_auto_bs8_DP2_TP2_PP2_intermediate(){
         --fp16_opt_level "O2" \
         --num_hidden_layers 2 \
         --intermediate_size 1024 \
-        --sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \
+        --sharding_parallel_config "enable_tensor_fusion enable_overlap" \
         --tensor_parallel_config "enable_mp_async_allreduce" \
         --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
         --pipeline_parallel_config "enable_send_recv_overlap enable_split_backward" \

diff --git a/.../static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json b/.../static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json
@@ -48,7 +48,7 @@
     "max_seq_length": 4096,
     "sequence_parallel": false,
     "sharding": "stage1",
-    "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap",
+    "sharding_parallel_config": "enable_tensor_fusion enable_overlap",
     "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"

diff --git a/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json
@@ -51,7 +51,7 @@
     "amp_master_grad": true,
     "attention_probs_dropout_prob": 0.1,
     "hidden_dropout_prob": 0.1,
-    "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap",
+    "sharding_parallel_config": "enable_tensor_fusion enable_overlap",
     "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
     "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"

diff --git a/...c/static/auto_parallel/llama2/pretrain_config_llama2_13b/intermediate_api-llama2_13b.json b/...c/static/auto_parallel/llama2/pretrain_config_llama2_13b/intermediate_api-llama2_13b.json
@@ -10,7 +10,7 @@
   "pipeline_parallel_degree": 4,
   "sharding": "stage1",
   "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
-  "sharding_parallel_config": "enable_stage1_overlap",
+  "sharding_parallel_config": "enable_overlap enable_tensor_fusion",
   "tensor_parallel_config": "enable_mp_async_allreduce",
   "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward",
   "pipeline_schedule_mode": "VPP", 

diff --git a/...test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json b/...test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json
@@ -10,7 +10,7 @@
   "pipeline_parallel_degree": 4,
   "sharding": "stage1",
   "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
-  "sharding_parallel_config": "enable_stage1_overlap",
+  "sharding_parallel_config": "enable_overlap enable_tensor_fusion",
   "tensor_parallel_config": "enable_mp_async_allreduce",
   "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward",
   "pipeline_schedule_mode": "VPP", 

diff --git a/...test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json b/...test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_70b/pretrain-llama2_70b.json
@@ -52,7 +52,7 @@
     "virtual_pp_degree": 5,
     "pipeline_schedule_mode": "VPP", 
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
-    "sharding_parallel_config": "enable_stage1_overlap",
+    "sharding_parallel_config": "enable_overlap enable_tensor_fusion",
     "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "max_seq_length": 4096,
     "to_static": true,

diff --git a/...ipc/static/auto_parallel/llama2/pretrain_config_llama2_7b/intermediate_api-llama2_7b.json b/...ipc/static/auto_parallel/llama2/pretrain_config_llama2_7b/intermediate_api-llama2_7b.json
@@ -10,7 +10,7 @@
   "pipeline_parallel_degree": 1,
   "sharding": "stage1",
   "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
-  "sharding_parallel_config": "enable_stage1_overlap",
+  "sharding_parallel_config": "enable_overlap enable_tensor_fusion",
   "tensor_parallel_config": "enable_mp_async_allreduce",
   "pipeline_parallel_config": "",
   "virtual_pp_degree": 1,

diff --git a/...s/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_7b/pretrain-llama2_7b.json b/...s/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_7b/pretrain-llama2_7b.json
@@ -10,7 +10,7 @@
   "pipeline_parallel_degree": 1,
   "sharding": "stage1",
   "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
-  "sharding_parallel_config": "enable_stage1_overlap",
+  "sharding_parallel_config": "enable_overlap enable_tensor_fusion",
   "tensor_parallel_config": "enable_mp_async_allreduce",
   "pipeline_parallel_config": "",
   "virtual_pp_degree": 1,

diff --git a/...st_tipc/static/auto_parallel/llama2/pretrain_config_llama3.1_8b/pretrain_llama3.1_8b.json b/...st_tipc/static/auto_parallel/llama2/pretrain_config_llama3.1_8b/pretrain_llama3.1_8b.json
@@ -10,7 +10,7 @@
   "pipeline_parallel_degree": 1,
   "sharding": "stage1",
   "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
-  "sharding_parallel_config": "enable_stage1_overlap",
+  "sharding_parallel_config": "enable_overlap enable_tensor_fusion",
   "tensor_parallel_config": "enable_mp_async_allreduce",
   "pipeline_parallel_config": "",
   "virtual_pp_degree": 1,

diff --git a/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json b/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen_14b/pretrain-qwen_14b.json
@@ -49,7 +49,7 @@
     "to_static": 1,
     "auto_parallel_resume_form_hybrid_parallel": true,
     "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
-    "sharding_parallel_config": "enable_stage1_overlap",
+    "sharding_parallel_config": "enable_overlap enable_tensor_fusion",
     "tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
     "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"
 }