Add interface to set nccl communicator configs

erhoo82 · erhoo82 · commit 5462101bba0c · 2025-05-02T11:33:48.000-07:00
Signed-off-by: Sangkug Lym &lt;slym@nvidia.com&gt;
diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py
@@ -294,6 +294,13 @@ def parse_cli_args():
         required=False,
         default=None,
     )
+    parser.add_argument(
+        "--nccl_communicator_config_path",
+        type=str,
+        help="Path to NCCL communicator config yaml file",
+        required=False,
+        default=None,
+    )
 
     def list_of_strings(arg):
         return arg.split(',')
diff --git a/scripts/performance/llm/finetune_deepseek_v3.py b/scripts/performance/llm/finetune_deepseek_v3.py
@@ -101,6 +101,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
 
     # disable HF ckpt loading
diff --git a/scripts/performance/llm/finetune_llama31_405b.py b/scripts/performance/llm/finetune_llama31_405b.py
@@ -89,6 +89,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe,
diff --git a/scripts/performance/llm/finetune_llama3_70b.py b/scripts/performance/llm/finetune_llama3_70b.py
@@ -96,6 +96,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe,
diff --git a/scripts/performance/llm/finetune_llama3_8b.py b/scripts/performance/llm/finetune_llama3_8b.py
@@ -82,6 +82,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe,
diff --git a/scripts/performance/llm/pretrain_gpt3_175b.py b/scripts/performance/llm/pretrain_gpt3_175b.py
@@ -79,6 +79,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "gpt3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_llama31_405b.py b/scripts/performance/llm/pretrain_llama31_405b.py
@@ -79,6 +79,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_llama3_70b.py b/scripts/performance/llm/pretrain_llama3_70b.py
@@ -79,6 +79,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_llama3_8b.py b/scripts/performance/llm/pretrain_llama3_8b.py
@@ -65,6 +65,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_mixtral_8x22b.py b/scripts/performance/llm/pretrain_mixtral_8x22b.py
@@ -73,6 +73,7 @@ def override_recipe_configs(
         activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "mixtral", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_mixtral_8x7b.py b/scripts/performance/llm/pretrain_mixtral_8x7b.py
@@ -67,6 +67,7 @@ def override_recipe_configs(
         enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "mixtral", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_nemotron3_22b.py b/scripts/performance/llm/pretrain_nemotron3_22b.py
@@ -70,6 +70,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_nemotron3_8b.py b/scripts/performance/llm/pretrain_nemotron3_8b.py
@@ -64,6 +64,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_nemotron4_15b.py b/scripts/performance/llm/pretrain_nemotron4_15b.py
@@ -68,6 +68,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_nemotron4_340b.py b/scripts/performance/llm/pretrain_nemotron4_340b.py
@@ -77,6 +77,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/utils.py b/scripts/performance/utils.py
@@ -252,6 +252,7 @@ def set_primary_perf_configs(
     compute_dtype: str = None,
     fp8_recipe: str = None,
     recompute_modules: Optional[List[str]] = None,
+    nccl_communicator_config_path: str = None,
 ):
     """Set experiment configs we usually tune for performance of all models."""
     # nemo.lightning.Trainer configs
@@ -277,6 +278,8 @@ def set_primary_perf_configs(
     recipe.trainer.strategy.expert_tensor_parallel_size = etp_size
 
     recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+    if nccl_communicator_config_path is not None:
+        recipe.trainer.strategy.nccl_communicator_config_path = nccl_communicator_config_path
 
     # callback configs
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)

Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,7 @@ def override_recipe_configs(`
`101`	`101`	`enable_cuda_graphs=enable_cuda_graphs,`
`102`	`102`	`compute_dtype=args.compute_dtype,`
`103`	`103`	`fp8_recipe=args.fp8_recipe,`
	`104`	`+ nccl_communicator_config_path=args.nccl_communicator_config_path,`
`104`	`105`	`)`
`105`	`106`
`106`	`107`	`# disable HF ckpt loading`
Original file line number	Diff line number	Diff line change
`@@ -89,6 +89,7 @@ def override_recipe_configs(`
`89`	`89`	`activation_offload_layers=activation_offload_layers,`
`90`	`90`	`compute_dtype=args.compute_dtype,`
`91`	`91`	`fp8_recipe=args.fp8_recipe,`
	`92`	`+ nccl_communicator_config_path=args.nccl_communicator_config_path,`
`92`	`93`	`)`
`93`	`94`	`recipe = set_exp_logging_configs(`
`94`	`95`	`recipe,`
Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@ def override_recipe_configs(`
`96`	`96`	`activation_offload_layers=activation_offload_layers,`
`97`	`97`	`compute_dtype=args.compute_dtype,`
`98`	`98`	`fp8_recipe=args.fp8_recipe,`
	`99`	`+ nccl_communicator_config_path=args.nccl_communicator_config_path,`
`99`	`100`	`)`
`100`	`101`	`recipe = set_exp_logging_configs(`
`101`	`102`	`recipe,`
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ def override_recipe_configs(`
`82`	`82`	`enable_cuda_graphs=enable_cuda_graphs,`
`83`	`83`	`compute_dtype=args.compute_dtype,`
`84`	`84`	`fp8_recipe=args.fp8_recipe,`
	`85`	`+ nccl_communicator_config_path=args.nccl_communicator_config_path,`
`85`	`86`	`)`
`86`	`87`	`recipe = set_exp_logging_configs(`
`87`	`88`	`recipe,`
Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,7 @@ def override_recipe_configs(`
`79`	`79`	`activation_offload_layers=activation_offload_layers,`
`80`	`80`	`compute_dtype=args.compute_dtype,`
`81`	`81`	`fp8_recipe=args.fp8_recipe,`
	`82`	`+ nccl_communicator_config_path=args.nccl_communicator_config_path,`
`82`	`83`	`)`
`83`	`84`	`recipe = set_exp_logging_configs(`
`84`	`85`	`recipe, "pre_train", "llm", "gpt3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name`
Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ def override_recipe_configs(`
`65`	`65`	`enable_cuda_graphs=enable_cuda_graphs,`
`66`	`66`	`compute_dtype=args.compute_dtype,`
`67`	`67`	`fp8_recipe=args.fp8_recipe,`
	`68`	`+ nccl_communicator_config_path=args.nccl_communicator_config_path,`
`68`	`69`	`)`
`69`	`70`	`recipe = set_exp_logging_configs(`
`70`	`71`	`recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name`
Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,7 @@ def override_recipe_configs(`
`73`	`73`	`activation_offload_layers,`
`74`	`74`	`compute_dtype=args.compute_dtype,`
`75`	`75`	`fp8_recipe=args.fp8_recipe,`
	`76`	`+ nccl_communicator_config_path=args.nccl_communicator_config_path,`
`76`	`77`	`)`
`77`	`78`	`recipe = set_exp_logging_configs(`
`78`	`79`	`recipe, "pre_train", "llm", "mixtral", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name`