diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py
index f07e1e93afe2..ce8fef685a59 100644
--- a/scripts/performance/argument_parser.py
+++ b/scripts/performance/argument_parser.py
@@ -294,6 +294,13 @@ def parse_cli_args():
         required=False,
         default=None,
     )
+    parser.add_argument(
+        "--nccl_communicator_config_path",
+        type=str,
+        help="Path to NCCL communicator config yaml file",
+        required=False,
+        default=None,
+    )
 
     def list_of_strings(arg):
         return arg.split(',')
diff --git a/scripts/performance/llm/finetune_deepseek_v3.py b/scripts/performance/llm/finetune_deepseek_v3.py
index a9df11fbd8c1..84d4224130c9 100644
--- a/scripts/performance/llm/finetune_deepseek_v3.py
+++ b/scripts/performance/llm/finetune_deepseek_v3.py
@@ -101,6 +101,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
 
     # disable HF ckpt loading
diff --git a/scripts/performance/llm/finetune_llama31_405b.py b/scripts/performance/llm/finetune_llama31_405b.py
index aff7e9c81ae4..3211c230a2be 100644
--- a/scripts/performance/llm/finetune_llama31_405b.py
+++ b/scripts/performance/llm/finetune_llama31_405b.py
@@ -89,6 +89,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe,
diff --git a/scripts/performance/llm/finetune_llama3_70b.py b/scripts/performance/llm/finetune_llama3_70b.py
index 68f9b1ee3b80..aa000afb51ab 100644
--- a/scripts/performance/llm/finetune_llama3_70b.py
+++ b/scripts/performance/llm/finetune_llama3_70b.py
@@ -96,6 +96,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe,
diff --git a/scripts/performance/llm/finetune_llama3_8b.py b/scripts/performance/llm/finetune_llama3_8b.py
index 28a83851ef92..f89fd818a7d7 100644
--- a/scripts/performance/llm/finetune_llama3_8b.py
+++ b/scripts/performance/llm/finetune_llama3_8b.py
@@ -82,6 +82,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe,
diff --git a/scripts/performance/llm/pretrain_gpt3_175b.py b/scripts/performance/llm/pretrain_gpt3_175b.py
index f0faf3470cc2..dfb70fd7b48d 100644
--- a/scripts/performance/llm/pretrain_gpt3_175b.py
+++ b/scripts/performance/llm/pretrain_gpt3_175b.py
@@ -79,6 +79,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "gpt3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_llama31_405b.py b/scripts/performance/llm/pretrain_llama31_405b.py
index 8e458163a33f..00f612267d8c 100644
--- a/scripts/performance/llm/pretrain_llama31_405b.py
+++ b/scripts/performance/llm/pretrain_llama31_405b.py
@@ -79,6 +79,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_llama3_70b.py b/scripts/performance/llm/pretrain_llama3_70b.py
index 72f4b9be0d45..01227f52fcc5 100644
--- a/scripts/performance/llm/pretrain_llama3_70b.py
+++ b/scripts/performance/llm/pretrain_llama3_70b.py
@@ -79,6 +79,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_llama3_8b.py b/scripts/performance/llm/pretrain_llama3_8b.py
index 781df30da53b..b775cfe562bd 100644
--- a/scripts/performance/llm/pretrain_llama3_8b.py
+++ b/scripts/performance/llm/pretrain_llama3_8b.py
@@ -65,6 +65,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_mixtral_8x22b.py b/scripts/performance/llm/pretrain_mixtral_8x22b.py
index 54335a8b5773..88c75d739d3c 100644
--- a/scripts/performance/llm/pretrain_mixtral_8x22b.py
+++ b/scripts/performance/llm/pretrain_mixtral_8x22b.py
@@ -73,6 +73,7 @@ def override_recipe_configs(
         activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "mixtral", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_mixtral_8x7b.py b/scripts/performance/llm/pretrain_mixtral_8x7b.py
index 4a3ffe6c81d9..4d1412c4a076 100644
--- a/scripts/performance/llm/pretrain_mixtral_8x7b.py
+++ b/scripts/performance/llm/pretrain_mixtral_8x7b.py
@@ -67,6 +67,7 @@ def override_recipe_configs(
         enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "mixtral", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_nemotron3_22b.py b/scripts/performance/llm/pretrain_nemotron3_22b.py
index 32f0669edc04..e85d230923b8 100644
--- a/scripts/performance/llm/pretrain_nemotron3_22b.py
+++ b/scripts/performance/llm/pretrain_nemotron3_22b.py
@@ -70,6 +70,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_nemotron3_8b.py b/scripts/performance/llm/pretrain_nemotron3_8b.py
index 4ba1fdd7828f..ccd9ac86dcad 100644
--- a/scripts/performance/llm/pretrain_nemotron3_8b.py
+++ b/scripts/performance/llm/pretrain_nemotron3_8b.py
@@ -64,6 +64,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_nemotron4_15b.py b/scripts/performance/llm/pretrain_nemotron4_15b.py
index d27926d71035..c762ec604427 100644
--- a/scripts/performance/llm/pretrain_nemotron4_15b.py
+++ b/scripts/performance/llm/pretrain_nemotron4_15b.py
@@ -68,6 +68,7 @@ def override_recipe_configs(
         enable_cuda_graphs=enable_cuda_graphs,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/llm/pretrain_nemotron4_340b.py b/scripts/performance/llm/pretrain_nemotron4_340b.py
index 0f422c5a1ac8..f4434502a7b0 100644
--- a/scripts/performance/llm/pretrain_nemotron4_340b.py
+++ b/scripts/performance/llm/pretrain_nemotron4_340b.py
@@ -77,6 +77,7 @@ def override_recipe_configs(
         activation_offload_layers=activation_offload_layers,
         compute_dtype=args.compute_dtype,
         fp8_recipe=args.fp8_recipe,
+        nccl_communicator_config_path=args.nccl_communicator_config_path,
     )
     recipe = set_exp_logging_configs(
         recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name
diff --git a/scripts/performance/utils.py b/scripts/performance/utils.py
index 4e85b53693ab..237dd9316ad7 100644
--- a/scripts/performance/utils.py
+++ b/scripts/performance/utils.py
@@ -252,6 +252,7 @@ def set_primary_perf_configs(
     compute_dtype: str = None,
     fp8_recipe: str = None,
     recompute_modules: Optional[List[str]] = None,
+    nccl_communicator_config_path: str = None,
 ):
     """Set experiment configs we usually tune for performance of all models."""
     # nemo.lightning.Trainer configs
@@ -277,6 +278,8 @@ def set_primary_perf_configs(
     recipe.trainer.strategy.expert_tensor_parallel_size = etp_size
 
     recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+    if nccl_communicator_config_path is not None:
+        recipe.trainer.strategy.nccl_communicator_config_path = nccl_communicator_config_path
 
     # callback configs
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)