diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index f07e1e93afe2..ce8fef685a59 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -294,6 +294,13 @@ def parse_cli_args(): required=False, default=None, ) + parser.add_argument( + "--nccl_communicator_config_path", + type=str, + help="Path to NCCL communicator config yaml file", + required=False, + default=None, + ) def list_of_strings(arg): return arg.split(',') diff --git a/scripts/performance/llm/finetune_deepseek_v3.py b/scripts/performance/llm/finetune_deepseek_v3.py index a9df11fbd8c1..84d4224130c9 100644 --- a/scripts/performance/llm/finetune_deepseek_v3.py +++ b/scripts/performance/llm/finetune_deepseek_v3.py @@ -101,6 +101,7 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) # disable HF ckpt loading diff --git a/scripts/performance/llm/finetune_llama31_405b.py b/scripts/performance/llm/finetune_llama31_405b.py index aff7e9c81ae4..3211c230a2be 100644 --- a/scripts/performance/llm/finetune_llama31_405b.py +++ b/scripts/performance/llm/finetune_llama31_405b.py @@ -89,6 +89,7 @@ def override_recipe_configs( activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, diff --git a/scripts/performance/llm/finetune_llama3_70b.py b/scripts/performance/llm/finetune_llama3_70b.py index 68f9b1ee3b80..aa000afb51ab 100644 --- a/scripts/performance/llm/finetune_llama3_70b.py +++ b/scripts/performance/llm/finetune_llama3_70b.py @@ -96,6 +96,7 @@ def override_recipe_configs( activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, diff --git a/scripts/performance/llm/finetune_llama3_8b.py b/scripts/performance/llm/finetune_llama3_8b.py index 28a83851ef92..f89fd818a7d7 100644 --- a/scripts/performance/llm/finetune_llama3_8b.py +++ b/scripts/performance/llm/finetune_llama3_8b.py @@ -82,6 +82,7 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, diff --git a/scripts/performance/llm/pretrain_gpt3_175b.py b/scripts/performance/llm/pretrain_gpt3_175b.py index f0faf3470cc2..dfb70fd7b48d 100644 --- a/scripts/performance/llm/pretrain_gpt3_175b.py +++ b/scripts/performance/llm/pretrain_gpt3_175b.py @@ -79,6 +79,7 @@ def override_recipe_configs( activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "gpt3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/llm/pretrain_llama31_405b.py b/scripts/performance/llm/pretrain_llama31_405b.py index 8e458163a33f..00f612267d8c 100644 --- a/scripts/performance/llm/pretrain_llama31_405b.py +++ b/scripts/performance/llm/pretrain_llama31_405b.py @@ -79,6 +79,7 @@ def override_recipe_configs( activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/llm/pretrain_llama3_70b.py b/scripts/performance/llm/pretrain_llama3_70b.py index 72f4b9be0d45..01227f52fcc5 100644 --- a/scripts/performance/llm/pretrain_llama3_70b.py +++ b/scripts/performance/llm/pretrain_llama3_70b.py @@ -79,6 +79,7 @@ def override_recipe_configs( activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/llm/pretrain_llama3_8b.py b/scripts/performance/llm/pretrain_llama3_8b.py index 781df30da53b..b775cfe562bd 100644 --- a/scripts/performance/llm/pretrain_llama3_8b.py +++ b/scripts/performance/llm/pretrain_llama3_8b.py @@ -65,6 +65,7 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "llama3", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/llm/pretrain_mixtral_8x22b.py b/scripts/performance/llm/pretrain_mixtral_8x22b.py index 54335a8b5773..88c75d739d3c 100644 --- a/scripts/performance/llm/pretrain_mixtral_8x22b.py +++ b/scripts/performance/llm/pretrain_mixtral_8x22b.py @@ -73,6 +73,7 @@ def override_recipe_configs( activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "mixtral", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/llm/pretrain_mixtral_8x7b.py b/scripts/performance/llm/pretrain_mixtral_8x7b.py index 4a3ffe6c81d9..4d1412c4a076 100644 --- a/scripts/performance/llm/pretrain_mixtral_8x7b.py +++ b/scripts/performance/llm/pretrain_mixtral_8x7b.py @@ -67,6 +67,7 @@ def override_recipe_configs( enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "mixtral", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/llm/pretrain_nemotron3_22b.py b/scripts/performance/llm/pretrain_nemotron3_22b.py index 32f0669edc04..e85d230923b8 100644 --- a/scripts/performance/llm/pretrain_nemotron3_22b.py +++ b/scripts/performance/llm/pretrain_nemotron3_22b.py @@ -70,6 +70,7 @@ def override_recipe_configs( activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/llm/pretrain_nemotron3_8b.py b/scripts/performance/llm/pretrain_nemotron3_8b.py index 4ba1fdd7828f..ccd9ac86dcad 100644 --- a/scripts/performance/llm/pretrain_nemotron3_8b.py +++ b/scripts/performance/llm/pretrain_nemotron3_8b.py @@ -64,6 +64,7 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/llm/pretrain_nemotron4_15b.py b/scripts/performance/llm/pretrain_nemotron4_15b.py index d27926d71035..c762ec604427 100644 --- a/scripts/performance/llm/pretrain_nemotron4_15b.py +++ b/scripts/performance/llm/pretrain_nemotron4_15b.py @@ -68,6 +68,7 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/llm/pretrain_nemotron4_340b.py b/scripts/performance/llm/pretrain_nemotron4_340b.py index 0f422c5a1ac8..f4434502a7b0 100644 --- a/scripts/performance/llm/pretrain_nemotron4_340b.py +++ b/scripts/performance/llm/pretrain_nemotron4_340b.py @@ -77,6 +77,7 @@ def override_recipe_configs( activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + nccl_communicator_config_path=args.nccl_communicator_config_path, ) recipe = set_exp_logging_configs( recipe, "pre_train", "llm", "nemotron", args.tensorboard, args.wandb, args.wandb_prj_name, args.wandb_job_name diff --git a/scripts/performance/utils.py b/scripts/performance/utils.py index 4e85b53693ab..237dd9316ad7 100644 --- a/scripts/performance/utils.py +++ b/scripts/performance/utils.py @@ -252,6 +252,7 @@ def set_primary_perf_configs( compute_dtype: str = None, fp8_recipe: str = None, recompute_modules: Optional[List[str]] = None, + nccl_communicator_config_path: str = None, ): """Set experiment configs we usually tune for performance of all models.""" # nemo.lightning.Trainer configs @@ -277,6 +278,8 @@ def set_primary_perf_configs( recipe.trainer.strategy.expert_tensor_parallel_size = etp_size recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1) + if nccl_communicator_config_path is not None: + recipe.trainer.strategy.nccl_communicator_config_path = nccl_communicator_config_path # callback configs comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)