Fix callbacks in DSV3 script (#14350)

guyueh1 · web-flow · commit 99e5dac685f8 · 2025-07-31T10:57:00.000-07:00
* Fix h100 and gb200 dsv3 script

Signed-off-by: Guyue Huang &lt;guyueh@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: guyueh1 &lt;guyueh1@users.noreply.github.com&gt;

* Add moe to recompute module for deepseek v3 h100

Signed-off-by: Guyue Huang &lt;guyueh@nvidia.com&gt;

* Revert a previous change to remove DeepEP callback

Signed-off-by: Guyue Huang &lt;guyueh@nvidia.com&gt;

---------

Signed-off-by: Guyue Huang &lt;guyueh@nvidia.com&gt;
Signed-off-by: guyueh1 &lt;guyueh1@users.noreply.github.com&gt;
Co-authored-by: guyueh1 &lt;guyueh1@users.noreply.github.com&gt;
diff --git a/scripts/performance/executors.py b/scripts/performance/executors.py
@@ -71,7 +71,7 @@ def slurm_executor(
     custom_bash_cmds = [] if custom_bash_cmds is None else custom_bash_cmds
     err_msgs = []
     mounts = []
-    srun_args = custom_srun_args.copy() + ["--mpi=pmix"]
+    srun_args = custom_srun_args.copy() + ["--mpi=pmix", "--no-container-mount-home"]
 
     if log_dir != get_nemorun_home():
         err_msgs.append(f"\nRun `export NEMORUN_HOME={log_dir}` in your shell environment and rerun this script.")
diff --git a/scripts/performance/llm/pretrain_deepseek_v3.py b/scripts/performance/llm/pretrain_deepseek_v3.py
@@ -19,6 +19,7 @@
 
 from nemo.collections.llm.recipes.deepseek_v3 import pretrain_recipe
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning.pytorch.callbacks.deepep import DeepEPCallback
 from nemo.lightning.pytorch.callbacks.megatron_enable_experimental_callback import MegatronEnableExperimentalCallback
 from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
@@ -73,6 +74,8 @@ def override_recipe_configs(
         recipe.model.config.moe_token_dispatcher_type = "flex"
         recipe.model.config.moe_enable_deepep = True
         recipe.model.config.moe_shared_expert_overlap = False  # not supported for deepEP
+        # use force load balance for reducing variance in benchmarking
+        recipe.model.config.moe_router_force_load_balancing = True
     else:
         recipe.model.config.moe_token_dispatcher_type = "alltoall"
         recipe.model.config.moe_enable_deepep = False
diff --git a/scripts/performance/recommended_model_configs/model_configs_h100.csv b/scripts/performance/recommended_model_configs/model_configs_h100.csv
@@ -30,8 +30,8 @@ pre_train,nemotronh,8b,h100,fp8,8,8192,1,1,1,1,1,1,128,,1,0,0,0,,,,
 pre_train,nemotronh,47b,h100,fp8,64,8192,8,1,1,1,1,1,192,,1,0,0,0,,,,
 pre_train,nemotronh,56b,h100,fp8,64,8192,8,1,1,1,1,1,192,,1,0,0,0,,,,
 pre_train,nemotronh,56b,h100,fp8,256,8192,8,1,1,1,1,1,768,,1,0,0,0,,,,
-pre_train,deepseek,v3,h100,bf16,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp,,,
-pre_train,deepseek,v3,h100,fp8,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp,,,
+pre_train,deepseek,v3,h100,bf16,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp/moe,,,
+pre_train,deepseek,v3,h100,fp8,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp/moe,,,
 pre_train,llama4,e16,h100,bf16,256.0,8192.0,4.0,1.0,1.0,16.0,1.0,1.0,1024.0,4.0,1,0,0,0,,,,
 pre_train,llama4,e128,h100,bf16,512.0,8192.0,4.0,1.0,1.0,128.0,1.0,1.0,1024.0,4.0,1,0,0,0,,,,
 pre_train,vlm_llama4,e16,h100,bf16,256.0,8192.0,4.0,1.0,1.0,16.0,1.0,1.0,1024.0,4.0,0,0,0,0,,,,