NVIDIA-NeMo · chtruong814 · Aug 4, 2025 · Jul 31, 2025
diff --git a/scripts/performance/executors.py b/scripts/performance/executors.py
@@ -71,7 +71,7 @@ def slurm_executor(
     custom_bash_cmds = [] if custom_bash_cmds is None else custom_bash_cmds
     err_msgs = []
     mounts = []
-    srun_args = custom_srun_args.copy() + ["--mpi=pmix"]
+    srun_args = custom_srun_args.copy() + ["--mpi=pmix", "--no-container-mount-home"]
 
     if log_dir != get_nemorun_home():
         err_msgs.append(f"\nRun `export NEMORUN_HOME={log_dir}` in your shell environment and rerun this script.")

diff --git a/scripts/performance/llm/pretrain_deepseek_v3.py b/scripts/performance/llm/pretrain_deepseek_v3.py
@@ -19,6 +19,7 @@
 
 from nemo.collections.llm.recipes.deepseek_v3 import pretrain_recipe
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning.pytorch.callbacks.deepep import DeepEPCallback
 from nemo.lightning.pytorch.callbacks.megatron_enable_experimental_callback import MegatronEnableExperimentalCallback
 from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
 from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
@@ -73,6 +74,8 @@ def override_recipe_configs(
         recipe.model.config.moe_token_dispatcher_type = "flex"
         recipe.model.config.moe_enable_deepep = True
         recipe.model.config.moe_shared_expert_overlap = False  # not supported for deepEP
+        # use force load balance for reducing variance in benchmarking
+        recipe.model.config.moe_router_force_load_balancing = True
     else:
         recipe.model.config.moe_token_dispatcher_type = "alltoall"
         recipe.model.config.moe_enable_deepep = False

diff --git a/scripts/performance/recommended_model_configs/model_configs_h100.csv b/scripts/performance/recommended_model_configs/model_configs_h100.csv
@@ -30,8 +30,8 @@ pre_train,nemotronh,8b,h100,fp8,8,8192,1,1,1,1,1,1,128,,1,0,0,0,,,,
 pre_train,nemotronh,47b,h100,fp8,64,8192,8,1,1,1,1,1,192,,1,0,0,0,,,,
 pre_train,nemotronh,56b,h100,fp8,64,8192,8,1,1,1,1,1,192,,1,0,0,0,,,,
 pre_train,nemotronh,56b,h100,fp8,256,8192,8,1,1,1,1,1,768,,1,0,0,0,,,,
-pre_train,deepseek,v3,h100,bf16,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp,,,
-pre_train,deepseek,v3,h100,fp8,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp,,,
+pre_train,deepseek,v3,h100,bf16,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp/moe,,,
+pre_train,deepseek,v3,h100,fp8,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp/moe,,,
 pre_train,llama4,e16,h100,bf16,256.0,8192.0,4.0,1.0,1.0,16.0,1.0,1.0,1024.0,4.0,1,0,0,0,,,,
 pre_train,llama4,e128,h100,bf16,512.0,8192.0,4.0,1.0,1.0,128.0,1.0,1.0,1024.0,4.0,1,0,0,0,,,,
 pre_train,vlm_llama4,e16,h100,bf16,256.0,8192.0,4.0,1.0,1.0,16.0,1.0,1.0,1024.0,4.0,0,0,0,0,,,,