Skip to content

Commit 99e5dac

Browse files
authored
Fix callbacks in DSV3 script (#14350)
* Fix h100 and gb200 dsv3 script Signed-off-by: Guyue Huang <[email protected]> * Apply isort and black reformatting Signed-off-by: guyueh1 <[email protected]> * Add moe to recompute module for deepseek v3 h100 Signed-off-by: Guyue Huang <[email protected]> * Revert a previous change to remove DeepEP callback Signed-off-by: Guyue Huang <[email protected]> --------- Signed-off-by: Guyue Huang <[email protected]> Signed-off-by: guyueh1 <[email protected]> Co-authored-by: guyueh1 <[email protected]>
1 parent 29c230b commit 99e5dac

File tree

3 files changed

+6
-3
lines changed

3 files changed

+6
-3
lines changed

scripts/performance/executors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def slurm_executor(
7171
custom_bash_cmds = [] if custom_bash_cmds is None else custom_bash_cmds
7272
err_msgs = []
7373
mounts = []
74-
srun_args = custom_srun_args.copy() + ["--mpi=pmix"]
74+
srun_args = custom_srun_args.copy() + ["--mpi=pmix", "--no-container-mount-home"]
7575

7676
if log_dir != get_nemorun_home():
7777
err_msgs.append(f"\nRun `export NEMORUN_HOME={log_dir}` in your shell environment and rerun this script.")

scripts/performance/llm/pretrain_deepseek_v3.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from nemo.collections.llm.recipes.deepseek_v3 import pretrain_recipe
2121
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
22+
from nemo.lightning.pytorch.callbacks.deepep import DeepEPCallback
2223
from nemo.lightning.pytorch.callbacks.megatron_enable_experimental_callback import MegatronEnableExperimentalCallback
2324
from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
2425
from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
@@ -73,6 +74,8 @@ def override_recipe_configs(
7374
recipe.model.config.moe_token_dispatcher_type = "flex"
7475
recipe.model.config.moe_enable_deepep = True
7576
recipe.model.config.moe_shared_expert_overlap = False # not supported for deepEP
77+
# use force load balance for reducing variance in benchmarking
78+
recipe.model.config.moe_router_force_load_balancing = True
7679
else:
7780
recipe.model.config.moe_token_dispatcher_type = "alltoall"
7881
recipe.model.config.moe_enable_deepep = False

scripts/performance/recommended_model_configs/model_configs_h100.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ pre_train,nemotronh,8b,h100,fp8,8,8192,1,1,1,1,1,1,128,,1,0,0,0,,,,
3030
pre_train,nemotronh,47b,h100,fp8,64,8192,8,1,1,1,1,1,192,,1,0,0,0,,,,
3131
pre_train,nemotronh,56b,h100,fp8,64,8192,8,1,1,1,1,1,192,,1,0,0,0,,,,
3232
pre_train,nemotronh,56b,h100,fp8,256,8192,8,1,1,1,1,1,768,,1,0,0,0,,,,
33-
pre_train,deepseek,v3,h100,bf16,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp,,,
34-
pre_train,deepseek,v3,h100,fp8,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp,,,
33+
pre_train,deepseek,v3,h100,bf16,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp/moe,,,
34+
pre_train,deepseek,v3,h100,fp8,1024.0,4096.0,2.0,16.0,1.0,64.0,1.0,1.0,8192.0,1.0,0,0,0,0,mla_up_proj/mlp/moe,,,
3535
pre_train,llama4,e16,h100,bf16,256.0,8192.0,4.0,1.0,1.0,16.0,1.0,1.0,1024.0,4.0,1,0,0,0,,,,
3636
pre_train,llama4,e128,h100,bf16,512.0,8192.0,4.0,1.0,1.0,128.0,1.0,1.0,1024.0,4.0,1,0,0,0,,,,
3737
pre_train,vlm_llama4,e16,h100,bf16,256.0,8192.0,4.0,1.0,1.0,16.0,1.0,1.0,1024.0,4.0,0,0,0,0,,,,

0 commit comments

Comments
 (0)