@@ -54,7 +54,7 @@ def override_recipe_configs(
5454 """
5555 DeepSeek V3 pre-train recipe aimed at achieving best possible performance.
5656 """
57- recipe = pretrain_recipe (performance_mode = True , use_mtp = False )
57+ recipe = pretrain_recipe (performance_mode = True )
5858
5959 # reset recompute args in the default recipe
6060 if args .recompute_modules is None :
@@ -69,16 +69,16 @@ def override_recipe_configs(
6969 # Token dispatcher configs. For H100 we use deepEP and for Blackwell,
7070 # because deepEP is not supported yet, we use all-to-all dispatcher with
7171 # token drop. After deepEP is supported, we can use deepEP dispatcher.
72- # if args.gpu.lower() in ['h100']:
73- # recipe.model.config.moe_token_dispatcher_type = "flex"
74- # recipe.model.config.moe_enable_deepep = True
75- # recipe.model.config.moe_shared_expert_overlap = False # not supported for deepEP
76- # else:
77- recipe .model .config .moe_token_dispatcher_type = "alltoall"
78- recipe .model .config .moe_enable_deepep = False
79- recipe .model .config .moe_shared_expert_overlap = True
80- if USE_TOKEN_DROP :
81- recipe .trainer .callbacks .append (run .Config (MegatronTokenDropCallback ))
72+ if args .gpu .lower () in ['h100' ]:
73+ recipe .model .config .moe_token_dispatcher_type = "flex"
74+ recipe .model .config .moe_enable_deepep = True
75+ recipe .model .config .moe_shared_expert_overlap = False # not supported for deepEP
76+ else :
77+ recipe .model .config .moe_token_dispatcher_type = "alltoall"
78+ recipe .model .config .moe_enable_deepep = False
79+ recipe .model .config .moe_shared_expert_overlap = True
80+ if USE_TOKEN_DROP :
81+ recipe .trainer .callbacks .append (run .Config (MegatronTokenDropCallback ))
8282
8383 # Performance optimization knobs
8484 recipe .model .config .moe_permute_fusion = True
@@ -159,52 +159,16 @@ def override_recipe_configs(
159159 )
160160 recipe .model .tokenizer = recipe .data .tokenizer
161161
162-
163- if args .run_local :
164- recipe .model .config .num_moe_experts = 16
165- recipe .model .config .num_layers = 3
166- recipe .model .config .moe_layer_freq = [0 ,1 ,1 ]
167-
168-
169- # add the partial cg support
170- USE_PARTIAL_CG = args .partial_cg
171- if USE_PARTIAL_CG :
172- recipe .model .config .external_cuda_graph = True
173- recipe .model .config .cuda_graph_scope = "attn"
174- recipe .trainer .strategy .use_te_rng_tracker = True
175- recipe .model .config .enable_cuda_graph = False
176-
177-
178162 return recipe
179163
180164
181165if __name__ == "__main__" :
182166 args = parse_cli_args ().parse_args ()
183167 args_sanity_check (args )
184168
185- # kwargs = get_user_configs(args.gpu.lower(), "pre_train", "deepseek", "v3", args)
186- # (
187- # num_nodes,
188- # mbs,
189- # gbs,
190- # tp_size,
191- # pp_size,
192- # cp_size,
193- # vp_size,
194- # ep_size,
195- # etp_size,
196- # enable_cuda_graphs,
197- # use_mcore_fsdp,
198- # recompute_layers,
199- # activation_offload_layers,
200- # recompute_modules,
201- # _, # keep_fsdp_fp8_transpose_cache
202- # use_user_buffer_registration,
203- # use_sharp,
204- # ) = kwargs[:17]
205-
206- if args .run_local :
207- ( num_nodes ,
169+ kwargs = get_user_configs (args .gpu .lower (), "pre_train" , "deepseek" , "v3" , args )
170+ (
171+ num_nodes ,
208172 mbs ,
209173 gbs ,
210174 tp_size ,
@@ -220,25 +184,9 @@ def override_recipe_configs(
220184 recompute_modules ,
221185 _ , # keep_fsdp_fp8_transpose_cache
222186 use_user_buffer_registration ,
223- use_sharp ,) = 1 , 1 , 32 , 1 , 1 , 1 , 1 , 8 , 1 , (not args .partial_cg ) , False , 0 , 0 , None , False , False , False
224- else :
225- ( num_nodes ,
226- mbs ,
227- gbs ,
228- tp_size ,
229- pp_size ,
230- cp_size ,
231- vp_size ,
232- ep_size ,
233- etp_size ,
234- enable_cuda_graphs ,
235- use_mcore_fsdp ,
236- recompute_layers ,
237- activation_offload_layers ,
238- recompute_modules ,
239- use_user_buffer_registration ,
240- use_sharp ,) = 1 , 1 , 32 , 1 , 1 , 1 , 1 , 8 , 1 , False , False , 0 , 0 , None , False , False , False
241-
187+ use_sharp ,
188+ ) = kwargs [:17 ]
189+
242190 recipe = override_recipe_configs (
243191 args ,
244192 num_nodes ,
@@ -262,25 +210,22 @@ def override_recipe_configs(
262210 exp_config = f"{ num_nodes } nodes_tp{ tp_size } _pp{ pp_size } _cp{ cp_size } _vp{ vp_size } _ep{ ep_size } _{ mbs } mbs_{ gbs } gbs"
263211 exp_name = f"{ splitext (basename (__file__ ))[0 ]} _{ args .compute_dtype } _{ exp_config } "
264212
265- if not args .run_local :
266- executor = slurm_executor (
267- args .gpu .lower (),
268- args .account ,
269- args .partition ,
270- args .log_dir ,
271- num_nodes ,
213+ executor = slurm_executor (
214+ args .gpu .lower (),
215+ args .account ,
216+ args .partition ,
217+ args .log_dir ,
218+ num_nodes ,
272219 args .gpus_per_node ,
273220 args .time_limit ,
274221 args .container_image ,
275222 custom_mounts = args .custom_mounts ,
276223 custom_env_vars = {},
277224 hf_token = args .hf_token ,
278225 nemo_home = args .nemo_home ,
279- wandb_key = args .wandb_key ,
280- network = 'sharp' if use_sharp else None ,
281- )
282- else :
283- executor = run .LocalExecutor (ntasks_per_node = 8 , launcher = "torchrun" , env_vars = {})
226+ wandb_key = args .wandb_key ,
227+ network = 'sharp' if use_sharp else None ,
228+ )
284229
285230 plugins = [
286231 PerfEnvPlugin (
@@ -295,11 +240,6 @@ def override_recipe_configs(
295240 assert args .memory_profile_out_path is not None
296241 plugins .append (MemoryProfilePlugin (dir = args .memory_profile_out_path ))
297242
298-
299- if args .run_local :
300- run .run (recipe , executor = executor , name = exp_name , plugins = plugins )
301- exit ()
302-
303243 with run .Experiment (exp_name ) as exp :
304244 exp .add (
305245 recipe ,
0 commit comments