Skip to content

Commit 4ac8fa5

Browse files
committed
cleanup
1 parent e08eb78 commit 4ac8fa5

File tree

4 files changed

+32
-102
lines changed

4 files changed

+32
-102
lines changed

nemo/collections/llm/recipes/deepseek_v3.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,8 @@ def pretrain_recipe(
122122
recipe.log.ckpt.train_time_interval = run.Config(timedelta, minutes=60)
123123

124124
# recompute
125-
recipe.model.config.recompute_granularity = None #"selective"
126-
recipe.model.config.recompute_modules = None #["mla_up_proj", "layernorm"]
125+
recipe.model.config.recompute_granularity = "selective"
126+
recipe.model.config.recompute_modules = ["mla_up_proj", "layernorm"]
127127

128128
# DeepEP
129129
deepep_callback = run.Config(DeepEPCallback)
@@ -135,11 +135,10 @@ def pretrain_recipe(
135135
)
136136
comm_overlap_callback = run.Config(
137137
MegatronCommOverlapCallback,
138-
tp_comm_overlap=True,
139-
tp_comm_bootstrap_backend='nccl',
138+
tp_comm_overlap=False,
140139
)
141140

142-
# recipe.trainer.callbacks.append(deepep_callback)
141+
recipe.trainer.callbacks.append(deepep_callback)
143142
recipe.trainer.callbacks.append(garbage_collection_callback)
144143
recipe.trainer.callbacks.append(comm_overlap_callback)
145144

nemo/lightning/pytorch/strategies/megatron_strategy.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,7 @@ def setup_distributed(self) -> None:
574574
"""Setups dist env"""
575575
setup_parallel_ranks(self)
576576

577+
# Capture Cudagraph on a side stream
577578
if self.model.config.external_cuda_graph:
578579
torch.cuda.set_stream(torch.cuda.Stream())
579580

@@ -725,7 +726,7 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
725726
assert self.lightning_module is not None
726727
assert isinstance(self.model, MegatronParallel)
727728

728-
# (TODO) Capture the cuda graph for the first step for now
729+
# (TODO:) Capture the cuda graph for the first step
729730
if self.trainer.global_step == 0 and self.model.config.external_cuda_graph:
730731
# disable prehook
731732
if self.ddp_config.use_distributed_optimizer and self.ddp_config.overlap_param_gather:

scripts/performance/argument_parser.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -104,16 +104,6 @@ def parse_cli_args():
104104
help="Enable Nsys profiling. Diabled by default",
105105
action="store_true",
106106
)
107-
parser.add_argument(
108-
"--run_local",
109-
help="Run local. Diabled by default",
110-
action="store_true",
111-
)
112-
parser.add_argument(
113-
"--partial_cg",
114-
help="Run local. Diabled by default",
115-
action="store_true",
116-
)
117107
parser.add_argument(
118108
"-em",
119109
"--enable_memory_profile",

scripts/performance/llm/pretrain_deepseek_v3.py

Lines changed: 26 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def override_recipe_configs(
5454
"""
5555
DeepSeek V3 pre-train recipe aimed at achieving best possible performance.
5656
"""
57-
recipe = pretrain_recipe(performance_mode=True, use_mtp=False)
57+
recipe = pretrain_recipe(performance_mode=True)
5858

5959
# reset recompute args in the default recipe
6060
if args.recompute_modules is None:
@@ -69,16 +69,16 @@ def override_recipe_configs(
6969
# Token dispatcher configs. For H100 we use deepEP and for Blackwell,
7070
# because deepEP is not supported yet, we use all-to-all dispatcher with
7171
# token drop. After deepEP is supported, we can use deepEP dispatcher.
72-
# if args.gpu.lower() in ['h100']:
73-
# recipe.model.config.moe_token_dispatcher_type = "flex"
74-
# recipe.model.config.moe_enable_deepep = True
75-
# recipe.model.config.moe_shared_expert_overlap = False # not supported for deepEP
76-
# else:
77-
recipe.model.config.moe_token_dispatcher_type = "alltoall"
78-
recipe.model.config.moe_enable_deepep = False
79-
recipe.model.config.moe_shared_expert_overlap = True
80-
if USE_TOKEN_DROP:
81-
recipe.trainer.callbacks.append(run.Config(MegatronTokenDropCallback))
72+
if args.gpu.lower() in ['h100']:
73+
recipe.model.config.moe_token_dispatcher_type = "flex"
74+
recipe.model.config.moe_enable_deepep = True
75+
recipe.model.config.moe_shared_expert_overlap = False # not supported for deepEP
76+
else:
77+
recipe.model.config.moe_token_dispatcher_type = "alltoall"
78+
recipe.model.config.moe_enable_deepep = False
79+
recipe.model.config.moe_shared_expert_overlap = True
80+
if USE_TOKEN_DROP:
81+
recipe.trainer.callbacks.append(run.Config(MegatronTokenDropCallback))
8282

8383
# Performance optimization knobs
8484
recipe.model.config.moe_permute_fusion = True
@@ -159,52 +159,16 @@ def override_recipe_configs(
159159
)
160160
recipe.model.tokenizer = recipe.data.tokenizer
161161

162-
163-
if args.run_local:
164-
recipe.model.config.num_moe_experts = 16
165-
recipe.model.config.num_layers=3
166-
recipe.model.config.moe_layer_freq=[0,1,1]
167-
168-
169-
# add the partial cg support
170-
USE_PARTIAL_CG = args.partial_cg
171-
if USE_PARTIAL_CG:
172-
recipe.model.config.external_cuda_graph = True
173-
recipe.model.config.cuda_graph_scope = "attn"
174-
recipe.trainer.strategy.use_te_rng_tracker = True
175-
recipe.model.config.enable_cuda_graph = False
176-
177-
178162
return recipe
179163

180164

181165
if __name__ == "__main__":
182166
args = parse_cli_args().parse_args()
183167
args_sanity_check(args)
184168

185-
# kwargs = get_user_configs(args.gpu.lower(), "pre_train", "deepseek", "v3", args)
186-
# (
187-
# num_nodes,
188-
# mbs,
189-
# gbs,
190-
# tp_size,
191-
# pp_size,
192-
# cp_size,
193-
# vp_size,
194-
# ep_size,
195-
# etp_size,
196-
# enable_cuda_graphs,
197-
# use_mcore_fsdp,
198-
# recompute_layers,
199-
# activation_offload_layers,
200-
# recompute_modules,
201-
# _, # keep_fsdp_fp8_transpose_cache
202-
# use_user_buffer_registration,
203-
# use_sharp,
204-
# ) = kwargs[:17]
205-
206-
if args.run_local:
207-
( num_nodes,
169+
kwargs = get_user_configs(args.gpu.lower(), "pre_train", "deepseek", "v3", args)
170+
(
171+
num_nodes,
208172
mbs,
209173
gbs,
210174
tp_size,
@@ -220,25 +184,9 @@ def override_recipe_configs(
220184
recompute_modules,
221185
_, # keep_fsdp_fp8_transpose_cache
222186
use_user_buffer_registration,
223-
use_sharp,) = 1, 1, 32, 1, 1, 1, 1, 8, 1, (not args.partial_cg) , False, 0, 0, None, False, False, False
224-
else:
225-
( num_nodes,
226-
mbs,
227-
gbs,
228-
tp_size,
229-
pp_size,
230-
cp_size,
231-
vp_size,
232-
ep_size,
233-
etp_size,
234-
enable_cuda_graphs,
235-
use_mcore_fsdp,
236-
recompute_layers,
237-
activation_offload_layers,
238-
recompute_modules,
239-
use_user_buffer_registration,
240-
use_sharp,) = 1, 1, 32, 1, 1, 1, 1, 8, 1, False, False, 0, 0, None, False, False, False
241-
187+
use_sharp,
188+
) = kwargs[:17]
189+
242190
recipe = override_recipe_configs(
243191
args,
244192
num_nodes,
@@ -262,25 +210,22 @@ def override_recipe_configs(
262210
exp_config = f"{num_nodes}nodes_tp{tp_size}_pp{pp_size}_cp{cp_size}_vp{vp_size}_ep{ep_size}_{mbs}mbs_{gbs}gbs"
263211
exp_name = f"{splitext(basename(__file__))[0]}_{args.compute_dtype}_{exp_config}"
264212

265-
if not args.run_local:
266-
executor = slurm_executor(
267-
args.gpu.lower(),
268-
args.account,
269-
args.partition,
270-
args.log_dir,
271-
num_nodes,
213+
executor = slurm_executor(
214+
args.gpu.lower(),
215+
args.account,
216+
args.partition,
217+
args.log_dir,
218+
num_nodes,
272219
args.gpus_per_node,
273220
args.time_limit,
274221
args.container_image,
275222
custom_mounts=args.custom_mounts,
276223
custom_env_vars={},
277224
hf_token=args.hf_token,
278225
nemo_home=args.nemo_home,
279-
wandb_key=args.wandb_key,
280-
network='sharp' if use_sharp else None,
281-
)
282-
else:
283-
executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun", env_vars={})
226+
wandb_key=args.wandb_key,
227+
network='sharp' if use_sharp else None,
228+
)
284229

285230
plugins = [
286231
PerfEnvPlugin(
@@ -295,11 +240,6 @@ def override_recipe_configs(
295240
assert args.memory_profile_out_path is not None
296241
plugins.append(MemoryProfilePlugin(dir=args.memory_profile_out_path))
297242

298-
299-
if args.run_local:
300-
run.run(recipe, executor=executor, name=exp_name, plugins=plugins)
301-
exit()
302-
303243
with run.Experiment(exp_name) as exp:
304244
exp.add(
305245
recipe,

0 commit comments

Comments
 (0)