Skip to content

Commit 5f17628

Browse files
scsudhakaranLLMB CI Formatter
authored andcommitted
Update performance scripts to align with llmb requirements
Signed-off-by: Sanju C Sudhakaran <[email protected]>
1 parent 1cf025c commit 5f17628

File tree

4 files changed

+137
-7
lines changed

4 files changed

+137
-7
lines changed

scripts/performance/argument_parser.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,14 @@ def parse_cli_args():
261261
required=False,
262262
default=None,
263263
)
264+
parser.add_argument(
265+
"--moe_flex_dispatcher_backend",
266+
type=str,
267+
choices=["deepep", "hybridep"],
268+
help="MoE flex dispatcher backend to use. Defaults to None",
269+
required=False,
270+
default=None,
271+
)
264272
parser.add_argument(
265273
"--use_megatron_fsdp",
266274
help="Use Megatron FSDP. Disabled by default.",
@@ -398,6 +406,74 @@ def parse_cli_args():
398406
action="store_false",
399407
dest="detach",
400408
)
401-
409+
parser.add_argument(
410+
"--profiling_start_step", type=int, help="Defines start step for profiling", required=False, default=10
411+
)
412+
parser.add_argument(
413+
"--profiling_stop_step", type=int, help="Defines stop step for profiling", required=False, default=11
414+
)
415+
parser.add_argument(
416+
"-pgm",
417+
"--profiling_gpu_metrics",
418+
help="Enable nsys gpu metrics. Disabled by default.",
419+
action="store_true",
420+
)
421+
parser.add_argument(
422+
"--additional_slurm_params",
423+
type=str,
424+
help="Additional SLURM parameters as key=value pairs. "
425+
"Use semicolons (;) to separate parameters when values contain commas. "
426+
"Examples: 'nodelist=node001,node002;constraint=gpu' or 'reservation=my_res;exclusive'",
427+
required=False,
428+
default=None,
429+
)
402430
args, cli_dotlist_overrides = parser.parse_known_args()
403431
return args, cli_dotlist_overrides
432+
433+
def parse_additional_slurm_params(params_str):
434+
"""
435+
Parse additional SLURM parameters from a string of key=value pairs.
436+
This function handles different separator formats:
437+
1. Semicolon-separated: "key1=value1;key2=value2" (recommended for multiple parameters)
438+
2. Space-separated: "key1=value1 key2=value2"
439+
3. Single parameter: "key1=value1,value2" (no separators = single parameter)
440+
Args:
441+
params_str (str): String with parameters
442+
Returns:
443+
dict: Dictionary of parameters, or None if params_str is None/empty
444+
Example:
445+
parse_additional_slurm_params("nodelist=node001,node002")
446+
returns {"nodelist": "node001,node002"}
447+
parse_additional_slurm_params("nodelist=node001,node002;constraint=gpu")
448+
returns {"nodelist": "node001,node002", "constraint": "gpu"}
449+
parse_additional_slurm_params("reservation=my_res;constraint=gpu")
450+
returns {"reservation": "my_res", "constraint": "gpu"}
451+
"""
452+
if not params_str:
453+
return None
454+
455+
params = {}
456+
457+
# Try semicolon separation first (most reliable for complex values)
458+
if ';' in params_str:
459+
parts = params_str.split(';')
460+
# Try space separation next
461+
elif ' ' in params_str:
462+
parts = params_str.split()
463+
# No separators found - treat as single parameter
464+
else:
465+
parts = [params_str]
466+
467+
for part in parts:
468+
part = part.strip()
469+
if not part:
470+
continue
471+
472+
if '=' in part:
473+
key, value = part.split('=', 1)
474+
params[key.strip()] = value.strip()
475+
else:
476+
# Boolean flag (no value)
477+
params[part] = True
478+
479+
return params if params else None

scripts/performance/perf_plugins.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ class NsysPlugin(Plugin):
107107
profile_step_end: int
108108
profile_ranks: Optional[list[int]] = None
109109
nsys_trace: Optional[list[str]] = None
110+
nsys_extra_args: Optional[list[str]] = None
110111
record_shapes: bool = False
111112
nsys_gpu_metrics: bool = False
112113
script_args_converter_fn: Optional[Callable[[NsysPluginScriptArgs], List[str]]] = None
@@ -116,6 +117,7 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
116117
launcher = executor.get_launcher()
117118
launcher.nsys_profile = True
118119
launcher.nsys_trace = self.nsys_trace or ["nvtx", "cuda"]
120+
launcher.nsys_extra_args = self.nsys_extra_args or launcher.nsys_extra_args
119121

120122
if isinstance(executor, SlurmExecutor):
121123
# NOTE: DO NOT change to f-string, `%q{}` is Slurm placeholder
@@ -195,6 +197,7 @@ class PerfEnvPlugin(Plugin):
195197
pp_size: int = 1
196198
script_args_converter_fn: Optional[Callable[[PerfEnvPluginScriptArgs], List[str]]] = None
197199
moe_a2a_overlap: bool = False
200+
moe_flex_dispatcher_backend: str
198201
model_name: str
199202
model_size: str
200203
gpu: str

scripts/performance/setup_experiment.py

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818

1919

2020
try:
21-
from argument_parser import parse_cli_args
21+
from argument_parser import parse_additional_slurm_params, parse_cli_args
2222
from utils.executors import slurm_executor
2323
except (ImportError, ModuleNotFoundError):
24-
from .argument_parser import parse_cli_args
24+
from .argument_parser import parse_additional_slurm_params, parse_cli_args
2525
from .utils.executors import slurm_executor
2626

2727
import nemo_run as run
@@ -50,6 +50,7 @@ def main(
5050
task: str,
5151
compute_dtype: str,
5252
gpu: str,
53+
num_gpus: int,
5354
hf_token: str,
5455
custom_mounts: List[str],
5556
detach: bool,
@@ -58,12 +59,20 @@ def main(
5859
enable_nsys: bool,
5960
use_tokendrop: bool,
6061
moe_a2a_overlap: bool,
62+
moe_flex_dispatcher_backend: str,
6163
tp_size: Optional[int],
6264
pp_size: Optional[int],
6365
cp_size: Optional[int],
66+
vp_size: Optional[int],
67+
ep_size: Optional[int],
68+
mbs: Optional[int],
69+
gbs: Optional[int],
6470
wandb_key: str,
6571
wandb_prj_name: str,
6672
wandb_exp_name: str,
73+
profiling_start_step: int,
74+
profiling_stop_step: int,
75+
profiling_gpu_metrics: bool,
6776
executor: run.Executor,
6877
):
6978
"""Sets up the experiment and runs it."""
@@ -87,6 +96,7 @@ def main(
8796
PerfEnvPlugin(
8897
enable_vboost=enable_vboost,
8998
moe_a2a_overlap=moe_a2a_overlap,
99+
moe_flex_dispatcher_backend=moe_flex_dispatcher_backend,
90100
tp_size=tp_size,
91101
pp_size=pp_size,
92102
cp_size=cp_size,
@@ -98,7 +108,20 @@ def main(
98108
)
99109
)
100110
if enable_nsys:
101-
plugins.append(NsysPlugin(profile_step_start=10, profile_step_end=11))
111+
plugins.append(NsysPlugin(
112+
profile_step_start=profiling_start_step,
113+
profile_step_end=profiling_stop_step,
114+
profile_ranks=list(range(num_gpus)),
115+
nsys_gpu_metrics=profiling_gpu_metrics,
116+
nsys_trace=['cuda'],
117+
nsys_extra_args=[
118+
"--force-overwrite=true",
119+
"--capture-range=cudaProfilerApi",
120+
"--capture-range-end=stop",
121+
"--cuda-graph-trace=node",
122+
"--cuda-event-trace=false",
123+
"--nvtx-domain-include=NCCL",
124+
]))
102125

103126
executor.container_mounts.extend(
104127
custom_mounts
@@ -109,9 +132,12 @@ def main(
109132
)
110133
logger.info(f"Custom mounts: {executor.container_mounts}")
111134

112-
exp_name = f"{model_name}_{model_size}_{domain}_{task}" + (
113-
"_bf16" if compute_dtype == "bf16" else f"_{compute_dtype}"
135+
exp_name = (
136+
f"{task}_{model_name}_{model_size}_{compute_dtype}"
137+
f"_gpus{num_gpus}_tp{tp_size}_pp{pp_size}_cp{cp_size}"
138+
f"_vp{vp_size}_ep{ep_size}_mbs{mbs}_gbs{gbs}"
114139
)
140+
115141
logger.debug(
116142
run.Script(
117143
path=str(RUN_SCRIPT_PATH),
@@ -146,6 +172,11 @@ def main(
146172
if __name__ == "__main__":
147173
args, _ = parse_cli_args()
148174

175+
# Parse additional SLURM parameters if provided
176+
additional_slurm_params = None
177+
if hasattr(args, 'additional_slurm_params') and args.additional_slurm_params:
178+
additional_slurm_params = parse_additional_slurm_params(args.additional_slurm_params)
179+
149180
main(
150181
script_name=SCRIPT_NAME,
151182
model_name=args.model_name,
@@ -154,6 +185,7 @@ def main(
154185
task=args.task,
155186
compute_dtype=args.compute_dtype,
156187
gpu=args.gpu,
188+
num_gpus=args.num_gpus,
157189
hf_token=args.hf_token,
158190
custom_mounts=args.custom_mounts,
159191
detach=args.detach,
@@ -162,12 +194,20 @@ def main(
162194
enable_nsys=args.enable_nsys,
163195
use_tokendrop=args.use_tokendrop,
164196
moe_a2a_overlap=args.moe_a2a_overlap,
197+
moe_flex_dispatcher_backend=args.moe_flex_dispatcher_backend,
165198
tp_size=args.tensor_model_parallel_size,
166199
pp_size=args.pipeline_model_parallel_size,
167200
cp_size=args.context_parallel_size,
201+
vp_size=args.virtual_pipeline_model_parallel_size,
202+
ep_size=args.expert_model_parallel_size,
203+
mbs=args.micro_batch_size,
204+
gbs=args.global_batch_size,
168205
wandb_key=args.wandb_key,
169206
wandb_prj_name=args.wandb_prj_name,
170207
wandb_exp_name=args.wandb_exp_name,
208+
profiling_start_step=args.profiling_start_step,
209+
profiling_stop_step=args.profiling_stop_step,
210+
profiling_gpu_metrics=args.profiling_gpu_metrics,
171211
executor=slurm_executor(
172212
args.gpu,
173213
args.account,
@@ -181,5 +221,6 @@ def main(
181221
hf_token=args.hf_token,
182222
nemo_home=args.nemo_home,
183223
wandb_key=args.wandb_key,
224+
additional_slurm_params=additional_slurm_params,
184225
),
185226
)

scripts/performance/utils/executors.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import os
1616
import sys
1717
from pathlib import Path
18-
from typing import Dict, List
18+
from typing import Any, Dict, List
1919

2020
import nemo_run as run
2121
from nemo_run.config import get_nemorun_home
@@ -43,6 +43,7 @@
4343
"NVTE_NORM_FWD_USE_CUDNN": "1",
4444
"NVTE_NORM_BWD_USE_CUDNN": "1",
4545
"TORCH_NCCL_HIGH_PRIORITY": "1",
46+
"HF_HUB_OFFLINE": "0",
4647
}
4748

4849

@@ -63,10 +64,18 @@ def slurm_executor(
6364
wandb_key: str = None,
6465
network: str = None,
6566
custom_bash_cmds: List[str] = None,
67+
additional_slurm_params: Dict[str, Any] = None,
6668
) -> run.SlurmExecutor:
6769
"""
6870
Slurm cluster definition with appropriate cluster params and NeMo container params needed for pre-training
6971
and fine-tuning experiments
72+
73+
Args:
74+
additional_slurm_params: Dict[str, Any], optional
75+
Additional SLURM parameters to pass to sbatch. These will be converted to #SBATCH directives.
76+
Example: {"nodelist": "node001,node002", "constraint": "gpu"} will generate:
77+
#SBATCH --nodelist=node001,node002
78+
#SBATCH --constraint=gpu
7079
"""
7180
custom_bash_cmds = [] if custom_bash_cmds is None else custom_bash_cmds
7281
err_msgs = []
@@ -135,6 +144,7 @@ def slurm_executor(
135144
segment=segment,
136145
network=network,
137146
launcher=launcher,
147+
additional_parameters=additional_slurm_params,
138148
)
139149

140150
return executor

0 commit comments

Comments
 (0)