Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions scripts/performance/argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,14 @@ def bool_arg(arg):
required=False,
default=None,
)
parser.add_argument(
"-fsdp_db",
"--use_fsdp_double_buffer",
help="Enable FSDP double buffer. Disabled by default",
type=bool_arg,
required=False,
default=None,
)
parser.add_argument(
"-ubr",
"--use_user_buffer_registration",
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/diffusion/pretrain_flux_12b.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
)
recipe = set_exp_logging_configs(
recipe,
Expand Down
43 changes: 29 additions & 14 deletions scripts/performance/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ def get_user_configs(gpu: str, task: str, model_name: str, model_size: str, args
use_mcore_fsdp = config.get("use_mcore_fsdp") if args.use_mcore_fsdp is None else args.use_mcore_fsdp
use_mcore_fsdp = False if use_mcore_fsdp is None else bool(int(use_mcore_fsdp))

use_fsdp_double_buffer = (
config.get("use_fsdp_double_buffer") if args.use_fsdp_double_buffer is None else args.use_fsdp_double_buffer
)
use_fsdp_double_buffer = False if use_fsdp_double_buffer is None else bool(int(use_fsdp_double_buffer))
if use_fsdp_double_buffer:
assert use_mcore_fsdp == True, "use_fsdp_double_buffer requires use_mcore_fsdp to be True"

recompute_layers = config.get("recompute_layers") if args.recompute_layers is None else args.recompute_layers
recompute_layers = 0 if recompute_layers is None else int(recompute_layers)
activation_offload_layers = (
Expand Down Expand Up @@ -130,6 +137,7 @@ def get_user_configs(gpu: str, task: str, model_name: str, model_size: str, args
logging.info(f"{etp_size=}")
logging.info(f"{enable_cuda_graphs=}")
logging.info(f"{use_mcore_fsdp=}")
logging.info(f"{use_fsdp_double_buffer=}")
logging.info(f"{recompute_layers=}")
logging.info(f"{activation_offload_layers=}")
logging.info(f"{recompute_modules=}")
Expand Down Expand Up @@ -240,12 +248,16 @@ def set_perf_optimization_configs(
recompute_layers: int,
activation_offload_layers: int,
recompute_modules: Optional[List[str]],
use_sharp: bool,
use_user_buffer_registration: bool,
use_fsdp_double_buffer: Optional[bool],
use_user_buffer_registration: Optional[bool],
use_sharp: Optional[bool],
):
# enable cross entropy fusion with TE kernel
recipe.model.config.cross_entropy_fusion_impl = "te"

if use_fsdp_double_buffer:
assert use_mcore_fsdp == True, "use_fsdp_double_buffer requires use_mcore_fsdp to be True"

if use_mcore_fsdp and enable_cuda_graphs:
logging.warning("Currently, cuda graphs are not supported with FSDP. Disabling cuda graphs.")
enable_cuda_graphs = False
Expand All @@ -269,6 +281,7 @@ def set_perf_optimization_configs(
recipe.trainer.strategy.ddp.check_for_nan_in_grad = False
recipe.trainer.strategy.ddp.check_for_large_grads = False
recipe.trainer.strategy.ddp.nccl_ub = bool(use_user_buffer_registration)
recipe.trainer.strategy.ddp.fsdp_double_buffer = bool(use_fsdp_double_buffer)

return recipe

Expand All @@ -289,6 +302,7 @@ def set_primary_perf_configs(
etp_size: Optional[int] = None,
enable_cuda_graphs: bool = False,
use_mcore_fsdp: bool = False,
use_fsdp_double_buffer: bool = False,
use_user_buffer_registration: bool = False,
use_sharp: bool = False,
recompute_layers: int = 0,
Expand Down Expand Up @@ -334,18 +348,19 @@ def set_primary_perf_configs(
)

recipe = set_perf_optimization_configs(
recipe,
use_mcore_fsdp,
enable_cuda_graphs,
task,
tp_size,
compute_dtype,
fp8_recipe,
recompute_layers,
activation_offload_layers,
recompute_modules,
use_sharp,
use_user_buffer_registration,
recipe=recipe,
use_mcore_fsdp=use_mcore_fsdp,
enable_cuda_graphs=enable_cuda_graphs,
task=task,
tp_size=tp_size,
compute_dtype=compute_dtype,
fp8_recipe=fp8_recipe,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
recompute_modules=recompute_modules,
use_fsdp_double_buffer=use_fsdp_double_buffer,
use_user_buffer_registration=use_user_buffer_registration,
use_sharp=use_sharp,
)

return recipe
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/finetune_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
Expand Down
3 changes: 2 additions & 1 deletion scripts/performance/llm/finetune_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,13 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
)
recipe = set_exp_logging_configs(
Expand Down
3 changes: 2 additions & 1 deletion scripts/performance/llm/finetune_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,13 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
)
recipe = set_exp_logging_configs(
Expand Down
4 changes: 3 additions & 1 deletion scripts/performance/llm/finetune_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,10 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_sharp=args.use_sharp,
)
recipe = set_exp_logging_configs(
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/finetune_llama4_e128.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_gpt3_175b.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_user_buffer_registration=args.use_user_buffer_registration,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
)
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_llama4_e128.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_llama4_e16.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_mixtral_8x22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs,
use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_mixtral_8x7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_nemotron3_22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotron3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def override_recipe_configs(
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
)
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotron4_15b.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def override_recipe_configs(
vp_size,
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
3 changes: 2 additions & 1 deletion scripts/performance/llm/pretrain_nemotron4_340b.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
nccl_communicator_config_path=args.nccl_communicator_config_path,
)
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotronh_47b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotronh_56b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotronh_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_qwen3_235b_a22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_qwen3_30b_a3b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/vlm/finetune_neva_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
)
recipe = set_exp_logging_configs(
recipe,
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/vlm/pretrain_vlm_llama4_e128.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
)
recipe = set_exp_logging_configs(
recipe,
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/vlm/pretrain_vlm_llama4_e16.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
)
recipe = set_exp_logging_configs(
recipe,
Expand Down
Loading