Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions scripts/performance/argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,14 @@ def bool_arg(arg):
required=False,
default=None,
)
parser.add_argument(
"-fsdp_db",
"--use_fsdp_double_buffer",
help="Enable FSDP double buffer. Disabled by default",
type=bool_arg,
required=False,
default=None,
)
parser.add_argument(
"-ubr",
"--use_user_buffer_registration",
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/diffusion/pretrain_flux_12b.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
)
recipe = set_exp_logging_configs(
recipe,
Expand Down
53 changes: 37 additions & 16 deletions scripts/performance/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ def get_user_configs(gpu: str, task: str, model_name: str, model_size: str, args
use_mcore_fsdp = config.get("use_mcore_fsdp") if args.use_mcore_fsdp is None else args.use_mcore_fsdp
use_mcore_fsdp = False if use_mcore_fsdp is None else bool(int(use_mcore_fsdp))

use_fsdp_double_buffer = (
config.get("use_fsdp_double_buffer") if args.use_fsdp_double_buffer is None else args.use_fsdp_double_buffer
)
use_fsdp_double_buffer = False if use_fsdp_double_buffer is None else bool(int(use_fsdp_double_buffer))
if use_fsdp_double_buffer:
assert use_mcore_fsdp == True, "use_fsdp_double_buffer requires use_mcore_fsdp to be True"

recompute_layers = config.get("recompute_layers") if args.recompute_layers is None else args.recompute_layers
recompute_layers = 0 if recompute_layers is None else int(recompute_layers)
activation_offload_layers = (
Expand Down Expand Up @@ -130,6 +137,7 @@ def get_user_configs(gpu: str, task: str, model_name: str, model_size: str, args
logging.info(f"{etp_size=}")
logging.info(f"{enable_cuda_graphs=}")
logging.info(f"{use_mcore_fsdp=}")
logging.info(f"{use_fsdp_double_buffer=}")
logging.info(f"{recompute_layers=}")
logging.info(f"{activation_offload_layers=}")
logging.info(f"{recompute_modules=}")
Expand Down Expand Up @@ -240,12 +248,22 @@ def set_perf_optimization_configs(
recompute_layers: int,
activation_offload_layers: int,
recompute_modules: Optional[List[str]],
use_sharp: bool,
use_user_buffer_registration: bool,
use_fsdp_double_buffer: Optional[bool] = None,
use_user_buffer_registration: Optional[bool] = None,
use_sharp: Optional[bool] = None,
):
# enable cross entropy fusion with TE kernel
recipe.model.config.cross_entropy_fusion_impl = "te"

if use_fsdp_double_buffer:
assert use_mcore_fsdp == True, "use_fsdp_double_buffer requires use_mcore_fsdp to be True"

if use_user_buffer_registration:
assert use_mcore_fsdp == True, "use_user_buffer_registration requires use_mcore_fsdp to be True"
assert (
use_fsdp_double_buffer is not False
), "use_fsdp_double_buffer cannot be False when use_user_buffer_registration is True"

if use_mcore_fsdp and enable_cuda_graphs:
logging.warning("Currently, cuda graphs are not supported with FSDP. Disabling cuda graphs.")
enable_cuda_graphs = False
Expand All @@ -269,6 +287,7 @@ def set_perf_optimization_configs(
recipe.trainer.strategy.ddp.check_for_nan_in_grad = False
recipe.trainer.strategy.ddp.check_for_large_grads = False
recipe.trainer.strategy.ddp.nccl_ub = bool(use_user_buffer_registration)
recipe.trainer.strategy.ddp.fsdp_double_buffer = bool(use_fsdp_double_buffer)

return recipe

Expand All @@ -289,8 +308,9 @@ def set_primary_perf_configs(
etp_size: Optional[int] = None,
enable_cuda_graphs: bool = False,
use_mcore_fsdp: bool = False,
use_user_buffer_registration: bool = False,
use_sharp: bool = False,
use_fsdp_double_buffer: Optional[bool] = None,
use_user_buffer_registration: Optional[bool] = None,
use_sharp: Optional[bool] = None,
recompute_layers: int = 0,
activation_offload_layers: int = 0,
compute_dtype: str = None,
Expand Down Expand Up @@ -334,18 +354,19 @@ def set_primary_perf_configs(
)

recipe = set_perf_optimization_configs(
recipe,
use_mcore_fsdp,
enable_cuda_graphs,
task,
tp_size,
compute_dtype,
fp8_recipe,
recompute_layers,
activation_offload_layers,
recompute_modules,
use_sharp,
use_user_buffer_registration,
recipe=recipe,
use_mcore_fsdp=use_mcore_fsdp,
enable_cuda_graphs=enable_cuda_graphs,
task=task,
tp_size=tp_size,
compute_dtype=compute_dtype,
fp8_recipe=fp8_recipe,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
recompute_modules=recompute_modules,
use_fsdp_double_buffer=use_fsdp_double_buffer,
use_user_buffer_registration=use_user_buffer_registration,
use_sharp=use_sharp,
)

return recipe
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/finetune_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
Expand Down
3 changes: 2 additions & 1 deletion scripts/performance/llm/finetune_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,13 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
)
recipe = set_exp_logging_configs(
Expand Down
3 changes: 2 additions & 1 deletion scripts/performance/llm/finetune_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,13 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
)
recipe = set_exp_logging_configs(
Expand Down
4 changes: 3 additions & 1 deletion scripts/performance/llm/finetune_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,10 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_sharp=args.use_sharp,
)
recipe = set_exp_logging_configs(
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/finetune_llama4_e128.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_gpt3_175b.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_user_buffer_registration=args.use_user_buffer_registration,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
)
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_llama4_e128.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_llama4_e16.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_mixtral_8x22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs,
use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_mixtral_8x7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
1 change: 1 addition & 0 deletions scripts/performance/llm/pretrain_nemotron3_22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
recompute_layers=recompute_layers,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotron3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def override_recipe_configs(
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
nccl_communicator_config_path=args.nccl_communicator_config_path,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
)
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotron4_15b.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def override_recipe_configs(
vp_size,
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
3 changes: 2 additions & 1 deletion scripts/performance/llm/pretrain_nemotron4_340b.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
nccl_communicator_config_path=args.nccl_communicator_config_path,
)
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotronh_47b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotronh_56b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_nemotronh_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
recompute_layers=recompute_layers,
activation_offload_layers=activation_offload_layers,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_qwen3_235b_a22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
2 changes: 2 additions & 0 deletions scripts/performance/llm/pretrain_qwen3_30b_a3b.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def override_recipe_configs(
ep_size,
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
use_sharp=args.use_sharp,
compute_dtype=args.compute_dtype,
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/vlm/finetune_neva_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def override_recipe_configs(
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
fp8_recipe=args.fp8_recipe,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
)
recipe = set_exp_logging_configs(
recipe,
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/vlm/pretrain_vlm_llama4_e128.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
)
recipe = set_exp_logging_configs(
recipe,
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/vlm/pretrain_vlm_llama4_e16.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ def override_recipe_configs(
etp_size,
enable_cuda_graphs=enable_cuda_graphs,
compute_dtype=args.compute_dtype,
use_mcore_fsdp=args.use_mcore_fsdp,
use_fsdp_double_buffer=args.use_fsdp_double_buffer,
use_user_buffer_registration=args.use_user_buffer_registration,
)
recipe = set_exp_logging_configs(
recipe,
Expand Down
Loading