diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index 138443693517..f9465ee4ba94 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -300,6 +300,14 @@ def bool_arg(arg): required=False, default=None, ) + parser.add_argument( + "-fsdp_db", + "--use_fsdp_double_buffer", + help="Enable FSDP double buffer. Disabled by default", + type=bool_arg, + required=False, + default=None, + ) parser.add_argument( "-ubr", "--use_user_buffer_registration", diff --git a/scripts/performance/diffusion/pretrain_flux_12b.py b/scripts/performance/diffusion/pretrain_flux_12b.py index 5df1c6cf713b..775abe6abe38 100644 --- a/scripts/performance/diffusion/pretrain_flux_12b.py +++ b/scripts/performance/diffusion/pretrain_flux_12b.py @@ -59,6 +59,9 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, ) recipe = set_exp_logging_configs( recipe, diff --git a/scripts/performance/helpers.py b/scripts/performance/helpers.py index 430383dcd6b3..573b58ccbc01 100644 --- a/scripts/performance/helpers.py +++ b/scripts/performance/helpers.py @@ -95,6 +95,13 @@ def get_user_configs(gpu: str, task: str, model_name: str, model_size: str, args use_mcore_fsdp = config.get("use_mcore_fsdp") if args.use_mcore_fsdp is None else args.use_mcore_fsdp use_mcore_fsdp = False if use_mcore_fsdp is None else bool(int(use_mcore_fsdp)) + use_fsdp_double_buffer = ( + config.get("use_fsdp_double_buffer") if args.use_fsdp_double_buffer is None else args.use_fsdp_double_buffer + ) + use_fsdp_double_buffer = False if use_fsdp_double_buffer is None else bool(int(use_fsdp_double_buffer)) + if use_fsdp_double_buffer: + assert use_mcore_fsdp == True, "use_fsdp_double_buffer requires use_mcore_fsdp to be True" + recompute_layers = config.get("recompute_layers") if args.recompute_layers is None else args.recompute_layers recompute_layers = 0 if recompute_layers is None else int(recompute_layers) activation_offload_layers = ( @@ -130,6 +137,7 @@ def get_user_configs(gpu: str, task: str, model_name: str, model_size: str, args logging.info(f"{etp_size=}") logging.info(f"{enable_cuda_graphs=}") logging.info(f"{use_mcore_fsdp=}") + logging.info(f"{use_fsdp_double_buffer=}") logging.info(f"{recompute_layers=}") logging.info(f"{activation_offload_layers=}") logging.info(f"{recompute_modules=}") @@ -240,12 +248,22 @@ def set_perf_optimization_configs( recompute_layers: int, activation_offload_layers: int, recompute_modules: Optional[List[str]], - use_sharp: bool, - use_user_buffer_registration: bool, + use_fsdp_double_buffer: Optional[bool] = None, + use_user_buffer_registration: Optional[bool] = None, + use_sharp: Optional[bool] = None, ): # enable cross entropy fusion with TE kernel recipe.model.config.cross_entropy_fusion_impl = "te" + if use_fsdp_double_buffer: + assert use_mcore_fsdp == True, "use_fsdp_double_buffer requires use_mcore_fsdp to be True" + + if use_user_buffer_registration: + assert use_mcore_fsdp == True, "use_user_buffer_registration requires use_mcore_fsdp to be True" + assert ( + use_fsdp_double_buffer is not False + ), "use_fsdp_double_buffer cannot be False when use_user_buffer_registration is True" + if use_mcore_fsdp and enable_cuda_graphs: logging.warning("Currently, cuda graphs are not supported with FSDP. Disabling cuda graphs.") enable_cuda_graphs = False @@ -269,6 +287,7 @@ def set_perf_optimization_configs( recipe.trainer.strategy.ddp.check_for_nan_in_grad = False recipe.trainer.strategy.ddp.check_for_large_grads = False recipe.trainer.strategy.ddp.nccl_ub = bool(use_user_buffer_registration) + recipe.trainer.strategy.ddp.fsdp_double_buffer = bool(use_fsdp_double_buffer) return recipe @@ -289,8 +308,9 @@ def set_primary_perf_configs( etp_size: Optional[int] = None, enable_cuda_graphs: bool = False, use_mcore_fsdp: bool = False, - use_user_buffer_registration: bool = False, - use_sharp: bool = False, + use_fsdp_double_buffer: Optional[bool] = None, + use_user_buffer_registration: Optional[bool] = None, + use_sharp: Optional[bool] = None, recompute_layers: int = 0, activation_offload_layers: int = 0, compute_dtype: str = None, @@ -334,18 +354,19 @@ def set_primary_perf_configs( ) recipe = set_perf_optimization_configs( - recipe, - use_mcore_fsdp, - enable_cuda_graphs, - task, - tp_size, - compute_dtype, - fp8_recipe, - recompute_layers, - activation_offload_layers, - recompute_modules, - use_sharp, - use_user_buffer_registration, + recipe=recipe, + use_mcore_fsdp=use_mcore_fsdp, + enable_cuda_graphs=enable_cuda_graphs, + task=task, + tp_size=tp_size, + compute_dtype=compute_dtype, + fp8_recipe=fp8_recipe, + recompute_layers=recompute_layers, + activation_offload_layers=activation_offload_layers, + recompute_modules=recompute_modules, + use_fsdp_double_buffer=use_fsdp_double_buffer, + use_user_buffer_registration=use_user_buffer_registration, + use_sharp=use_sharp, ) return recipe diff --git a/scripts/performance/llm/finetune_deepseek_v3.py b/scripts/performance/llm/finetune_deepseek_v3.py index 2b283c0a637c..6be2bf92a701 100644 --- a/scripts/performance/llm/finetune_deepseek_v3.py +++ b/scripts/performance/llm/finetune_deepseek_v3.py @@ -83,6 +83,8 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, nccl_communicator_config_path=args.nccl_communicator_config_path, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, diff --git a/scripts/performance/llm/finetune_llama31_405b.py b/scripts/performance/llm/finetune_llama31_405b.py index 6f17934ed9dd..968b22c49dc7 100644 --- a/scripts/performance/llm/finetune_llama31_405b.py +++ b/scripts/performance/llm/finetune_llama31_405b.py @@ -85,12 +85,13 @@ def override_recipe_configs( ep_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, recompute_layers=recompute_layers, activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, nccl_communicator_config_path=args.nccl_communicator_config_path, - use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, ) recipe = set_exp_logging_configs( diff --git a/scripts/performance/llm/finetune_llama3_70b.py b/scripts/performance/llm/finetune_llama3_70b.py index 99cce2044ac8..75fc866da594 100644 --- a/scripts/performance/llm/finetune_llama3_70b.py +++ b/scripts/performance/llm/finetune_llama3_70b.py @@ -92,12 +92,13 @@ def override_recipe_configs( ep_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, recompute_layers=recompute_layers, activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, nccl_communicator_config_path=args.nccl_communicator_config_path, - use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, ) recipe = set_exp_logging_configs( diff --git a/scripts/performance/llm/finetune_llama3_8b.py b/scripts/performance/llm/finetune_llama3_8b.py index ab78f1840d7f..6cef719bbcc0 100644 --- a/scripts/performance/llm/finetune_llama3_8b.py +++ b/scripts/performance/llm/finetune_llama3_8b.py @@ -78,8 +78,10 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, - nccl_communicator_config_path=args.nccl_communicator_config_path, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, + nccl_communicator_config_path=args.nccl_communicator_config_path, use_sharp=args.use_sharp, ) recipe = set_exp_logging_configs( diff --git a/scripts/performance/llm/finetune_llama4_e128.py b/scripts/performance/llm/finetune_llama4_e128.py index 235d990f40da..3ea36a464625 100644 --- a/scripts/performance/llm/finetune_llama4_e128.py +++ b/scripts/performance/llm/finetune_llama4_e128.py @@ -86,6 +86,8 @@ def override_recipe_configs( etp_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, recompute_layers=recompute_layers, activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/llm/pretrain_deepseek_v3.py b/scripts/performance/llm/pretrain_deepseek_v3.py index e854f76ee434..d62c308d4130 100644 --- a/scripts/performance/llm/pretrain_deepseek_v3.py +++ b/scripts/performance/llm/pretrain_deepseek_v3.py @@ -86,6 +86,7 @@ def override_recipe_configs( etp_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, recompute_layers=recompute_layers, diff --git a/scripts/performance/llm/pretrain_gpt3_175b.py b/scripts/performance/llm/pretrain_gpt3_175b.py index ec711a21e0ae..c89f5e4458a4 100644 --- a/scripts/performance/llm/pretrain_gpt3_175b.py +++ b/scripts/performance/llm/pretrain_gpt3_175b.py @@ -70,6 +70,7 @@ def override_recipe_configs( ep_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, recompute_layers=recompute_layers, diff --git a/scripts/performance/llm/pretrain_llama31_405b.py b/scripts/performance/llm/pretrain_llama31_405b.py index 87c73a6e7be9..33b9cb476639 100644 --- a/scripts/performance/llm/pretrain_llama31_405b.py +++ b/scripts/performance/llm/pretrain_llama31_405b.py @@ -70,6 +70,7 @@ def override_recipe_configs( ep_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, recompute_layers=recompute_layers, diff --git a/scripts/performance/llm/pretrain_llama3_70b.py b/scripts/performance/llm/pretrain_llama3_70b.py index c8cc478a6e02..c6d66fd72799 100644 --- a/scripts/performance/llm/pretrain_llama3_70b.py +++ b/scripts/performance/llm/pretrain_llama3_70b.py @@ -71,6 +71,7 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, use_user_buffer_registration=args.use_user_buffer_registration, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_sharp=args.use_sharp, recompute_layers=recompute_layers, activation_offload_layers=activation_offload_layers, diff --git a/scripts/performance/llm/pretrain_llama3_8b.py b/scripts/performance/llm/pretrain_llama3_8b.py index f66b43f403c8..35f6740798b6 100644 --- a/scripts/performance/llm/pretrain_llama3_8b.py +++ b/scripts/performance/llm/pretrain_llama3_8b.py @@ -62,6 +62,8 @@ def override_recipe_configs( compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, nccl_communicator_config_path=args.nccl_communicator_config_path, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, ) diff --git a/scripts/performance/llm/pretrain_llama4_e128.py b/scripts/performance/llm/pretrain_llama4_e128.py index 94df5df7570b..fa14831dc544 100644 --- a/scripts/performance/llm/pretrain_llama4_e128.py +++ b/scripts/performance/llm/pretrain_llama4_e128.py @@ -62,6 +62,8 @@ def override_recipe_configs( ep_size, etp_size, enable_cuda_graphs=enable_cuda_graphs, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/llm/pretrain_llama4_e16.py b/scripts/performance/llm/pretrain_llama4_e16.py index f1d286fafb86..018751f35117 100644 --- a/scripts/performance/llm/pretrain_llama4_e16.py +++ b/scripts/performance/llm/pretrain_llama4_e16.py @@ -62,6 +62,8 @@ def override_recipe_configs( ep_size, etp_size, enable_cuda_graphs=enable_cuda_graphs, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/llm/pretrain_mixtral_8x22b.py b/scripts/performance/llm/pretrain_mixtral_8x22b.py index 54adc7895906..78755f7db586 100644 --- a/scripts/performance/llm/pretrain_mixtral_8x22b.py +++ b/scripts/performance/llm/pretrain_mixtral_8x22b.py @@ -65,6 +65,7 @@ def override_recipe_configs( etp_size, enable_cuda_graphs, use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, recompute_layers=recompute_layers, diff --git a/scripts/performance/llm/pretrain_mixtral_8x7b.py b/scripts/performance/llm/pretrain_mixtral_8x7b.py index eb206d6dfbcc..5e32d84aec5d 100644 --- a/scripts/performance/llm/pretrain_mixtral_8x7b.py +++ b/scripts/performance/llm/pretrain_mixtral_8x7b.py @@ -61,6 +61,8 @@ def override_recipe_configs( ep_size, etp_size, enable_cuda_graphs, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/llm/pretrain_nemotron3_22b.py b/scripts/performance/llm/pretrain_nemotron3_22b.py index c8f108b2dbd7..9da8f8f7202c 100644 --- a/scripts/performance/llm/pretrain_nemotron3_22b.py +++ b/scripts/performance/llm/pretrain_nemotron3_22b.py @@ -61,6 +61,7 @@ def override_recipe_configs( ep_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, recompute_layers=recompute_layers, diff --git a/scripts/performance/llm/pretrain_nemotron3_8b.py b/scripts/performance/llm/pretrain_nemotron3_8b.py index 9e69adc2c0b2..b8eefd120799 100644 --- a/scripts/performance/llm/pretrain_nemotron3_8b.py +++ b/scripts/performance/llm/pretrain_nemotron3_8b.py @@ -60,6 +60,8 @@ def override_recipe_configs( compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, nccl_communicator_config_path=args.nccl_communicator_config_path, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, ) diff --git a/scripts/performance/llm/pretrain_nemotron4_15b.py b/scripts/performance/llm/pretrain_nemotron4_15b.py index 3477438b1ef4..d7eea5ec415d 100644 --- a/scripts/performance/llm/pretrain_nemotron4_15b.py +++ b/scripts/performance/llm/pretrain_nemotron4_15b.py @@ -61,6 +61,8 @@ def override_recipe_configs( vp_size, ep_size, enable_cuda_graphs=enable_cuda_graphs, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/llm/pretrain_nemotron4_340b.py b/scripts/performance/llm/pretrain_nemotron4_340b.py index 06f7c5e0f28f..5442981fceb7 100644 --- a/scripts/performance/llm/pretrain_nemotron4_340b.py +++ b/scripts/performance/llm/pretrain_nemotron4_340b.py @@ -68,11 +68,12 @@ def override_recipe_configs( ep_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, recompute_layers=recompute_layers, activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, - use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, nccl_communicator_config_path=args.nccl_communicator_config_path, ) diff --git a/scripts/performance/llm/pretrain_nemotronh_47b.py b/scripts/performance/llm/pretrain_nemotronh_47b.py index 493e4d307fae..d2783b5a3c96 100644 --- a/scripts/performance/llm/pretrain_nemotronh_47b.py +++ b/scripts/performance/llm/pretrain_nemotronh_47b.py @@ -62,6 +62,8 @@ def override_recipe_configs( ep_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, recompute_layers=recompute_layers, activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/llm/pretrain_nemotronh_56b.py b/scripts/performance/llm/pretrain_nemotronh_56b.py index 47246aaf6ae3..903e7d89052f 100644 --- a/scripts/performance/llm/pretrain_nemotronh_56b.py +++ b/scripts/performance/llm/pretrain_nemotronh_56b.py @@ -62,6 +62,8 @@ def override_recipe_configs( ep_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, recompute_layers=recompute_layers, activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/llm/pretrain_nemotronh_8b.py b/scripts/performance/llm/pretrain_nemotronh_8b.py index cd2c743613d4..7ad2cf8d9499 100644 --- a/scripts/performance/llm/pretrain_nemotronh_8b.py +++ b/scripts/performance/llm/pretrain_nemotronh_8b.py @@ -62,6 +62,8 @@ def override_recipe_configs( ep_size, enable_cuda_graphs=enable_cuda_graphs, use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, recompute_layers=recompute_layers, activation_offload_layers=activation_offload_layers, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/llm/pretrain_qwen3_235b_a22b.py b/scripts/performance/llm/pretrain_qwen3_235b_a22b.py index fcf4d013b0a9..68f54466bec9 100644 --- a/scripts/performance/llm/pretrain_qwen3_235b_a22b.py +++ b/scripts/performance/llm/pretrain_qwen3_235b_a22b.py @@ -62,6 +62,8 @@ def override_recipe_configs( ep_size, etp_size, enable_cuda_graphs=enable_cuda_graphs, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/llm/pretrain_qwen3_30b_a3b.py b/scripts/performance/llm/pretrain_qwen3_30b_a3b.py index ea7f002016a7..9bb1a26edb63 100644 --- a/scripts/performance/llm/pretrain_qwen3_30b_a3b.py +++ b/scripts/performance/llm/pretrain_qwen3_30b_a3b.py @@ -62,6 +62,8 @@ def override_recipe_configs( ep_size, etp_size, enable_cuda_graphs=enable_cuda_graphs, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, use_user_buffer_registration=args.use_user_buffer_registration, use_sharp=args.use_sharp, compute_dtype=args.compute_dtype, diff --git a/scripts/performance/vlm/finetune_neva_8b.py b/scripts/performance/vlm/finetune_neva_8b.py index 099051c523ab..c2e20b613509 100644 --- a/scripts/performance/vlm/finetune_neva_8b.py +++ b/scripts/performance/vlm/finetune_neva_8b.py @@ -60,6 +60,9 @@ def override_recipe_configs( enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, fp8_recipe=args.fp8_recipe, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, ) recipe = set_exp_logging_configs( recipe, diff --git a/scripts/performance/vlm/pretrain_vlm_llama4_e128.py b/scripts/performance/vlm/pretrain_vlm_llama4_e128.py index 29c852e0a21b..0a7251152e97 100644 --- a/scripts/performance/vlm/pretrain_vlm_llama4_e128.py +++ b/scripts/performance/vlm/pretrain_vlm_llama4_e128.py @@ -65,6 +65,9 @@ def override_recipe_configs( etp_size, enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, ) recipe = set_exp_logging_configs( recipe, diff --git a/scripts/performance/vlm/pretrain_vlm_llama4_e16.py b/scripts/performance/vlm/pretrain_vlm_llama4_e16.py index 3f158bad5d42..48e8ac420675 100644 --- a/scripts/performance/vlm/pretrain_vlm_llama4_e16.py +++ b/scripts/performance/vlm/pretrain_vlm_llama4_e16.py @@ -65,6 +65,9 @@ def override_recipe_configs( etp_size, enable_cuda_graphs=enable_cuda_graphs, compute_dtype=args.compute_dtype, + use_mcore_fsdp=args.use_mcore_fsdp, + use_fsdp_double_buffer=args.use_fsdp_double_buffer, + use_user_buffer_registration=args.use_user_buffer_registration, ) recipe = set_exp_logging_configs( recipe,