diff --git a/README.md b/README.md index abbb7be78..012a649af 100644 --- a/README.md +++ b/README.md @@ -163,34 +163,13 @@ For more details on supported models, see our documentation: #### Launching Recipes -All recipes are ready to train out of the box, using mock data by default. For an example of how to override the default configuration through YAML or Hydra-style CLI overrides, please have a look at this [script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b.py). The script can then be launched with `torchrun`. For example, with the aforementioned script: +For a conceptual overview of how recipes are structured, overridden, and launched with either `torchrun` or NeMo-Run, read the [Using Recipes guide](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html). -```sh -torchrun --nproc-per-node=2 pretrain_llama3_8b.py model.tensor_model_parallel_size=1 -``` - -Optionally, Megatron Bridge also supports launching with [NeMo-Run](https://github.com/NVIDIA-NeMo/Run). See the following examples for reference on launching with NeMo-Run: - -- [pretrain_llama3_8b_nemo_run_script.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py) -- [pretrain_llama3_8b_nemo_run_partial.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_partial.py) - -These examples can also be run as-is with the Llama 3 8B recipe (with NeMo-Run installed). - -Launch Llama 3 8B pretraining with NeMo-Run's `run.Script`: +Runnable tutorials live in `tutorials/recipes/llama` that covers: -```sh -uv run python pretrain_llama3_8b_nemo_run_script.py \ - --nproc-per-node=2 \ - model.pipeline_model_parallel_size=1 \ - train.train_iters=10 # this script passes Hydra-style overrides to the target script -``` - -Launch Llama 3 8B pretraining with NeMo-Run's `run.Partial`: - -```sh -uv run python pretrain_llama3_8b_nemo_run_partial.py \ - --nproc-per-node=2 -``` +- `00_quickstart_pretrain.py` for mock-data pretraining +- `01_quickstart_finetune.py` + LoRA configs +- YAML-driven flows and launch helpers diff --git a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml deleted file mode 100644 index 5f55d9988..000000000 --- a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Example override file - -# To override a parameter, ensure the structure matches the ConfigContainer -# and its sub-configurations (e.g., model, train, etc.) -# Top-level ConfigContainer fields are dataclasses themselves - -model: - seq_length: 4096 - -train: - train_iters: 20 - global_batch_size: 8 - micro_batch_size: 1 - eval_iters: 0 - -optimizer: - lr: 0.00025 - min_lr: 0.000025 - -scheduler: - lr_warmup_iters: 10 - -checkpoint: - # Directory to save to. If null, no checkpoint will be saved. - save: null - -dist: - use_megatron_fsdp: false - use_torch_fsdp2: false - -logger: - log_interval: 1 - -dataset: - seq_length: 4096 - -rng: - seed: 42 - -ddp: - grad_reduce_in_fp32: true - -profiling: - # For optional fields in the config, specify the target to instantiate the object. - _target_: megatron.bridge.training.config.ProfilingConfig - use_nsys_profiler: false - profile_step_start: 5 - profile_step_end: 10 - use_pytorch_profiler: true - profile_ranks: [0, 1] - record_shapes: true diff --git a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml deleted file mode 100644 index 0e239e284..000000000 --- a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Example override file - -# To override a parameter, ensure the structure matches the ConfigContainer -# and its sub-configurations (e.g., model, train, etc.) -# Top-level ConfigContainer fields are dataclasses themselves - -model: - seq_length: 4096 - init_model_with_meta_device: true - -train: - train_iters: 20 - global_batch_size: 8 - micro_batch_size: 1 - eval_iters: 0 - -optimizer: - lr: 0.00025 - min_lr: 0.000025 - -scheduler: - lr_warmup_iters: 10 - -checkpoint: - # Directory to save to. If null, no checkpoint will be saved. - save: null - ckpt_format: "fsdp_dtensor" - -dist: - use_megatron_fsdp: true - use_torch_fsdp2: false - -logger: - log_interval: 1 - -dataset: - seq_length: 4096 - -rng: - seed: 42 - -ddp: - grad_reduce_in_fp32: true - data_parallel_sharding_strategy: "optim_grads_params" # for Megatron FSDP ZeRO-3 like sharding - -profiling: - # For optional fields in the config, specify the target to instantiate the object. - _target_: megatron.bridge.training.config.ProfilingConfig - use_nsys_profiler: false - profile_step_start: 5 - profile_step_end: 10 - use_pytorch_profiler: true - profile_ranks: [0, 1] - record_shapes: true diff --git a/examples/recipes/llama/pretrain_llama3_8b.py b/examples/recipes/llama/pretrain_llama3_8b.py deleted file mode 100644 index 76ebde762..000000000 --- a/examples/recipes/llama/pretrain_llama3_8b.py +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Llama3 8B Pretraining Script with YAML and CLI Configuration Overrides. - -This script provides a flexible way to pretrain Llama3 8B models using Megatron-Bridge with support for -both YAML configuration files and command-line overrides using Hydra-style syntax. - -Examples: - Basic usage with default configuration: - $ torchrun --nproc_per_node=8 examples/recipes/llama/pretrain_llama3_8b.py - - Using a custom YAML config file: - $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_custom_config.yaml - - Using CLI overrides only: - $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py model.tensor_model_parallel_size=4 train.train_iters=100000 - - Combining YAML and CLI overrides (CLI takes precedence): - $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file conf/my_config.yaml \ - model.pipeline_dtype=torch.float16 \ - train.global_batch_size=512 - -Configuration Precedence: - 1. Base configuration from pretrain_config() recipe - 2. YAML overrides from --config-file (if provided) - 3. CLI overrides (highest precedence) - -Supported Override Syntax: - - Standard assignment: key=value - - Nested assignment: section.subsection.key=value - - Addition: +new_key=value - - Deletion: ~key_to_remove - - Type conversion: Automatic for basic types (int, float, bool, str) - - Complex types: torch.dtype, enums, etc. are supported -""" - -import argparse -import logging -import os -import sys -from pathlib import Path -from typing import Tuple - -import torch -from omegaconf import OmegaConf - -from megatron.bridge.recipes.llama import llama3_8b_pretrain_config as pretrain_config -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain -from megatron.bridge.training.utils.omegaconf_utils import ( - apply_overrides, - create_omegaconf_dict_config, - parse_hydra_overrides, -) -from megatron.bridge.utils.common_utils import get_rank_safe - - -logger: logging.Logger = logging.getLogger(__name__) - - -# Define paths relative to this script's location -# Assumes this script (pretrain_llama3_8b.py) is in Megatron-Bridge/examples/recipes/llama/ -# and the config is in a 'conf' subdirectory. -SCRIPT_DIR: Path = Path(__file__).parent.resolve() -DEFAULT_CONFIG_FILENAME: str = "llama3_8b_pretrain_override_example.yaml" -DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME - - -def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]: - """Parse command line arguments, separating known script args from OmegaConf overrides.""" - parser = argparse.ArgumentParser( - description="Pretrain Llama3 8B model using Megatron-Bridge with YAML and CLI overrides", - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--config-file", - type=str, - default=str(DEFAULT_CONFIG_FILE_PATH), - help="Path to the YAML OmegaConf override file. Default: conf/llama3_8b_pretrain_override_example.yaml", - ) - parser.add_argument("--debug", action="store_true", help="Enable debug logging") - - # Parse known args for the script, remaining will be treated as overrides - args, cli_dotlist_overrides = parser.parse_known_args() - return args, cli_dotlist_overrides - - -def main() -> None: - """ - Entry point for the Llama3 8B pretraining script. - - This function orchestrates the complete configuration workflow: - 1. Loads the base configuration from pretrain_config() recipe - 2. Applies YAML overrides from --config-file (if exists) - 3. Applies CLI overrides using Hydra-style syntax - 4. Starts Megatron pretraining with the final merged configuration - - Configuration merging preserves callable fields (like activation functions) - and handles type conversions automatically. - - Examples of CLI usage: - # Use default config with custom learning rate - torchrun --nproc_per_node=8 pretrain_llama3_8b.py optimizer.lr=0.0002 - - # Custom config file with additional overrides - torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_config.yaml train.train_iters=50000 - - # Multiple overrides for distributed training - torchrun --nproc_per_node=8 pretrain_llama3_8b.py \ - model.tensor_model_parallel_size=4 \ - model.pipeline_model_parallel_size=2 \ - train.global_batch_size=512 - """ - args, cli_overrides = parse_cli_args() - - logger.info("Megatron-Bridge Llama3 8B Pretraining Script with YAML & CLI Overrides") - logger.info("------------------------------------------------------------------") - - # Load base configuration from the recipe as a Python dataclass - cfg: ConfigContainer = pretrain_config() - logger.info("Loaded base configuration") - - # Print configuration on rank 0 - if get_rank_safe() == 0: - cfg.print_yaml() - - # Convert the initial Python dataclass to an OmegaConf DictConfig for merging - merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg) - - # Load and merge YAML overrides if a config file is provided - if args.config_file: - logger.debug(f"Loading YAML overrides from: {args.config_file}") - if not os.path.exists(args.config_file): - logger.error(f"Override YAML file not found: {args.config_file}") - sys.exit(1) - yaml_overrides_omega = OmegaConf.load(args.config_file) - merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega) - logger.debug("YAML overrides merged successfully.") - - # Apply command-line overrides using Hydra-style parsing - if cli_overrides: - logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}") - merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides) - logger.debug("Hydra-style command-line overrides applied successfully.") - - # Apply the final merged OmegaConf configuration back to the original ConfigContainer - logger.debug("Applying final merged configuration back to Python ConfigContainer...") - final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True) - # Apply overrides while preserving excluded fields - apply_overrides(cfg, final_overrides_as_dict, excluded_fields) - - # Display final configuration - if get_rank_safe() == 0: - logger.info("--- Final Merged Configuration ---") - cfg.print_yaml() - logger.info("----------------------------------") - - # Start training - logger.debug("Starting pretraining...") - pretrain(config=cfg, forward_step_func=forward_step) - - # Cleanup process group - if torch.distributed.is_initialized(): - torch.distributed.barrier() - torch.distributed.destroy_process_group() - - -if __name__ == "__main__": - main() diff --git a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py b/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py deleted file mode 100644 index 6b8c6c68d..000000000 --- a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -NeMo Run Launcher for Llama3 8B Pretraining. - -This script launches the pretrain_llama3_8b.py script using NeMo Run with TorchRun, -while forwarding any additional command line arguments to the target script. - -Examples: - Basic usage with default config: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 - - Using a custom config file: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 --config-file=my_config.yaml - - Passing additional overrides to the target script: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 \ - model.tensor_model_parallel_size=4 \ - train.train_iters=100000 - - Using both custom config and CLI overrides: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 \ - --config-file=conf/my_custom_config.yaml \ - optimizerg.lr=0.0002 \ - train.global_batch_size=512 - - Dry run to see what would be executed: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 --dryrun \ - model.pipeline_dtype=torch.float16 - -Argument Forwarding: - Any arguments not recognized by this launcher script will be forwarded - to the target pretrain_llama3_8b.py script as Hydra-style overrides. -""" - -import argparse -import logging -import sys -from pathlib import Path -from typing import Tuple - -import nemo_run as run - - -logger: logging.Logger = logging.getLogger(__name__) - -# Define paths relative to this script's location -# Assumes this script (pretrain_llama3_8b_nemo_run_script.py) is in Megatron-Bridge/examples/recipes/llama/ -# and pretrain_llama3_8b.py is in the same directory, -# and the config is in a 'conf' subdirectory. -SCRIPT_DIR: Path = Path(__file__).parent.resolve() -PRETRAIN_SCRIPT_FILENAME: str = "pretrain_llama3_8b.py" -PRETRAIN_SCRIPT_PATH: Path = SCRIPT_DIR / PRETRAIN_SCRIPT_FILENAME -DEFAULT_CONFIG_FILENAME: str = "llama3_8b_pretrain_override_example.yaml" -DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME - - -def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]: - """Parse command line arguments, separating launcher args from target script args.""" - parser = argparse.ArgumentParser( - description="Launcher for Llama3 8B pretraining using nemo_run and TorchRun. " - "Additional arguments will be forwarded to pretrain_llama3_8b.py", - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--nproc-per-node", - type=int, - default=2, - help="Number of processes per node for TorchRun (typically number of GPUs).", - ) - parser.add_argument( - "--config-file", - type=str, - default=str(DEFAULT_CONFIG_FILE_PATH), - help="Path to the YAML override config file for the pretrain_llama3_8b.py script.", - ) - parser.add_argument( - "--dryrun", - action="store_true", - help="Dry run the script without actually running it.", - ) - - # Parse known args for the launcher, remaining will be forwarded to target script - args, forwarded_args = parser.parse_known_args() - return args, forwarded_args - - -def main() -> None: - """ - Main function for script demonstrating how to use the NeMo Run executor. - """ - args, forwarded_args = parse_cli_args() - - logger.info("Nemo Run Launcher for Llama3 8B Pretraining") - logger.info("===========================================") - - if not PRETRAIN_SCRIPT_PATH.is_file(): - logger.error(f"Target pretraining script not found: {PRETRAIN_SCRIPT_PATH}") - logger.error(f"Please ensure '{PRETRAIN_SCRIPT_FILENAME}' exists in the same directory as this launcher.") - sys.exit(1) - - config_file_to_use = Path(args.config_file).resolve() - if not config_file_to_use.is_file(): - logger.error(f"Specified YAML config file not found: {config_file_to_use}") - logger.error("Ensure the path passed to --config_file is correct.") - sys.exit(1) - - # Build the arguments list for the target script - target_script_args = [ - "--config-file", - str(config_file_to_use), - ] - - # Add any forwarded arguments (Hydra-style overrides and other target script args) - if forwarded_args: - target_script_args.extend(forwarded_args) - logger.info(f"Forwarding additional arguments to target script: {forwarded_args}") - - logger.info(f"Target script: {PRETRAIN_SCRIPT_PATH}") - logger.info(f"Target script arguments: {target_script_args}") - - train_script = run.Script( - path=str(PRETRAIN_SCRIPT_PATH), - entrypoint="python", - args=target_script_args, - ) - - # Define the executor - logger.info(f"Launching locally with TorchRun with nproc_per_node={args.nproc_per_node}") - executor = run.LocalExecutor(ntasks_per_node=args.nproc_per_node, launcher="torchrun") - - # Execute the run - run.run(train_script, executor=executor, dryrun=args.dryrun) - - -if __name__ == "__main__": - main() diff --git a/src/megatron/bridge/recipes/gemma/gemma2.py b/src/megatron/bridge/recipes/gemma/gemma2.py index 8987202ee..98c80e3e5 100644 --- a/src/megatron/bridge/recipes/gemma/gemma2.py +++ b/src/megatron/bridge/recipes/gemma/gemma2.py @@ -247,7 +247,7 @@ def _gemma2_common( reset_attention_mask=False, reset_position_ids=False, eod_mask_loss=False, - sequence_length=seq_length, + seq_length=seq_length, num_dataset_builder_threads=1, blend=blend, blend_per_split=blend_per_split, diff --git a/tutorials/recipes/llama/00_quickstart_pretrain.py b/tutorials/recipes/llama/00_quickstart_pretrain.py new file mode 100644 index 000000000..245cca0be --- /dev/null +++ b/tutorials/recipes/llama/00_quickstart_pretrain.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Quickstart: Pretrain Llama 3.2 1B with Megatron Bridge + +Usage: + Single GPU: + torchrun --nproc_per_node=1 00_quickstart_pretrain.py + + Multiple GPUs (automatic data parallelism): + torchrun --nproc_per_node=8 00_quickstart_pretrain.py + +The script uses sensible defaults and mock data for quick testing. +For custom configurations through YAML and Hydra-style overrides, see 02_pretrain_with_yaml.py +For multi-node training, see launch_with_sbatch.sh or 04_launch_slurm_with_nemo_run.py +""" + +from megatron.bridge.recipes.llama import llama32_1b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +def main() -> None: + """Run Llama 3.2 1B pretraining with default configuration.""" + + # Load the base recipe configuration + # Llama 3.2 1B works on a single GPU (TP=1, PP=1, CP=1) + config = llama32_1b_pretrain_config() + + # OPTIONAL: Customize key settings here + # Uncomment and modify as needed: + + # For a quick test run: + config.train.train_iters = 10 + config.scheduler.lr_warmup_iters = 2 + + # Use your own data: + # config.data.data_path = "/path/to/your/dataset" + + # Adjust batch sizes for your GPU memory: + # config.train.global_batch_size = 256 + # config.train.micro_batch_size = 2 + + # Change checkpoint save frequency: + # config.train.save_interval = 500 + + # Start pretraining + pretrain(config=config, forward_step_func=forward_step) + + +if __name__ == "__main__": + main() diff --git a/tutorials/recipes/llama/01_quickstart_finetune.py b/tutorials/recipes/llama/01_quickstart_finetune.py new file mode 100644 index 000000000..6a8af8e80 --- /dev/null +++ b/tutorials/recipes/llama/01_quickstart_finetune.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Quickstart: Finetune Llama 3.2 1B with Megatron Bridge + +Usage: + Single GPU with LoRA: + torchrun --nproc_per_node=1 01_quickstart_finetune.py \ + --pretrained-checkpoint /path/to/megatron/checkpoint + + Multiple GPUs (automatic data parallelism): + torchrun --nproc_per_node=8 01_quickstart_finetune.py \ + --pretrained-checkpoint /path/to/megatron/checkpoint + +Prerequisites: + You need a checkpoint in Megatron format. You can either: + 1. Convert HF checkpoint to Megatron format: + python examples/conversion/convert_checkpoints.py import \ + --hf-model meta-llama/Llama-3.2-1B \ + --megatron-path ./checkpoints/llama32_1b + 2. Use a checkpoint from pretraining (see 00_quickstart_pretrain.py) + +The script uses SQuAD dataset by default. See inline comments for: +- Using your own dataset +- Adjusting LoRA hyperparameters +- Switching to full supervised finetuning + +For YAML configuration, see 03_finetune_with_yaml.py +For multi-node training, see launch_with_sbatch.sh or 04_launch_slurm_with_nemo_run.py +""" + +import argparse + +from megatron.bridge.recipes.llama import llama32_1b_finetune_config +from megatron.bridge.training.finetune import finetune +from megatron.bridge.training.gpt_step import forward_step + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Finetune Llama 3.2 1B with LoRA", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--pretrained-checkpoint", + type=str, + required=True, + help="Path to pretrained checkpoint in Megatron format", + ) + return parser.parse_args() + + +def main() -> None: + """Run Llama 3.2 1B finetuning with LoRA.""" + args = parse_args() + + # Load the base finetune configuration + # Uses LoRA for efficient finetuning on a single GPU + config = llama32_1b_finetune_config() + + # Load from the pretrained checkpoint + config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint + + # === Quick test run === + config.train.train_iters = 10 + config.scheduler.lr_warmup_iters = 2 + + # ===== OPTIONAL CUSTOMIZATIONS ===== + # Uncomment and modify as needed: + + # === Use your own dataset === + # Replace SQuAD with your custom dataset + # Option 1: Simple path override + # config.dataset.dataset_root = "/path/to/your/dataset" + + # Or replace the dataset with FinetuningDatasetConfig for JSONL data + # from megatron.bridge.training.config import FinetuningDatasetConfig + # config.dataset = FinetuningDatasetConfig( + # dataset_root="/path/to/your/dataset_dir", # expects training/validation/test jsonl files + # seq_length=config.model.seq_length, + # ) + + # === Adjust learning rate === + # config.optimizer.lr = 5e-5 + + # === Change checkpoint save frequency === + # config.train.save_interval = 100 + + # === Adjust LoRA hyperparameters === + # Higher rank = more trainable parameters, potentially better quality but slower + # config.peft.dim = 16 # LoRA rank + # config.peft.alpha = 32 # LoRA alpha scaling + + # === Full supervised finetuning (no LoRA) === + # For full finetuning, reload config with peft=None: + # config = llama32_1b_finetune_config(peft=None) + # config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint + # Note: Full finetuning uses more memory than LoRA + # The recipe automatically adjusts parallelism for full SFT + + # Start finetuning + finetune(config=config, forward_step_func=forward_step) + + +if __name__ == "__main__": + main() diff --git a/tutorials/recipes/llama/02_pretrain_with_yaml.py b/tutorials/recipes/llama/02_pretrain_with_yaml.py new file mode 100644 index 000000000..999922324 --- /dev/null +++ b/tutorials/recipes/llama/02_pretrain_with_yaml.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Pretrain with YAML Configuration and CLI Overrides + +This script demonstrates how to use YAML configuration files and command-line +overrides for more complex configuration overrides. + +Usage: + With default config file: + torchrun --nproc_per_node=8 02_pretrain_with_yaml.py + + With custom config file: + torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ + --config-file conf/my_custom_config.yaml + + With command-line overrides: + torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ + train.train_iters=5000 \ + train.global_batch_size=256 + + Combining YAML and CLI (CLI takes precedence): + torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ + --config-file conf/llama32_1b_pretrain.yaml \ + train.train_iters=10000 + +Configuration Priority (highest to lowest): + 1. Command-line overrides (highest) + 2. YAML config file + 3. Base recipe defaults (lowest) + +See conf/ directory for example YAML configurations. +For a pure Python usage see 00_quickstart_pretrain.py. +""" + +import argparse +import logging +import sys +from pathlib import Path +from typing import Tuple + +from omegaconf import OmegaConf + +from megatron.bridge.recipes.llama import llama32_1b_pretrain_config +from megatron.bridge.training.config import ConfigContainer +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain +from megatron.bridge.training.utils.omegaconf_utils import ( + apply_overrides, + create_omegaconf_dict_config, + parse_hydra_overrides, +) + + +logger = logging.getLogger(__name__) + +# Default config file location +SCRIPT_DIR = Path(__file__).parent.resolve() +DEFAULT_CONFIG_FILE = SCRIPT_DIR / "conf" / "llama32_1b_pretrain.yaml" + + +def parse_args() -> Tuple[argparse.Namespace, list[str]]: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Pretrain with YAML configuration and CLI overrides", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--config-file", + type=str, + default=None, + help=f"Path to YAML config file (optional). Default: {DEFAULT_CONFIG_FILE}", + ) + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + + # Separate known args from CLI overrides + args, cli_overrides = parser.parse_known_args() + return args, cli_overrides + + +def main() -> None: + """Run pretraining with YAML configuration and CLI overrides.""" + args, cli_overrides = parse_args() + + # Load base configuration from recipe + config: ConfigContainer = llama32_1b_pretrain_config() + + # Convert to OmegaConf for merging + omega_conf, excluded_fields = create_omegaconf_dict_config(config) + + # Apply YAML overrides if provided + if args.config_file: + config_file_path = Path(args.config_file) + if not config_file_path.exists(): + logger.error(f"Config file not found: {config_file_path}") + sys.exit(1) + + yaml_conf = OmegaConf.load(config_file_path) + omega_conf = OmegaConf.merge(omega_conf, yaml_conf) + + # Apply command-line overrides + if cli_overrides: + omega_conf = parse_hydra_overrides(omega_conf, cli_overrides) + + # Convert back to ConfigContainer + final_config_dict = OmegaConf.to_container(omega_conf, resolve=True) + apply_overrides(config, final_config_dict, excluded_fields) + + # Start pretraining + pretrain(config=config, forward_step_func=forward_step) + + +if __name__ == "__main__": + main() diff --git a/tutorials/recipes/llama/03_finetune_with_yaml.py b/tutorials/recipes/llama/03_finetune_with_yaml.py new file mode 100644 index 000000000..ead33540f --- /dev/null +++ b/tutorials/recipes/llama/03_finetune_with_yaml.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Finetune with YAML Configuration and CLI Overrides + +This script demonstrates how to use YAML configuration files and command-line +overrides for finetuning with LoRA or full supervised finetuning (SFT). + +Usage: + With default config file: + torchrun --nproc_per_node=1 03_finetune_with_yaml.py + + With custom config file: + torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ + --config-file conf/my_finetune_config.yaml + + With command-line overrides: + torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ + train.train_iters=1000 \ + optimizer.lr=5e-5 + + Full finetuning instead of LoRA: + torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ + --peft none \ + train.train_iters=1000 + + Combining YAML and CLI (CLI takes precedence): + torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ + --config-file conf/llama32_1b_finetune.yaml \ + peft.dim=16 \ + train.train_iters=2000 + +Configuration Priority (highest to lowest): + 1. Command-line overrides (highest) + 2. YAML config file + 3. Base recipe defaults (lowest) + +See conf/ directory for example YAML configurations. +For a pure Python usage see 01_quickstart_finetune.py. +""" + +import argparse +import logging +import sys +from pathlib import Path +from typing import Tuple + +from omegaconf import OmegaConf + +from megatron.bridge.recipes.llama import llama32_1b_finetune_config +from megatron.bridge.training.config import ConfigContainer +from megatron.bridge.training.finetune import finetune +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.utils.omegaconf_utils import ( + apply_overrides, + create_omegaconf_dict_config, + parse_hydra_overrides, +) + + +logger = logging.getLogger(__name__) + +# Default config file location +SCRIPT_DIR = Path(__file__).parent.resolve() +DEFAULT_CONFIG_FILE = SCRIPT_DIR / "conf" / "llama32_1b_finetune.yaml" + + +def parse_args() -> Tuple[argparse.Namespace, list[str]]: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Finetune with YAML configuration and CLI overrides", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--config-file", + type=str, + default=None, + help=f"Path to YAML config file (optional). Default: {DEFAULT_CONFIG_FILE}", + ) + parser.add_argument( + "--peft", + type=str, + default="lora", + choices=["lora", "dora", "none"], + help="PEFT method to use. Use 'none' for full finetuning.", + ) + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + + # Separate known args from CLI overrides + args, cli_overrides = parser.parse_known_args() + return args, cli_overrides + + +def main() -> None: + """Run finetuning with YAML configuration and CLI overrides.""" + args, cli_overrides = parse_args() + + # Load base configuration from recipe + peft_method = None if args.peft == "none" else args.peft + config: ConfigContainer = llama32_1b_finetune_config(peft=peft_method) + + # Convert to OmegaConf for merging + omega_conf, excluded_fields = create_omegaconf_dict_config(config) + + # Apply YAML overrides if provided + if args.config_file: + config_file_path = Path(args.config_file) + if not config_file_path.exists(): + logger.error(f"Config file not found: {config_file_path}") + sys.exit(1) + + yaml_conf = OmegaConf.load(config_file_path) + omega_conf = OmegaConf.merge(omega_conf, yaml_conf) + + # Apply command-line overrides + if cli_overrides: + omega_conf = parse_hydra_overrides(omega_conf, cli_overrides) + + # Convert back to ConfigContainer + final_config_dict = OmegaConf.to_container(omega_conf, resolve=True) + apply_overrides(config, final_config_dict, excluded_fields) + + # Start finetuning + finetune(config=config, forward_step_func=forward_step) + + +if __name__ == "__main__": + main() diff --git a/tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py b/tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py new file mode 100644 index 000000000..fef063e7a --- /dev/null +++ b/tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Launch Training on Slurm with NeMo-Run + +This script demonstrates how to launch training scripts (pretrain or finetune) +on a Slurm cluster using NeMo-Run. This enables easy multi-node training with +proper job management. + +Prerequisites: Install nemo-run + +Usage: + # From the Slurm cluster (uses LocalTunnel) + python 04_launch_slurm_with_nemo_run.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account + + # From your local machine (uses SSHTunnel) + python 04_launch_slurm_with_nemo_run.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + --ssh-tunnel \ + --host my-cluster.example.com \ + --user myusername \ + --remote-job-dir /home/myusername/nemo-runs + + # With custom SSH key + python 04_launch_slurm_with_nemo_run.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + --ssh-tunnel \ + --host my-cluster.example.com \ + --user myusername \ + --remote-job-dir /home/myusername/nemo-runs \ + --identity ~/.ssh/id_rsa + + # Launch with custom config (pass arguments to training script) + python 04_launch_slurm_with_nemo_run.py \ + --script 03_finetune_with_yaml.py \ + --nodes 1 \ + --partition gpu \ + --account my_account \ + --config-file conf/llama32_1b_finetune.yaml + + # Pass CLI overrides to training script + python 04_launch_slurm_with_nemo_run.py \ + --script 02_pretrain_with_yaml.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + train.train_iters=5000 \ + optimizer.lr=0.0002 + + # With container and custom mounts + python 04_launch_slurm_with_nemo_run.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + --container-image /path/to/container.sqsh \ + --mount /data:/data + + # Wait for job completion and tail logs + python 04_launch_slurm_with_nemo_run.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + --no-detach \ + --tail-logs + +Note: +- Use --ssh-tunnel when launching from your local machine +- Omit --ssh-tunnel when already on the Slurm cluster (uses LocalTunnel) +- By default, jobs are submitted and detached (--detach) +- Use --no-detach --tail-logs to wait and monitor job output +- Any unknown arguments are forwarded to the training script +- Adjust cluster-specific settings (account, partition, container paths) +""" + +import argparse +import logging +from pathlib import Path + +import nemo_run as run + + +logger = logging.getLogger(__name__) + +SCRIPT_DIR = Path(__file__).parent.resolve() + + +def parse_args() -> tuple[argparse.Namespace, list[str]]: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Launch training (pretrain/finetune) on Slurm using NeMo-Run", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--script", + type=str, + required=True, + help="Training script to run (e.g., 00_quickstart_pretrain.py, 01_quickstart_finetune.py)", + ) + parser.add_argument( + "--nodes", + type=int, + default=1, + help="Number of nodes to use", + ) + parser.add_argument( + "--devices", + type=int, + default=8, + help="GPUs per node", + ) + parser.add_argument( + "--partition", + type=str, + required=True, + help="Slurm partition name", + ) + parser.add_argument( + "--account", + type=str, + required=True, + help="Slurm account name", + ) + parser.add_argument( + "--time", + type=str, + default="04:00:00", + help="Job time limit", + ) + parser.add_argument( + "--ssh-tunnel", + action="store_true", + help="Use SSH tunnel (for launching from local machine). Requires --host, --user, --remote-job-dir", + ) + parser.add_argument( + "--host", + type=str, + help="SSH host for tunnel (required if --ssh-tunnel is set)", + ) + parser.add_argument( + "--user", + type=str, + help="SSH user for tunnel (required if --ssh-tunnel is set)", + ) + parser.add_argument( + "--remote-job-dir", + type=str, + help="Remote directory to store job files (required if --ssh-tunnel is set)", + ) + parser.add_argument( + "--identity", + type=str, + default=None, + help="Path to SSH private key for authentication", + ) + parser.add_argument( + "--container-image", + type=str, + default=None, + help="Container image path", + ) + parser.add_argument( + "--mount", + type=str, + action="append", + default=[], + help="Container mounts in format host:container (can be specified multiple times)", + ) + parser.add_argument( + "--experiment-name", + type=str, + default="megatron_bridge_training", + help="Name for the experiment", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print what would be executed without submitting the job", + ) + parser.add_argument( + "--detach", + action="store_true", + default=True, + help="Detach from the experiment after submission", + ) + parser.add_argument( + "--tail-logs", + action="store_true", + help="Tail logs after submission (only works with --no-detach)", + ) + + # Use parse_known_args to capture forwarded arguments for the training script + args, forwarded_args = parser.parse_known_args() + return args, forwarded_args + + +def main() -> None: + """Launch training (pretrain/finetune) using NeMo-Run SlurmExecutor.""" + args, forwarded_args = parse_args() + + # Validate SSH tunnel arguments + if args.ssh_tunnel: + if not all([args.host, args.user, args.remote_job_dir]): + raise ValueError("--ssh-tunnel requires --host, --user, and --remote-job-dir to be specified") + + # Resolve script path + script_path = SCRIPT_DIR / args.script + if not script_path.exists(): + raise FileNotFoundError(f"Training script not found: {script_path}") + + # Build arguments for the training script from forwarded args + script_args = forwarded_args if forwarded_args else [] + + # Create the training task + task = run.Script( + path=str(script_path), + entrypoint="python", + args=script_args, + ) + + # Configure tunnel (SSH for remote, Local if already on cluster) + tunnel = None + if args.ssh_tunnel: + tunnel = run.SSHTunnel( + host=args.host, + user=args.user, + job_dir=args.remote_job_dir, + identity=args.identity, + ) + logger.info(f"Using SSH tunnel to {args.user}@{args.host}") + else: + tunnel = run.LocalTunnel() + logger.info("Using LocalTunnel (running on cluster)") + + # Create the Slurm executor + executor = run.SlurmExecutor( + account=args.account, + partition=args.partition, + nodes=args.nodes, + ntasks_per_node=args.devices, + gpus_per_node=args.devices, + mem="0", + exclusive=True, + time=args.time, + tunnel=tunnel, + ) + + # Configure container if specified + if args.container_image: + executor.container_image = args.container_image + + # Configure mounts if specified + if args.mount: + executor.container_mounts = args.mount + + # Run the experiment + with run.Experiment(args.experiment_name) as exp: + exp.add(task, executor=executor, name="training") + + if args.dry_run: + exp.dryrun() + else: + exp.run(detach=args.detach, tail_logs=args.tail_logs) + + if args.detach: + logger.info("Job submitted to Slurm!") + logger.info("Use 'squeue' to check job status") + else: + logger.info("Job completed!") + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(message)s") + main() diff --git a/tutorials/recipes/llama/README.md b/tutorials/recipes/llama/README.md new file mode 100644 index 000000000..b5306707d --- /dev/null +++ b/tutorials/recipes/llama/README.md @@ -0,0 +1,219 @@ +# Recipes with Megatron Bridge + +This guide shows you how to pretrain and finetune Llama models using Megatron Bridge. + +## Quickstart + +The fastest way to get started with Megatron Bridge pretraining: + +```bash +torchrun --nproc_per_node=1 00_quickstart_pretrain.py +``` + +This runs Llama 3.2 1B pretraining on a single GPU with mock data. + +For finetuning, you first need a checkpoint in Megatron format. Convert from HuggingFace using the `AutoBridge`: + +> **Note:** You must be authenticated with Hugging Face to download the model. Run `hf auth login --token $HF_TOKEN` if needed. + +```bash +python ../../conversion/convert_checkpoints.py import \ + --hf-model meta-llama/Llama-3.2-1B \ + --megatron-path ./checkpoints/llama32_1b +``` + +Then run finetuning: + +```bash +torchrun --nproc_per_node=1 01_quickstart_finetune.py \ + --pretrained-checkpoint ./checkpoints/llama32_1b +``` + +The [01_quickstart_finetune.py](01_quickstart_finetune.py) recipe finetunes Llama 3.2 1B using LoRA on the SQuAD dataset by default. + +To plug in your own JSONL dataset, swap the dataset config in that script: + +```python +from megatron.bridge.training.config import FinetuningDatasetConfig + +config.dataset = FinetuningDatasetConfig( + dataset_root="/path/to/dataset_dir", # contains training/validation/test jsonl files + seq_length=config.model.seq_length, +) +``` + +## Configuration + +Megatron Bridge recipes are standard Python scripts, giving you full flexibility in how you configure your training. You can: +1. Modify the Python scripts directly +2. Use the framework's YAML-based configuration system +3. Implement your own configuration management (ArgParse, Hydra, etc.) + +### Using Framework YAML Configs + +The recipes include optional support for YAML configuration and dot-notation overrides via `ConfigContainer`. This is just one way to manage config; you are free to use other methods. + +To use the provided YAML system: + +```bash +torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ + --config-file conf/llama32_1b_pretrain.yaml +``` + +Understanding the YAML Structure: + +YAML files mirror the `ConfigContainer` structure. Each top-level key corresponds to a configuration section (e.g., `dataset`, `train`, `model`, `optimizer`). + +Example YAML (`conf/llama32_1b_pretrain.yaml`): + +```yaml +# Each section maps to a ConfigContainer field +dataset: # GPTDatasetConfig + data_path: /path/to/training/data + sequence_length: 4096 + +train: # TrainingConfig + train_iters: 100 + global_batch_size: 256 + +checkpoint: # CheckpointConfig + save: ./checkpoints/llama32_1b + save_interval: 50 + +model: # Model Provider + seq_length: 4096 # Must match data.sequence_length + tensor_model_parallel_size: 1 + +optimizer: # OptimizerConfig + lr: 0.0003 +``` + +Command-Line Overrides: + +You can override values using dot notation (`section.field=value`): + +```bash +torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ + --config-file conf/llama32_1b_pretrain.yaml \ + train.train_iters=5000 \ + train.global_batch_size=512 \ + optimizer.lr=0.0002 +``` + +Priority order (highest to lowest): +1. Command-line overrides +2. YAML config file +3. Base recipe defaults + +### Finetuning Configuration + +For more complex finetuning configurations: + +```bash +torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ + --config-file conf/llama32_1b_finetune.yaml +``` + +Example YAML (`conf/llama32_1b_finetune.yaml`): + +```yaml +# Each section maps to a ConfigContainer field +dataset: # FinetuningDatasetConfig + data_path: /path/to/finetuning_dataset.jsonl + seq_length: 4096 + +train: # TrainingConfig + train_iters: 100 + global_batch_size: 128 + +checkpoint: # CheckpointConfig + pretrained_checkpoint: /path/to/pretrained/checkpoint + save: ./checkpoints/llama32_1b_finetuned + save_interval: 50 + +peft: # PEFT (LoRA config) + dim: 8 # LoRA rank + alpha: 16 # LoRA alpha + +model: # Model Provider + seq_length: 4096 # Must match data.seq_length + +optimizer: # OptimizerConfig + lr: 0.0001 +``` + +Full Finetuning (No LoRA) + +```bash +torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ + --peft none \ + train.train_iters=1000 +``` + +## Multi-Node Training + +### Direct Slurm with sbatch + +For traditional HPC workflows without NeMo-Run: + +```bash +# 1. Configure launch_with_sbatch.sh +# Edit SBATCH directives and script variables at the top + +# 2. Submit job +sbatch launch_with_sbatch.sh +``` + +The `launch_with_sbatch.sh` script shows how to: +- Configure Slurm job parameters +- Set up multi-node torchrun +- Use containers (optional) +- Pass arguments to training scripts + +### NeMo-Run + +For job management and remote launching capabilities: + +Prerequisites: + +```bash +pip install nemo-run +``` + +From the Slurm cluster login node: + +```bash +python 04_launch_slurm_with_nemo_run.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --devices 8 \ + --partition gpu \ + --account my_account +``` + +From your local machine (SSHTunnel): + +```bash +python 04_launch_slurm_with_nemo_run.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --devices 8 \ + --partition gpu \ + --account my_account \ + --ssh-tunnel \ + --host my-cluster.example.com \ + --user myusername \ + --remote-job-dir /home/myusername/nemo-runs +``` + +With custom config: + +```bash +python 04_launch_slurm_with_nemo_run.py \ + --script 03_finetune_with_yaml.py \ + --nodes 1 \ + --devices 8 \ + --partition gpu \ + --account my_account \ + --config-file conf/llama32_1b_finetune.yaml +``` diff --git a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml new file mode 100644 index 000000000..02c3897a6 --- /dev/null +++ b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml @@ -0,0 +1,78 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Example YAML configuration for Llama 3.2 1B finetuning with LoRA +# This file demonstrates commonly customized settings. +# Modify values as needed for your use case. + +# Data configuration +dataset: + # Replace with your dataset path (JSONL format recommended) + # data_path: /path/to/your/finetuning_dataset.jsonl + seq_length: 4096 + +# Training configuration +train: + train_iters: 100 + global_batch_size: 128 + micro_batch_size: 2 + eval_iters: 10 + eval_interval: 50 + + +# Optimizer configuration +optimizer: + lr: 0.0001 + min_lr: 0.00001 + weight_decay: 0.0 + +# Learning rate scheduler +scheduler: + lr_warmup_iters: 10 + lr_decay_style: cosine + +# Checkpoint configuration +checkpoint: + # Directory to save finetuned checkpoints + save: ./checkpoints/llama32_1b_finetuned + # Directory to resume from during training + load: ./checkpoints/llama32_1b + # Directory for pretrained weights in Megatron format + pretrained_checkpoint: ./path/to/pretrained/checkpoint/ + + save_interval: 50 + +# LoRA configuration +peft: + dim: 8 # LoRA rank + alpha: 16 # LoRA alpha scaling + +# Model configuration +# Note: seq_length must match dataset.seq_length +model: + seq_length: 4096 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + +# Logging +logger: + log_interval: 10 + tensorboard_dir: ./logs/llama32_1b_finetuned + # wandb_project: my_finetune_project + # wandb_entity: my_team + +# Random seed +rng: + seed: 1234 diff --git a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml new file mode 100644 index 000000000..9b9e8bf1d --- /dev/null +++ b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml @@ -0,0 +1,68 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Example YAML configuration for Llama 3.2 1B pretraining +# This file demonstrates commonly customized settings. +# Modify values as needed for your use case. + +# Data configuration +dataset: + # Replace with your dataset path + # data_path: /path/to/your/dataset + sequence_length: 4096 + +# Training configuration +train: + train_iters: 100 + global_batch_size: 256 + micro_batch_size: 2 + eval_iters: 10 + eval_interval: 50 + +# Optimizer configuration +optimizer: + lr: 0.0003 + min_lr: 0.00003 + weight_decay: 0.1 + +# Learning rate scheduler +scheduler: + lr_warmup_iters: 20 + lr_decay_style: cosine + +# Checkpoint configuration +checkpoint: + # Directory to save checkpoints + save: ./checkpoints/llama32_1b + load: ./checkpoints/llama32_1b + save_interval: 50 + +# Model configuration +# Note: seq_length must match data.sequence_length +model: + seq_length: 4096 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + +# Logging +logger: + log_interval: 10 + tensorboard_dir: ./logs/llama32_1b + # wandb_project: my_project # Uncomment to enable W&B logging + # wandb_entity: my_team + +# Random seed +rng: + seed: 1234 diff --git a/tutorials/recipes/llama/launch_with_sbatch.sh b/tutorials/recipes/llama/launch_with_sbatch.sh new file mode 100644 index 000000000..e75690d27 --- /dev/null +++ b/tutorials/recipes/llama/launch_with_sbatch.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#SBATCH --job-name=megatron-bridge-train +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=8 +#SBATCH --gpus-per-node=8 +#SBATCH --time=04:00:00 +#SBATCH --partition=gpu +#SBATCH --account=my_account +#SBATCH --output=logs/train_%j.out +#SBATCH --error=logs/train_%j.err +#SBATCH --exclusive + +# ============================================================================== +# Direct Slurm Launch with sbatch +# +# This script demonstrates how to launch training directly using sbatch without +# NeMo-Run. This is useful if you prefer traditional HPC workflows or don't want +# to install additional dependencies. +# +# Usage: +# 1. Modify the #SBATCH directives above for your cluster +# 2. Set the TRAINING_SCRIPT and other variables below +# 3. Submit: sbatch launch_with_sbatch.sh +# +# For NeMo-Run based launching (recommended), see 04_launch_slurm_with_nemo_run.py +# ============================================================================== + +# ============================================================================== +# CONFIGURATION - Modify these for your setup +# ============================================================================== + +# Training script to run (choose one) +TRAINING_SCRIPT="00_quickstart_pretrain.py" +# TRAINING_SCRIPT="01_quickstart_finetune.py" +# TRAINING_SCRIPT="02_pretrain_with_yaml.py" +# TRAINING_SCRIPT="03_finetune_with_yaml.py" + +# Optional: YAML config file (for *_with_yaml.py scripts) +CONFIG_FILE="" +# CONFIG_FILE="conf/llama32_1b_pretrain.yaml" +# CONFIG_FILE="conf/llama32_1b_finetune.yaml" + +# Optional: Additional CLI overrides (for *_with_yaml.py scripts) +CLI_OVERRIDES="" +# CLI_OVERRIDES="train.train_iters=1000 train.global_batch_size=512" + +# Optional: For finetuning scripts, specify checkpoint path +PRETRAINED_CHECKPOINT="" +# PRETRAINED_CHECKPOINT="./checkpoints/llama32_1b" + +# Container image (optional, only if using containers) +CONTAINER_IMAGE="" +# CONTAINER_IMAGE="/path/to/container.sqsh" + +# Container mounts (optional, space-separated) +CONTAINER_MOUNTS="" +# CONTAINER_MOUNTS="/data:/data /model:/model" + +# ============================================================================== +# Environment Setup +# ============================================================================== + +# Set common environment variables +# Optional: Set these if needed +# export CUDA_DEVICE_MAX_CONNECTIONS=1 +# export NCCL_DEBUG=INFO + +# ============================================================================== +# Job Execution +# ============================================================================== + +echo "======================================" +echo "Megatron Bridge Training Job" +echo "======================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Nodes: $SLURM_JOB_NUM_NODES" +echo "GPUs per node: $SLURM_GPUS_PER_NODE" +echo "Script: $TRAINING_SCRIPT" +echo "======================================" + +# Build the command +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_PATH="${SCRIPT_DIR}/${TRAINING_SCRIPT}" + +# Build torchrun command +CMD="torchrun" +CMD="$CMD --nproc_per_node=$SLURM_GPUS_PER_NODE" +CMD="$CMD --nnodes=$SLURM_JOB_NUM_NODES" +CMD="$CMD --node_rank=\$SLURM_PROCID" +CMD="$CMD --master_addr=\$(scontrol show hostname \$SLURM_NODELIST | head -n1)" +CMD="$CMD --master_port=29500" +CMD="$CMD $SCRIPT_PATH" + +# Add config file if specified +if [ -n "$CONFIG_FILE" ]; then + CMD="$CMD --config-file $CONFIG_FILE" +fi + +# Add pretrained checkpoint if specified (for finetuning) +if [ -n "$PRETRAINED_CHECKPOINT" ]; then + CMD="$CMD --pretrained-checkpoint $PRETRAINED_CHECKPOINT" +fi + +# Add CLI overrides if specified +if [ -n "$CLI_OVERRIDES" ]; then + CMD="$CMD $CLI_OVERRIDES" +fi + +echo "Executing: $CMD" +echo "======================================" + +# Execute with or without container +if [ -n "$CONTAINER_IMAGE" ]; then + # With container + SRUN_CMD="srun --container-image=$CONTAINER_IMAGE" + + # Add container mounts + if [ -n "$CONTAINER_MOUNTS" ]; then + for mount in $CONTAINER_MOUNTS; do + SRUN_CMD="$SRUN_CMD --container-mounts=$mount" + done + fi + + $SRUN_CMD bash -c "$CMD" +else + # Without container + srun bash -c "$CMD" +fi + +echo "======================================" +echo "Job completed" +echo "======================================" +