From 4c0a02c26d7b857c323db4a47f3dcc4cae94f4c5 Mon Sep 17 00:00:00 2001 From: Ananth Subramaniam Date: Wed, 12 Nov 2025 03:12:45 -0800 Subject: [PATCH 1/7] redo llama recipe examples Signed-off-by: Ananth Subramaniam --- .../recipes/llama/00_quickstart_finetune.py | 121 +++++++ .../recipes/llama/00_quickstart_pretrain.py | 68 ++++ .../recipes/llama/01_finetune_with_yaml.py | 141 ++++++++ .../recipes/llama/01_pretrain_with_yaml.py | 127 ++++++++ .../recipes/llama/02_launch_pretrain_local.py | 156 +++++++++ .../recipes/llama/03_launch_pretrain_slurm.py | 283 ++++++++++++++++ examples/recipes/llama/README.md | 308 ++++++++++++++++++ .../llama/conf/llama32_1b_finetune.yaml | 82 +++++ .../llama/conf/llama32_1b_pretrain.yaml | 71 ++++ .../llama3_8b_pretrain_override_example.yaml | 65 ---- ...etrain_override_example_megatron_fsdp.yaml | 68 ---- examples/recipes/llama/pretrain_llama3_8b.py | 184 ----------- .../pretrain_llama3_8b_nemo_run_script.py | 150 --------- 13 files changed, 1357 insertions(+), 467 deletions(-) create mode 100644 examples/recipes/llama/00_quickstart_finetune.py create mode 100644 examples/recipes/llama/00_quickstart_pretrain.py create mode 100644 examples/recipes/llama/01_finetune_with_yaml.py create mode 100644 examples/recipes/llama/01_pretrain_with_yaml.py create mode 100644 examples/recipes/llama/02_launch_pretrain_local.py create mode 100644 examples/recipes/llama/03_launch_pretrain_slurm.py create mode 100644 examples/recipes/llama/README.md create mode 100644 examples/recipes/llama/conf/llama32_1b_finetune.yaml create mode 100644 examples/recipes/llama/conf/llama32_1b_pretrain.yaml delete mode 100644 examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml delete mode 100644 examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml delete mode 100644 examples/recipes/llama/pretrain_llama3_8b.py delete mode 100644 examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py diff --git a/examples/recipes/llama/00_quickstart_finetune.py b/examples/recipes/llama/00_quickstart_finetune.py new file mode 100644 index 000000000..5b8e4ee83 --- /dev/null +++ b/examples/recipes/llama/00_quickstart_finetune.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Quickstart: Finetune Llama 3.2 1B with Megatron-Bridge + +This is the simplest way to start finetuning with Megatron-Bridge. +By default, this uses LoRA (Low-Rank Adaptation) for efficient finetuning. + +Usage: + Single GPU with LoRA: + torchrun --nproc_per_node=1 00_quickstart_finetune.py \ + --pretrained-checkpoint /path/to/megatron/checkpoint + + Multiple GPUs (automatic data parallelism): + torchrun --nproc_per_node=8 00_quickstart_finetune.py \ + --pretrained-checkpoint /path/to/megatron/checkpoint + +Prerequisites: + You need a checkpoint in Megatron format. You can either: + 1. Convert HF checkpoint to Megatron format: + python examples/conversion/convert_checkpoints.py import \ + --hf-model meta-llama/Llama-3.2-1B \ + --megatron-path ./checkpoints/llama32_1b + 2. Use a checkpoint from pretraining (see 00_quickstart_pretrain.py) + +The script uses SQuAD dataset by default. See inline comments for: +- Using your own dataset +- Adjusting LoRA hyperparameters +- Switching to full supervised finetuning +""" + +import argparse + +from megatron.bridge.recipes.llama import llama32_1b_finetune_config +from megatron.bridge.training.finetune import finetune +from megatron.bridge.training.gpt_step import forward_step + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Finetune Llama 3.2 1B with LoRA", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--pretrained-checkpoint", + type=str, + required=True, + help="Path to pretrained checkpoint in Megatron format", + ) + return parser.parse_args() + + +def main() -> None: + """Run Llama 3.2 1B finetuning with LoRA.""" + args = parse_args() + + # Load the base finetune configuration + # By default: LoRA with rank=8, alpha=16, works on single GPU + config = llama32_1b_finetune_config() + + # Load from the pretrained checkpoint + config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint + + # === Quick test run (10 iterations) === + config.train.train_iters = 10 + config.scheduler.lr_warmup_iters = 2 + + # ===== OPTIONAL CUSTOMIZATIONS ===== + # Uncomment and modify as needed: + + # === Use your own dataset === + # Replace SQuAD with your custom dataset + # Option 1: Simple path override (uses default FinetuningDatasetConfig) + # config.data.data_path = "/path/to/your/dataset.jsonl" + + # Option 2: Use FinetuningDatasetConfig for custom JSONL datasets + # from megatron.bridge.training.data import FinetuningDatasetConfig + # config.data = FinetuningDatasetConfig(data_path="/path/to/your/dataset.jsonl") + + # Option 3: Use HFDatasetConfig for HuggingFace datasets + # from megatron.bridge.training.data import HFDatasetConfig + # config.data = HFDatasetConfig(hf_dataset="squad", split="train") + + # === Change learning rate === + # config.optimizer.lr = 5e-5 # Default for LoRA: 1e-4 + + # === Modify checkpoint frequency === + # config.train.save_interval = 100 + + # === Adjust LoRA hyperparameters === + # Higher rank = more parameters = potentially better quality but slower + # config.peft.dim = 16 # LoRA rank (default: 8) + # config.peft.alpha = 32 # LoRA alpha scaling (default: 16) + + # === Full supervised finetuning (no LoRA) === + # For full finetuning, reload config with peft=None: + # config = llama32_1b_finetune_config(peft=None) + # config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint + # Note: Full finetuning uses more memory than LoRA + # The recipe automatically adjusts parallelism for full SFT + + # Start finetuning + finetune(config=config, forward_step_func=forward_step) + + +if __name__ == "__main__": + main() diff --git a/examples/recipes/llama/00_quickstart_pretrain.py b/examples/recipes/llama/00_quickstart_pretrain.py new file mode 100644 index 000000000..5f11d2382 --- /dev/null +++ b/examples/recipes/llama/00_quickstart_pretrain.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Quickstart: Pretrain Llama 3.2 1B with Megatron Bridge + +This is the simplest way to start pretraining with Megatron-Bridge. +We use Llama 3.2 1B because it fits on a single GPU, making it easy to test. + +Usage: + Single GPU: + torchrun --nproc_per_node=1 00_quickstart_pretrain.py + + Multiple GPUs (automatic data parallelism): + torchrun --nproc_per_node=8 00_quickstart_pretrain.py + +The script uses sensible defaults and mock data for quick testing. +For custom configurations, see 01_pretrain_with_yaml.py +For multi-node training, see 02_launch_pretrain_slurm.py +For larger models (8B, 70B), see 01_pretrain_with_yaml.py +""" + +from megatron.bridge.recipes.llama import llama32_1b_pretrain_config +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain + + +def main() -> None: + """Run Llama 3.2 1B pretraining with default configuration.""" + + # Load the base recipe configuration + # Llama 3.2 1B works on a single GPU (TP=1, PP=1, CP=1) + config = llama32_1b_pretrain_config() + + # OPTIONAL: Customize key settings here + # Uncomment and modify as needed: + + # For a quick test run (10 iterations): + # config.train.train_iters = 10 + + # Use your own data (replace mock data): + # config.data.data_path = "/path/to/your/dataset" + + # Change batch sizes: + # config.train.global_batch_size = 256 + # config.train.micro_batch_size = 2 + + # Modify checkpoint frequency: + # config.train.save_interval = 500 + + # Start pretraining + pretrain(config=config, forward_step_func=forward_step) + + +if __name__ == "__main__": + main() diff --git a/examples/recipes/llama/01_finetune_with_yaml.py b/examples/recipes/llama/01_finetune_with_yaml.py new file mode 100644 index 000000000..c510c1b20 --- /dev/null +++ b/examples/recipes/llama/01_finetune_with_yaml.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Finetune with YAML Configuration and CLI Overrides + +This script demonstrates how to use YAML configuration files and command-line +overrides for finetuning with LoRA or full supervised finetuning (SFT). + +Usage: + With default config file: + torchrun --nproc_per_node=1 01_finetune_with_yaml.py + + With custom config file: + torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + --config-file conf/my_finetune_config.yaml + + With command-line overrides: + torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + train.train_iters=1000 \ + optimizer.lr=5e-5 + + Full finetuning instead of LoRA: + torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + --peft none \ + train.train_iters=1000 + + Combining YAML and CLI (CLI takes precedence): + torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + --config-file conf/llama32_1b_finetune.yaml \ + peft.dim=16 \ + train.train_iters=2000 + +Configuration Priority (highest to lowest): + 1. Command-line overrides (highest) + 2. YAML config file + 3. Base recipe defaults (lowest) + +See conf/ directory for example YAML configurations. +For a pure Python usage see 00_quickstart_finetune.py. +""" + +import argparse +import logging +import sys +from pathlib import Path +from typing import Tuple + +from omegaconf import OmegaConf + +from megatron.bridge.recipes.llama import llama32_1b_finetune_config +from megatron.bridge.training.config import ConfigContainer +from megatron.bridge.training.finetune import finetune +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.utils.omegaconf_utils import ( + apply_overrides, + create_omegaconf_dict_config, + parse_hydra_overrides, +) + + +logger = logging.getLogger(__name__) + +# Default config file location +SCRIPT_DIR = Path(__file__).parent.resolve() +DEFAULT_CONFIG_FILE = SCRIPT_DIR / "conf" / "llama32_1b_finetune.yaml" + + +def parse_args() -> Tuple[argparse.Namespace, list[str]]: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Finetune with YAML configuration and CLI overrides", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--config-file", + type=str, + default=None, + help=f"Path to YAML config file (optional). Default: {DEFAULT_CONFIG_FILE}", + ) + parser.add_argument( + "--peft", + type=str, + default="lora", + choices=["lora", "dora", "none"], + help="PEFT method to use (default: lora). Use 'none' for full finetuning.", + ) + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + + # Separate known args from CLI overrides + args, cli_overrides = parser.parse_known_args() + return args, cli_overrides + + +def main() -> None: + """Run finetuning with YAML configuration and CLI overrides.""" + args, cli_overrides = parse_args() + + # Load base configuration from recipe + peft_method = None if args.peft == "none" else args.peft + config: ConfigContainer = llama32_1b_finetune_config(peft=peft_method) + + # Convert to OmegaConf for merging + omega_conf, excluded_fields = create_omegaconf_dict_config(config) + + # Apply YAML overrides if provided + if args.config_file: + config_file_path = Path(args.config_file) + if not config_file_path.exists(): + logger.error(f"Config file not found: {config_file_path}") + sys.exit(1) + + yaml_conf = OmegaConf.load(config_file_path) + omega_conf = OmegaConf.merge(omega_conf, yaml_conf) + + # Apply command-line overrides + if cli_overrides: + omega_conf = parse_hydra_overrides(omega_conf, cli_overrides) + + # Convert back to ConfigContainer + final_config_dict = OmegaConf.to_container(omega_conf, resolve=True) + apply_overrides(config, final_config_dict, excluded_fields) + + # Start finetuning + finetune(config=config, forward_step_func=forward_step) + + +if __name__ == "__main__": + main() diff --git a/examples/recipes/llama/01_pretrain_with_yaml.py b/examples/recipes/llama/01_pretrain_with_yaml.py new file mode 100644 index 000000000..4de837d25 --- /dev/null +++ b/examples/recipes/llama/01_pretrain_with_yaml.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Pretrain with YAML Configuration and CLI Overrides + +This script demonstrates how to use YAML configuration files and command-line +overrides for more complex configuration overrides. + +Usage: + With default config file: + torchrun --nproc_per_node=8 01_pretrain_with_yaml.py + + With custom config file: + torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ + --config-file conf/my_custom_config.yaml + + With command-line overrides: + torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ + train.train_iters=5000 \ + train.global_batch_size=256 + + Combining YAML and CLI (CLI takes precedence): + torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ + --config-file conf/llama32_1b_pretrain.yaml \ + train.train_iters=10000 + +Configuration Priority (highest to lowest): + 1. Command-line overrides (highest) + 2. YAML config file + 3. Base recipe defaults (lowest) + +See conf/ directory for example YAML configurations. +For a pure Python usage see 00_quickstart_pretrain.py. +""" + +import argparse +import logging +import sys +from pathlib import Path +from typing import Tuple + +from omegaconf import OmegaConf + +from megatron.bridge.recipes.llama import llama32_1b_pretrain_config +from megatron.bridge.training.config import ConfigContainer +from megatron.bridge.training.gpt_step import forward_step +from megatron.bridge.training.pretrain import pretrain +from megatron.bridge.training.utils.omegaconf_utils import ( + apply_overrides, + create_omegaconf_dict_config, + parse_hydra_overrides, +) + + +logger = logging.getLogger(__name__) + +# Default config file location +SCRIPT_DIR = Path(__file__).parent.resolve() +DEFAULT_CONFIG_FILE = SCRIPT_DIR / "conf" / "llama32_1b_pretrain.yaml" + + +def parse_args() -> Tuple[argparse.Namespace, list[str]]: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Pretrain with YAML configuration and CLI overrides", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--config-file", + type=str, + default=None, + help=f"Path to YAML config file (optional). Default: {DEFAULT_CONFIG_FILE}", + ) + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + + # Separate known args from CLI overrides + args, cli_overrides = parser.parse_known_args() + return args, cli_overrides + + +def main() -> None: + """Run pretraining with YAML configuration and CLI overrides.""" + args, cli_overrides = parse_args() + + # Load base configuration from recipe + config: ConfigContainer = llama32_1b_pretrain_config() + + # Convert to OmegaConf for merging + omega_conf, excluded_fields = create_omegaconf_dict_config(config) + + # Apply YAML overrides if provided + if args.config_file: + config_file_path = Path(args.config_file) + if not config_file_path.exists(): + logger.error(f"Config file not found: {config_file_path}") + sys.exit(1) + + yaml_conf = OmegaConf.load(config_file_path) + omega_conf = OmegaConf.merge(omega_conf, yaml_conf) + + # Apply command-line overrides + if cli_overrides: + omega_conf = parse_hydra_overrides(omega_conf, cli_overrides) + + # Convert back to ConfigContainer + final_config_dict = OmegaConf.to_container(omega_conf, resolve=True) + apply_overrides(config, final_config_dict, excluded_fields) + + # Start pretraining + pretrain(config=config, forward_step_func=forward_step) + + +if __name__ == "__main__": + main() diff --git a/examples/recipes/llama/02_launch_pretrain_local.py b/examples/recipes/llama/02_launch_pretrain_local.py new file mode 100644 index 000000000..afcbfc431 --- /dev/null +++ b/examples/recipes/llama/02_launch_pretrain_local.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Launch Training Locally with NeMo-Run + +This script demonstrates how to launch training scripts (pretrain or finetune) +using NeMo-Run's LocalExecutor with torchrun. This provides better job management +and logging compared to running torchrun directly. + +Prerequisites: Install nemo-run + +Usage: + # Launch pretrain script + python 02_launch_pretrain_local.py --script 00_quickstart_pretrain.py --devices 2 + + # Launch finetune script + python 02_launch_pretrain_local.py --script 00_quickstart_finetune.py --devices 1 + + # Launch with YAML config + python 02_launch_pretrain_local.py \ + --script 01_pretrain_with_yaml.py \ + --devices 2 \ + --config-file conf/llama32_1b_pretrain.yaml + + # Pass CLI overrides to the training script + python 02_launch_pretrain_local.py \ + --script 01_finetune_with_yaml.py \ + --devices 2 \ + --script-args "train.train_iters=500 peft.dim=16" + + # Dry run (see what would be executed) + python 02_launch_pretrain_local.py --script 00_quickstart_pretrain.py --dry-run +""" + +import argparse +import logging +from pathlib import Path + +import nemo_run as run + + +logger = logging.getLogger(__name__) + +SCRIPT_DIR = Path(__file__).parent.resolve() + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Launch training (pretrain/finetune) locally using NeMo-Run", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--script", + type=str, + required=True, + help="Training script to run (e.g., 00_quickstart_pretrain.py, 00_quickstart_finetune.py)", + ) + parser.add_argument( + "--devices", + type=int, + default=1, + help="Number of GPUs to use (default: 1)", + ) + parser.add_argument( + "--config-file", + type=str, + default=None, + help="YAML config file to pass to the training script (optional)", + ) + parser.add_argument( + "--script-args", + type=str, + default="", + help='Additional arguments to pass to the training script (space-separated, e.g., "train.train_iters=100")', + ) + parser.add_argument( + "--experiment-name", + type=str, + default="megatron_bridge_training", + help="Name for the experiment (default: megatron_bridge_training)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print what would be executed without running", + ) + + return parser.parse_args() + + +def main() -> None: + """Launch training (pretrain/finetune) using NeMo-Run LocalExecutor.""" + args = parse_args() + + # Resolve script path + script_path = SCRIPT_DIR / args.script + if not script_path.exists(): + raise FileNotFoundError(f"Training script not found: {script_path}") + + # Build arguments for the training script + script_args = [] + if args.config_file: + script_args.extend(["--config-file", args.config_file]) + + if args.script_args: + # Split the script args string and add each arg + script_args.extend(args.script_args.split()) + + logger.info("Launching training with NeMo-Run LocalExecutor") + logger.info(f"Script: {script_path.name}") + logger.info(f"GPUs: {args.devices}") + if args.config_file: + logger.info(f"Config: {args.config_file}") + if script_args: + logger.info(f"Script args: {' '.join(script_args)}") + logger.info("") + + # Create the training task + task = run.Script( + path=str(script_path), + entrypoint="python", + args=script_args, + ) + + # Create the local executor with torchrun + executor = run.LocalExecutor( + ntasks_per_node=args.devices, + launcher="torchrun", + ) + + # Run the experiment + with run.Experiment(args.experiment_name) as exp: + exp.add(task, executor=executor, name="training") + exp.run(detach=False, dryrun=args.dry_run) + + if not args.dry_run: + logger.info("Training completed!") + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(message)s") + main() diff --git a/examples/recipes/llama/03_launch_pretrain_slurm.py b/examples/recipes/llama/03_launch_pretrain_slurm.py new file mode 100644 index 000000000..5d23effdc --- /dev/null +++ b/examples/recipes/llama/03_launch_pretrain_slurm.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Launch Training on Slurm with NeMo-Run + +This script demonstrates how to launch training scripts (pretrain or finetune) +on a Slurm cluster using NeMo-Run. This enables easy multi-node training with +proper job management. + +Prerequisites: Install nemo-run + +Usage: + # From the Slurm cluster (uses LocalTunnel) + python 03_launch_pretrain_slurm.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account + + # From your local machine (uses SSHTunnel) + python 03_launch_pretrain_slurm.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + --ssh-tunnel \ + --host my-cluster.example.com \ + --user myusername \ + --remote-job-dir /home/myusername/nemo-runs + + # With custom SSH key + python 03_launch_pretrain_slurm.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + --ssh-tunnel \ + --host my-cluster.example.com \ + --user myusername \ + --remote-job-dir /home/myusername/nemo-runs \ + --identity ~/.ssh/id_rsa + + # Launch with custom config + python 03_launch_pretrain_slurm.py \ + --script 01_finetune_with_yaml.py \ + --nodes 1 \ + --partition gpu \ + --account my_account \ + --config-file conf/llama32_1b_finetune.yaml + + # With container and custom mounts + python 03_launch_pretrain_slurm.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + --container-image /path/to/container.sqsh \ + --mount /data:/data + +Note: +- Use --ssh-tunnel when launching from your local machine +- Omit --ssh-tunnel when already on the Slurm cluster (uses LocalTunnel) +- Adjust cluster-specific settings (account, partition, container paths) +""" + +import argparse +import logging +from pathlib import Path + +import nemo_run as run + + +logger = logging.getLogger(__name__) + +SCRIPT_DIR = Path(__file__).parent.resolve() + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Launch training (pretrain/finetune) on Slurm using NeMo-Run", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--script", + type=str, + required=True, + help="Training script to run (e.g., 00_quickstart_pretrain.py, 00_quickstart_finetune.py)", + ) + parser.add_argument( + "--nodes", + type=int, + default=1, + help="Number of nodes to use (default: 1)", + ) + parser.add_argument( + "--devices", + type=int, + default=8, + help="GPUs per node (default: 8)", + ) + parser.add_argument( + "--partition", + type=str, + required=True, + help="Slurm partition name", + ) + parser.add_argument( + "--account", + type=str, + required=True, + help="Slurm account name", + ) + parser.add_argument( + "--time", + type=str, + default="04:00:00", + help="Job time limit (default: 04:00:00)", + ) + parser.add_argument( + "--ssh-tunnel", + action="store_true", + help="Use SSH tunnel (for launching from local machine). Requires --host, --user, --remote-job-dir", + ) + parser.add_argument( + "--host", + type=str, + help="SSH host for tunnel (required if --ssh-tunnel is set)", + ) + parser.add_argument( + "--user", + type=str, + help="SSH user for tunnel (required if --ssh-tunnel is set)", + ) + parser.add_argument( + "--remote-job-dir", + type=str, + help="Remote directory to store job files (required if --ssh-tunnel is set)", + ) + parser.add_argument( + "--identity", + type=str, + default=None, + help="Path to SSH private key for authentication (optional)", + ) + parser.add_argument( + "--config-file", + type=str, + default=None, + help="YAML config file to pass to the training script (optional)", + ) + parser.add_argument( + "--script-args", + type=str, + default="", + help="Additional arguments for the training script (space-separated)", + ) + parser.add_argument( + "--container-image", + type=str, + default=None, + help="Container image path (optional)", + ) + parser.add_argument( + "--mount", + type=str, + action="append", + default=[], + help="Container mounts in format host:container (can be specified multiple times)", + ) + parser.add_argument( + "--experiment-name", + type=str, + default="megatron_bridge_training", + help="Name for the experiment", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print what would be executed without submitting the job", + ) + + return parser.parse_args() + + +def main() -> None: + """Launch training (pretrain/finetune) using NeMo-Run SlurmExecutor.""" + args = parse_args() + + # Validate SSH tunnel arguments + if args.ssh_tunnel: + if not all([args.host, args.user, args.remote_job_dir]): + raise ValueError("--ssh-tunnel requires --host, --user, and --remote-job-dir to be specified") + + # Resolve script path + script_path = SCRIPT_DIR / args.script + if not script_path.exists(): + raise FileNotFoundError(f"Training script not found: {script_path}") + + # Build arguments for the training script + script_args = [] + if args.config_file: + script_args.extend(["--config-file", args.config_file]) + + if args.script_args: + script_args.extend(args.script_args.split()) + + # Create the training task + task = run.Script( + path=str(script_path), + entrypoint="python", + args=script_args, + ) + + # Configure tunnel (SSH for remote, Local if already on cluster) + tunnel = None + if args.ssh_tunnel: + tunnel = run.SSHTunnel( + host=args.host, + user=args.user, + job_dir=args.remote_job_dir, + identity=args.identity, + ) + logger.info(f"Using SSH tunnel to {args.user}@{args.host}") + else: + tunnel = run.LocalTunnel() + logger.info("Using LocalTunnel (running on cluster)") + + # Create the Slurm executor + executor = run.SlurmExecutor( + account=args.account, + partition=args.partition, + nodes=args.nodes, + ntasks_per_node=args.devices, + gpus_per_node=args.devices, + mem="0", + exclusive=True, + time=args.time, + tunnel=tunnel, + ) + + # Configure container if specified + if args.container_image: + executor.container_image = args.container_image + + # Configure mounts if specified + if args.mount: + executor.container_mounts = args.mount + + # Set common environment variables + executor.env_vars = { + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", + "NCCL_NVLS_ENABLE": "0", + } + + # Run the experiment + with run.Experiment(args.experiment_name) as exp: + exp.add(task, executor=executor, name="training") + exp.run(detach=True, dryrun=args.dry_run) + + if args.dry_run: + logger.info("Dry run completed - no job was submitted") + else: + logger.info("Job submitted to Slurm!") + logger.info("Use 'squeue' to check job status") + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(message)s") + main() diff --git a/examples/recipes/llama/README.md b/examples/recipes/llama/README.md new file mode 100644 index 000000000..1d00e237e --- /dev/null +++ b/examples/recipes/llama/README.md @@ -0,0 +1,308 @@ +# Llama Recipes with Megatron Bridge + +This guide shows you how to pretrain and finetune Llama models using Megatron Bridge. + +## Quickstart + +The fastest way to get started with Megatron Bridge pretraining: + +```bash +torchrun --nproc_per_node=1 00_quickstart_pretrain.py +``` + +This runs Llama 3.2 1B pretraining on a single GPU with mock data. + +For finetuning, you need a checkpoint in Megatron format. Convert from HuggingFace: + +```bash +python ../../conversion/convert_checkpoints.py import \ + --hf-model meta-llama/Llama-3.2-1B \ + --megatron-path ./checkpoints/llama32_1b +``` + +Then run finetuning: + +```bash +torchrun --nproc_per_node=1 00_quickstart_finetune.py \ + --pretrained-checkpoint ./checkpoints/llama32_1b +``` + +This finetunes Llama 3.2 1B using LoRA on the SQuAD dataset. + +To use real data, uncomment and modify in the script: + +```python +config.data.data_path = "/path/to/your/dataset" +``` + +## Configuration with YAML + +For more complex configurations, use YAML files and command-line overrides: + +```bash +torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ + --config-file conf/llama32_1b_pretrain.yaml +``` + +Understanding YAML Configuration: + +YAML files should be organized into sections that mirror the `ConfigContainer` structure. Each top-level key corresponds to a configuration section (e.g., `data`, `train`, `model`, `optimizer`). Overrides are applied in a nested manner according to the ConfigContainer fields. + +Example YAML (`conf/llama32_1b_pretrain.yaml`): + +```yaml +# Each section maps to a ConfigContainer field +data: # GPTDatasetConfig + data_path: /path/to/training/data + seq_length: 4096 + +train: # TrainingConfig + train_iters: 10000 + global_batch_size: 256 + +checkpoint: # CheckpointConfig + save: ./checkpoints/llama32_1b + save_interval: 1000 + +model: # Model Provider + seq_length: 4096 # Must match data.seq_length + tensor_model_parallel_size: 1 + +optimizer: # OptimizerConfig + lr: 0.0003 +``` + +Override from command line using dot notation: + +Command-line overrides follow the same pattern as YAML structure. The first part before the dot indicates which subconfig of ConfigContainer to override (e.g., `train`, `model`, `optimizer`), and the part after the dot specifies the field within that subconfig. + +```bash +torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ + --config-file conf/llama32_1b_pretrain.yaml \ + train.train_iters=5000 \ + train.global_batch_size=512 \ + optimizer.lr=0.0002 +``` + +In this example: +- `train.train_iters=5000` → overrides `ConfigContainer.train.train_iters` +- `optimizer.lr=0.0002` → overrides `ConfigContainer.optimizer.lr` + +These example scripts are configured to accept overrides in the priority order (highest to lowest): +1. Command-line overrides (dot notation: `section.field=value`) +2. YAML config file (nested structure) +3. Base recipe defaults (from `llama32_1b_pretrain_config()`) + +## Multi-Node Training with NeMo-Run + +### Prerequisites + +```bash +pip install nemo-run +``` + +### Launch Locally + +Test your setup before going to a cluster. Works with both pretrain and finetune scripts: + +```bash +# Pretrain +python 02_launch_pretrain_local.py \ + --script 00_quickstart_pretrain.py \ + --devices 2 + +# Finetune +python 02_launch_pretrain_local.py \ + --script 00_quickstart_finetune.py \ + --devices 1 +``` + +### Launch on Slurm + +For multi-node training on Slurm clusters: + +From the cluster (LocalTunnel): + +```bash +python 03_launch_pretrain_slurm.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --devices 8 \ + --partition gpu \ + --account my_account +``` + +From your local machine (SSHTunnel): + +```bash +python 03_launch_pretrain_slurm.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --devices 8 \ + --partition gpu \ + --account my_account \ + --ssh-tunnel \ + --host my-cluster.example.com \ + --user myusername \ + --remote-job-dir /home/myusername/nemo-runs +``` + +With custom config: + +```bash +python 03_launch_pretrain_slurm.py \ + --script 01_finetune_with_yaml.py \ + --nodes 1 \ + --devices 8 \ + --partition gpu \ + --account my_account \ + --config-file conf/llama32_1b_finetune.yaml +``` + +## Finetuning + +### Quickstart: Finetune with LoRA + +Prerequisites: You need a checkpoint in Megatron format. Convert from HuggingFace: + +```bash +python ../../conversion/convert_checkpoints.py import \ + --hf-model meta-llama/Llama-3.2-1B \ + --megatron-path ./checkpoints/llama32_1b +``` + +Run finetuning: + +```bash +torchrun --nproc_per_node=1 00_quickstart_finetune.py \ + --pretrained-checkpoint ./checkpoints/llama32_1b +``` + +By default, this: +- Uses LoRA (Low-Rank Adaptation) for efficient finetuning +- Trains on the SQuAD dataset +- Works on a single GPU +- Llama 3.2 1B model + +Customize in the script: + +```python +# Use your own dataset (JSONL format) +config.data.data_path = "/path/to/your/dataset.jsonl" + +# Adjust LoRA hyperparameters +config.peft.dim = 16 # LoRA rank (default: 8) +config.peft.alpha = 32 # LoRA alpha (default: 16) +``` + +### Configuration with YAML + +For more complex finetuning configurations: + +```bash +torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + --config-file conf/llama32_1b_finetune.yaml +``` + +Example YAML (`conf/llama32_1b_finetune.yaml`): + +```yaml +# Each section maps to a ConfigContainer field +data: # FinetuningDatasetConfig + data_path: /path/to/finetuning_dataset.jsonl + seq_length: 4096 + +train: # TrainingConfig + train_iters: 1000 + global_batch_size: 128 + +checkpoint: # CheckpointConfig + pretrained_checkpoint: /path/to/pretrained/checkpoint + save: ./checkpoints/llama32_1b_finetuned + save_interval: 500 + +peft: # PEFT (LoRA config) + dim: 8 # LoRA rank + alpha: 16 # LoRA alpha + +model: # Model Provider + seq_length: 4096 # Must match data.seq_length + +optimizer: # OptimizerConfig + lr: 0.0001 # Higher LR for LoRA +``` + +Override from command line using dot notation: + +The first part before the dot indicates which ConfigContainer subconfig to override, and the part after specifies the field. + +```bash +torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + --config-file conf/llama32_1b_finetune.yaml \ + peft.dim=16 \ + train.train_iters=2000 +``` + +Here, `peft.dim=16` overrides `ConfigContainer.peft.dim`. + +Full finetuning (no LoRA): + +```bash +torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + --peft none \ + train.train_iters=1000 +``` + +### Multi-Node Finetuning + +Use the same launchers for finetuning. + +Local: + +```bash +python 02_launch_pretrain_local.py \ + --script 00_quickstart_finetune.py \ + --devices 1 \ + --script-args "--pretrained-checkpoint ./checkpoints/llama32_1b" +``` + +Slurm: + +```bash +python 03_launch_pretrain_slurm.py \ + --script 01_finetune_with_yaml.py \ + --nodes 1 \ + --partition gpu \ + --account my_account \ + --config-file conf/llama32_1b_finetune.yaml +``` + +### Working with Checkpoints + +**Important:** Finetuning requires checkpoints in Megatron format. You cannot use HuggingFace checkpoints directly. + +You can obtain Megatron checkpoints by: + +1. Converting from HuggingFace (recommended for starting from public models) +2. Using Megatron checkpoints from your own pretraining runs + +Convert HuggingFace checkpoint to Megatron format: + +```bash +python ../../conversion/convert_checkpoints.py import \ + --hf-model meta-llama/Llama-3.2-1B \ + --megatron-path ./checkpoints/llama32_1b +``` + +Use the checkpoint: + +```bash +# Command line (quickstart scripts) +torchrun --nproc_per_node=1 00_quickstart_finetune.py \ + --pretrained-checkpoint ./checkpoints/llama32_1b + +# YAML config (01_finetune_with_yaml.py) +# In conf/llama32_1b_finetune.yaml: +# checkpoint: +# pretrained_checkpoint: ./checkpoints/llama32_1b +``` diff --git a/examples/recipes/llama/conf/llama32_1b_finetune.yaml b/examples/recipes/llama/conf/llama32_1b_finetune.yaml new file mode 100644 index 000000000..a7919529d --- /dev/null +++ b/examples/recipes/llama/conf/llama32_1b_finetune.yaml @@ -0,0 +1,82 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Example YAML configuration for Llama 3.2 1B finetuning with LoRA +# This file demonstrates the most commonly customized settings. +# Uncomment and modify values as needed for your use case. + +# Data configuration +data: + # Replace with your dataset path (JSONL format recommended) + # data_path: /path/to/your/finetuning_dataset.jsonl + seq_length: 4096 + +# Training configuration +train: + train_iters: 100 + global_batch_size: 128 + micro_batch_size: 2 + eval_iters: 10 + eval_interval: 50 + + # Load from pretrained checkpoint + # load: /path/to/pretrained/checkpoint + +# Optimizer configuration (higher LR for LoRA) +optimizer: + lr: 0.0001 + min_lr: 0.00001 + weight_decay: 0.0 + +# Learning rate scheduler +scheduler: + lr_warmup_iters: 10 + lr_decay_style: cosine + +# Checkpoint configuration +checkpoint: + # Directory to save finetuned checkpoints + save: ./checkpoints/llama32_1b_finetuned + save_interval: 50 + +# LoRA configuration (default: rank=8, alpha=16) +peft: + dim: 8 # LoRA rank + alpha: 16 # LoRA alpha scaling + +# For full finetuning instead of LoRA: +# peft: null +# And adjust parallelism if needed: +# model: +# tensor_model_parallel_size: 2 + +# Model configuration (LoRA default: single GPU) +# Note: seq_length must match data.seq_length +model: + seq_length: 4096 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + +# Logging +logger: + log_interval: 10 + tensorboard_dir: ./logs/llama32_1b_finetuned + # wandb_project: my_finetune_project + # wandb_entity: my_team + +# Random seed +rng: + seed: 1234 + diff --git a/examples/recipes/llama/conf/llama32_1b_pretrain.yaml b/examples/recipes/llama/conf/llama32_1b_pretrain.yaml new file mode 100644 index 000000000..8454e7aa1 --- /dev/null +++ b/examples/recipes/llama/conf/llama32_1b_pretrain.yaml @@ -0,0 +1,71 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Example YAML configuration for Llama 3.2 1B pretraining +# This file demonstrates the most commonly customized settings. +# Uncomment and modify values as needed for your use case. + +# Data configuration +data: + # Replace with your dataset path + # data_path: /path/to/your/dataset + sequence_length: 4096 + +# Training configuration +train: + train_iters: 100 + global_batch_size: 256 + micro_batch_size: 2 + eval_iters: 10 + eval_interval: 50 + +# Optimizer configuration +optimizer: + lr: 0.0003 + min_lr: 0.00003 + weight_decay: 0.1 + +# Learning rate scheduler +scheduler: + lr_warmup_iters: 20 + lr_decay_style: cosine + +# Checkpoint configuration +checkpoint: + # Directory to save checkpoints + save: ./checkpoints/llama32_1b + save_interval: 50 + # Resume from checkpoint (optional) + # load: ./checkpoints/llama32_1b/iter_0000050 + +# Model configuration +# Llama 3.2 1B defaults: TP=1, PP=1, CP=1 (works on single GPU) +# Note: seq_length must match data.seq_length +model: + seq_length: 4096 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + +# Logging +logger: + log_interval: 10 + tensorboard_dir: ./logs/llama32_1b + # wandb_project: my_project # Uncomment to enable W&B logging + # wandb_entity: my_team + +# Random seed +rng: + seed: 1234 + diff --git a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml deleted file mode 100644 index 5f55d9988..000000000 --- a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Example override file - -# To override a parameter, ensure the structure matches the ConfigContainer -# and its sub-configurations (e.g., model, train, etc.) -# Top-level ConfigContainer fields are dataclasses themselves - -model: - seq_length: 4096 - -train: - train_iters: 20 - global_batch_size: 8 - micro_batch_size: 1 - eval_iters: 0 - -optimizer: - lr: 0.00025 - min_lr: 0.000025 - -scheduler: - lr_warmup_iters: 10 - -checkpoint: - # Directory to save to. If null, no checkpoint will be saved. - save: null - -dist: - use_megatron_fsdp: false - use_torch_fsdp2: false - -logger: - log_interval: 1 - -dataset: - seq_length: 4096 - -rng: - seed: 42 - -ddp: - grad_reduce_in_fp32: true - -profiling: - # For optional fields in the config, specify the target to instantiate the object. - _target_: megatron.bridge.training.config.ProfilingConfig - use_nsys_profiler: false - profile_step_start: 5 - profile_step_end: 10 - use_pytorch_profiler: true - profile_ranks: [0, 1] - record_shapes: true diff --git a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml deleted file mode 100644 index 0e239e284..000000000 --- a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Example override file - -# To override a parameter, ensure the structure matches the ConfigContainer -# and its sub-configurations (e.g., model, train, etc.) -# Top-level ConfigContainer fields are dataclasses themselves - -model: - seq_length: 4096 - init_model_with_meta_device: true - -train: - train_iters: 20 - global_batch_size: 8 - micro_batch_size: 1 - eval_iters: 0 - -optimizer: - lr: 0.00025 - min_lr: 0.000025 - -scheduler: - lr_warmup_iters: 10 - -checkpoint: - # Directory to save to. If null, no checkpoint will be saved. - save: null - ckpt_format: "fsdp_dtensor" - -dist: - use_megatron_fsdp: true - use_torch_fsdp2: false - -logger: - log_interval: 1 - -dataset: - seq_length: 4096 - -rng: - seed: 42 - -ddp: - grad_reduce_in_fp32: true - data_parallel_sharding_strategy: "optim_grads_params" # for Megatron FSDP ZeRO-3 like sharding - -profiling: - # For optional fields in the config, specify the target to instantiate the object. - _target_: megatron.bridge.training.config.ProfilingConfig - use_nsys_profiler: false - profile_step_start: 5 - profile_step_end: 10 - use_pytorch_profiler: true - profile_ranks: [0, 1] - record_shapes: true diff --git a/examples/recipes/llama/pretrain_llama3_8b.py b/examples/recipes/llama/pretrain_llama3_8b.py deleted file mode 100644 index 76ebde762..000000000 --- a/examples/recipes/llama/pretrain_llama3_8b.py +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Llama3 8B Pretraining Script with YAML and CLI Configuration Overrides. - -This script provides a flexible way to pretrain Llama3 8B models using Megatron-Bridge with support for -both YAML configuration files and command-line overrides using Hydra-style syntax. - -Examples: - Basic usage with default configuration: - $ torchrun --nproc_per_node=8 examples/recipes/llama/pretrain_llama3_8b.py - - Using a custom YAML config file: - $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_custom_config.yaml - - Using CLI overrides only: - $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py model.tensor_model_parallel_size=4 train.train_iters=100000 - - Combining YAML and CLI overrides (CLI takes precedence): - $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file conf/my_config.yaml \ - model.pipeline_dtype=torch.float16 \ - train.global_batch_size=512 - -Configuration Precedence: - 1. Base configuration from pretrain_config() recipe - 2. YAML overrides from --config-file (if provided) - 3. CLI overrides (highest precedence) - -Supported Override Syntax: - - Standard assignment: key=value - - Nested assignment: section.subsection.key=value - - Addition: +new_key=value - - Deletion: ~key_to_remove - - Type conversion: Automatic for basic types (int, float, bool, str) - - Complex types: torch.dtype, enums, etc. are supported -""" - -import argparse -import logging -import os -import sys -from pathlib import Path -from typing import Tuple - -import torch -from omegaconf import OmegaConf - -from megatron.bridge.recipes.llama import llama3_8b_pretrain_config as pretrain_config -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain -from megatron.bridge.training.utils.omegaconf_utils import ( - apply_overrides, - create_omegaconf_dict_config, - parse_hydra_overrides, -) -from megatron.bridge.utils.common_utils import get_rank_safe - - -logger: logging.Logger = logging.getLogger(__name__) - - -# Define paths relative to this script's location -# Assumes this script (pretrain_llama3_8b.py) is in Megatron-Bridge/examples/recipes/llama/ -# and the config is in a 'conf' subdirectory. -SCRIPT_DIR: Path = Path(__file__).parent.resolve() -DEFAULT_CONFIG_FILENAME: str = "llama3_8b_pretrain_override_example.yaml" -DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME - - -def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]: - """Parse command line arguments, separating known script args from OmegaConf overrides.""" - parser = argparse.ArgumentParser( - description="Pretrain Llama3 8B model using Megatron-Bridge with YAML and CLI overrides", - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--config-file", - type=str, - default=str(DEFAULT_CONFIG_FILE_PATH), - help="Path to the YAML OmegaConf override file. Default: conf/llama3_8b_pretrain_override_example.yaml", - ) - parser.add_argument("--debug", action="store_true", help="Enable debug logging") - - # Parse known args for the script, remaining will be treated as overrides - args, cli_dotlist_overrides = parser.parse_known_args() - return args, cli_dotlist_overrides - - -def main() -> None: - """ - Entry point for the Llama3 8B pretraining script. - - This function orchestrates the complete configuration workflow: - 1. Loads the base configuration from pretrain_config() recipe - 2. Applies YAML overrides from --config-file (if exists) - 3. Applies CLI overrides using Hydra-style syntax - 4. Starts Megatron pretraining with the final merged configuration - - Configuration merging preserves callable fields (like activation functions) - and handles type conversions automatically. - - Examples of CLI usage: - # Use default config with custom learning rate - torchrun --nproc_per_node=8 pretrain_llama3_8b.py optimizer.lr=0.0002 - - # Custom config file with additional overrides - torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_config.yaml train.train_iters=50000 - - # Multiple overrides for distributed training - torchrun --nproc_per_node=8 pretrain_llama3_8b.py \ - model.tensor_model_parallel_size=4 \ - model.pipeline_model_parallel_size=2 \ - train.global_batch_size=512 - """ - args, cli_overrides = parse_cli_args() - - logger.info("Megatron-Bridge Llama3 8B Pretraining Script with YAML & CLI Overrides") - logger.info("------------------------------------------------------------------") - - # Load base configuration from the recipe as a Python dataclass - cfg: ConfigContainer = pretrain_config() - logger.info("Loaded base configuration") - - # Print configuration on rank 0 - if get_rank_safe() == 0: - cfg.print_yaml() - - # Convert the initial Python dataclass to an OmegaConf DictConfig for merging - merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg) - - # Load and merge YAML overrides if a config file is provided - if args.config_file: - logger.debug(f"Loading YAML overrides from: {args.config_file}") - if not os.path.exists(args.config_file): - logger.error(f"Override YAML file not found: {args.config_file}") - sys.exit(1) - yaml_overrides_omega = OmegaConf.load(args.config_file) - merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega) - logger.debug("YAML overrides merged successfully.") - - # Apply command-line overrides using Hydra-style parsing - if cli_overrides: - logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}") - merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides) - logger.debug("Hydra-style command-line overrides applied successfully.") - - # Apply the final merged OmegaConf configuration back to the original ConfigContainer - logger.debug("Applying final merged configuration back to Python ConfigContainer...") - final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True) - # Apply overrides while preserving excluded fields - apply_overrides(cfg, final_overrides_as_dict, excluded_fields) - - # Display final configuration - if get_rank_safe() == 0: - logger.info("--- Final Merged Configuration ---") - cfg.print_yaml() - logger.info("----------------------------------") - - # Start training - logger.debug("Starting pretraining...") - pretrain(config=cfg, forward_step_func=forward_step) - - # Cleanup process group - if torch.distributed.is_initialized(): - torch.distributed.barrier() - torch.distributed.destroy_process_group() - - -if __name__ == "__main__": - main() diff --git a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py b/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py deleted file mode 100644 index 6b8c6c68d..000000000 --- a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -NeMo Run Launcher for Llama3 8B Pretraining. - -This script launches the pretrain_llama3_8b.py script using NeMo Run with TorchRun, -while forwarding any additional command line arguments to the target script. - -Examples: - Basic usage with default config: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 - - Using a custom config file: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 --config-file=my_config.yaml - - Passing additional overrides to the target script: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 \ - model.tensor_model_parallel_size=4 \ - train.train_iters=100000 - - Using both custom config and CLI overrides: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 \ - --config-file=conf/my_custom_config.yaml \ - optimizerg.lr=0.0002 \ - train.global_batch_size=512 - - Dry run to see what would be executed: - $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 --dryrun \ - model.pipeline_dtype=torch.float16 - -Argument Forwarding: - Any arguments not recognized by this launcher script will be forwarded - to the target pretrain_llama3_8b.py script as Hydra-style overrides. -""" - -import argparse -import logging -import sys -from pathlib import Path -from typing import Tuple - -import nemo_run as run - - -logger: logging.Logger = logging.getLogger(__name__) - -# Define paths relative to this script's location -# Assumes this script (pretrain_llama3_8b_nemo_run_script.py) is in Megatron-Bridge/examples/recipes/llama/ -# and pretrain_llama3_8b.py is in the same directory, -# and the config is in a 'conf' subdirectory. -SCRIPT_DIR: Path = Path(__file__).parent.resolve() -PRETRAIN_SCRIPT_FILENAME: str = "pretrain_llama3_8b.py" -PRETRAIN_SCRIPT_PATH: Path = SCRIPT_DIR / PRETRAIN_SCRIPT_FILENAME -DEFAULT_CONFIG_FILENAME: str = "llama3_8b_pretrain_override_example.yaml" -DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME - - -def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]: - """Parse command line arguments, separating launcher args from target script args.""" - parser = argparse.ArgumentParser( - description="Launcher for Llama3 8B pretraining using nemo_run and TorchRun. " - "Additional arguments will be forwarded to pretrain_llama3_8b.py", - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--nproc-per-node", - type=int, - default=2, - help="Number of processes per node for TorchRun (typically number of GPUs).", - ) - parser.add_argument( - "--config-file", - type=str, - default=str(DEFAULT_CONFIG_FILE_PATH), - help="Path to the YAML override config file for the pretrain_llama3_8b.py script.", - ) - parser.add_argument( - "--dryrun", - action="store_true", - help="Dry run the script without actually running it.", - ) - - # Parse known args for the launcher, remaining will be forwarded to target script - args, forwarded_args = parser.parse_known_args() - return args, forwarded_args - - -def main() -> None: - """ - Main function for script demonstrating how to use the NeMo Run executor. - """ - args, forwarded_args = parse_cli_args() - - logger.info("Nemo Run Launcher for Llama3 8B Pretraining") - logger.info("===========================================") - - if not PRETRAIN_SCRIPT_PATH.is_file(): - logger.error(f"Target pretraining script not found: {PRETRAIN_SCRIPT_PATH}") - logger.error(f"Please ensure '{PRETRAIN_SCRIPT_FILENAME}' exists in the same directory as this launcher.") - sys.exit(1) - - config_file_to_use = Path(args.config_file).resolve() - if not config_file_to_use.is_file(): - logger.error(f"Specified YAML config file not found: {config_file_to_use}") - logger.error("Ensure the path passed to --config_file is correct.") - sys.exit(1) - - # Build the arguments list for the target script - target_script_args = [ - "--config-file", - str(config_file_to_use), - ] - - # Add any forwarded arguments (Hydra-style overrides and other target script args) - if forwarded_args: - target_script_args.extend(forwarded_args) - logger.info(f"Forwarding additional arguments to target script: {forwarded_args}") - - logger.info(f"Target script: {PRETRAIN_SCRIPT_PATH}") - logger.info(f"Target script arguments: {target_script_args}") - - train_script = run.Script( - path=str(PRETRAIN_SCRIPT_PATH), - entrypoint="python", - args=target_script_args, - ) - - # Define the executor - logger.info(f"Launching locally with TorchRun with nproc_per_node={args.nproc_per_node}") - executor = run.LocalExecutor(ntasks_per_node=args.nproc_per_node, launcher="torchrun") - - # Execute the run - run.run(train_script, executor=executor, dryrun=args.dryrun) - - -if __name__ == "__main__": - main() From 6ed441c667e997e3d9350aa29deb7d1745e45288 Mon Sep 17 00:00:00 2001 From: Ananth Subramaniam Date: Wed, 12 Nov 2025 03:28:28 -0800 Subject: [PATCH 2/7] keep simplifying Signed-off-by: Ananth Subramaniam --- examples/recipes/llama/00_quickstart_finetune.py | 16 ++++++++-------- examples/recipes/llama/00_quickstart_pretrain.py | 14 +++++--------- examples/recipes/llama/README.md | 4 ++-- .../recipes/llama/conf/llama32_1b_finetune.yaml | 16 +++++----------- .../recipes/llama/conf/llama32_1b_pretrain.yaml | 7 +++---- 5 files changed, 23 insertions(+), 34 deletions(-) diff --git a/examples/recipes/llama/00_quickstart_finetune.py b/examples/recipes/llama/00_quickstart_finetune.py index 5b8e4ee83..f604cbbbe 100644 --- a/examples/recipes/llama/00_quickstart_finetune.py +++ b/examples/recipes/llama/00_quickstart_finetune.py @@ -69,7 +69,7 @@ def main() -> None: args = parse_args() # Load the base finetune configuration - # By default: LoRA with rank=8, alpha=16, works on single GPU + # Uses LoRA for efficient finetuning on a single GPU config = llama32_1b_finetune_config() # Load from the pretrained checkpoint @@ -84,7 +84,7 @@ def main() -> None: # === Use your own dataset === # Replace SQuAD with your custom dataset - # Option 1: Simple path override (uses default FinetuningDatasetConfig) + # Option 1: Simple path override # config.data.data_path = "/path/to/your/dataset.jsonl" # Option 2: Use FinetuningDatasetConfig for custom JSONL datasets @@ -95,16 +95,16 @@ def main() -> None: # from megatron.bridge.training.data import HFDatasetConfig # config.data = HFDatasetConfig(hf_dataset="squad", split="train") - # === Change learning rate === - # config.optimizer.lr = 5e-5 # Default for LoRA: 1e-4 + # === Adjust learning rate === + # config.optimizer.lr = 5e-5 - # === Modify checkpoint frequency === + # === Change checkpoint save frequency === # config.train.save_interval = 100 # === Adjust LoRA hyperparameters === - # Higher rank = more parameters = potentially better quality but slower - # config.peft.dim = 16 # LoRA rank (default: 8) - # config.peft.alpha = 32 # LoRA alpha scaling (default: 16) + # Higher rank = more trainable parameters, potentially better quality but slower + # config.peft.dim = 16 # LoRA rank + # config.peft.alpha = 32 # LoRA alpha scaling # === Full supervised finetuning (no LoRA) === # For full finetuning, reload config with peft=None: diff --git a/examples/recipes/llama/00_quickstart_pretrain.py b/examples/recipes/llama/00_quickstart_pretrain.py index 5f11d2382..4ec50cba7 100644 --- a/examples/recipes/llama/00_quickstart_pretrain.py +++ b/examples/recipes/llama/00_quickstart_pretrain.py @@ -16,9 +16,6 @@ """ Quickstart: Pretrain Llama 3.2 1B with Megatron Bridge -This is the simplest way to start pretraining with Megatron-Bridge. -We use Llama 3.2 1B because it fits on a single GPU, making it easy to test. - Usage: Single GPU: torchrun --nproc_per_node=1 00_quickstart_pretrain.py @@ -27,9 +24,8 @@ torchrun --nproc_per_node=8 00_quickstart_pretrain.py The script uses sensible defaults and mock data for quick testing. -For custom configurations, see 01_pretrain_with_yaml.py +For custom configurations through YAML and Hydra-style overrides, see 01_pretrain_with_yaml.py For multi-node training, see 02_launch_pretrain_slurm.py -For larger models (8B, 70B), see 01_pretrain_with_yaml.py """ from megatron.bridge.recipes.llama import llama32_1b_pretrain_config @@ -47,17 +43,17 @@ def main() -> None: # OPTIONAL: Customize key settings here # Uncomment and modify as needed: - # For a quick test run (10 iterations): + # For a quick test run: # config.train.train_iters = 10 - # Use your own data (replace mock data): + # Use your own data: # config.data.data_path = "/path/to/your/dataset" - # Change batch sizes: + # Adjust batch sizes for your GPU memory: # config.train.global_batch_size = 256 # config.train.micro_batch_size = 2 - # Modify checkpoint frequency: + # Change checkpoint save frequency: # config.train.save_interval = 500 # Start pretraining diff --git a/examples/recipes/llama/README.md b/examples/recipes/llama/README.md index 1d00e237e..82b04d396 100644 --- a/examples/recipes/llama/README.md +++ b/examples/recipes/llama/README.md @@ -191,8 +191,8 @@ Customize in the script: config.data.data_path = "/path/to/your/dataset.jsonl" # Adjust LoRA hyperparameters -config.peft.dim = 16 # LoRA rank (default: 8) -config.peft.alpha = 32 # LoRA alpha (default: 16) +config.peft.dim = 16 # LoRA rank +config.peft.alpha = 32 # LoRA alpha scaling ``` ### Configuration with YAML diff --git a/examples/recipes/llama/conf/llama32_1b_finetune.yaml b/examples/recipes/llama/conf/llama32_1b_finetune.yaml index a7919529d..fd6bc1a73 100644 --- a/examples/recipes/llama/conf/llama32_1b_finetune.yaml +++ b/examples/recipes/llama/conf/llama32_1b_finetune.yaml @@ -13,8 +13,8 @@ # limitations under the License. # Example YAML configuration for Llama 3.2 1B finetuning with LoRA -# This file demonstrates the most commonly customized settings. -# Uncomment and modify values as needed for your use case. +# This file demonstrates commonly customized settings. +# Modify values as needed for your use case. # Data configuration data: @@ -33,7 +33,7 @@ train: # Load from pretrained checkpoint # load: /path/to/pretrained/checkpoint -# Optimizer configuration (higher LR for LoRA) +# Optimizer configuration optimizer: lr: 0.0001 min_lr: 0.00001 @@ -50,18 +50,12 @@ checkpoint: save: ./checkpoints/llama32_1b_finetuned save_interval: 50 -# LoRA configuration (default: rank=8, alpha=16) +# LoRA configuration peft: dim: 8 # LoRA rank alpha: 16 # LoRA alpha scaling -# For full finetuning instead of LoRA: -# peft: null -# And adjust parallelism if needed: -# model: -# tensor_model_parallel_size: 2 - -# Model configuration (LoRA default: single GPU) +# Model configuration # Note: seq_length must match data.seq_length model: seq_length: 4096 diff --git a/examples/recipes/llama/conf/llama32_1b_pretrain.yaml b/examples/recipes/llama/conf/llama32_1b_pretrain.yaml index 8454e7aa1..a46d0c689 100644 --- a/examples/recipes/llama/conf/llama32_1b_pretrain.yaml +++ b/examples/recipes/llama/conf/llama32_1b_pretrain.yaml @@ -13,8 +13,8 @@ # limitations under the License. # Example YAML configuration for Llama 3.2 1B pretraining -# This file demonstrates the most commonly customized settings. -# Uncomment and modify values as needed for your use case. +# This file demonstrates commonly customized settings. +# Modify values as needed for your use case. # Data configuration data: @@ -50,8 +50,7 @@ checkpoint: # load: ./checkpoints/llama32_1b/iter_0000050 # Model configuration -# Llama 3.2 1B defaults: TP=1, PP=1, CP=1 (works on single GPU) -# Note: seq_length must match data.seq_length +# Note: seq_length must match data.sequence_length model: seq_length: 4096 tensor_model_parallel_size: 1 From 390cb072c74db2bed2f4634feefde8227bbcbf5d Mon Sep 17 00:00:00 2001 From: Ananth Subramaniam Date: Mon, 17 Nov 2025 00:42:14 -0800 Subject: [PATCH 3/7] update examples, include slurm sbatch script Signed-off-by: Ananth Subramaniam --- .../recipes/llama/00_quickstart_pretrain.py | 7 +- ..._finetune.py => 01_quickstart_finetune.py} | 14 +- .../recipes/llama/02_launch_pretrain_local.py | 156 ------------------ ..._with_yaml.py => 02_pretrain_with_yaml.py} | 8 +- ..._with_yaml.py => 03_finetune_with_yaml.py} | 14 +- ...rm.py => 04_launch_slurm_with_nemo_run.py} | 109 ++++++------ examples/recipes/llama/README.md | 106 +++++------- examples/recipes/llama/launch_with_sbatch.sh | 147 +++++++++++++++++ 8 files changed, 273 insertions(+), 288 deletions(-) rename examples/recipes/llama/{00_quickstart_finetune.py => 01_quickstart_finetune.py} (91%) delete mode 100644 examples/recipes/llama/02_launch_pretrain_local.py rename examples/recipes/llama/{01_pretrain_with_yaml.py => 02_pretrain_with_yaml.py} (94%) rename examples/recipes/llama/{01_finetune_with_yaml.py => 03_finetune_with_yaml.py} (90%) rename examples/recipes/llama/{03_launch_pretrain_slurm.py => 04_launch_slurm_with_nemo_run.py} (74%) create mode 100644 examples/recipes/llama/launch_with_sbatch.sh diff --git a/examples/recipes/llama/00_quickstart_pretrain.py b/examples/recipes/llama/00_quickstart_pretrain.py index 4ec50cba7..245cca0be 100644 --- a/examples/recipes/llama/00_quickstart_pretrain.py +++ b/examples/recipes/llama/00_quickstart_pretrain.py @@ -24,8 +24,8 @@ torchrun --nproc_per_node=8 00_quickstart_pretrain.py The script uses sensible defaults and mock data for quick testing. -For custom configurations through YAML and Hydra-style overrides, see 01_pretrain_with_yaml.py -For multi-node training, see 02_launch_pretrain_slurm.py +For custom configurations through YAML and Hydra-style overrides, see 02_pretrain_with_yaml.py +For multi-node training, see launch_with_sbatch.sh or 04_launch_slurm_with_nemo_run.py """ from megatron.bridge.recipes.llama import llama32_1b_pretrain_config @@ -44,7 +44,8 @@ def main() -> None: # Uncomment and modify as needed: # For a quick test run: - # config.train.train_iters = 10 + config.train.train_iters = 10 + config.scheduler.lr_warmup_iters = 2 # Use your own data: # config.data.data_path = "/path/to/your/dataset" diff --git a/examples/recipes/llama/00_quickstart_finetune.py b/examples/recipes/llama/01_quickstart_finetune.py similarity index 91% rename from examples/recipes/llama/00_quickstart_finetune.py rename to examples/recipes/llama/01_quickstart_finetune.py index f604cbbbe..a6f8060d6 100644 --- a/examples/recipes/llama/00_quickstart_finetune.py +++ b/examples/recipes/llama/01_quickstart_finetune.py @@ -14,18 +14,15 @@ # limitations under the License. """ -Quickstart: Finetune Llama 3.2 1B with Megatron-Bridge - -This is the simplest way to start finetuning with Megatron-Bridge. -By default, this uses LoRA (Low-Rank Adaptation) for efficient finetuning. +Quickstart: Finetune Llama 3.2 1B with Megatron Bridge Usage: Single GPU with LoRA: - torchrun --nproc_per_node=1 00_quickstart_finetune.py \ + torchrun --nproc_per_node=1 01_quickstart_finetune.py \ --pretrained-checkpoint /path/to/megatron/checkpoint Multiple GPUs (automatic data parallelism): - torchrun --nproc_per_node=8 00_quickstart_finetune.py \ + torchrun --nproc_per_node=8 01_quickstart_finetune.py \ --pretrained-checkpoint /path/to/megatron/checkpoint Prerequisites: @@ -40,6 +37,9 @@ - Using your own dataset - Adjusting LoRA hyperparameters - Switching to full supervised finetuning + +For YAML configuration, see 03_finetune_with_yaml.py +For multi-node training, see launch_with_sbatch.sh or 04_launch_slurm_with_nemo_run.py """ import argparse @@ -75,7 +75,7 @@ def main() -> None: # Load from the pretrained checkpoint config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint - # === Quick test run (10 iterations) === + # === Quick test run === config.train.train_iters = 10 config.scheduler.lr_warmup_iters = 2 diff --git a/examples/recipes/llama/02_launch_pretrain_local.py b/examples/recipes/llama/02_launch_pretrain_local.py deleted file mode 100644 index afcbfc431..000000000 --- a/examples/recipes/llama/02_launch_pretrain_local.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Launch Training Locally with NeMo-Run - -This script demonstrates how to launch training scripts (pretrain or finetune) -using NeMo-Run's LocalExecutor with torchrun. This provides better job management -and logging compared to running torchrun directly. - -Prerequisites: Install nemo-run - -Usage: - # Launch pretrain script - python 02_launch_pretrain_local.py --script 00_quickstart_pretrain.py --devices 2 - - # Launch finetune script - python 02_launch_pretrain_local.py --script 00_quickstart_finetune.py --devices 1 - - # Launch with YAML config - python 02_launch_pretrain_local.py \ - --script 01_pretrain_with_yaml.py \ - --devices 2 \ - --config-file conf/llama32_1b_pretrain.yaml - - # Pass CLI overrides to the training script - python 02_launch_pretrain_local.py \ - --script 01_finetune_with_yaml.py \ - --devices 2 \ - --script-args "train.train_iters=500 peft.dim=16" - - # Dry run (see what would be executed) - python 02_launch_pretrain_local.py --script 00_quickstart_pretrain.py --dry-run -""" - -import argparse -import logging -from pathlib import Path - -import nemo_run as run - - -logger = logging.getLogger(__name__) - -SCRIPT_DIR = Path(__file__).parent.resolve() - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Launch training (pretrain/finetune) locally using NeMo-Run", - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--script", - type=str, - required=True, - help="Training script to run (e.g., 00_quickstart_pretrain.py, 00_quickstart_finetune.py)", - ) - parser.add_argument( - "--devices", - type=int, - default=1, - help="Number of GPUs to use (default: 1)", - ) - parser.add_argument( - "--config-file", - type=str, - default=None, - help="YAML config file to pass to the training script (optional)", - ) - parser.add_argument( - "--script-args", - type=str, - default="", - help='Additional arguments to pass to the training script (space-separated, e.g., "train.train_iters=100")', - ) - parser.add_argument( - "--experiment-name", - type=str, - default="megatron_bridge_training", - help="Name for the experiment (default: megatron_bridge_training)", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Print what would be executed without running", - ) - - return parser.parse_args() - - -def main() -> None: - """Launch training (pretrain/finetune) using NeMo-Run LocalExecutor.""" - args = parse_args() - - # Resolve script path - script_path = SCRIPT_DIR / args.script - if not script_path.exists(): - raise FileNotFoundError(f"Training script not found: {script_path}") - - # Build arguments for the training script - script_args = [] - if args.config_file: - script_args.extend(["--config-file", args.config_file]) - - if args.script_args: - # Split the script args string and add each arg - script_args.extend(args.script_args.split()) - - logger.info("Launching training with NeMo-Run LocalExecutor") - logger.info(f"Script: {script_path.name}") - logger.info(f"GPUs: {args.devices}") - if args.config_file: - logger.info(f"Config: {args.config_file}") - if script_args: - logger.info(f"Script args: {' '.join(script_args)}") - logger.info("") - - # Create the training task - task = run.Script( - path=str(script_path), - entrypoint="python", - args=script_args, - ) - - # Create the local executor with torchrun - executor = run.LocalExecutor( - ntasks_per_node=args.devices, - launcher="torchrun", - ) - - # Run the experiment - with run.Experiment(args.experiment_name) as exp: - exp.add(task, executor=executor, name="training") - exp.run(detach=False, dryrun=args.dry_run) - - if not args.dry_run: - logger.info("Training completed!") - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format="%(message)s") - main() diff --git a/examples/recipes/llama/01_pretrain_with_yaml.py b/examples/recipes/llama/02_pretrain_with_yaml.py similarity index 94% rename from examples/recipes/llama/01_pretrain_with_yaml.py rename to examples/recipes/llama/02_pretrain_with_yaml.py index 4de837d25..999922324 100644 --- a/examples/recipes/llama/01_pretrain_with_yaml.py +++ b/examples/recipes/llama/02_pretrain_with_yaml.py @@ -21,19 +21,19 @@ Usage: With default config file: - torchrun --nproc_per_node=8 01_pretrain_with_yaml.py + torchrun --nproc_per_node=8 02_pretrain_with_yaml.py With custom config file: - torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ + torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ --config-file conf/my_custom_config.yaml With command-line overrides: - torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ + torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ train.train_iters=5000 \ train.global_batch_size=256 Combining YAML and CLI (CLI takes precedence): - torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ + torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ --config-file conf/llama32_1b_pretrain.yaml \ train.train_iters=10000 diff --git a/examples/recipes/llama/01_finetune_with_yaml.py b/examples/recipes/llama/03_finetune_with_yaml.py similarity index 90% rename from examples/recipes/llama/01_finetune_with_yaml.py rename to examples/recipes/llama/03_finetune_with_yaml.py index c510c1b20..ead33540f 100644 --- a/examples/recipes/llama/01_finetune_with_yaml.py +++ b/examples/recipes/llama/03_finetune_with_yaml.py @@ -21,24 +21,24 @@ Usage: With default config file: - torchrun --nproc_per_node=1 01_finetune_with_yaml.py + torchrun --nproc_per_node=1 03_finetune_with_yaml.py With custom config file: - torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ --config-file conf/my_finetune_config.yaml With command-line overrides: - torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ train.train_iters=1000 \ optimizer.lr=5e-5 Full finetuning instead of LoRA: - torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ --peft none \ train.train_iters=1000 Combining YAML and CLI (CLI takes precedence): - torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ + torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ --config-file conf/llama32_1b_finetune.yaml \ peft.dim=16 \ train.train_iters=2000 @@ -49,7 +49,7 @@ 3. Base recipe defaults (lowest) See conf/ directory for example YAML configurations. -For a pure Python usage see 00_quickstart_finetune.py. +For a pure Python usage see 01_quickstart_finetune.py. """ import argparse @@ -95,7 +95,7 @@ def parse_args() -> Tuple[argparse.Namespace, list[str]]: type=str, default="lora", choices=["lora", "dora", "none"], - help="PEFT method to use (default: lora). Use 'none' for full finetuning.", + help="PEFT method to use. Use 'none' for full finetuning.", ) parser.add_argument("--debug", action="store_true", help="Enable debug logging") diff --git a/examples/recipes/llama/03_launch_pretrain_slurm.py b/examples/recipes/llama/04_launch_slurm_with_nemo_run.py similarity index 74% rename from examples/recipes/llama/03_launch_pretrain_slurm.py rename to examples/recipes/llama/04_launch_slurm_with_nemo_run.py index 5d23effdc..fef063e7a 100644 --- a/examples/recipes/llama/03_launch_pretrain_slurm.py +++ b/examples/recipes/llama/04_launch_slurm_with_nemo_run.py @@ -24,14 +24,14 @@ Usage: # From the Slurm cluster (uses LocalTunnel) - python 03_launch_pretrain_slurm.py \ + python 04_launch_slurm_with_nemo_run.py \ --script 00_quickstart_pretrain.py \ --nodes 2 \ --partition gpu \ --account my_account # From your local machine (uses SSHTunnel) - python 03_launch_pretrain_slurm.py \ + python 04_launch_slurm_with_nemo_run.py \ --script 00_quickstart_pretrain.py \ --nodes 2 \ --partition gpu \ @@ -42,7 +42,7 @@ --remote-job-dir /home/myusername/nemo-runs # With custom SSH key - python 03_launch_pretrain_slurm.py \ + python 04_launch_slurm_with_nemo_run.py \ --script 00_quickstart_pretrain.py \ --nodes 2 \ --partition gpu \ @@ -53,16 +53,25 @@ --remote-job-dir /home/myusername/nemo-runs \ --identity ~/.ssh/id_rsa - # Launch with custom config - python 03_launch_pretrain_slurm.py \ - --script 01_finetune_with_yaml.py \ + # Launch with custom config (pass arguments to training script) + python 04_launch_slurm_with_nemo_run.py \ + --script 03_finetune_with_yaml.py \ --nodes 1 \ --partition gpu \ --account my_account \ --config-file conf/llama32_1b_finetune.yaml + # Pass CLI overrides to training script + python 04_launch_slurm_with_nemo_run.py \ + --script 02_pretrain_with_yaml.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + train.train_iters=5000 \ + optimizer.lr=0.0002 + # With container and custom mounts - python 03_launch_pretrain_slurm.py \ + python 04_launch_slurm_with_nemo_run.py \ --script 00_quickstart_pretrain.py \ --nodes 2 \ --partition gpu \ @@ -70,9 +79,21 @@ --container-image /path/to/container.sqsh \ --mount /data:/data + # Wait for job completion and tail logs + python 04_launch_slurm_with_nemo_run.py \ + --script 00_quickstart_pretrain.py \ + --nodes 2 \ + --partition gpu \ + --account my_account \ + --no-detach \ + --tail-logs + Note: - Use --ssh-tunnel when launching from your local machine - Omit --ssh-tunnel when already on the Slurm cluster (uses LocalTunnel) +- By default, jobs are submitted and detached (--detach) +- Use --no-detach --tail-logs to wait and monitor job output +- Any unknown arguments are forwarded to the training script - Adjust cluster-specific settings (account, partition, container paths) """ @@ -88,7 +109,7 @@ SCRIPT_DIR = Path(__file__).parent.resolve() -def parse_args() -> argparse.Namespace: +def parse_args() -> tuple[argparse.Namespace, list[str]]: """Parse command-line arguments.""" parser = argparse.ArgumentParser( description="Launch training (pretrain/finetune) on Slurm using NeMo-Run", @@ -98,19 +119,19 @@ def parse_args() -> argparse.Namespace: "--script", type=str, required=True, - help="Training script to run (e.g., 00_quickstart_pretrain.py, 00_quickstart_finetune.py)", + help="Training script to run (e.g., 00_quickstart_pretrain.py, 01_quickstart_finetune.py)", ) parser.add_argument( "--nodes", type=int, default=1, - help="Number of nodes to use (default: 1)", + help="Number of nodes to use", ) parser.add_argument( "--devices", type=int, default=8, - help="GPUs per node (default: 8)", + help="GPUs per node", ) parser.add_argument( "--partition", @@ -128,7 +149,7 @@ def parse_args() -> argparse.Namespace: "--time", type=str, default="04:00:00", - help="Job time limit (default: 04:00:00)", + help="Job time limit", ) parser.add_argument( "--ssh-tunnel", @@ -154,25 +175,13 @@ def parse_args() -> argparse.Namespace: "--identity", type=str, default=None, - help="Path to SSH private key for authentication (optional)", - ) - parser.add_argument( - "--config-file", - type=str, - default=None, - help="YAML config file to pass to the training script (optional)", - ) - parser.add_argument( - "--script-args", - type=str, - default="", - help="Additional arguments for the training script (space-separated)", + help="Path to SSH private key for authentication", ) parser.add_argument( "--container-image", type=str, default=None, - help="Container image path (optional)", + help="Container image path", ) parser.add_argument( "--mount", @@ -192,13 +201,26 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Print what would be executed without submitting the job", ) + parser.add_argument( + "--detach", + action="store_true", + default=True, + help="Detach from the experiment after submission", + ) + parser.add_argument( + "--tail-logs", + action="store_true", + help="Tail logs after submission (only works with --no-detach)", + ) - return parser.parse_args() + # Use parse_known_args to capture forwarded arguments for the training script + args, forwarded_args = parser.parse_known_args() + return args, forwarded_args def main() -> None: """Launch training (pretrain/finetune) using NeMo-Run SlurmExecutor.""" - args = parse_args() + args, forwarded_args = parse_args() # Validate SSH tunnel arguments if args.ssh_tunnel: @@ -210,13 +232,8 @@ def main() -> None: if not script_path.exists(): raise FileNotFoundError(f"Training script not found: {script_path}") - # Build arguments for the training script - script_args = [] - if args.config_file: - script_args.extend(["--config-file", args.config_file]) - - if args.script_args: - script_args.extend(args.script_args.split()) + # Build arguments for the training script from forwarded args + script_args = forwarded_args if forwarded_args else [] # Create the training task task = run.Script( @@ -260,22 +277,20 @@ def main() -> None: if args.mount: executor.container_mounts = args.mount - # Set common environment variables - executor.env_vars = { - "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - } - # Run the experiment with run.Experiment(args.experiment_name) as exp: exp.add(task, executor=executor, name="training") - exp.run(detach=True, dryrun=args.dry_run) - if args.dry_run: - logger.info("Dry run completed - no job was submitted") - else: - logger.info("Job submitted to Slurm!") - logger.info("Use 'squeue' to check job status") + if args.dry_run: + exp.dryrun() + else: + exp.run(detach=args.detach, tail_logs=args.tail_logs) + + if args.detach: + logger.info("Job submitted to Slurm!") + logger.info("Use 'squeue' to check job status") + else: + logger.info("Job completed!") if __name__ == "__main__": diff --git a/examples/recipes/llama/README.md b/examples/recipes/llama/README.md index 82b04d396..bed7d357d 100644 --- a/examples/recipes/llama/README.md +++ b/examples/recipes/llama/README.md @@ -23,7 +23,7 @@ python ../../conversion/convert_checkpoints.py import \ Then run finetuning: ```bash -torchrun --nproc_per_node=1 00_quickstart_finetune.py \ +torchrun --nproc_per_node=1 01_quickstart_finetune.py \ --pretrained-checkpoint ./checkpoints/llama32_1b ``` @@ -40,7 +40,7 @@ config.data.data_path = "/path/to/your/dataset" For more complex configurations, use YAML files and command-line overrides: ```bash -torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ +torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ --config-file conf/llama32_1b_pretrain.yaml ``` @@ -54,18 +54,18 @@ Example YAML (`conf/llama32_1b_pretrain.yaml`): # Each section maps to a ConfigContainer field data: # GPTDatasetConfig data_path: /path/to/training/data - seq_length: 4096 + sequence_length: 4096 train: # TrainingConfig - train_iters: 10000 + train_iters: 100 global_batch_size: 256 checkpoint: # CheckpointConfig save: ./checkpoints/llama32_1b - save_interval: 1000 + save_interval: 50 model: # Model Provider - seq_length: 4096 # Must match data.seq_length + seq_length: 4096 # Must match data.sequence_length tensor_model_parallel_size: 1 optimizer: # OptimizerConfig @@ -77,7 +77,7 @@ Override from command line using dot notation: Command-line overrides follow the same pattern as YAML structure. The first part before the dot indicates which subconfig of ConfigContainer to override (e.g., `train`, `model`, `optimizer`), and the part after the dot specifies the field within that subconfig. ```bash -torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \ +torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ --config-file conf/llama32_1b_pretrain.yaml \ train.train_iters=5000 \ train.global_batch_size=512 \ @@ -93,38 +93,40 @@ These example scripts are configured to accept overrides in the priority order ( 2. YAML config file (nested structure) 3. Base recipe defaults (from `llama32_1b_pretrain_config()`) -## Multi-Node Training with NeMo-Run +## Multi-Node Training + +### Direct Slurm with sbatch -### Prerequisites +For traditional HPC workflows without NeMo-Run: ```bash -pip install nemo-run +# 1. Configure launch_with_sbatch.sh +# Edit SBATCH directives and script variables at the top + +# 2. Submit job +sbatch launch_with_sbatch.sh ``` -### Launch Locally +The `launch_with_sbatch.sh` script shows how to: +- Configure Slurm job parameters +- Set up multi-node torchrun +- Use containers (optional) +- Pass arguments to training scripts -Test your setup before going to a cluster. Works with both pretrain and finetune scripts: +### NeMo-Run -```bash -# Pretrain -python 02_launch_pretrain_local.py \ - --script 00_quickstart_pretrain.py \ - --devices 2 +For better job management and remote launching capabilities: -# Finetune -python 02_launch_pretrain_local.py \ - --script 00_quickstart_finetune.py \ - --devices 1 -``` +Prerequisites: -### Launch on Slurm - -For multi-node training on Slurm clusters: +```bash +pip install nemo-run +``` -From the cluster (LocalTunnel): +From the Slurm cluster (LocalTunnel): ```bash -python 03_launch_pretrain_slurm.py \ +python 04_launch_slurm_with_nemo_run.py \ --script 00_quickstart_pretrain.py \ --nodes 2 \ --devices 8 \ @@ -135,7 +137,7 @@ python 03_launch_pretrain_slurm.py \ From your local machine (SSHTunnel): ```bash -python 03_launch_pretrain_slurm.py \ +python 04_launch_slurm_with_nemo_run.py \ --script 00_quickstart_pretrain.py \ --nodes 2 \ --devices 8 \ @@ -150,8 +152,8 @@ python 03_launch_pretrain_slurm.py \ With custom config: ```bash -python 03_launch_pretrain_slurm.py \ - --script 01_finetune_with_yaml.py \ +python 04_launch_slurm_with_nemo_run.py \ + --script 03_finetune_with_yaml.py \ --nodes 1 \ --devices 8 \ --partition gpu \ @@ -174,7 +176,7 @@ python ../../conversion/convert_checkpoints.py import \ Run finetuning: ```bash -torchrun --nproc_per_node=1 00_quickstart_finetune.py \ +torchrun --nproc_per_node=1 01_quickstart_finetune.py \ --pretrained-checkpoint ./checkpoints/llama32_1b ``` @@ -200,7 +202,7 @@ config.peft.alpha = 32 # LoRA alpha scaling For more complex finetuning configurations: ```bash -torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ +torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ --config-file conf/llama32_1b_finetune.yaml ``` @@ -213,13 +215,13 @@ data: # FinetuningDatasetConfig seq_length: 4096 train: # TrainingConfig - train_iters: 1000 + train_iters: 100 global_batch_size: 128 checkpoint: # CheckpointConfig pretrained_checkpoint: /path/to/pretrained/checkpoint save: ./checkpoints/llama32_1b_finetuned - save_interval: 500 + save_interval: 50 peft: # PEFT (LoRA config) dim: 8 # LoRA rank @@ -229,7 +231,7 @@ model: # Model Provider seq_length: 4096 # Must match data.seq_length optimizer: # OptimizerConfig - lr: 0.0001 # Higher LR for LoRA + lr: 0.0001 ``` Override from command line using dot notation: @@ -237,7 +239,7 @@ Override from command line using dot notation: The first part before the dot indicates which ConfigContainer subconfig to override, and the part after specifies the field. ```bash -torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ +torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ --config-file conf/llama32_1b_finetune.yaml \ peft.dim=16 \ train.train_iters=2000 @@ -248,42 +250,18 @@ Here, `peft.dim=16` overrides `ConfigContainer.peft.dim`. Full finetuning (no LoRA): ```bash -torchrun --nproc_per_node=2 01_finetune_with_yaml.py \ +torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ --peft none \ train.train_iters=1000 ``` -### Multi-Node Finetuning - -Use the same launchers for finetuning. - -Local: - -```bash -python 02_launch_pretrain_local.py \ - --script 00_quickstart_finetune.py \ - --devices 1 \ - --script-args "--pretrained-checkpoint ./checkpoints/llama32_1b" -``` - -Slurm: - -```bash -python 03_launch_pretrain_slurm.py \ - --script 01_finetune_with_yaml.py \ - --nodes 1 \ - --partition gpu \ - --account my_account \ - --config-file conf/llama32_1b_finetune.yaml -``` - ### Working with Checkpoints **Important:** Finetuning requires checkpoints in Megatron format. You cannot use HuggingFace checkpoints directly. You can obtain Megatron checkpoints by: -1. Converting from HuggingFace (recommended for starting from public models) +1. Converting from HuggingFace 2. Using Megatron checkpoints from your own pretraining runs Convert HuggingFace checkpoint to Megatron format: @@ -298,10 +276,10 @@ Use the checkpoint: ```bash # Command line (quickstart scripts) -torchrun --nproc_per_node=1 00_quickstart_finetune.py \ +torchrun --nproc_per_node=1 01_quickstart_finetune.py \ --pretrained-checkpoint ./checkpoints/llama32_1b -# YAML config (01_finetune_with_yaml.py) +# YAML config (03_finetune_with_yaml.py) # In conf/llama32_1b_finetune.yaml: # checkpoint: # pretrained_checkpoint: ./checkpoints/llama32_1b diff --git a/examples/recipes/llama/launch_with_sbatch.sh b/examples/recipes/llama/launch_with_sbatch.sh new file mode 100644 index 000000000..e75690d27 --- /dev/null +++ b/examples/recipes/llama/launch_with_sbatch.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#SBATCH --job-name=megatron-bridge-train +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=8 +#SBATCH --gpus-per-node=8 +#SBATCH --time=04:00:00 +#SBATCH --partition=gpu +#SBATCH --account=my_account +#SBATCH --output=logs/train_%j.out +#SBATCH --error=logs/train_%j.err +#SBATCH --exclusive + +# ============================================================================== +# Direct Slurm Launch with sbatch +# +# This script demonstrates how to launch training directly using sbatch without +# NeMo-Run. This is useful if you prefer traditional HPC workflows or don't want +# to install additional dependencies. +# +# Usage: +# 1. Modify the #SBATCH directives above for your cluster +# 2. Set the TRAINING_SCRIPT and other variables below +# 3. Submit: sbatch launch_with_sbatch.sh +# +# For NeMo-Run based launching (recommended), see 04_launch_slurm_with_nemo_run.py +# ============================================================================== + +# ============================================================================== +# CONFIGURATION - Modify these for your setup +# ============================================================================== + +# Training script to run (choose one) +TRAINING_SCRIPT="00_quickstart_pretrain.py" +# TRAINING_SCRIPT="01_quickstart_finetune.py" +# TRAINING_SCRIPT="02_pretrain_with_yaml.py" +# TRAINING_SCRIPT="03_finetune_with_yaml.py" + +# Optional: YAML config file (for *_with_yaml.py scripts) +CONFIG_FILE="" +# CONFIG_FILE="conf/llama32_1b_pretrain.yaml" +# CONFIG_FILE="conf/llama32_1b_finetune.yaml" + +# Optional: Additional CLI overrides (for *_with_yaml.py scripts) +CLI_OVERRIDES="" +# CLI_OVERRIDES="train.train_iters=1000 train.global_batch_size=512" + +# Optional: For finetuning scripts, specify checkpoint path +PRETRAINED_CHECKPOINT="" +# PRETRAINED_CHECKPOINT="./checkpoints/llama32_1b" + +# Container image (optional, only if using containers) +CONTAINER_IMAGE="" +# CONTAINER_IMAGE="/path/to/container.sqsh" + +# Container mounts (optional, space-separated) +CONTAINER_MOUNTS="" +# CONTAINER_MOUNTS="/data:/data /model:/model" + +# ============================================================================== +# Environment Setup +# ============================================================================== + +# Set common environment variables +# Optional: Set these if needed +# export CUDA_DEVICE_MAX_CONNECTIONS=1 +# export NCCL_DEBUG=INFO + +# ============================================================================== +# Job Execution +# ============================================================================== + +echo "======================================" +echo "Megatron Bridge Training Job" +echo "======================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Nodes: $SLURM_JOB_NUM_NODES" +echo "GPUs per node: $SLURM_GPUS_PER_NODE" +echo "Script: $TRAINING_SCRIPT" +echo "======================================" + +# Build the command +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_PATH="${SCRIPT_DIR}/${TRAINING_SCRIPT}" + +# Build torchrun command +CMD="torchrun" +CMD="$CMD --nproc_per_node=$SLURM_GPUS_PER_NODE" +CMD="$CMD --nnodes=$SLURM_JOB_NUM_NODES" +CMD="$CMD --node_rank=\$SLURM_PROCID" +CMD="$CMD --master_addr=\$(scontrol show hostname \$SLURM_NODELIST | head -n1)" +CMD="$CMD --master_port=29500" +CMD="$CMD $SCRIPT_PATH" + +# Add config file if specified +if [ -n "$CONFIG_FILE" ]; then + CMD="$CMD --config-file $CONFIG_FILE" +fi + +# Add pretrained checkpoint if specified (for finetuning) +if [ -n "$PRETRAINED_CHECKPOINT" ]; then + CMD="$CMD --pretrained-checkpoint $PRETRAINED_CHECKPOINT" +fi + +# Add CLI overrides if specified +if [ -n "$CLI_OVERRIDES" ]; then + CMD="$CMD $CLI_OVERRIDES" +fi + +echo "Executing: $CMD" +echo "======================================" + +# Execute with or without container +if [ -n "$CONTAINER_IMAGE" ]; then + # With container + SRUN_CMD="srun --container-image=$CONTAINER_IMAGE" + + # Add container mounts + if [ -n "$CONTAINER_MOUNTS" ]; then + for mount in $CONTAINER_MOUNTS; do + SRUN_CMD="$SRUN_CMD --container-mounts=$mount" + done + fi + + $SRUN_CMD bash -c "$CMD" +else + # Without container + srun bash -c "$CMD" +fi + +echo "======================================" +echo "Job completed" +echo "======================================" + From 5a921bf45292d7218ee62a01cf65fc6d97462d68 Mon Sep 17 00:00:00 2001 From: Ananth Subramaniam Date: Mon, 24 Nov 2025 04:07:55 -0800 Subject: [PATCH 4/7] move to tutorials Signed-off-by: Ananth Subramaniam --- {examples => tutorials}/recipes/llama/00_quickstart_pretrain.py | 0 {examples => tutorials}/recipes/llama/01_quickstart_finetune.py | 0 {examples => tutorials}/recipes/llama/02_pretrain_with_yaml.py | 0 {examples => tutorials}/recipes/llama/03_finetune_with_yaml.py | 0 .../recipes/llama/04_launch_slurm_with_nemo_run.py | 0 {examples => tutorials}/recipes/llama/README.md | 0 .../recipes/llama/conf/llama32_1b_finetune.yaml | 0 .../recipes/llama/conf/llama32_1b_pretrain.yaml | 0 {examples => tutorials}/recipes/llama/launch_with_sbatch.sh | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename {examples => tutorials}/recipes/llama/00_quickstart_pretrain.py (100%) rename {examples => tutorials}/recipes/llama/01_quickstart_finetune.py (100%) rename {examples => tutorials}/recipes/llama/02_pretrain_with_yaml.py (100%) rename {examples => tutorials}/recipes/llama/03_finetune_with_yaml.py (100%) rename {examples => tutorials}/recipes/llama/04_launch_slurm_with_nemo_run.py (100%) rename {examples => tutorials}/recipes/llama/README.md (100%) rename {examples => tutorials}/recipes/llama/conf/llama32_1b_finetune.yaml (100%) rename {examples => tutorials}/recipes/llama/conf/llama32_1b_pretrain.yaml (100%) rename {examples => tutorials}/recipes/llama/launch_with_sbatch.sh (100%) diff --git a/examples/recipes/llama/00_quickstart_pretrain.py b/tutorials/recipes/llama/00_quickstart_pretrain.py similarity index 100% rename from examples/recipes/llama/00_quickstart_pretrain.py rename to tutorials/recipes/llama/00_quickstart_pretrain.py diff --git a/examples/recipes/llama/01_quickstart_finetune.py b/tutorials/recipes/llama/01_quickstart_finetune.py similarity index 100% rename from examples/recipes/llama/01_quickstart_finetune.py rename to tutorials/recipes/llama/01_quickstart_finetune.py diff --git a/examples/recipes/llama/02_pretrain_with_yaml.py b/tutorials/recipes/llama/02_pretrain_with_yaml.py similarity index 100% rename from examples/recipes/llama/02_pretrain_with_yaml.py rename to tutorials/recipes/llama/02_pretrain_with_yaml.py diff --git a/examples/recipes/llama/03_finetune_with_yaml.py b/tutorials/recipes/llama/03_finetune_with_yaml.py similarity index 100% rename from examples/recipes/llama/03_finetune_with_yaml.py rename to tutorials/recipes/llama/03_finetune_with_yaml.py diff --git a/examples/recipes/llama/04_launch_slurm_with_nemo_run.py b/tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py similarity index 100% rename from examples/recipes/llama/04_launch_slurm_with_nemo_run.py rename to tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py diff --git a/examples/recipes/llama/README.md b/tutorials/recipes/llama/README.md similarity index 100% rename from examples/recipes/llama/README.md rename to tutorials/recipes/llama/README.md diff --git a/examples/recipes/llama/conf/llama32_1b_finetune.yaml b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml similarity index 100% rename from examples/recipes/llama/conf/llama32_1b_finetune.yaml rename to tutorials/recipes/llama/conf/llama32_1b_finetune.yaml diff --git a/examples/recipes/llama/conf/llama32_1b_pretrain.yaml b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml similarity index 100% rename from examples/recipes/llama/conf/llama32_1b_pretrain.yaml rename to tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml diff --git a/examples/recipes/llama/launch_with_sbatch.sh b/tutorials/recipes/llama/launch_with_sbatch.sh similarity index 100% rename from examples/recipes/llama/launch_with_sbatch.sh rename to tutorials/recipes/llama/launch_with_sbatch.sh From 68f45cddc24ed6d6cbadd9eb17b1b40eb44ae536 Mon Sep 17 00:00:00 2001 From: Ananth Subramaniam Date: Mon, 24 Nov 2025 08:08:10 -0800 Subject: [PATCH 5/7] move to tutorials Signed-off-by: Ananth Subramaniam --- .../recipes/llama/01_quickstart_finetune.py | 17 +- tutorials/recipes/llama/README.md | 223 ++++++------------ .../llama/conf/llama32_1b_finetune.yaml | 5 +- .../llama/conf/llama32_1b_pretrain.yaml | 3 +- 4 files changed, 89 insertions(+), 159 deletions(-) diff --git a/tutorials/recipes/llama/01_quickstart_finetune.py b/tutorials/recipes/llama/01_quickstart_finetune.py index a6f8060d6..6a8af8e80 100644 --- a/tutorials/recipes/llama/01_quickstart_finetune.py +++ b/tutorials/recipes/llama/01_quickstart_finetune.py @@ -85,15 +85,14 @@ def main() -> None: # === Use your own dataset === # Replace SQuAD with your custom dataset # Option 1: Simple path override - # config.data.data_path = "/path/to/your/dataset.jsonl" - - # Option 2: Use FinetuningDatasetConfig for custom JSONL datasets - # from megatron.bridge.training.data import FinetuningDatasetConfig - # config.data = FinetuningDatasetConfig(data_path="/path/to/your/dataset.jsonl") - - # Option 3: Use HFDatasetConfig for HuggingFace datasets - # from megatron.bridge.training.data import HFDatasetConfig - # config.data = HFDatasetConfig(hf_dataset="squad", split="train") + # config.dataset.dataset_root = "/path/to/your/dataset" + + # Or replace the dataset with FinetuningDatasetConfig for JSONL data + # from megatron.bridge.training.config import FinetuningDatasetConfig + # config.dataset = FinetuningDatasetConfig( + # dataset_root="/path/to/your/dataset_dir", # expects training/validation/test jsonl files + # seq_length=config.model.seq_length, + # ) # === Adjust learning rate === # config.optimizer.lr = 5e-5 diff --git a/tutorials/recipes/llama/README.md b/tutorials/recipes/llama/README.md index bed7d357d..b5306707d 100644 --- a/tutorials/recipes/llama/README.md +++ b/tutorials/recipes/llama/README.md @@ -1,4 +1,4 @@ -# Llama Recipes with Megatron Bridge +# Recipes with Megatron Bridge This guide shows you how to pretrain and finetune Llama models using Megatron Bridge. @@ -12,7 +12,9 @@ torchrun --nproc_per_node=1 00_quickstart_pretrain.py This runs Llama 3.2 1B pretraining on a single GPU with mock data. -For finetuning, you need a checkpoint in Megatron format. Convert from HuggingFace: +For finetuning, you first need a checkpoint in Megatron format. Convert from HuggingFace using the `AutoBridge`: + +> **Note:** You must be authenticated with Hugging Face to download the model. Run `hf auth login --token $HF_TOKEN` if needed. ```bash python ../../conversion/convert_checkpoints.py import \ @@ -27,32 +29,46 @@ torchrun --nproc_per_node=1 01_quickstart_finetune.py \ --pretrained-checkpoint ./checkpoints/llama32_1b ``` -This finetunes Llama 3.2 1B using LoRA on the SQuAD dataset. +The [01_quickstart_finetune.py](01_quickstart_finetune.py) recipe finetunes Llama 3.2 1B using LoRA on the SQuAD dataset by default. -To use real data, uncomment and modify in the script: +To plug in your own JSONL dataset, swap the dataset config in that script: ```python -config.data.data_path = "/path/to/your/dataset" +from megatron.bridge.training.config import FinetuningDatasetConfig + +config.dataset = FinetuningDatasetConfig( + dataset_root="/path/to/dataset_dir", # contains training/validation/test jsonl files + seq_length=config.model.seq_length, +) ``` -## Configuration with YAML +## Configuration + +Megatron Bridge recipes are standard Python scripts, giving you full flexibility in how you configure your training. You can: +1. Modify the Python scripts directly +2. Use the framework's YAML-based configuration system +3. Implement your own configuration management (ArgParse, Hydra, etc.) + +### Using Framework YAML Configs + +The recipes include optional support for YAML configuration and dot-notation overrides via `ConfigContainer`. This is just one way to manage config; you are free to use other methods. -For more complex configurations, use YAML files and command-line overrides: +To use the provided YAML system: ```bash torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ --config-file conf/llama32_1b_pretrain.yaml ``` -Understanding YAML Configuration: +Understanding the YAML Structure: -YAML files should be organized into sections that mirror the `ConfigContainer` structure. Each top-level key corresponds to a configuration section (e.g., `data`, `train`, `model`, `optimizer`). Overrides are applied in a nested manner according to the ConfigContainer fields. +YAML files mirror the `ConfigContainer` structure. Each top-level key corresponds to a configuration section (e.g., `dataset`, `train`, `model`, `optimizer`). Example YAML (`conf/llama32_1b_pretrain.yaml`): ```yaml # Each section maps to a ConfigContainer field -data: # GPTDatasetConfig +dataset: # GPTDatasetConfig data_path: /path/to/training/data sequence_length: 4096 @@ -72,9 +88,9 @@ optimizer: # OptimizerConfig lr: 0.0003 ``` -Override from command line using dot notation: +Command-Line Overrides: -Command-line overrides follow the same pattern as YAML structure. The first part before the dot indicates which subconfig of ConfigContainer to override (e.g., `train`, `model`, `optimizer`), and the part after the dot specifies the field within that subconfig. +You can override values using dot notation (`section.field=value`): ```bash torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ @@ -84,14 +100,55 @@ torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \ optimizer.lr=0.0002 ``` -In this example: -- `train.train_iters=5000` → overrides `ConfigContainer.train.train_iters` -- `optimizer.lr=0.0002` → overrides `ConfigContainer.optimizer.lr` +Priority order (highest to lowest): +1. Command-line overrides +2. YAML config file +3. Base recipe defaults + +### Finetuning Configuration + +For more complex finetuning configurations: + +```bash +torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ + --config-file conf/llama32_1b_finetune.yaml +``` + +Example YAML (`conf/llama32_1b_finetune.yaml`): + +```yaml +# Each section maps to a ConfigContainer field +dataset: # FinetuningDatasetConfig + data_path: /path/to/finetuning_dataset.jsonl + seq_length: 4096 + +train: # TrainingConfig + train_iters: 100 + global_batch_size: 128 -These example scripts are configured to accept overrides in the priority order (highest to lowest): -1. Command-line overrides (dot notation: `section.field=value`) -2. YAML config file (nested structure) -3. Base recipe defaults (from `llama32_1b_pretrain_config()`) +checkpoint: # CheckpointConfig + pretrained_checkpoint: /path/to/pretrained/checkpoint + save: ./checkpoints/llama32_1b_finetuned + save_interval: 50 + +peft: # PEFT (LoRA config) + dim: 8 # LoRA rank + alpha: 16 # LoRA alpha + +model: # Model Provider + seq_length: 4096 # Must match data.seq_length + +optimizer: # OptimizerConfig + lr: 0.0001 +``` + +Full Finetuning (No LoRA) + +```bash +torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ + --peft none \ + train.train_iters=1000 +``` ## Multi-Node Training @@ -115,7 +172,7 @@ The `launch_with_sbatch.sh` script shows how to: ### NeMo-Run -For better job management and remote launching capabilities: +For job management and remote launching capabilities: Prerequisites: @@ -123,7 +180,7 @@ Prerequisites: pip install nemo-run ``` -From the Slurm cluster (LocalTunnel): +From the Slurm cluster login node: ```bash python 04_launch_slurm_with_nemo_run.py \ @@ -160,127 +217,3 @@ python 04_launch_slurm_with_nemo_run.py \ --account my_account \ --config-file conf/llama32_1b_finetune.yaml ``` - -## Finetuning - -### Quickstart: Finetune with LoRA - -Prerequisites: You need a checkpoint in Megatron format. Convert from HuggingFace: - -```bash -python ../../conversion/convert_checkpoints.py import \ - --hf-model meta-llama/Llama-3.2-1B \ - --megatron-path ./checkpoints/llama32_1b -``` - -Run finetuning: - -```bash -torchrun --nproc_per_node=1 01_quickstart_finetune.py \ - --pretrained-checkpoint ./checkpoints/llama32_1b -``` - -By default, this: -- Uses LoRA (Low-Rank Adaptation) for efficient finetuning -- Trains on the SQuAD dataset -- Works on a single GPU -- Llama 3.2 1B model - -Customize in the script: - -```python -# Use your own dataset (JSONL format) -config.data.data_path = "/path/to/your/dataset.jsonl" - -# Adjust LoRA hyperparameters -config.peft.dim = 16 # LoRA rank -config.peft.alpha = 32 # LoRA alpha scaling -``` - -### Configuration with YAML - -For more complex finetuning configurations: - -```bash -torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ - --config-file conf/llama32_1b_finetune.yaml -``` - -Example YAML (`conf/llama32_1b_finetune.yaml`): - -```yaml -# Each section maps to a ConfigContainer field -data: # FinetuningDatasetConfig - data_path: /path/to/finetuning_dataset.jsonl - seq_length: 4096 - -train: # TrainingConfig - train_iters: 100 - global_batch_size: 128 - -checkpoint: # CheckpointConfig - pretrained_checkpoint: /path/to/pretrained/checkpoint - save: ./checkpoints/llama32_1b_finetuned - save_interval: 50 - -peft: # PEFT (LoRA config) - dim: 8 # LoRA rank - alpha: 16 # LoRA alpha - -model: # Model Provider - seq_length: 4096 # Must match data.seq_length - -optimizer: # OptimizerConfig - lr: 0.0001 -``` - -Override from command line using dot notation: - -The first part before the dot indicates which ConfigContainer subconfig to override, and the part after specifies the field. - -```bash -torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ - --config-file conf/llama32_1b_finetune.yaml \ - peft.dim=16 \ - train.train_iters=2000 -``` - -Here, `peft.dim=16` overrides `ConfigContainer.peft.dim`. - -Full finetuning (no LoRA): - -```bash -torchrun --nproc_per_node=2 03_finetune_with_yaml.py \ - --peft none \ - train.train_iters=1000 -``` - -### Working with Checkpoints - -**Important:** Finetuning requires checkpoints in Megatron format. You cannot use HuggingFace checkpoints directly. - -You can obtain Megatron checkpoints by: - -1. Converting from HuggingFace -2. Using Megatron checkpoints from your own pretraining runs - -Convert HuggingFace checkpoint to Megatron format: - -```bash -python ../../conversion/convert_checkpoints.py import \ - --hf-model meta-llama/Llama-3.2-1B \ - --megatron-path ./checkpoints/llama32_1b -``` - -Use the checkpoint: - -```bash -# Command line (quickstart scripts) -torchrun --nproc_per_node=1 01_quickstart_finetune.py \ - --pretrained-checkpoint ./checkpoints/llama32_1b - -# YAML config (03_finetune_with_yaml.py) -# In conf/llama32_1b_finetune.yaml: -# checkpoint: -# pretrained_checkpoint: ./checkpoints/llama32_1b -``` diff --git a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml index fd6bc1a73..5be4051bd 100644 --- a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml +++ b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml @@ -17,7 +17,7 @@ # Modify values as needed for your use case. # Data configuration -data: +dataset: # Replace with your dataset path (JSONL format recommended) # data_path: /path/to/your/finetuning_dataset.jsonl seq_length: 4096 @@ -56,7 +56,7 @@ peft: alpha: 16 # LoRA alpha scaling # Model configuration -# Note: seq_length must match data.seq_length +# Note: seq_length must match dataset.seq_length model: seq_length: 4096 tensor_model_parallel_size: 1 @@ -73,4 +73,3 @@ logger: # Random seed rng: seed: 1234 - diff --git a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml index a46d0c689..378503aad 100644 --- a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml +++ b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml @@ -17,7 +17,7 @@ # Modify values as needed for your use case. # Data configuration -data: +dataset: # Replace with your dataset path # data_path: /path/to/your/dataset sequence_length: 4096 @@ -67,4 +67,3 @@ logger: # Random seed rng: seed: 1234 - From e41e906827fbe48cde6e5b608aa9c03f4607c653 Mon Sep 17 00:00:00 2001 From: Ananth Subramaniam Date: Tue, 25 Nov 2025 07:40:19 -0800 Subject: [PATCH 6/7] updates Signed-off-by: Ananth Subramaniam --- README.md | 31 +++---------------- src/megatron/bridge/recipes/gemma/gemma2.py | 2 +- .../llama/conf/llama32_1b_finetune.yaml | 7 +++-- .../llama/conf/llama32_1b_pretrain.yaml | 3 +- 4 files changed, 12 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index abbb7be78..7bda6076b 100644 --- a/README.md +++ b/README.md @@ -163,34 +163,13 @@ For more details on supported models, see our documentation: #### Launching Recipes -All recipes are ready to train out of the box, using mock data by default. For an example of how to override the default configuration through YAML or Hydra-style CLI overrides, please have a look at this [script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b.py). The script can then be launched with `torchrun`. For example, with the aforementioned script: +For a conceptual overview of how recipes are structured, overridden, and launched with either `torchrun` or NeMo-Run, read the [Using Recipes guide](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html). -```sh -torchrun --nproc-per-node=2 pretrain_llama3_8b.py model.tensor_model_parallel_size=1 -``` - -Optionally, Megatron Bridge also supports launching with [NeMo-Run](https://github.com/NVIDIA-NeMo/Run). See the following examples for reference on launching with NeMo-Run: - -- [pretrain_llama3_8b_nemo_run_script.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py) -- [pretrain_llama3_8b_nemo_run_partial.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_partial.py) - -These examples can also be run as-is with the Llama 3 8B recipe (with NeMo-Run installed). - -Launch Llama 3 8B pretraining with NeMo-Run's `run.Script`: +Runnable tutorials live in [`tutorials/recipes/llama`](tutorials/recipes/llama) and ship with a detailed README that covers: -```sh -uv run python pretrain_llama3_8b_nemo_run_script.py \ - --nproc-per-node=2 \ - model.pipeline_model_parallel_size=1 \ - train.train_iters=10 # this script passes Hydra-style overrides to the target script -``` - -Launch Llama 3 8B pretraining with NeMo-Run's `run.Partial`: - -```sh -uv run python pretrain_llama3_8b_nemo_run_partial.py \ - --nproc-per-node=2 -``` +- `00_quickstart_pretrain.py` for mock-data pretraining +- `01_quickstart_finetune.py` + LoRA configs +- YAML-driven flows and launch helpers diff --git a/src/megatron/bridge/recipes/gemma/gemma2.py b/src/megatron/bridge/recipes/gemma/gemma2.py index 8987202ee..98c80e3e5 100644 --- a/src/megatron/bridge/recipes/gemma/gemma2.py +++ b/src/megatron/bridge/recipes/gemma/gemma2.py @@ -247,7 +247,7 @@ def _gemma2_common( reset_attention_mask=False, reset_position_ids=False, eod_mask_loss=False, - sequence_length=seq_length, + seq_length=seq_length, num_dataset_builder_threads=1, blend=blend, blend_per_split=blend_per_split, diff --git a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml index 5be4051bd..02c3897a6 100644 --- a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml +++ b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml @@ -30,8 +30,6 @@ train: eval_iters: 10 eval_interval: 50 - # Load from pretrained checkpoint - # load: /path/to/pretrained/checkpoint # Optimizer configuration optimizer: @@ -48,6 +46,11 @@ scheduler: checkpoint: # Directory to save finetuned checkpoints save: ./checkpoints/llama32_1b_finetuned + # Directory to resume from during training + load: ./checkpoints/llama32_1b + # Directory for pretrained weights in Megatron format + pretrained_checkpoint: ./path/to/pretrained/checkpoint/ + save_interval: 50 # LoRA configuration diff --git a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml index 378503aad..9b9e8bf1d 100644 --- a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml +++ b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml @@ -45,9 +45,8 @@ scheduler: checkpoint: # Directory to save checkpoints save: ./checkpoints/llama32_1b + load: ./checkpoints/llama32_1b save_interval: 50 - # Resume from checkpoint (optional) - # load: ./checkpoints/llama32_1b/iter_0000050 # Model configuration # Note: seq_length must match data.sequence_length From eec49d39fdba1bdc3f1817285f42be41c761e105 Mon Sep 17 00:00:00 2001 From: Ananth Subramaniam Date: Tue, 25 Nov 2025 09:50:01 -0800 Subject: [PATCH 7/7] fix docs Signed-off-by: Ananth Subramaniam --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7bda6076b..012a649af 100644 --- a/README.md +++ b/README.md @@ -165,7 +165,7 @@ For more details on supported models, see our documentation: For a conceptual overview of how recipes are structured, overridden, and launched with either `torchrun` or NeMo-Run, read the [Using Recipes guide](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html). -Runnable tutorials live in [`tutorials/recipes/llama`](tutorials/recipes/llama) and ship with a detailed README that covers: +Runnable tutorials live in `tutorials/recipes/llama` that covers: - `00_quickstart_pretrain.py` for mock-data pretraining - `01_quickstart_finetune.py` + LoRA configs