From 4c0a02c26d7b857c323db4a47f3dcc4cae94f4c5 Mon Sep 17 00:00:00 2001
From: Ananth Subramaniam <ansubramania@nvidia.com>
Date: Wed, 12 Nov 2025 03:12:45 -0800
Subject: [PATCH 1/7] redo llama recipe examples

Signed-off-by: Ananth Subramaniam <ansubramania@nvidia.com>
---
 .../recipes/llama/00_quickstart_finetune.py   | 121 +++++++
 .../recipes/llama/00_quickstart_pretrain.py   |  68 ++++
 .../recipes/llama/01_finetune_with_yaml.py    | 141 ++++++++
 .../recipes/llama/01_pretrain_with_yaml.py    | 127 ++++++++
 .../recipes/llama/02_launch_pretrain_local.py | 156 +++++++++
 .../recipes/llama/03_launch_pretrain_slurm.py | 283 ++++++++++++++++
 examples/recipes/llama/README.md              | 308 ++++++++++++++++++
 .../llama/conf/llama32_1b_finetune.yaml       |  82 +++++
 .../llama/conf/llama32_1b_pretrain.yaml       |  71 ++++
 .../llama3_8b_pretrain_override_example.yaml  |  65 ----
 ...etrain_override_example_megatron_fsdp.yaml |  68 ----
 examples/recipes/llama/pretrain_llama3_8b.py  | 184 -----------
 .../pretrain_llama3_8b_nemo_run_script.py     | 150 ---------
 13 files changed, 1357 insertions(+), 467 deletions(-)
 create mode 100644 examples/recipes/llama/00_quickstart_finetune.py
 create mode 100644 examples/recipes/llama/00_quickstart_pretrain.py
 create mode 100644 examples/recipes/llama/01_finetune_with_yaml.py
 create mode 100644 examples/recipes/llama/01_pretrain_with_yaml.py
 create mode 100644 examples/recipes/llama/02_launch_pretrain_local.py
 create mode 100644 examples/recipes/llama/03_launch_pretrain_slurm.py
 create mode 100644 examples/recipes/llama/README.md
 create mode 100644 examples/recipes/llama/conf/llama32_1b_finetune.yaml
 create mode 100644 examples/recipes/llama/conf/llama32_1b_pretrain.yaml
 delete mode 100644 examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml
 delete mode 100644 examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml
 delete mode 100644 examples/recipes/llama/pretrain_llama3_8b.py
 delete mode 100644 examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py

diff --git a/examples/recipes/llama/00_quickstart_finetune.py b/examples/recipes/llama/00_quickstart_finetune.py
new file mode 100644
index 000000000..5b8e4ee83
--- /dev/null
+++ b/examples/recipes/llama/00_quickstart_finetune.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Quickstart: Finetune Llama 3.2 1B with Megatron-Bridge
+
+This is the simplest way to start finetuning with Megatron-Bridge.
+By default, this uses LoRA (Low-Rank Adaptation) for efficient finetuning.
+
+Usage:
+    Single GPU with LoRA:
+        torchrun --nproc_per_node=1 00_quickstart_finetune.py \
+            --pretrained-checkpoint /path/to/megatron/checkpoint
+
+    Multiple GPUs (automatic data parallelism):
+        torchrun --nproc_per_node=8 00_quickstart_finetune.py \
+            --pretrained-checkpoint /path/to/megatron/checkpoint
+
+Prerequisites:
+    You need a checkpoint in Megatron format. You can either:
+    1. Convert HF checkpoint to Megatron format:
+       python examples/conversion/convert_checkpoints.py import \
+           --hf-model meta-llama/Llama-3.2-1B \
+           --megatron-path ./checkpoints/llama32_1b
+    2. Use a checkpoint from pretraining (see 00_quickstart_pretrain.py)
+
+The script uses SQuAD dataset by default. See inline comments for:
+- Using your own dataset
+- Adjusting LoRA hyperparameters
+- Switching to full supervised finetuning
+"""
+
+import argparse
+
+from megatron.bridge.recipes.llama import llama32_1b_finetune_config
+from megatron.bridge.training.finetune import finetune
+from megatron.bridge.training.gpt_step import forward_step
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Finetune Llama 3.2 1B with LoRA",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--pretrained-checkpoint",
+        type=str,
+        required=True,
+        help="Path to pretrained checkpoint in Megatron format",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    """Run Llama 3.2 1B finetuning with LoRA."""
+    args = parse_args()
+
+    # Load the base finetune configuration
+    # By default: LoRA with rank=8, alpha=16, works on single GPU
+    config = llama32_1b_finetune_config()
+
+    # Load from the pretrained checkpoint
+    config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint
+
+    # === Quick test run (10 iterations) ===
+    config.train.train_iters = 10
+    config.scheduler.lr_warmup_iters = 2
+
+    # ===== OPTIONAL CUSTOMIZATIONS =====
+    # Uncomment and modify as needed:
+
+    # === Use your own dataset ===
+    # Replace SQuAD with your custom dataset
+    # Option 1: Simple path override (uses default FinetuningDatasetConfig)
+    # config.data.data_path = "/path/to/your/dataset.jsonl"
+
+    # Option 2: Use FinetuningDatasetConfig for custom JSONL datasets
+    # from megatron.bridge.training.data import FinetuningDatasetConfig
+    # config.data = FinetuningDatasetConfig(data_path="/path/to/your/dataset.jsonl")
+
+    # Option 3: Use HFDatasetConfig for HuggingFace datasets
+    # from megatron.bridge.training.data import HFDatasetConfig
+    # config.data = HFDatasetConfig(hf_dataset="squad", split="train")
+
+    # === Change learning rate ===
+    # config.optimizer.lr = 5e-5  # Default for LoRA: 1e-4
+
+    # === Modify checkpoint frequency ===
+    # config.train.save_interval = 100
+
+    # === Adjust LoRA hyperparameters ===
+    # Higher rank = more parameters = potentially better quality but slower
+    # config.peft.dim = 16  # LoRA rank (default: 8)
+    # config.peft.alpha = 32  # LoRA alpha scaling (default: 16)
+
+    # === Full supervised finetuning (no LoRA) ===
+    # For full finetuning, reload config with peft=None:
+    # config = llama32_1b_finetune_config(peft=None)
+    # config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint
+    # Note: Full finetuning uses more memory than LoRA
+    # The recipe automatically adjusts parallelism for full SFT
+
+    # Start finetuning
+    finetune(config=config, forward_step_func=forward_step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/recipes/llama/00_quickstart_pretrain.py b/examples/recipes/llama/00_quickstart_pretrain.py
new file mode 100644
index 000000000..5f11d2382
--- /dev/null
+++ b/examples/recipes/llama/00_quickstart_pretrain.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Quickstart: Pretrain Llama 3.2 1B with Megatron Bridge
+
+This is the simplest way to start pretraining with Megatron-Bridge.
+We use Llama 3.2 1B because it fits on a single GPU, making it easy to test.
+
+Usage:
+    Single GPU:
+        torchrun --nproc_per_node=1 00_quickstart_pretrain.py
+
+    Multiple GPUs (automatic data parallelism):
+        torchrun --nproc_per_node=8 00_quickstart_pretrain.py
+
+The script uses sensible defaults and mock data for quick testing.
+For custom configurations, see 01_pretrain_with_yaml.py
+For multi-node training, see 02_launch_pretrain_slurm.py
+For larger models (8B, 70B), see 01_pretrain_with_yaml.py
+"""
+
+from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+
+
+def main() -> None:
+    """Run Llama 3.2 1B pretraining with default configuration."""
+
+    # Load the base recipe configuration
+    # Llama 3.2 1B works on a single GPU (TP=1, PP=1, CP=1)
+    config = llama32_1b_pretrain_config()
+
+    # OPTIONAL: Customize key settings here
+    # Uncomment and modify as needed:
+
+    # For a quick test run (10 iterations):
+    # config.train.train_iters = 10
+
+    # Use your own data (replace mock data):
+    # config.data.data_path = "/path/to/your/dataset"
+
+    # Change batch sizes:
+    # config.train.global_batch_size = 256
+    # config.train.micro_batch_size = 2
+
+    # Modify checkpoint frequency:
+    # config.train.save_interval = 500
+
+    # Start pretraining
+    pretrain(config=config, forward_step_func=forward_step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/recipes/llama/01_finetune_with_yaml.py b/examples/recipes/llama/01_finetune_with_yaml.py
new file mode 100644
index 000000000..c510c1b20
--- /dev/null
+++ b/examples/recipes/llama/01_finetune_with_yaml.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Finetune with YAML Configuration and CLI Overrides
+
+This script demonstrates how to use YAML configuration files and command-line
+overrides for finetuning with LoRA or full supervised finetuning (SFT).
+
+Usage:
+    With default config file:
+        torchrun --nproc_per_node=1 01_finetune_with_yaml.py
+
+    With custom config file:
+        torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+            --config-file conf/my_finetune_config.yaml
+
+    With command-line overrides:
+        torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+            train.train_iters=1000 \
+            optimizer.lr=5e-5
+
+    Full finetuning instead of LoRA:
+        torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+            --peft none \
+            train.train_iters=1000
+
+    Combining YAML and CLI (CLI takes precedence):
+        torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+            --config-file conf/llama32_1b_finetune.yaml \
+            peft.dim=16 \
+            train.train_iters=2000
+
+Configuration Priority (highest to lowest):
+    1. Command-line overrides (highest)
+    2. YAML config file
+    3. Base recipe defaults (lowest)
+
+See conf/ directory for example YAML configurations.
+For a pure Python usage see 00_quickstart_finetune.py.
+"""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Tuple
+
+from omegaconf import OmegaConf
+
+from megatron.bridge.recipes.llama import llama32_1b_finetune_config
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.finetune import finetune
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.utils.omegaconf_utils import (
+    apply_overrides,
+    create_omegaconf_dict_config,
+    parse_hydra_overrides,
+)
+
+
+logger = logging.getLogger(__name__)
+
+# Default config file location
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DEFAULT_CONFIG_FILE = SCRIPT_DIR / "conf" / "llama32_1b_finetune.yaml"
+
+
+def parse_args() -> Tuple[argparse.Namespace, list[str]]:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Finetune with YAML configuration and CLI overrides",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=None,
+        help=f"Path to YAML config file (optional). Default: {DEFAULT_CONFIG_FILE}",
+    )
+    parser.add_argument(
+        "--peft",
+        type=str,
+        default="lora",
+        choices=["lora", "dora", "none"],
+        help="PEFT method to use (default: lora). Use 'none' for full finetuning.",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+
+    # Separate known args from CLI overrides
+    args, cli_overrides = parser.parse_known_args()
+    return args, cli_overrides
+
+
+def main() -> None:
+    """Run finetuning with YAML configuration and CLI overrides."""
+    args, cli_overrides = parse_args()
+
+    # Load base configuration from recipe
+    peft_method = None if args.peft == "none" else args.peft
+    config: ConfigContainer = llama32_1b_finetune_config(peft=peft_method)
+
+    # Convert to OmegaConf for merging
+    omega_conf, excluded_fields = create_omegaconf_dict_config(config)
+
+    # Apply YAML overrides if provided
+    if args.config_file:
+        config_file_path = Path(args.config_file)
+        if not config_file_path.exists():
+            logger.error(f"Config file not found: {config_file_path}")
+            sys.exit(1)
+
+        yaml_conf = OmegaConf.load(config_file_path)
+        omega_conf = OmegaConf.merge(omega_conf, yaml_conf)
+
+    # Apply command-line overrides
+    if cli_overrides:
+        omega_conf = parse_hydra_overrides(omega_conf, cli_overrides)
+
+    # Convert back to ConfigContainer
+    final_config_dict = OmegaConf.to_container(omega_conf, resolve=True)
+    apply_overrides(config, final_config_dict, excluded_fields)
+
+    # Start finetuning
+    finetune(config=config, forward_step_func=forward_step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/recipes/llama/01_pretrain_with_yaml.py b/examples/recipes/llama/01_pretrain_with_yaml.py
new file mode 100644
index 000000000..4de837d25
--- /dev/null
+++ b/examples/recipes/llama/01_pretrain_with_yaml.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pretrain with YAML Configuration and CLI Overrides
+
+This script demonstrates how to use YAML configuration files and command-line
+overrides for more complex configuration overrides.
+
+Usage:
+    With default config file:
+        torchrun --nproc_per_node=8 01_pretrain_with_yaml.py
+
+    With custom config file:
+        torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+            --config-file conf/my_custom_config.yaml
+
+    With command-line overrides:
+        torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+            train.train_iters=5000 \
+            train.global_batch_size=256
+
+    Combining YAML and CLI (CLI takes precedence):
+        torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+            --config-file conf/llama32_1b_pretrain.yaml \
+            train.train_iters=10000
+
+Configuration Priority (highest to lowest):
+    1. Command-line overrides (highest)
+    2. YAML config file
+    3. Base recipe defaults (lowest)
+
+See conf/ directory for example YAML configurations.
+For a pure Python usage see 00_quickstart_pretrain.py.
+"""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Tuple
+
+from omegaconf import OmegaConf
+
+from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+from megatron.bridge.training.utils.omegaconf_utils import (
+    apply_overrides,
+    create_omegaconf_dict_config,
+    parse_hydra_overrides,
+)
+
+
+logger = logging.getLogger(__name__)
+
+# Default config file location
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DEFAULT_CONFIG_FILE = SCRIPT_DIR / "conf" / "llama32_1b_pretrain.yaml"
+
+
+def parse_args() -> Tuple[argparse.Namespace, list[str]]:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Pretrain with YAML configuration and CLI overrides",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=None,
+        help=f"Path to YAML config file (optional). Default: {DEFAULT_CONFIG_FILE}",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+
+    # Separate known args from CLI overrides
+    args, cli_overrides = parser.parse_known_args()
+    return args, cli_overrides
+
+
+def main() -> None:
+    """Run pretraining with YAML configuration and CLI overrides."""
+    args, cli_overrides = parse_args()
+
+    # Load base configuration from recipe
+    config: ConfigContainer = llama32_1b_pretrain_config()
+
+    # Convert to OmegaConf for merging
+    omega_conf, excluded_fields = create_omegaconf_dict_config(config)
+
+    # Apply YAML overrides if provided
+    if args.config_file:
+        config_file_path = Path(args.config_file)
+        if not config_file_path.exists():
+            logger.error(f"Config file not found: {config_file_path}")
+            sys.exit(1)
+
+        yaml_conf = OmegaConf.load(config_file_path)
+        omega_conf = OmegaConf.merge(omega_conf, yaml_conf)
+
+    # Apply command-line overrides
+    if cli_overrides:
+        omega_conf = parse_hydra_overrides(omega_conf, cli_overrides)
+
+    # Convert back to ConfigContainer
+    final_config_dict = OmegaConf.to_container(omega_conf, resolve=True)
+    apply_overrides(config, final_config_dict, excluded_fields)
+
+    # Start pretraining
+    pretrain(config=config, forward_step_func=forward_step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/recipes/llama/02_launch_pretrain_local.py b/examples/recipes/llama/02_launch_pretrain_local.py
new file mode 100644
index 000000000..afcbfc431
--- /dev/null
+++ b/examples/recipes/llama/02_launch_pretrain_local.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Launch Training Locally with NeMo-Run
+
+This script demonstrates how to launch training scripts (pretrain or finetune)
+using NeMo-Run's LocalExecutor with torchrun. This provides better job management
+and logging compared to running torchrun directly.
+
+Prerequisites: Install nemo-run
+
+Usage:
+    # Launch pretrain script
+    python 02_launch_pretrain_local.py --script 00_quickstart_pretrain.py --devices 2
+
+    # Launch finetune script
+    python 02_launch_pretrain_local.py --script 00_quickstart_finetune.py --devices 1
+
+    # Launch with YAML config
+    python 02_launch_pretrain_local.py \
+        --script 01_pretrain_with_yaml.py \
+        --devices 2 \
+        --config-file conf/llama32_1b_pretrain.yaml
+
+    # Pass CLI overrides to the training script
+    python 02_launch_pretrain_local.py \
+        --script 01_finetune_with_yaml.py \
+        --devices 2 \
+        --script-args "train.train_iters=500 peft.dim=16"
+
+    # Dry run (see what would be executed)
+    python 02_launch_pretrain_local.py --script 00_quickstart_pretrain.py --dry-run
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import nemo_run as run
+
+
+logger = logging.getLogger(__name__)
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Launch training (pretrain/finetune) locally using NeMo-Run",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--script",
+        type=str,
+        required=True,
+        help="Training script to run (e.g., 00_quickstart_pretrain.py, 00_quickstart_finetune.py)",
+    )
+    parser.add_argument(
+        "--devices",
+        type=int,
+        default=1,
+        help="Number of GPUs to use (default: 1)",
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=None,
+        help="YAML config file to pass to the training script (optional)",
+    )
+    parser.add_argument(
+        "--script-args",
+        type=str,
+        default="",
+        help='Additional arguments to pass to the training script (space-separated, e.g., "train.train_iters=100")',
+    )
+    parser.add_argument(
+        "--experiment-name",
+        type=str,
+        default="megatron_bridge_training",
+        help="Name for the experiment (default: megatron_bridge_training)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print what would be executed without running",
+    )
+
+    return parser.parse_args()
+
+
+def main() -> None:
+    """Launch training (pretrain/finetune) using NeMo-Run LocalExecutor."""
+    args = parse_args()
+
+    # Resolve script path
+    script_path = SCRIPT_DIR / args.script
+    if not script_path.exists():
+        raise FileNotFoundError(f"Training script not found: {script_path}")
+
+    # Build arguments for the training script
+    script_args = []
+    if args.config_file:
+        script_args.extend(["--config-file", args.config_file])
+
+    if args.script_args:
+        # Split the script args string and add each arg
+        script_args.extend(args.script_args.split())
+
+    logger.info("Launching training with NeMo-Run LocalExecutor")
+    logger.info(f"Script: {script_path.name}")
+    logger.info(f"GPUs: {args.devices}")
+    if args.config_file:
+        logger.info(f"Config: {args.config_file}")
+    if script_args:
+        logger.info(f"Script args: {' '.join(script_args)}")
+    logger.info("")
+
+    # Create the training task
+    task = run.Script(
+        path=str(script_path),
+        entrypoint="python",
+        args=script_args,
+    )
+
+    # Create the local executor with torchrun
+    executor = run.LocalExecutor(
+        ntasks_per_node=args.devices,
+        launcher="torchrun",
+    )
+
+    # Run the experiment
+    with run.Experiment(args.experiment_name) as exp:
+        exp.add(task, executor=executor, name="training")
+        exp.run(detach=False, dryrun=args.dry_run)
+
+    if not args.dry_run:
+        logger.info("Training completed!")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    main()
diff --git a/examples/recipes/llama/03_launch_pretrain_slurm.py b/examples/recipes/llama/03_launch_pretrain_slurm.py
new file mode 100644
index 000000000..5d23effdc
--- /dev/null
+++ b/examples/recipes/llama/03_launch_pretrain_slurm.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Launch Training on Slurm with NeMo-Run
+
+This script demonstrates how to launch training scripts (pretrain or finetune)
+on a Slurm cluster using NeMo-Run. This enables easy multi-node training with
+proper job management.
+
+Prerequisites: Install nemo-run
+
+Usage:
+    # From the Slurm cluster (uses LocalTunnel)
+    python 03_launch_pretrain_slurm.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account
+
+    # From your local machine (uses SSHTunnel)
+    python 03_launch_pretrain_slurm.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        --ssh-tunnel \
+        --host my-cluster.example.com \
+        --user myusername \
+        --remote-job-dir /home/myusername/nemo-runs
+
+    # With custom SSH key
+    python 03_launch_pretrain_slurm.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        --ssh-tunnel \
+        --host my-cluster.example.com \
+        --user myusername \
+        --remote-job-dir /home/myusername/nemo-runs \
+        --identity ~/.ssh/id_rsa
+
+    # Launch with custom config
+    python 03_launch_pretrain_slurm.py \
+        --script 01_finetune_with_yaml.py \
+        --nodes 1 \
+        --partition gpu \
+        --account my_account \
+        --config-file conf/llama32_1b_finetune.yaml
+
+    # With container and custom mounts
+    python 03_launch_pretrain_slurm.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        --container-image /path/to/container.sqsh \
+        --mount /data:/data
+
+Note:
+- Use --ssh-tunnel when launching from your local machine
+- Omit --ssh-tunnel when already on the Slurm cluster (uses LocalTunnel)
+- Adjust cluster-specific settings (account, partition, container paths)
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import nemo_run as run
+
+
+logger = logging.getLogger(__name__)
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Launch training (pretrain/finetune) on Slurm using NeMo-Run",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--script",
+        type=str,
+        required=True,
+        help="Training script to run (e.g., 00_quickstart_pretrain.py, 00_quickstart_finetune.py)",
+    )
+    parser.add_argument(
+        "--nodes",
+        type=int,
+        default=1,
+        help="Number of nodes to use (default: 1)",
+    )
+    parser.add_argument(
+        "--devices",
+        type=int,
+        default=8,
+        help="GPUs per node (default: 8)",
+    )
+    parser.add_argument(
+        "--partition",
+        type=str,
+        required=True,
+        help="Slurm partition name",
+    )
+    parser.add_argument(
+        "--account",
+        type=str,
+        required=True,
+        help="Slurm account name",
+    )
+    parser.add_argument(
+        "--time",
+        type=str,
+        default="04:00:00",
+        help="Job time limit (default: 04:00:00)",
+    )
+    parser.add_argument(
+        "--ssh-tunnel",
+        action="store_true",
+        help="Use SSH tunnel (for launching from local machine). Requires --host, --user, --remote-job-dir",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        help="SSH host for tunnel (required if --ssh-tunnel is set)",
+    )
+    parser.add_argument(
+        "--user",
+        type=str,
+        help="SSH user for tunnel (required if --ssh-tunnel is set)",
+    )
+    parser.add_argument(
+        "--remote-job-dir",
+        type=str,
+        help="Remote directory to store job files (required if --ssh-tunnel is set)",
+    )
+    parser.add_argument(
+        "--identity",
+        type=str,
+        default=None,
+        help="Path to SSH private key for authentication (optional)",
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=None,
+        help="YAML config file to pass to the training script (optional)",
+    )
+    parser.add_argument(
+        "--script-args",
+        type=str,
+        default="",
+        help="Additional arguments for the training script (space-separated)",
+    )
+    parser.add_argument(
+        "--container-image",
+        type=str,
+        default=None,
+        help="Container image path (optional)",
+    )
+    parser.add_argument(
+        "--mount",
+        type=str,
+        action="append",
+        default=[],
+        help="Container mounts in format host:container (can be specified multiple times)",
+    )
+    parser.add_argument(
+        "--experiment-name",
+        type=str,
+        default="megatron_bridge_training",
+        help="Name for the experiment",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print what would be executed without submitting the job",
+    )
+
+    return parser.parse_args()
+
+
+def main() -> None:
+    """Launch training (pretrain/finetune) using NeMo-Run SlurmExecutor."""
+    args = parse_args()
+
+    # Validate SSH tunnel arguments
+    if args.ssh_tunnel:
+        if not all([args.host, args.user, args.remote_job_dir]):
+            raise ValueError("--ssh-tunnel requires --host, --user, and --remote-job-dir to be specified")
+
+    # Resolve script path
+    script_path = SCRIPT_DIR / args.script
+    if not script_path.exists():
+        raise FileNotFoundError(f"Training script not found: {script_path}")
+
+    # Build arguments for the training script
+    script_args = []
+    if args.config_file:
+        script_args.extend(["--config-file", args.config_file])
+
+    if args.script_args:
+        script_args.extend(args.script_args.split())
+
+    # Create the training task
+    task = run.Script(
+        path=str(script_path),
+        entrypoint="python",
+        args=script_args,
+    )
+
+    # Configure tunnel (SSH for remote, Local if already on cluster)
+    tunnel = None
+    if args.ssh_tunnel:
+        tunnel = run.SSHTunnel(
+            host=args.host,
+            user=args.user,
+            job_dir=args.remote_job_dir,
+            identity=args.identity,
+        )
+        logger.info(f"Using SSH tunnel to {args.user}@{args.host}")
+    else:
+        tunnel = run.LocalTunnel()
+        logger.info("Using LocalTunnel (running on cluster)")
+
+    # Create the Slurm executor
+    executor = run.SlurmExecutor(
+        account=args.account,
+        partition=args.partition,
+        nodes=args.nodes,
+        ntasks_per_node=args.devices,
+        gpus_per_node=args.devices,
+        mem="0",
+        exclusive=True,
+        time=args.time,
+        tunnel=tunnel,
+    )
+
+    # Configure container if specified
+    if args.container_image:
+        executor.container_image = args.container_image
+
+    # Configure mounts if specified
+    if args.mount:
+        executor.container_mounts = args.mount
+
+    # Set common environment variables
+    executor.env_vars = {
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+        "NCCL_NVLS_ENABLE": "0",
+    }
+
+    # Run the experiment
+    with run.Experiment(args.experiment_name) as exp:
+        exp.add(task, executor=executor, name="training")
+        exp.run(detach=True, dryrun=args.dry_run)
+
+    if args.dry_run:
+        logger.info("Dry run completed - no job was submitted")
+    else:
+        logger.info("Job submitted to Slurm!")
+        logger.info("Use 'squeue' to check job status")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    main()
diff --git a/examples/recipes/llama/README.md b/examples/recipes/llama/README.md
new file mode 100644
index 000000000..1d00e237e
--- /dev/null
+++ b/examples/recipes/llama/README.md
@@ -0,0 +1,308 @@
+# Llama Recipes with Megatron Bridge
+
+This guide shows you how to pretrain and finetune Llama models using Megatron Bridge.
+
+## Quickstart
+
+The fastest way to get started with Megatron Bridge pretraining:
+
+```bash
+torchrun --nproc_per_node=1 00_quickstart_pretrain.py
+```
+
+This runs Llama 3.2 1B pretraining on a single GPU with mock data.
+
+For finetuning, you need a checkpoint in Megatron format. Convert from HuggingFace:
+
+```bash
+python ../../conversion/convert_checkpoints.py import \
+    --hf-model meta-llama/Llama-3.2-1B \
+    --megatron-path ./checkpoints/llama32_1b
+```
+
+Then run finetuning:
+
+```bash
+torchrun --nproc_per_node=1 00_quickstart_finetune.py \
+    --pretrained-checkpoint ./checkpoints/llama32_1b
+```
+
+This finetunes Llama 3.2 1B using LoRA on the SQuAD dataset.
+
+To use real data, uncomment and modify in the script:
+
+```python
+config.data.data_path = "/path/to/your/dataset"
+```
+
+## Configuration with YAML
+
+For more complex configurations, use YAML files and command-line overrides:
+
+```bash
+torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+    --config-file conf/llama32_1b_pretrain.yaml
+```
+
+Understanding YAML Configuration:
+
+YAML files should be organized into sections that mirror the `ConfigContainer` structure. Each top-level key corresponds to a configuration section (e.g., `data`, `train`, `model`, `optimizer`). Overrides are applied in a nested manner according to the ConfigContainer fields.
+
+Example YAML (`conf/llama32_1b_pretrain.yaml`):
+
+```yaml
+# Each section maps to a ConfigContainer field
+data:                              # GPTDatasetConfig
+  data_path: /path/to/training/data
+  seq_length: 4096
+
+train:                             # TrainingConfig
+  train_iters: 10000
+  global_batch_size: 256
+
+checkpoint:                        # CheckpointConfig
+  save: ./checkpoints/llama32_1b
+  save_interval: 1000
+
+model:                             # Model Provider
+  seq_length: 4096                 # Must match data.seq_length
+  tensor_model_parallel_size: 1
+  
+optimizer:                         # OptimizerConfig
+  lr: 0.0003
+```
+
+Override from command line using dot notation:
+
+Command-line overrides follow the same pattern as YAML structure. The first part before the dot indicates which subconfig of ConfigContainer to override (e.g., `train`, `model`, `optimizer`), and the part after the dot specifies the field within that subconfig.
+
+```bash
+torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+    --config-file conf/llama32_1b_pretrain.yaml \
+    train.train_iters=5000 \
+    train.global_batch_size=512 \
+    optimizer.lr=0.0002
+```
+
+In this example:
+- `train.train_iters=5000` → overrides `ConfigContainer.train.train_iters`
+- `optimizer.lr=0.0002` → overrides `ConfigContainer.optimizer.lr`
+
+These example scripts are configured to accept overrides in the priority order (highest to lowest):
+1. Command-line overrides (dot notation: `section.field=value`)
+2. YAML config file (nested structure)
+3. Base recipe defaults (from `llama32_1b_pretrain_config()`)
+
+## Multi-Node Training with NeMo-Run
+
+### Prerequisites
+
+```bash
+pip install nemo-run
+```
+
+### Launch Locally
+
+Test your setup before going to a cluster. Works with both pretrain and finetune scripts:
+
+```bash
+# Pretrain
+python 02_launch_pretrain_local.py \
+    --script 00_quickstart_pretrain.py \
+    --devices 2
+
+# Finetune
+python 02_launch_pretrain_local.py \
+    --script 00_quickstart_finetune.py \
+    --devices 1
+```
+
+### Launch on Slurm
+
+For multi-node training on Slurm clusters:
+
+From the cluster (LocalTunnel):
+
+```bash
+python 03_launch_pretrain_slurm.py \
+    --script 00_quickstart_pretrain.py \
+    --nodes 2 \
+    --devices 8 \
+    --partition gpu \
+    --account my_account
+```
+
+From your local machine (SSHTunnel):
+
+```bash
+python 03_launch_pretrain_slurm.py \
+    --script 00_quickstart_pretrain.py \
+    --nodes 2 \
+    --devices 8 \
+    --partition gpu \
+    --account my_account \
+    --ssh-tunnel \
+    --host my-cluster.example.com \
+    --user myusername \
+    --remote-job-dir /home/myusername/nemo-runs
+```
+
+With custom config:
+
+```bash
+python 03_launch_pretrain_slurm.py \
+    --script 01_finetune_with_yaml.py \
+    --nodes 1 \
+    --devices 8 \
+    --partition gpu \
+    --account my_account \
+    --config-file conf/llama32_1b_finetune.yaml
+```
+
+## Finetuning
+
+### Quickstart: Finetune with LoRA
+
+Prerequisites: You need a checkpoint in Megatron format. Convert from HuggingFace:
+
+```bash
+python ../../conversion/convert_checkpoints.py import \
+    --hf-model meta-llama/Llama-3.2-1B \
+    --megatron-path ./checkpoints/llama32_1b
+```
+
+Run finetuning:
+
+```bash
+torchrun --nproc_per_node=1 00_quickstart_finetune.py \
+    --pretrained-checkpoint ./checkpoints/llama32_1b
+```
+
+By default, this:
+- Uses LoRA (Low-Rank Adaptation) for efficient finetuning
+- Trains on the SQuAD dataset
+- Works on a single GPU
+- Llama 3.2 1B model
+
+Customize in the script:
+
+```python
+# Use your own dataset (JSONL format)
+config.data.data_path = "/path/to/your/dataset.jsonl"
+
+# Adjust LoRA hyperparameters
+config.peft.dim = 16  # LoRA rank (default: 8)
+config.peft.alpha = 32  # LoRA alpha (default: 16)
+```
+
+### Configuration with YAML
+
+For more complex finetuning configurations:
+
+```bash
+torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+    --config-file conf/llama32_1b_finetune.yaml
+```
+
+Example YAML (`conf/llama32_1b_finetune.yaml`):
+
+```yaml
+# Each section maps to a ConfigContainer field
+data:                              # FinetuningDatasetConfig
+  data_path: /path/to/finetuning_dataset.jsonl
+  seq_length: 4096
+
+train:                             # TrainingConfig  
+  train_iters: 1000
+  global_batch_size: 128
+
+checkpoint:                        # CheckpointConfig
+  pretrained_checkpoint: /path/to/pretrained/checkpoint
+  save: ./checkpoints/llama32_1b_finetuned
+  save_interval: 500
+
+peft:                             # PEFT (LoRA config)
+  dim: 8      # LoRA rank
+  alpha: 16   # LoRA alpha
+
+model:                            # Model Provider
+  seq_length: 4096                # Must match data.seq_length
+  
+optimizer:                        # OptimizerConfig
+  lr: 0.0001  # Higher LR for LoRA
+```
+
+Override from command line using dot notation:
+
+The first part before the dot indicates which ConfigContainer subconfig to override, and the part after specifies the field.
+
+```bash
+torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+    --config-file conf/llama32_1b_finetune.yaml \
+    peft.dim=16 \
+    train.train_iters=2000
+```
+
+Here, `peft.dim=16` overrides `ConfigContainer.peft.dim`.
+
+Full finetuning (no LoRA):
+
+```bash
+torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+    --peft none \
+    train.train_iters=1000
+```
+
+### Multi-Node Finetuning
+
+Use the same launchers for finetuning.
+
+Local:
+
+```bash
+python 02_launch_pretrain_local.py \
+    --script 00_quickstart_finetune.py \
+    --devices 1 \
+    --script-args "--pretrained-checkpoint ./checkpoints/llama32_1b"
+```
+
+Slurm:
+
+```bash
+python 03_launch_pretrain_slurm.py \
+    --script 01_finetune_with_yaml.py \
+    --nodes 1 \
+    --partition gpu \
+    --account my_account \
+    --config-file conf/llama32_1b_finetune.yaml
+```
+
+### Working with Checkpoints
+
+**Important:** Finetuning requires checkpoints in Megatron format. You cannot use HuggingFace checkpoints directly.
+
+You can obtain Megatron checkpoints by:
+
+1. Converting from HuggingFace (recommended for starting from public models)
+2. Using Megatron checkpoints from your own pretraining runs
+
+Convert HuggingFace checkpoint to Megatron format:
+
+```bash
+python ../../conversion/convert_checkpoints.py import \
+    --hf-model meta-llama/Llama-3.2-1B \
+    --megatron-path ./checkpoints/llama32_1b
+```
+
+Use the checkpoint:
+
+```bash
+# Command line (quickstart scripts)
+torchrun --nproc_per_node=1 00_quickstart_finetune.py \
+    --pretrained-checkpoint ./checkpoints/llama32_1b
+
+# YAML config (01_finetune_with_yaml.py)
+# In conf/llama32_1b_finetune.yaml:
+# checkpoint:
+#   pretrained_checkpoint: ./checkpoints/llama32_1b
+```
diff --git a/examples/recipes/llama/conf/llama32_1b_finetune.yaml b/examples/recipes/llama/conf/llama32_1b_finetune.yaml
new file mode 100644
index 000000000..a7919529d
--- /dev/null
+++ b/examples/recipes/llama/conf/llama32_1b_finetune.yaml
@@ -0,0 +1,82 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example YAML configuration for Llama 3.2 1B finetuning with LoRA
+# This file demonstrates the most commonly customized settings.
+# Uncomment and modify values as needed for your use case.
+
+# Data configuration
+data:
+  # Replace with your dataset path (JSONL format recommended)
+  # data_path: /path/to/your/finetuning_dataset.jsonl
+  seq_length: 4096
+
+# Training configuration
+train:
+  train_iters: 100
+  global_batch_size: 128
+  micro_batch_size: 2
+  eval_iters: 10
+  eval_interval: 50
+  
+  # Load from pretrained checkpoint
+  # load: /path/to/pretrained/checkpoint
+
+# Optimizer configuration (higher LR for LoRA)
+optimizer:
+  lr: 0.0001
+  min_lr: 0.00001
+  weight_decay: 0.0
+
+# Learning rate scheduler
+scheduler:
+  lr_warmup_iters: 10
+  lr_decay_style: cosine
+
+# Checkpoint configuration
+checkpoint:
+  # Directory to save finetuned checkpoints
+  save: ./checkpoints/llama32_1b_finetuned
+  save_interval: 50
+
+# LoRA configuration (default: rank=8, alpha=16)
+peft:
+  dim: 8  # LoRA rank
+  alpha: 16  # LoRA alpha scaling
+
+# For full finetuning instead of LoRA:
+# peft: null
+# And adjust parallelism if needed:
+# model:
+#   tensor_model_parallel_size: 2
+
+# Model configuration (LoRA default: single GPU)
+# Note: seq_length must match data.seq_length
+model:
+  seq_length: 4096
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+
+# Logging
+logger:
+  log_interval: 10
+  tensorboard_dir: ./logs/llama32_1b_finetuned
+  # wandb_project: my_finetune_project
+  # wandb_entity: my_team
+
+# Random seed
+rng:
+  seed: 1234
+
diff --git a/examples/recipes/llama/conf/llama32_1b_pretrain.yaml b/examples/recipes/llama/conf/llama32_1b_pretrain.yaml
new file mode 100644
index 000000000..8454e7aa1
--- /dev/null
+++ b/examples/recipes/llama/conf/llama32_1b_pretrain.yaml
@@ -0,0 +1,71 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example YAML configuration for Llama 3.2 1B pretraining
+# This file demonstrates the most commonly customized settings.
+# Uncomment and modify values as needed for your use case.
+
+# Data configuration
+data:
+  # Replace with your dataset path
+  # data_path: /path/to/your/dataset
+  sequence_length: 4096
+
+# Training configuration
+train:
+  train_iters: 100
+  global_batch_size: 256
+  micro_batch_size: 2
+  eval_iters: 10
+  eval_interval: 50
+
+# Optimizer configuration
+optimizer:
+  lr: 0.0003
+  min_lr: 0.00003
+  weight_decay: 0.1
+
+# Learning rate scheduler
+scheduler:
+  lr_warmup_iters: 20
+  lr_decay_style: cosine
+
+# Checkpoint configuration
+checkpoint:
+  # Directory to save checkpoints
+  save: ./checkpoints/llama32_1b
+  save_interval: 50
+  # Resume from checkpoint (optional)
+  # load: ./checkpoints/llama32_1b/iter_0000050
+
+# Model configuration
+# Llama 3.2 1B defaults: TP=1, PP=1, CP=1 (works on single GPU)
+# Note: seq_length must match data.seq_length
+model:
+  seq_length: 4096
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+
+# Logging
+logger:
+  log_interval: 10
+  tensorboard_dir: ./logs/llama32_1b
+  # wandb_project: my_project  # Uncomment to enable W&B logging
+  # wandb_entity: my_team
+
+# Random seed
+rng:
+  seed: 1234
+
diff --git a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml
deleted file mode 100644
index 5f55d9988..000000000
--- a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Example override file
-
-# To override a parameter, ensure the structure matches the ConfigContainer
-# and its sub-configurations (e.g., model, train, etc.)
-# Top-level ConfigContainer fields are dataclasses themselves
-
-model:
-  seq_length: 4096
-
-train:
-  train_iters: 20
-  global_batch_size: 8
-  micro_batch_size: 1
-  eval_iters: 0
-
-optimizer:
-  lr: 0.00025
-  min_lr: 0.000025
-
-scheduler:
-  lr_warmup_iters: 10
-
-checkpoint:
-  # Directory to save to. If null, no checkpoint will be saved.
-  save: null
-
-dist:
-  use_megatron_fsdp: false
-  use_torch_fsdp2: false
-
-logger:
-  log_interval: 1
-
-dataset:
-  seq_length: 4096
-
-rng:
-  seed: 42
-
-ddp:
-  grad_reduce_in_fp32: true
-
-profiling:
-  # For optional fields in the config, specify the target to instantiate the object.
-  _target_: megatron.bridge.training.config.ProfilingConfig
-  use_nsys_profiler: false
-  profile_step_start: 5
-  profile_step_end: 10
-  use_pytorch_profiler: true
-  profile_ranks: [0, 1]
-  record_shapes: true
diff --git a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml
deleted file mode 100644
index 0e239e284..000000000
--- a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Example override file
-
-# To override a parameter, ensure the structure matches the ConfigContainer
-# and its sub-configurations (e.g., model, train, etc.)
-# Top-level ConfigContainer fields are dataclasses themselves
-
-model:
-  seq_length: 4096
-  init_model_with_meta_device: true
-
-train:
-  train_iters: 20
-  global_batch_size: 8
-  micro_batch_size: 1
-  eval_iters: 0
-
-optimizer:
-  lr: 0.00025
-  min_lr: 0.000025
-
-scheduler:
-  lr_warmup_iters: 10
-
-checkpoint:
-  # Directory to save to. If null, no checkpoint will be saved.
-  save: null
-  ckpt_format: "fsdp_dtensor"
-
-dist:
-  use_megatron_fsdp: true
-  use_torch_fsdp2: false
-
-logger:
-  log_interval: 1
-
-dataset:
-  seq_length: 4096
-
-rng:
-  seed: 42
-
-ddp:
-  grad_reduce_in_fp32: true
-  data_parallel_sharding_strategy: "optim_grads_params" # for Megatron FSDP ZeRO-3 like sharding
-
-profiling:
-  # For optional fields in the config, specify the target to instantiate the object.
-  _target_: megatron.bridge.training.config.ProfilingConfig
-  use_nsys_profiler: false
-  profile_step_start: 5
-  profile_step_end: 10
-  use_pytorch_profiler: true
-  profile_ranks: [0, 1]
-  record_shapes: true
diff --git a/examples/recipes/llama/pretrain_llama3_8b.py b/examples/recipes/llama/pretrain_llama3_8b.py
deleted file mode 100644
index 76ebde762..000000000
--- a/examples/recipes/llama/pretrain_llama3_8b.py
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Llama3 8B Pretraining Script with YAML and CLI Configuration Overrides.
-
-This script provides a flexible way to pretrain Llama3 8B models using Megatron-Bridge with support for
-both YAML configuration files and command-line overrides using Hydra-style syntax.
-
-Examples:
-    Basic usage with default configuration:
-        $ torchrun --nproc_per_node=8 examples/recipes/llama/pretrain_llama3_8b.py
-
-    Using a custom YAML config file:
-        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_custom_config.yaml
-
-    Using CLI overrides only:
-        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py model.tensor_model_parallel_size=4 train.train_iters=100000
-
-    Combining YAML and CLI overrides (CLI takes precedence):
-        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file conf/my_config.yaml \
-        model.pipeline_dtype=torch.float16 \
-        train.global_batch_size=512
-
-Configuration Precedence:
-    1. Base configuration from pretrain_config() recipe
-    2. YAML overrides from --config-file (if provided)
-    3. CLI overrides (highest precedence)
-
-Supported Override Syntax:
-    - Standard assignment: key=value
-    - Nested assignment: section.subsection.key=value
-    - Addition: +new_key=value
-    - Deletion: ~key_to_remove
-    - Type conversion: Automatic for basic types (int, float, bool, str)
-    - Complex types: torch.dtype, enums, etc. are supported
-"""
-
-import argparse
-import logging
-import os
-import sys
-from pathlib import Path
-from typing import Tuple
-
-import torch
-from omegaconf import OmegaConf
-
-from megatron.bridge.recipes.llama import llama3_8b_pretrain_config as pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.training.utils.omegaconf_utils import (
-    apply_overrides,
-    create_omegaconf_dict_config,
-    parse_hydra_overrides,
-)
-from megatron.bridge.utils.common_utils import get_rank_safe
-
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-# Define paths relative to this script's location
-# Assumes this script (pretrain_llama3_8b.py) is in Megatron-Bridge/examples/recipes/llama/
-# and the config is in a 'conf' subdirectory.
-SCRIPT_DIR: Path = Path(__file__).parent.resolve()
-DEFAULT_CONFIG_FILENAME: str = "llama3_8b_pretrain_override_example.yaml"
-DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME
-
-
-def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
-    """Parse command line arguments, separating known script args from OmegaConf overrides."""
-    parser = argparse.ArgumentParser(
-        description="Pretrain Llama3 8B model using Megatron-Bridge with YAML and CLI overrides",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--config-file",
-        type=str,
-        default=str(DEFAULT_CONFIG_FILE_PATH),
-        help="Path to the YAML OmegaConf override file. Default: conf/llama3_8b_pretrain_override_example.yaml",
-    )
-    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
-
-    # Parse known args for the script, remaining will be treated as overrides
-    args, cli_dotlist_overrides = parser.parse_known_args()
-    return args, cli_dotlist_overrides
-
-
-def main() -> None:
-    """
-    Entry point for the Llama3 8B pretraining script.
-
-    This function orchestrates the complete configuration workflow:
-    1. Loads the base configuration from pretrain_config() recipe
-    2. Applies YAML overrides from --config-file (if exists)
-    3. Applies CLI overrides using Hydra-style syntax
-    4. Starts Megatron pretraining with the final merged configuration
-
-    Configuration merging preserves callable fields (like activation functions)
-    and handles type conversions automatically.
-
-    Examples of CLI usage:
-        # Use default config with custom learning rate
-        torchrun --nproc_per_node=8 pretrain_llama3_8b.py optimizer.lr=0.0002
-
-        # Custom config file with additional overrides
-        torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_config.yaml train.train_iters=50000
-
-        # Multiple overrides for distributed training
-        torchrun --nproc_per_node=8 pretrain_llama3_8b.py \
-            model.tensor_model_parallel_size=4 \
-            model.pipeline_model_parallel_size=2 \
-            train.global_batch_size=512
-    """
-    args, cli_overrides = parse_cli_args()
-
-    logger.info("Megatron-Bridge Llama3 8B Pretraining Script with YAML & CLI Overrides")
-    logger.info("------------------------------------------------------------------")
-
-    # Load base configuration from the recipe as a Python dataclass
-    cfg: ConfigContainer = pretrain_config()
-    logger.info("Loaded base configuration")
-
-    # Print configuration on rank 0
-    if get_rank_safe() == 0:
-        cfg.print_yaml()
-
-    # Convert the initial Python dataclass to an OmegaConf DictConfig for merging
-    merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
-
-    # Load and merge YAML overrides if a config file is provided
-    if args.config_file:
-        logger.debug(f"Loading YAML overrides from: {args.config_file}")
-        if not os.path.exists(args.config_file):
-            logger.error(f"Override YAML file not found: {args.config_file}")
-            sys.exit(1)
-        yaml_overrides_omega = OmegaConf.load(args.config_file)
-        merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega)
-        logger.debug("YAML overrides merged successfully.")
-
-    # Apply command-line overrides using Hydra-style parsing
-    if cli_overrides:
-        logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}")
-        merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
-        logger.debug("Hydra-style command-line overrides applied successfully.")
-
-    # Apply the final merged OmegaConf configuration back to the original ConfigContainer
-    logger.debug("Applying final merged configuration back to Python ConfigContainer...")
-    final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
-    # Apply overrides while preserving excluded fields
-    apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
-
-    # Display final configuration
-    if get_rank_safe() == 0:
-        logger.info("--- Final Merged Configuration ---")
-        cfg.print_yaml()
-        logger.info("----------------------------------")
-
-    # Start training
-    logger.debug("Starting pretraining...")
-    pretrain(config=cfg, forward_step_func=forward_step)
-
-    # Cleanup process group
-    if torch.distributed.is_initialized():
-        torch.distributed.barrier()
-        torch.distributed.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py b/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py
deleted file mode 100644
index 6b8c6c68d..000000000
--- a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-NeMo Run Launcher for Llama3 8B Pretraining.
-
-This script launches the pretrain_llama3_8b.py script using NeMo Run with TorchRun,
-while forwarding any additional command line arguments to the target script.
-
-Examples:
-    Basic usage with default config:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8
-
-    Using a custom config file:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 --config-file=my_config.yaml
-
-    Passing additional overrides to the target script:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 \
-            model.tensor_model_parallel_size=4 \
-            train.train_iters=100000
-
-    Using both custom config and CLI overrides:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 \
-            --config-file=conf/my_custom_config.yaml \
-            optimizerg.lr=0.0002 \
-            train.global_batch_size=512
-
-    Dry run to see what would be executed:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 --dryrun \
-            model.pipeline_dtype=torch.float16
-
-Argument Forwarding:
-    Any arguments not recognized by this launcher script will be forwarded
-    to the target pretrain_llama3_8b.py script as Hydra-style overrides.
-"""
-
-import argparse
-import logging
-import sys
-from pathlib import Path
-from typing import Tuple
-
-import nemo_run as run
-
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-# Define paths relative to this script's location
-# Assumes this script (pretrain_llama3_8b_nemo_run_script.py) is in Megatron-Bridge/examples/recipes/llama/
-# and pretrain_llama3_8b.py is in the same directory,
-# and the config is in a 'conf' subdirectory.
-SCRIPT_DIR: Path = Path(__file__).parent.resolve()
-PRETRAIN_SCRIPT_FILENAME: str = "pretrain_llama3_8b.py"
-PRETRAIN_SCRIPT_PATH: Path = SCRIPT_DIR / PRETRAIN_SCRIPT_FILENAME
-DEFAULT_CONFIG_FILENAME: str = "llama3_8b_pretrain_override_example.yaml"
-DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME
-
-
-def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
-    """Parse command line arguments, separating launcher args from target script args."""
-    parser = argparse.ArgumentParser(
-        description="Launcher for Llama3 8B pretraining using nemo_run and TorchRun. "
-        "Additional arguments will be forwarded to pretrain_llama3_8b.py",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--nproc-per-node",
-        type=int,
-        default=2,
-        help="Number of processes per node for TorchRun (typically number of GPUs).",
-    )
-    parser.add_argument(
-        "--config-file",
-        type=str,
-        default=str(DEFAULT_CONFIG_FILE_PATH),
-        help="Path to the YAML override config file for the pretrain_llama3_8b.py script.",
-    )
-    parser.add_argument(
-        "--dryrun",
-        action="store_true",
-        help="Dry run the script without actually running it.",
-    )
-
-    # Parse known args for the launcher, remaining will be forwarded to target script
-    args, forwarded_args = parser.parse_known_args()
-    return args, forwarded_args
-
-
-def main() -> None:
-    """
-    Main function for script demonstrating how to use the NeMo Run executor.
-    """
-    args, forwarded_args = parse_cli_args()
-
-    logger.info("Nemo Run Launcher for Llama3 8B Pretraining")
-    logger.info("===========================================")
-
-    if not PRETRAIN_SCRIPT_PATH.is_file():
-        logger.error(f"Target pretraining script not found: {PRETRAIN_SCRIPT_PATH}")
-        logger.error(f"Please ensure '{PRETRAIN_SCRIPT_FILENAME}' exists in the same directory as this launcher.")
-        sys.exit(1)
-
-    config_file_to_use = Path(args.config_file).resolve()
-    if not config_file_to_use.is_file():
-        logger.error(f"Specified YAML config file not found: {config_file_to_use}")
-        logger.error("Ensure the path passed to --config_file is correct.")
-        sys.exit(1)
-
-    # Build the arguments list for the target script
-    target_script_args = [
-        "--config-file",
-        str(config_file_to_use),
-    ]
-
-    # Add any forwarded arguments (Hydra-style overrides and other target script args)
-    if forwarded_args:
-        target_script_args.extend(forwarded_args)
-        logger.info(f"Forwarding additional arguments to target script: {forwarded_args}")
-
-    logger.info(f"Target script: {PRETRAIN_SCRIPT_PATH}")
-    logger.info(f"Target script arguments: {target_script_args}")
-
-    train_script = run.Script(
-        path=str(PRETRAIN_SCRIPT_PATH),
-        entrypoint="python",
-        args=target_script_args,
-    )
-
-    # Define the executor
-    logger.info(f"Launching locally with TorchRun with nproc_per_node={args.nproc_per_node}")
-    executor = run.LocalExecutor(ntasks_per_node=args.nproc_per_node, launcher="torchrun")
-
-    # Execute the run
-    run.run(train_script, executor=executor, dryrun=args.dryrun)
-
-
-if __name__ == "__main__":
-    main()

From 6ed441c667e997e3d9350aa29deb7d1745e45288 Mon Sep 17 00:00:00 2001
From: Ananth Subramaniam <ansubramania@nvidia.com>
Date: Wed, 12 Nov 2025 03:28:28 -0800
Subject: [PATCH 2/7] keep simplifying

Signed-off-by: Ananth Subramaniam <ansubramania@nvidia.com>
---
 examples/recipes/llama/00_quickstart_finetune.py | 16 ++++++++--------
 examples/recipes/llama/00_quickstart_pretrain.py | 14 +++++---------
 examples/recipes/llama/README.md                 |  4 ++--
 .../recipes/llama/conf/llama32_1b_finetune.yaml  | 16 +++++-----------
 .../recipes/llama/conf/llama32_1b_pretrain.yaml  |  7 +++----
 5 files changed, 23 insertions(+), 34 deletions(-)

diff --git a/examples/recipes/llama/00_quickstart_finetune.py b/examples/recipes/llama/00_quickstart_finetune.py
index 5b8e4ee83..f604cbbbe 100644
--- a/examples/recipes/llama/00_quickstart_finetune.py
+++ b/examples/recipes/llama/00_quickstart_finetune.py
@@ -69,7 +69,7 @@ def main() -> None:
     args = parse_args()
 
     # Load the base finetune configuration
-    # By default: LoRA with rank=8, alpha=16, works on single GPU
+    # Uses LoRA for efficient finetuning on a single GPU
     config = llama32_1b_finetune_config()
 
     # Load from the pretrained checkpoint
@@ -84,7 +84,7 @@ def main() -> None:
 
     # === Use your own dataset ===
     # Replace SQuAD with your custom dataset
-    # Option 1: Simple path override (uses default FinetuningDatasetConfig)
+    # Option 1: Simple path override
     # config.data.data_path = "/path/to/your/dataset.jsonl"
 
     # Option 2: Use FinetuningDatasetConfig for custom JSONL datasets
@@ -95,16 +95,16 @@ def main() -> None:
     # from megatron.bridge.training.data import HFDatasetConfig
     # config.data = HFDatasetConfig(hf_dataset="squad", split="train")
 
-    # === Change learning rate ===
-    # config.optimizer.lr = 5e-5  # Default for LoRA: 1e-4
+    # === Adjust learning rate ===
+    # config.optimizer.lr = 5e-5
 
-    # === Modify checkpoint frequency ===
+    # === Change checkpoint save frequency ===
     # config.train.save_interval = 100
 
     # === Adjust LoRA hyperparameters ===
-    # Higher rank = more parameters = potentially better quality but slower
-    # config.peft.dim = 16  # LoRA rank (default: 8)
-    # config.peft.alpha = 32  # LoRA alpha scaling (default: 16)
+    # Higher rank = more trainable parameters, potentially better quality but slower
+    # config.peft.dim = 16  # LoRA rank
+    # config.peft.alpha = 32  # LoRA alpha scaling
 
     # === Full supervised finetuning (no LoRA) ===
     # For full finetuning, reload config with peft=None:
diff --git a/examples/recipes/llama/00_quickstart_pretrain.py b/examples/recipes/llama/00_quickstart_pretrain.py
index 5f11d2382..4ec50cba7 100644
--- a/examples/recipes/llama/00_quickstart_pretrain.py
+++ b/examples/recipes/llama/00_quickstart_pretrain.py
@@ -16,9 +16,6 @@
 """
 Quickstart: Pretrain Llama 3.2 1B with Megatron Bridge
 
-This is the simplest way to start pretraining with Megatron-Bridge.
-We use Llama 3.2 1B because it fits on a single GPU, making it easy to test.
-
 Usage:
     Single GPU:
         torchrun --nproc_per_node=1 00_quickstart_pretrain.py
@@ -27,9 +24,8 @@
         torchrun --nproc_per_node=8 00_quickstart_pretrain.py
 
 The script uses sensible defaults and mock data for quick testing.
-For custom configurations, see 01_pretrain_with_yaml.py
+For custom configurations through YAML and Hydra-style overrides, see 01_pretrain_with_yaml.py
 For multi-node training, see 02_launch_pretrain_slurm.py
-For larger models (8B, 70B), see 01_pretrain_with_yaml.py
 """
 
 from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
@@ -47,17 +43,17 @@ def main() -> None:
     # OPTIONAL: Customize key settings here
     # Uncomment and modify as needed:
 
-    # For a quick test run (10 iterations):
+    # For a quick test run:
     # config.train.train_iters = 10
 
-    # Use your own data (replace mock data):
+    # Use your own data:
     # config.data.data_path = "/path/to/your/dataset"
 
-    # Change batch sizes:
+    # Adjust batch sizes for your GPU memory:
     # config.train.global_batch_size = 256
     # config.train.micro_batch_size = 2
 
-    # Modify checkpoint frequency:
+    # Change checkpoint save frequency:
     # config.train.save_interval = 500
 
     # Start pretraining
diff --git a/examples/recipes/llama/README.md b/examples/recipes/llama/README.md
index 1d00e237e..82b04d396 100644
--- a/examples/recipes/llama/README.md
+++ b/examples/recipes/llama/README.md
@@ -191,8 +191,8 @@ Customize in the script:
 config.data.data_path = "/path/to/your/dataset.jsonl"
 
 # Adjust LoRA hyperparameters
-config.peft.dim = 16  # LoRA rank (default: 8)
-config.peft.alpha = 32  # LoRA alpha (default: 16)
+config.peft.dim = 16  # LoRA rank
+config.peft.alpha = 32  # LoRA alpha scaling
 ```
 
 ### Configuration with YAML
diff --git a/examples/recipes/llama/conf/llama32_1b_finetune.yaml b/examples/recipes/llama/conf/llama32_1b_finetune.yaml
index a7919529d..fd6bc1a73 100644
--- a/examples/recipes/llama/conf/llama32_1b_finetune.yaml
+++ b/examples/recipes/llama/conf/llama32_1b_finetune.yaml
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 # Example YAML configuration for Llama 3.2 1B finetuning with LoRA
-# This file demonstrates the most commonly customized settings.
-# Uncomment and modify values as needed for your use case.
+# This file demonstrates commonly customized settings.
+# Modify values as needed for your use case.
 
 # Data configuration
 data:
@@ -33,7 +33,7 @@ train:
   # Load from pretrained checkpoint
   # load: /path/to/pretrained/checkpoint
 
-# Optimizer configuration (higher LR for LoRA)
+# Optimizer configuration
 optimizer:
   lr: 0.0001
   min_lr: 0.00001
@@ -50,18 +50,12 @@ checkpoint:
   save: ./checkpoints/llama32_1b_finetuned
   save_interval: 50
 
-# LoRA configuration (default: rank=8, alpha=16)
+# LoRA configuration
 peft:
   dim: 8  # LoRA rank
   alpha: 16  # LoRA alpha scaling
 
-# For full finetuning instead of LoRA:
-# peft: null
-# And adjust parallelism if needed:
-# model:
-#   tensor_model_parallel_size: 2
-
-# Model configuration (LoRA default: single GPU)
+# Model configuration
 # Note: seq_length must match data.seq_length
 model:
   seq_length: 4096
diff --git a/examples/recipes/llama/conf/llama32_1b_pretrain.yaml b/examples/recipes/llama/conf/llama32_1b_pretrain.yaml
index 8454e7aa1..a46d0c689 100644
--- a/examples/recipes/llama/conf/llama32_1b_pretrain.yaml
+++ b/examples/recipes/llama/conf/llama32_1b_pretrain.yaml
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 # Example YAML configuration for Llama 3.2 1B pretraining
-# This file demonstrates the most commonly customized settings.
-# Uncomment and modify values as needed for your use case.
+# This file demonstrates commonly customized settings.
+# Modify values as needed for your use case.
 
 # Data configuration
 data:
@@ -50,8 +50,7 @@ checkpoint:
   # load: ./checkpoints/llama32_1b/iter_0000050
 
 # Model configuration
-# Llama 3.2 1B defaults: TP=1, PP=1, CP=1 (works on single GPU)
-# Note: seq_length must match data.seq_length
+# Note: seq_length must match data.sequence_length
 model:
   seq_length: 4096
   tensor_model_parallel_size: 1

From 390cb072c74db2bed2f4634feefde8227bbcbf5d Mon Sep 17 00:00:00 2001
From: Ananth Subramaniam <ansubramania@nvidia.com>
Date: Mon, 17 Nov 2025 00:42:14 -0800
Subject: [PATCH 3/7] update examples, include slurm sbatch script

Signed-off-by: Ananth Subramaniam <ansubramania@nvidia.com>
---
 .../recipes/llama/00_quickstart_pretrain.py   |   7 +-
 ..._finetune.py => 01_quickstart_finetune.py} |  14 +-
 .../recipes/llama/02_launch_pretrain_local.py | 156 ------------------
 ..._with_yaml.py => 02_pretrain_with_yaml.py} |   8 +-
 ..._with_yaml.py => 03_finetune_with_yaml.py} |  14 +-
 ...rm.py => 04_launch_slurm_with_nemo_run.py} | 109 ++++++------
 examples/recipes/llama/README.md              | 106 +++++-------
 examples/recipes/llama/launch_with_sbatch.sh  | 147 +++++++++++++++++
 8 files changed, 273 insertions(+), 288 deletions(-)
 rename examples/recipes/llama/{00_quickstart_finetune.py => 01_quickstart_finetune.py} (91%)
 delete mode 100644 examples/recipes/llama/02_launch_pretrain_local.py
 rename examples/recipes/llama/{01_pretrain_with_yaml.py => 02_pretrain_with_yaml.py} (94%)
 rename examples/recipes/llama/{01_finetune_with_yaml.py => 03_finetune_with_yaml.py} (90%)
 rename examples/recipes/llama/{03_launch_pretrain_slurm.py => 04_launch_slurm_with_nemo_run.py} (74%)
 create mode 100644 examples/recipes/llama/launch_with_sbatch.sh

diff --git a/examples/recipes/llama/00_quickstart_pretrain.py b/examples/recipes/llama/00_quickstart_pretrain.py
index 4ec50cba7..245cca0be 100644
--- a/examples/recipes/llama/00_quickstart_pretrain.py
+++ b/examples/recipes/llama/00_quickstart_pretrain.py
@@ -24,8 +24,8 @@
         torchrun --nproc_per_node=8 00_quickstart_pretrain.py
 
 The script uses sensible defaults and mock data for quick testing.
-For custom configurations through YAML and Hydra-style overrides, see 01_pretrain_with_yaml.py
-For multi-node training, see 02_launch_pretrain_slurm.py
+For custom configurations through YAML and Hydra-style overrides, see 02_pretrain_with_yaml.py
+For multi-node training, see launch_with_sbatch.sh or 04_launch_slurm_with_nemo_run.py
 """
 
 from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
@@ -44,7 +44,8 @@ def main() -> None:
     # Uncomment and modify as needed:
 
     # For a quick test run:
-    # config.train.train_iters = 10
+    config.train.train_iters = 10
+    config.scheduler.lr_warmup_iters = 2
 
     # Use your own data:
     # config.data.data_path = "/path/to/your/dataset"
diff --git a/examples/recipes/llama/00_quickstart_finetune.py b/examples/recipes/llama/01_quickstart_finetune.py
similarity index 91%
rename from examples/recipes/llama/00_quickstart_finetune.py
rename to examples/recipes/llama/01_quickstart_finetune.py
index f604cbbbe..a6f8060d6 100644
--- a/examples/recipes/llama/00_quickstart_finetune.py
+++ b/examples/recipes/llama/01_quickstart_finetune.py
@@ -14,18 +14,15 @@
 # limitations under the License.
 
 """
-Quickstart: Finetune Llama 3.2 1B with Megatron-Bridge
-
-This is the simplest way to start finetuning with Megatron-Bridge.
-By default, this uses LoRA (Low-Rank Adaptation) for efficient finetuning.
+Quickstart: Finetune Llama 3.2 1B with Megatron Bridge
 
 Usage:
     Single GPU with LoRA:
-        torchrun --nproc_per_node=1 00_quickstart_finetune.py \
+        torchrun --nproc_per_node=1 01_quickstart_finetune.py \
             --pretrained-checkpoint /path/to/megatron/checkpoint
 
     Multiple GPUs (automatic data parallelism):
-        torchrun --nproc_per_node=8 00_quickstart_finetune.py \
+        torchrun --nproc_per_node=8 01_quickstart_finetune.py \
             --pretrained-checkpoint /path/to/megatron/checkpoint
 
 Prerequisites:
@@ -40,6 +37,9 @@
 - Using your own dataset
 - Adjusting LoRA hyperparameters
 - Switching to full supervised finetuning
+
+For YAML configuration, see 03_finetune_with_yaml.py
+For multi-node training, see launch_with_sbatch.sh or 04_launch_slurm_with_nemo_run.py
 """
 
 import argparse
@@ -75,7 +75,7 @@ def main() -> None:
     # Load from the pretrained checkpoint
     config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint
 
-    # === Quick test run (10 iterations) ===
+    # === Quick test run ===
     config.train.train_iters = 10
     config.scheduler.lr_warmup_iters = 2
 
diff --git a/examples/recipes/llama/02_launch_pretrain_local.py b/examples/recipes/llama/02_launch_pretrain_local.py
deleted file mode 100644
index afcbfc431..000000000
--- a/examples/recipes/llama/02_launch_pretrain_local.py
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Launch Training Locally with NeMo-Run
-
-This script demonstrates how to launch training scripts (pretrain or finetune)
-using NeMo-Run's LocalExecutor with torchrun. This provides better job management
-and logging compared to running torchrun directly.
-
-Prerequisites: Install nemo-run
-
-Usage:
-    # Launch pretrain script
-    python 02_launch_pretrain_local.py --script 00_quickstart_pretrain.py --devices 2
-
-    # Launch finetune script
-    python 02_launch_pretrain_local.py --script 00_quickstart_finetune.py --devices 1
-
-    # Launch with YAML config
-    python 02_launch_pretrain_local.py \
-        --script 01_pretrain_with_yaml.py \
-        --devices 2 \
-        --config-file conf/llama32_1b_pretrain.yaml
-
-    # Pass CLI overrides to the training script
-    python 02_launch_pretrain_local.py \
-        --script 01_finetune_with_yaml.py \
-        --devices 2 \
-        --script-args "train.train_iters=500 peft.dim=16"
-
-    # Dry run (see what would be executed)
-    python 02_launch_pretrain_local.py --script 00_quickstart_pretrain.py --dry-run
-"""
-
-import argparse
-import logging
-from pathlib import Path
-
-import nemo_run as run
-
-
-logger = logging.getLogger(__name__)
-
-SCRIPT_DIR = Path(__file__).parent.resolve()
-
-
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Launch training (pretrain/finetune) locally using NeMo-Run",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--script",
-        type=str,
-        required=True,
-        help="Training script to run (e.g., 00_quickstart_pretrain.py, 00_quickstart_finetune.py)",
-    )
-    parser.add_argument(
-        "--devices",
-        type=int,
-        default=1,
-        help="Number of GPUs to use (default: 1)",
-    )
-    parser.add_argument(
-        "--config-file",
-        type=str,
-        default=None,
-        help="YAML config file to pass to the training script (optional)",
-    )
-    parser.add_argument(
-        "--script-args",
-        type=str,
-        default="",
-        help='Additional arguments to pass to the training script (space-separated, e.g., "train.train_iters=100")',
-    )
-    parser.add_argument(
-        "--experiment-name",
-        type=str,
-        default="megatron_bridge_training",
-        help="Name for the experiment (default: megatron_bridge_training)",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Print what would be executed without running",
-    )
-
-    return parser.parse_args()
-
-
-def main() -> None:
-    """Launch training (pretrain/finetune) using NeMo-Run LocalExecutor."""
-    args = parse_args()
-
-    # Resolve script path
-    script_path = SCRIPT_DIR / args.script
-    if not script_path.exists():
-        raise FileNotFoundError(f"Training script not found: {script_path}")
-
-    # Build arguments for the training script
-    script_args = []
-    if args.config_file:
-        script_args.extend(["--config-file", args.config_file])
-
-    if args.script_args:
-        # Split the script args string and add each arg
-        script_args.extend(args.script_args.split())
-
-    logger.info("Launching training with NeMo-Run LocalExecutor")
-    logger.info(f"Script: {script_path.name}")
-    logger.info(f"GPUs: {args.devices}")
-    if args.config_file:
-        logger.info(f"Config: {args.config_file}")
-    if script_args:
-        logger.info(f"Script args: {' '.join(script_args)}")
-    logger.info("")
-
-    # Create the training task
-    task = run.Script(
-        path=str(script_path),
-        entrypoint="python",
-        args=script_args,
-    )
-
-    # Create the local executor with torchrun
-    executor = run.LocalExecutor(
-        ntasks_per_node=args.devices,
-        launcher="torchrun",
-    )
-
-    # Run the experiment
-    with run.Experiment(args.experiment_name) as exp:
-        exp.add(task, executor=executor, name="training")
-        exp.run(detach=False, dryrun=args.dry_run)
-
-    if not args.dry_run:
-        logger.info("Training completed!")
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO, format="%(message)s")
-    main()
diff --git a/examples/recipes/llama/01_pretrain_with_yaml.py b/examples/recipes/llama/02_pretrain_with_yaml.py
similarity index 94%
rename from examples/recipes/llama/01_pretrain_with_yaml.py
rename to examples/recipes/llama/02_pretrain_with_yaml.py
index 4de837d25..999922324 100644
--- a/examples/recipes/llama/01_pretrain_with_yaml.py
+++ b/examples/recipes/llama/02_pretrain_with_yaml.py
@@ -21,19 +21,19 @@
 
 Usage:
     With default config file:
-        torchrun --nproc_per_node=8 01_pretrain_with_yaml.py
+        torchrun --nproc_per_node=8 02_pretrain_with_yaml.py
 
     With custom config file:
-        torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+        torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
             --config-file conf/my_custom_config.yaml
 
     With command-line overrides:
-        torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+        torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
             train.train_iters=5000 \
             train.global_batch_size=256
 
     Combining YAML and CLI (CLI takes precedence):
-        torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+        torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
             --config-file conf/llama32_1b_pretrain.yaml \
             train.train_iters=10000
 
diff --git a/examples/recipes/llama/01_finetune_with_yaml.py b/examples/recipes/llama/03_finetune_with_yaml.py
similarity index 90%
rename from examples/recipes/llama/01_finetune_with_yaml.py
rename to examples/recipes/llama/03_finetune_with_yaml.py
index c510c1b20..ead33540f 100644
--- a/examples/recipes/llama/01_finetune_with_yaml.py
+++ b/examples/recipes/llama/03_finetune_with_yaml.py
@@ -21,24 +21,24 @@
 
 Usage:
     With default config file:
-        torchrun --nproc_per_node=1 01_finetune_with_yaml.py
+        torchrun --nproc_per_node=1 03_finetune_with_yaml.py
 
     With custom config file:
-        torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+        torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
             --config-file conf/my_finetune_config.yaml
 
     With command-line overrides:
-        torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+        torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
             train.train_iters=1000 \
             optimizer.lr=5e-5
 
     Full finetuning instead of LoRA:
-        torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+        torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
             --peft none \
             train.train_iters=1000
 
     Combining YAML and CLI (CLI takes precedence):
-        torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+        torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
             --config-file conf/llama32_1b_finetune.yaml \
             peft.dim=16 \
             train.train_iters=2000
@@ -49,7 +49,7 @@
     3. Base recipe defaults (lowest)
 
 See conf/ directory for example YAML configurations.
-For a pure Python usage see 00_quickstart_finetune.py.
+For a pure Python usage see 01_quickstart_finetune.py.
 """
 
 import argparse
@@ -95,7 +95,7 @@ def parse_args() -> Tuple[argparse.Namespace, list[str]]:
         type=str,
         default="lora",
         choices=["lora", "dora", "none"],
-        help="PEFT method to use (default: lora). Use 'none' for full finetuning.",
+        help="PEFT method to use. Use 'none' for full finetuning.",
     )
     parser.add_argument("--debug", action="store_true", help="Enable debug logging")
 
diff --git a/examples/recipes/llama/03_launch_pretrain_slurm.py b/examples/recipes/llama/04_launch_slurm_with_nemo_run.py
similarity index 74%
rename from examples/recipes/llama/03_launch_pretrain_slurm.py
rename to examples/recipes/llama/04_launch_slurm_with_nemo_run.py
index 5d23effdc..fef063e7a 100644
--- a/examples/recipes/llama/03_launch_pretrain_slurm.py
+++ b/examples/recipes/llama/04_launch_slurm_with_nemo_run.py
@@ -24,14 +24,14 @@
 
 Usage:
     # From the Slurm cluster (uses LocalTunnel)
-    python 03_launch_pretrain_slurm.py \
+    python 04_launch_slurm_with_nemo_run.py \
         --script 00_quickstart_pretrain.py \
         --nodes 2 \
         --partition gpu \
         --account my_account
 
     # From your local machine (uses SSHTunnel)
-    python 03_launch_pretrain_slurm.py \
+    python 04_launch_slurm_with_nemo_run.py \
         --script 00_quickstart_pretrain.py \
         --nodes 2 \
         --partition gpu \
@@ -42,7 +42,7 @@
         --remote-job-dir /home/myusername/nemo-runs
 
     # With custom SSH key
-    python 03_launch_pretrain_slurm.py \
+    python 04_launch_slurm_with_nemo_run.py \
         --script 00_quickstart_pretrain.py \
         --nodes 2 \
         --partition gpu \
@@ -53,16 +53,25 @@
         --remote-job-dir /home/myusername/nemo-runs \
         --identity ~/.ssh/id_rsa
 
-    # Launch with custom config
-    python 03_launch_pretrain_slurm.py \
-        --script 01_finetune_with_yaml.py \
+    # Launch with custom config (pass arguments to training script)
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 03_finetune_with_yaml.py \
         --nodes 1 \
         --partition gpu \
         --account my_account \
         --config-file conf/llama32_1b_finetune.yaml
 
+    # Pass CLI overrides to training script
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 02_pretrain_with_yaml.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        train.train_iters=5000 \
+        optimizer.lr=0.0002
+
     # With container and custom mounts
-    python 03_launch_pretrain_slurm.py \
+    python 04_launch_slurm_with_nemo_run.py \
         --script 00_quickstart_pretrain.py \
         --nodes 2 \
         --partition gpu \
@@ -70,9 +79,21 @@
         --container-image /path/to/container.sqsh \
         --mount /data:/data
 
+    # Wait for job completion and tail logs
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        --no-detach \
+        --tail-logs
+
 Note:
 - Use --ssh-tunnel when launching from your local machine
 - Omit --ssh-tunnel when already on the Slurm cluster (uses LocalTunnel)
+- By default, jobs are submitted and detached (--detach)
+- Use --no-detach --tail-logs to wait and monitor job output
+- Any unknown arguments are forwarded to the training script
 - Adjust cluster-specific settings (account, partition, container paths)
 """
 
@@ -88,7 +109,7 @@
 SCRIPT_DIR = Path(__file__).parent.resolve()
 
 
-def parse_args() -> argparse.Namespace:
+def parse_args() -> tuple[argparse.Namespace, list[str]]:
     """Parse command-line arguments."""
     parser = argparse.ArgumentParser(
         description="Launch training (pretrain/finetune) on Slurm using NeMo-Run",
@@ -98,19 +119,19 @@ def parse_args() -> argparse.Namespace:
         "--script",
         type=str,
         required=True,
-        help="Training script to run (e.g., 00_quickstart_pretrain.py, 00_quickstart_finetune.py)",
+        help="Training script to run (e.g., 00_quickstart_pretrain.py, 01_quickstart_finetune.py)",
     )
     parser.add_argument(
         "--nodes",
         type=int,
         default=1,
-        help="Number of nodes to use (default: 1)",
+        help="Number of nodes to use",
     )
     parser.add_argument(
         "--devices",
         type=int,
         default=8,
-        help="GPUs per node (default: 8)",
+        help="GPUs per node",
     )
     parser.add_argument(
         "--partition",
@@ -128,7 +149,7 @@ def parse_args() -> argparse.Namespace:
         "--time",
         type=str,
         default="04:00:00",
-        help="Job time limit (default: 04:00:00)",
+        help="Job time limit",
     )
     parser.add_argument(
         "--ssh-tunnel",
@@ -154,25 +175,13 @@ def parse_args() -> argparse.Namespace:
         "--identity",
         type=str,
         default=None,
-        help="Path to SSH private key for authentication (optional)",
-    )
-    parser.add_argument(
-        "--config-file",
-        type=str,
-        default=None,
-        help="YAML config file to pass to the training script (optional)",
-    )
-    parser.add_argument(
-        "--script-args",
-        type=str,
-        default="",
-        help="Additional arguments for the training script (space-separated)",
+        help="Path to SSH private key for authentication",
     )
     parser.add_argument(
         "--container-image",
         type=str,
         default=None,
-        help="Container image path (optional)",
+        help="Container image path",
     )
     parser.add_argument(
         "--mount",
@@ -192,13 +201,26 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Print what would be executed without submitting the job",
     )
+    parser.add_argument(
+        "--detach",
+        action="store_true",
+        default=True,
+        help="Detach from the experiment after submission",
+    )
+    parser.add_argument(
+        "--tail-logs",
+        action="store_true",
+        help="Tail logs after submission (only works with --no-detach)",
+    )
 
-    return parser.parse_args()
+    # Use parse_known_args to capture forwarded arguments for the training script
+    args, forwarded_args = parser.parse_known_args()
+    return args, forwarded_args
 
 
 def main() -> None:
     """Launch training (pretrain/finetune) using NeMo-Run SlurmExecutor."""
-    args = parse_args()
+    args, forwarded_args = parse_args()
 
     # Validate SSH tunnel arguments
     if args.ssh_tunnel:
@@ -210,13 +232,8 @@ def main() -> None:
     if not script_path.exists():
         raise FileNotFoundError(f"Training script not found: {script_path}")
 
-    # Build arguments for the training script
-    script_args = []
-    if args.config_file:
-        script_args.extend(["--config-file", args.config_file])
-
-    if args.script_args:
-        script_args.extend(args.script_args.split())
+    # Build arguments for the training script from forwarded args
+    script_args = forwarded_args if forwarded_args else []
 
     # Create the training task
     task = run.Script(
@@ -260,22 +277,20 @@ def main() -> None:
     if args.mount:
         executor.container_mounts = args.mount
 
-    # Set common environment variables
-    executor.env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-    }
-
     # Run the experiment
     with run.Experiment(args.experiment_name) as exp:
         exp.add(task, executor=executor, name="training")
-        exp.run(detach=True, dryrun=args.dry_run)
 
-    if args.dry_run:
-        logger.info("Dry run completed - no job was submitted")
-    else:
-        logger.info("Job submitted to Slurm!")
-        logger.info("Use 'squeue' to check job status")
+        if args.dry_run:
+            exp.dryrun()
+        else:
+            exp.run(detach=args.detach, tail_logs=args.tail_logs)
+
+            if args.detach:
+                logger.info("Job submitted to Slurm!")
+                logger.info("Use 'squeue' to check job status")
+            else:
+                logger.info("Job completed!")
 
 
 if __name__ == "__main__":
diff --git a/examples/recipes/llama/README.md b/examples/recipes/llama/README.md
index 82b04d396..bed7d357d 100644
--- a/examples/recipes/llama/README.md
+++ b/examples/recipes/llama/README.md
@@ -23,7 +23,7 @@ python ../../conversion/convert_checkpoints.py import \
 Then run finetuning:
 
 ```bash
-torchrun --nproc_per_node=1 00_quickstart_finetune.py \
+torchrun --nproc_per_node=1 01_quickstart_finetune.py \
     --pretrained-checkpoint ./checkpoints/llama32_1b
 ```
 
@@ -40,7 +40,7 @@ config.data.data_path = "/path/to/your/dataset"
 For more complex configurations, use YAML files and command-line overrides:
 
 ```bash
-torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
     --config-file conf/llama32_1b_pretrain.yaml
 ```
 
@@ -54,18 +54,18 @@ Example YAML (`conf/llama32_1b_pretrain.yaml`):
 # Each section maps to a ConfigContainer field
 data:                              # GPTDatasetConfig
   data_path: /path/to/training/data
-  seq_length: 4096
+  sequence_length: 4096
 
 train:                             # TrainingConfig
-  train_iters: 10000
+  train_iters: 100
   global_batch_size: 256
 
 checkpoint:                        # CheckpointConfig
   save: ./checkpoints/llama32_1b
-  save_interval: 1000
+  save_interval: 50
 
 model:                             # Model Provider
-  seq_length: 4096                 # Must match data.seq_length
+  seq_length: 4096                 # Must match data.sequence_length
   tensor_model_parallel_size: 1
   
 optimizer:                         # OptimizerConfig
@@ -77,7 +77,7 @@ Override from command line using dot notation:
 Command-line overrides follow the same pattern as YAML structure. The first part before the dot indicates which subconfig of ConfigContainer to override (e.g., `train`, `model`, `optimizer`), and the part after the dot specifies the field within that subconfig.
 
 ```bash
-torchrun --nproc_per_node=2 01_pretrain_with_yaml.py \
+torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
     --config-file conf/llama32_1b_pretrain.yaml \
     train.train_iters=5000 \
     train.global_batch_size=512 \
@@ -93,38 +93,40 @@ These example scripts are configured to accept overrides in the priority order (
 2. YAML config file (nested structure)
 3. Base recipe defaults (from `llama32_1b_pretrain_config()`)
 
-## Multi-Node Training with NeMo-Run
+## Multi-Node Training
+
+### Direct Slurm with sbatch
 
-### Prerequisites
+For traditional HPC workflows without NeMo-Run:
 
 ```bash
-pip install nemo-run
+# 1. Configure launch_with_sbatch.sh
+# Edit SBATCH directives and script variables at the top
+
+# 2. Submit job
+sbatch launch_with_sbatch.sh
 ```
 
-### Launch Locally
+The `launch_with_sbatch.sh` script shows how to:
+- Configure Slurm job parameters
+- Set up multi-node torchrun
+- Use containers (optional)
+- Pass arguments to training scripts
 
-Test your setup before going to a cluster. Works with both pretrain and finetune scripts:
+### NeMo-Run
 
-```bash
-# Pretrain
-python 02_launch_pretrain_local.py \
-    --script 00_quickstart_pretrain.py \
-    --devices 2
+For better job management and remote launching capabilities:
 
-# Finetune
-python 02_launch_pretrain_local.py \
-    --script 00_quickstart_finetune.py \
-    --devices 1
-```
+Prerequisites:
 
-### Launch on Slurm
-
-For multi-node training on Slurm clusters:
+```bash
+pip install nemo-run
+```
 
-From the cluster (LocalTunnel):
+From the Slurm cluster (LocalTunnel):
 
 ```bash
-python 03_launch_pretrain_slurm.py \
+python 04_launch_slurm_with_nemo_run.py \
     --script 00_quickstart_pretrain.py \
     --nodes 2 \
     --devices 8 \
@@ -135,7 +137,7 @@ python 03_launch_pretrain_slurm.py \
 From your local machine (SSHTunnel):
 
 ```bash
-python 03_launch_pretrain_slurm.py \
+python 04_launch_slurm_with_nemo_run.py \
     --script 00_quickstart_pretrain.py \
     --nodes 2 \
     --devices 8 \
@@ -150,8 +152,8 @@ python 03_launch_pretrain_slurm.py \
 With custom config:
 
 ```bash
-python 03_launch_pretrain_slurm.py \
-    --script 01_finetune_with_yaml.py \
+python 04_launch_slurm_with_nemo_run.py \
+    --script 03_finetune_with_yaml.py \
     --nodes 1 \
     --devices 8 \
     --partition gpu \
@@ -174,7 +176,7 @@ python ../../conversion/convert_checkpoints.py import \
 Run finetuning:
 
 ```bash
-torchrun --nproc_per_node=1 00_quickstart_finetune.py \
+torchrun --nproc_per_node=1 01_quickstart_finetune.py \
     --pretrained-checkpoint ./checkpoints/llama32_1b
 ```
 
@@ -200,7 +202,7 @@ config.peft.alpha = 32  # LoRA alpha scaling
 For more complex finetuning configurations:
 
 ```bash
-torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
     --config-file conf/llama32_1b_finetune.yaml
 ```
 
@@ -213,13 +215,13 @@ data:                              # FinetuningDatasetConfig
   seq_length: 4096
 
 train:                             # TrainingConfig  
-  train_iters: 1000
+  train_iters: 100
   global_batch_size: 128
 
 checkpoint:                        # CheckpointConfig
   pretrained_checkpoint: /path/to/pretrained/checkpoint
   save: ./checkpoints/llama32_1b_finetuned
-  save_interval: 500
+  save_interval: 50
 
 peft:                             # PEFT (LoRA config)
   dim: 8      # LoRA rank
@@ -229,7 +231,7 @@ model:                            # Model Provider
   seq_length: 4096                # Must match data.seq_length
   
 optimizer:                        # OptimizerConfig
-  lr: 0.0001  # Higher LR for LoRA
+  lr: 0.0001
 ```
 
 Override from command line using dot notation:
@@ -237,7 +239,7 @@ Override from command line using dot notation:
 The first part before the dot indicates which ConfigContainer subconfig to override, and the part after specifies the field.
 
 ```bash
-torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
     --config-file conf/llama32_1b_finetune.yaml \
     peft.dim=16 \
     train.train_iters=2000
@@ -248,42 +250,18 @@ Here, `peft.dim=16` overrides `ConfigContainer.peft.dim`.
 Full finetuning (no LoRA):
 
 ```bash
-torchrun --nproc_per_node=2 01_finetune_with_yaml.py \
+torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
     --peft none \
     train.train_iters=1000
 ```
 
-### Multi-Node Finetuning
-
-Use the same launchers for finetuning.
-
-Local:
-
-```bash
-python 02_launch_pretrain_local.py \
-    --script 00_quickstart_finetune.py \
-    --devices 1 \
-    --script-args "--pretrained-checkpoint ./checkpoints/llama32_1b"
-```
-
-Slurm:
-
-```bash
-python 03_launch_pretrain_slurm.py \
-    --script 01_finetune_with_yaml.py \
-    --nodes 1 \
-    --partition gpu \
-    --account my_account \
-    --config-file conf/llama32_1b_finetune.yaml
-```
-
 ### Working with Checkpoints
 
 **Important:** Finetuning requires checkpoints in Megatron format. You cannot use HuggingFace checkpoints directly.
 
 You can obtain Megatron checkpoints by:
 
-1. Converting from HuggingFace (recommended for starting from public models)
+1. Converting from HuggingFace
 2. Using Megatron checkpoints from your own pretraining runs
 
 Convert HuggingFace checkpoint to Megatron format:
@@ -298,10 +276,10 @@ Use the checkpoint:
 
 ```bash
 # Command line (quickstart scripts)
-torchrun --nproc_per_node=1 00_quickstart_finetune.py \
+torchrun --nproc_per_node=1 01_quickstart_finetune.py \
     --pretrained-checkpoint ./checkpoints/llama32_1b
 
-# YAML config (01_finetune_with_yaml.py)
+# YAML config (03_finetune_with_yaml.py)
 # In conf/llama32_1b_finetune.yaml:
 # checkpoint:
 #   pretrained_checkpoint: ./checkpoints/llama32_1b
diff --git a/examples/recipes/llama/launch_with_sbatch.sh b/examples/recipes/llama/launch_with_sbatch.sh
new file mode 100644
index 000000000..e75690d27
--- /dev/null
+++ b/examples/recipes/llama/launch_with_sbatch.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#SBATCH --job-name=megatron-bridge-train
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8
+#SBATCH --time=04:00:00
+#SBATCH --partition=gpu
+#SBATCH --account=my_account
+#SBATCH --output=logs/train_%j.out
+#SBATCH --error=logs/train_%j.err
+#SBATCH --exclusive
+
+# ==============================================================================
+# Direct Slurm Launch with sbatch
+#
+# This script demonstrates how to launch training directly using sbatch without
+# NeMo-Run. This is useful if you prefer traditional HPC workflows or don't want
+# to install additional dependencies.
+#
+# Usage:
+#   1. Modify the #SBATCH directives above for your cluster
+#   2. Set the TRAINING_SCRIPT and other variables below
+#   3. Submit: sbatch launch_with_sbatch.sh
+#
+# For NeMo-Run based launching (recommended), see 04_launch_slurm_with_nemo_run.py
+# ==============================================================================
+
+# ==============================================================================
+# CONFIGURATION - Modify these for your setup
+# ==============================================================================
+
+# Training script to run (choose one)
+TRAINING_SCRIPT="00_quickstart_pretrain.py"
+# TRAINING_SCRIPT="01_quickstart_finetune.py"
+# TRAINING_SCRIPT="02_pretrain_with_yaml.py"
+# TRAINING_SCRIPT="03_finetune_with_yaml.py"
+
+# Optional: YAML config file (for *_with_yaml.py scripts)
+CONFIG_FILE=""
+# CONFIG_FILE="conf/llama32_1b_pretrain.yaml"
+# CONFIG_FILE="conf/llama32_1b_finetune.yaml"
+
+# Optional: Additional CLI overrides (for *_with_yaml.py scripts)
+CLI_OVERRIDES=""
+# CLI_OVERRIDES="train.train_iters=1000 train.global_batch_size=512"
+
+# Optional: For finetuning scripts, specify checkpoint path
+PRETRAINED_CHECKPOINT=""
+# PRETRAINED_CHECKPOINT="./checkpoints/llama32_1b"
+
+# Container image (optional, only if using containers)
+CONTAINER_IMAGE=""
+# CONTAINER_IMAGE="/path/to/container.sqsh"
+
+# Container mounts (optional, space-separated)
+CONTAINER_MOUNTS=""
+# CONTAINER_MOUNTS="/data:/data /model:/model"
+
+# ==============================================================================
+# Environment Setup
+# ==============================================================================
+
+# Set common environment variables
+# Optional: Set these if needed
+# export CUDA_DEVICE_MAX_CONNECTIONS=1
+# export NCCL_DEBUG=INFO
+
+# ==============================================================================
+# Job Execution
+# ==============================================================================
+
+echo "======================================"
+echo "Megatron Bridge Training Job"
+echo "======================================"
+echo "Job ID: $SLURM_JOB_ID"
+echo "Nodes: $SLURM_JOB_NUM_NODES"
+echo "GPUs per node: $SLURM_GPUS_PER_NODE"
+echo "Script: $TRAINING_SCRIPT"
+echo "======================================"
+
+# Build the command
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_PATH="${SCRIPT_DIR}/${TRAINING_SCRIPT}"
+
+# Build torchrun command
+CMD="torchrun"
+CMD="$CMD --nproc_per_node=$SLURM_GPUS_PER_NODE"
+CMD="$CMD --nnodes=$SLURM_JOB_NUM_NODES"
+CMD="$CMD --node_rank=\$SLURM_PROCID"
+CMD="$CMD --master_addr=\$(scontrol show hostname \$SLURM_NODELIST | head -n1)"
+CMD="$CMD --master_port=29500"
+CMD="$CMD $SCRIPT_PATH"
+
+# Add config file if specified
+if [ -n "$CONFIG_FILE" ]; then
+    CMD="$CMD --config-file $CONFIG_FILE"
+fi
+
+# Add pretrained checkpoint if specified (for finetuning)
+if [ -n "$PRETRAINED_CHECKPOINT" ]; then
+    CMD="$CMD --pretrained-checkpoint $PRETRAINED_CHECKPOINT"
+fi
+
+# Add CLI overrides if specified
+if [ -n "$CLI_OVERRIDES" ]; then
+    CMD="$CMD $CLI_OVERRIDES"
+fi
+
+echo "Executing: $CMD"
+echo "======================================"
+
+# Execute with or without container
+if [ -n "$CONTAINER_IMAGE" ]; then
+    # With container
+    SRUN_CMD="srun --container-image=$CONTAINER_IMAGE"
+    
+    # Add container mounts
+    if [ -n "$CONTAINER_MOUNTS" ]; then
+        for mount in $CONTAINER_MOUNTS; do
+            SRUN_CMD="$SRUN_CMD --container-mounts=$mount"
+        done
+    fi
+    
+    $SRUN_CMD bash -c "$CMD"
+else
+    # Without container
+    srun bash -c "$CMD"
+fi
+
+echo "======================================"
+echo "Job completed"
+echo "======================================"
+

From 5a921bf45292d7218ee62a01cf65fc6d97462d68 Mon Sep 17 00:00:00 2001
From: Ananth Subramaniam <ansubramania@nvidia.com>
Date: Mon, 24 Nov 2025 04:07:55 -0800
Subject: [PATCH 4/7] move to tutorials

Signed-off-by: Ananth Subramaniam <ansubramania@nvidia.com>
---
 {examples => tutorials}/recipes/llama/00_quickstart_pretrain.py   | 0
 {examples => tutorials}/recipes/llama/01_quickstart_finetune.py   | 0
 {examples => tutorials}/recipes/llama/02_pretrain_with_yaml.py    | 0
 {examples => tutorials}/recipes/llama/03_finetune_with_yaml.py    | 0
 .../recipes/llama/04_launch_slurm_with_nemo_run.py                | 0
 {examples => tutorials}/recipes/llama/README.md                   | 0
 .../recipes/llama/conf/llama32_1b_finetune.yaml                   | 0
 .../recipes/llama/conf/llama32_1b_pretrain.yaml                   | 0
 {examples => tutorials}/recipes/llama/launch_with_sbatch.sh       | 0
 9 files changed, 0 insertions(+), 0 deletions(-)
 rename {examples => tutorials}/recipes/llama/00_quickstart_pretrain.py (100%)
 rename {examples => tutorials}/recipes/llama/01_quickstart_finetune.py (100%)
 rename {examples => tutorials}/recipes/llama/02_pretrain_with_yaml.py (100%)
 rename {examples => tutorials}/recipes/llama/03_finetune_with_yaml.py (100%)
 rename {examples => tutorials}/recipes/llama/04_launch_slurm_with_nemo_run.py (100%)
 rename {examples => tutorials}/recipes/llama/README.md (100%)
 rename {examples => tutorials}/recipes/llama/conf/llama32_1b_finetune.yaml (100%)
 rename {examples => tutorials}/recipes/llama/conf/llama32_1b_pretrain.yaml (100%)
 rename {examples => tutorials}/recipes/llama/launch_with_sbatch.sh (100%)

diff --git a/examples/recipes/llama/00_quickstart_pretrain.py b/tutorials/recipes/llama/00_quickstart_pretrain.py
similarity index 100%
rename from examples/recipes/llama/00_quickstart_pretrain.py
rename to tutorials/recipes/llama/00_quickstart_pretrain.py
diff --git a/examples/recipes/llama/01_quickstart_finetune.py b/tutorials/recipes/llama/01_quickstart_finetune.py
similarity index 100%
rename from examples/recipes/llama/01_quickstart_finetune.py
rename to tutorials/recipes/llama/01_quickstart_finetune.py
diff --git a/examples/recipes/llama/02_pretrain_with_yaml.py b/tutorials/recipes/llama/02_pretrain_with_yaml.py
similarity index 100%
rename from examples/recipes/llama/02_pretrain_with_yaml.py
rename to tutorials/recipes/llama/02_pretrain_with_yaml.py
diff --git a/examples/recipes/llama/03_finetune_with_yaml.py b/tutorials/recipes/llama/03_finetune_with_yaml.py
similarity index 100%
rename from examples/recipes/llama/03_finetune_with_yaml.py
rename to tutorials/recipes/llama/03_finetune_with_yaml.py
diff --git a/examples/recipes/llama/04_launch_slurm_with_nemo_run.py b/tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py
similarity index 100%
rename from examples/recipes/llama/04_launch_slurm_with_nemo_run.py
rename to tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py
diff --git a/examples/recipes/llama/README.md b/tutorials/recipes/llama/README.md
similarity index 100%
rename from examples/recipes/llama/README.md
rename to tutorials/recipes/llama/README.md
diff --git a/examples/recipes/llama/conf/llama32_1b_finetune.yaml b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
similarity index 100%
rename from examples/recipes/llama/conf/llama32_1b_finetune.yaml
rename to tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
diff --git a/examples/recipes/llama/conf/llama32_1b_pretrain.yaml b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
similarity index 100%
rename from examples/recipes/llama/conf/llama32_1b_pretrain.yaml
rename to tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
diff --git a/examples/recipes/llama/launch_with_sbatch.sh b/tutorials/recipes/llama/launch_with_sbatch.sh
similarity index 100%
rename from examples/recipes/llama/launch_with_sbatch.sh
rename to tutorials/recipes/llama/launch_with_sbatch.sh

From 68f45cddc24ed6d6cbadd9eb17b1b40eb44ae536 Mon Sep 17 00:00:00 2001
From: Ananth Subramaniam <ansubramania@nvidia.com>
Date: Mon, 24 Nov 2025 08:08:10 -0800
Subject: [PATCH 5/7] move to tutorials

Signed-off-by: Ananth Subramaniam <ansubramania@nvidia.com>
---
 .../recipes/llama/01_quickstart_finetune.py   |  17 +-
 tutorials/recipes/llama/README.md             | 223 ++++++------------
 .../llama/conf/llama32_1b_finetune.yaml       |   5 +-
 .../llama/conf/llama32_1b_pretrain.yaml       |   3 +-
 4 files changed, 89 insertions(+), 159 deletions(-)

diff --git a/tutorials/recipes/llama/01_quickstart_finetune.py b/tutorials/recipes/llama/01_quickstart_finetune.py
index a6f8060d6..6a8af8e80 100644
--- a/tutorials/recipes/llama/01_quickstart_finetune.py
+++ b/tutorials/recipes/llama/01_quickstart_finetune.py
@@ -85,15 +85,14 @@ def main() -> None:
     # === Use your own dataset ===
     # Replace SQuAD with your custom dataset
     # Option 1: Simple path override
-    # config.data.data_path = "/path/to/your/dataset.jsonl"
-
-    # Option 2: Use FinetuningDatasetConfig for custom JSONL datasets
-    # from megatron.bridge.training.data import FinetuningDatasetConfig
-    # config.data = FinetuningDatasetConfig(data_path="/path/to/your/dataset.jsonl")
-
-    # Option 3: Use HFDatasetConfig for HuggingFace datasets
-    # from megatron.bridge.training.data import HFDatasetConfig
-    # config.data = HFDatasetConfig(hf_dataset="squad", split="train")
+    # config.dataset.dataset_root = "/path/to/your/dataset"
+
+    # Or replace the dataset with FinetuningDatasetConfig for JSONL data
+    # from megatron.bridge.training.config import FinetuningDatasetConfig
+    # config.dataset = FinetuningDatasetConfig(
+    #     dataset_root="/path/to/your/dataset_dir",  # expects training/validation/test jsonl files
+    #     seq_length=config.model.seq_length,
+    # )
 
     # === Adjust learning rate ===
     # config.optimizer.lr = 5e-5
diff --git a/tutorials/recipes/llama/README.md b/tutorials/recipes/llama/README.md
index bed7d357d..b5306707d 100644
--- a/tutorials/recipes/llama/README.md
+++ b/tutorials/recipes/llama/README.md
@@ -1,4 +1,4 @@
-# Llama Recipes with Megatron Bridge
+# Recipes with Megatron Bridge
 
 This guide shows you how to pretrain and finetune Llama models using Megatron Bridge.
 
@@ -12,7 +12,9 @@ torchrun --nproc_per_node=1 00_quickstart_pretrain.py
 
 This runs Llama 3.2 1B pretraining on a single GPU with mock data.
 
-For finetuning, you need a checkpoint in Megatron format. Convert from HuggingFace:
+For finetuning, you first need a checkpoint in Megatron format. Convert from HuggingFace using the `AutoBridge`:
+
+> **Note:** You must be authenticated with Hugging Face to download the model. Run `hf auth login --token $HF_TOKEN` if needed.
 
 ```bash
 python ../../conversion/convert_checkpoints.py import \
@@ -27,32 +29,46 @@ torchrun --nproc_per_node=1 01_quickstart_finetune.py \
     --pretrained-checkpoint ./checkpoints/llama32_1b
 ```
 
-This finetunes Llama 3.2 1B using LoRA on the SQuAD dataset.
+The [01_quickstart_finetune.py](01_quickstart_finetune.py) recipe finetunes Llama 3.2 1B using LoRA on the SQuAD dataset by default.
 
-To use real data, uncomment and modify in the script:
+To plug in your own JSONL dataset, swap the dataset config in that script:
 
 ```python
-config.data.data_path = "/path/to/your/dataset"
+from megatron.bridge.training.config import FinetuningDatasetConfig
+
+config.dataset = FinetuningDatasetConfig(
+    dataset_root="/path/to/dataset_dir",  # contains training/validation/test jsonl files
+    seq_length=config.model.seq_length,
+)
 ```
 
-## Configuration with YAML
+## Configuration
+
+Megatron Bridge recipes are standard Python scripts, giving you full flexibility in how you configure your training. You can:
+1.  Modify the Python scripts directly
+2.  Use the framework's YAML-based configuration system
+3.  Implement your own configuration management (ArgParse, Hydra, etc.)
+
+### Using Framework YAML Configs
+
+The recipes include optional support for YAML configuration and dot-notation overrides via `ConfigContainer`. This is just one way to manage config; you are free to use other methods.
 
-For more complex configurations, use YAML files and command-line overrides:
+To use the provided YAML system:
 
 ```bash
 torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
     --config-file conf/llama32_1b_pretrain.yaml
 ```
 
-Understanding YAML Configuration:
+Understanding the YAML Structure:
 
-YAML files should be organized into sections that mirror the `ConfigContainer` structure. Each top-level key corresponds to a configuration section (e.g., `data`, `train`, `model`, `optimizer`). Overrides are applied in a nested manner according to the ConfigContainer fields.
+YAML files mirror the `ConfigContainer` structure. Each top-level key corresponds to a configuration section (e.g., `dataset`, `train`, `model`, `optimizer`).
 
 Example YAML (`conf/llama32_1b_pretrain.yaml`):
 
 ```yaml
 # Each section maps to a ConfigContainer field
-data:                              # GPTDatasetConfig
+dataset:                           # GPTDatasetConfig
   data_path: /path/to/training/data
   sequence_length: 4096
 
@@ -72,9 +88,9 @@ optimizer:                         # OptimizerConfig
   lr: 0.0003
 ```
 
-Override from command line using dot notation:
+Command-Line Overrides:
 
-Command-line overrides follow the same pattern as YAML structure. The first part before the dot indicates which subconfig of ConfigContainer to override (e.g., `train`, `model`, `optimizer`), and the part after the dot specifies the field within that subconfig.
+You can override values using dot notation (`section.field=value`):
 
 ```bash
 torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
@@ -84,14 +100,55 @@ torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
     optimizer.lr=0.0002
 ```
 
-In this example:
-- `train.train_iters=5000` → overrides `ConfigContainer.train.train_iters`
-- `optimizer.lr=0.0002` → overrides `ConfigContainer.optimizer.lr`
+Priority order (highest to lowest):
+1.  Command-line overrides
+2.  YAML config file
+3.  Base recipe defaults
+
+### Finetuning Configuration
+
+For more complex finetuning configurations:
+
+```bash
+torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
+    --config-file conf/llama32_1b_finetune.yaml
+```
+
+Example YAML (`conf/llama32_1b_finetune.yaml`):
+
+```yaml
+# Each section maps to a ConfigContainer field
+dataset:                           # FinetuningDatasetConfig
+  data_path: /path/to/finetuning_dataset.jsonl
+  seq_length: 4096
+
+train:                             # TrainingConfig  
+  train_iters: 100
+  global_batch_size: 128
 
-These example scripts are configured to accept overrides in the priority order (highest to lowest):
-1. Command-line overrides (dot notation: `section.field=value`)
-2. YAML config file (nested structure)
-3. Base recipe defaults (from `llama32_1b_pretrain_config()`)
+checkpoint:                        # CheckpointConfig
+  pretrained_checkpoint: /path/to/pretrained/checkpoint
+  save: ./checkpoints/llama32_1b_finetuned
+  save_interval: 50
+
+peft:                             # PEFT (LoRA config)
+  dim: 8      # LoRA rank
+  alpha: 16   # LoRA alpha
+
+model:                            # Model Provider
+  seq_length: 4096                # Must match data.seq_length
+  
+optimizer:                        # OptimizerConfig
+  lr: 0.0001
+```
+
+Full Finetuning (No LoRA)
+
+```bash
+torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
+    --peft none \
+    train.train_iters=1000
+```
 
 ## Multi-Node Training
 
@@ -115,7 +172,7 @@ The `launch_with_sbatch.sh` script shows how to:
 
 ### NeMo-Run
 
-For better job management and remote launching capabilities:
+For job management and remote launching capabilities:
 
 Prerequisites:
 
@@ -123,7 +180,7 @@ Prerequisites:
 pip install nemo-run
 ```
 
-From the Slurm cluster (LocalTunnel):
+From the Slurm cluster login node:
 
 ```bash
 python 04_launch_slurm_with_nemo_run.py \
@@ -160,127 +217,3 @@ python 04_launch_slurm_with_nemo_run.py \
     --account my_account \
     --config-file conf/llama32_1b_finetune.yaml
 ```
-
-## Finetuning
-
-### Quickstart: Finetune with LoRA
-
-Prerequisites: You need a checkpoint in Megatron format. Convert from HuggingFace:
-
-```bash
-python ../../conversion/convert_checkpoints.py import \
-    --hf-model meta-llama/Llama-3.2-1B \
-    --megatron-path ./checkpoints/llama32_1b
-```
-
-Run finetuning:
-
-```bash
-torchrun --nproc_per_node=1 01_quickstart_finetune.py \
-    --pretrained-checkpoint ./checkpoints/llama32_1b
-```
-
-By default, this:
-- Uses LoRA (Low-Rank Adaptation) for efficient finetuning
-- Trains on the SQuAD dataset
-- Works on a single GPU
-- Llama 3.2 1B model
-
-Customize in the script:
-
-```python
-# Use your own dataset (JSONL format)
-config.data.data_path = "/path/to/your/dataset.jsonl"
-
-# Adjust LoRA hyperparameters
-config.peft.dim = 16  # LoRA rank
-config.peft.alpha = 32  # LoRA alpha scaling
-```
-
-### Configuration with YAML
-
-For more complex finetuning configurations:
-
-```bash
-torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
-    --config-file conf/llama32_1b_finetune.yaml
-```
-
-Example YAML (`conf/llama32_1b_finetune.yaml`):
-
-```yaml
-# Each section maps to a ConfigContainer field
-data:                              # FinetuningDatasetConfig
-  data_path: /path/to/finetuning_dataset.jsonl
-  seq_length: 4096
-
-train:                             # TrainingConfig  
-  train_iters: 100
-  global_batch_size: 128
-
-checkpoint:                        # CheckpointConfig
-  pretrained_checkpoint: /path/to/pretrained/checkpoint
-  save: ./checkpoints/llama32_1b_finetuned
-  save_interval: 50
-
-peft:                             # PEFT (LoRA config)
-  dim: 8      # LoRA rank
-  alpha: 16   # LoRA alpha
-
-model:                            # Model Provider
-  seq_length: 4096                # Must match data.seq_length
-  
-optimizer:                        # OptimizerConfig
-  lr: 0.0001
-```
-
-Override from command line using dot notation:
-
-The first part before the dot indicates which ConfigContainer subconfig to override, and the part after specifies the field.
-
-```bash
-torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
-    --config-file conf/llama32_1b_finetune.yaml \
-    peft.dim=16 \
-    train.train_iters=2000
-```
-
-Here, `peft.dim=16` overrides `ConfigContainer.peft.dim`.
-
-Full finetuning (no LoRA):
-
-```bash
-torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
-    --peft none \
-    train.train_iters=1000
-```
-
-### Working with Checkpoints
-
-**Important:** Finetuning requires checkpoints in Megatron format. You cannot use HuggingFace checkpoints directly.
-
-You can obtain Megatron checkpoints by:
-
-1. Converting from HuggingFace
-2. Using Megatron checkpoints from your own pretraining runs
-
-Convert HuggingFace checkpoint to Megatron format:
-
-```bash
-python ../../conversion/convert_checkpoints.py import \
-    --hf-model meta-llama/Llama-3.2-1B \
-    --megatron-path ./checkpoints/llama32_1b
-```
-
-Use the checkpoint:
-
-```bash
-# Command line (quickstart scripts)
-torchrun --nproc_per_node=1 01_quickstart_finetune.py \
-    --pretrained-checkpoint ./checkpoints/llama32_1b
-
-# YAML config (03_finetune_with_yaml.py)
-# In conf/llama32_1b_finetune.yaml:
-# checkpoint:
-#   pretrained_checkpoint: ./checkpoints/llama32_1b
-```
diff --git a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
index fd6bc1a73..5be4051bd 100644
--- a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
+++ b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
@@ -17,7 +17,7 @@
 # Modify values as needed for your use case.
 
 # Data configuration
-data:
+dataset:
   # Replace with your dataset path (JSONL format recommended)
   # data_path: /path/to/your/finetuning_dataset.jsonl
   seq_length: 4096
@@ -56,7 +56,7 @@ peft:
   alpha: 16  # LoRA alpha scaling
 
 # Model configuration
-# Note: seq_length must match data.seq_length
+# Note: seq_length must match dataset.seq_length
 model:
   seq_length: 4096
   tensor_model_parallel_size: 1
@@ -73,4 +73,3 @@ logger:
 # Random seed
 rng:
   seed: 1234
-
diff --git a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
index a46d0c689..378503aad 100644
--- a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
+++ b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
@@ -17,7 +17,7 @@
 # Modify values as needed for your use case.
 
 # Data configuration
-data:
+dataset:
   # Replace with your dataset path
   # data_path: /path/to/your/dataset
   sequence_length: 4096
@@ -67,4 +67,3 @@ logger:
 # Random seed
 rng:
   seed: 1234
-

From e41e906827fbe48cde6e5b608aa9c03f4607c653 Mon Sep 17 00:00:00 2001
From: Ananth Subramaniam <ansubramania@nvidia.com>
Date: Tue, 25 Nov 2025 07:40:19 -0800
Subject: [PATCH 6/7] updates

Signed-off-by: Ananth Subramaniam <ansubramania@nvidia.com>
---
 README.md                                     | 31 +++----------------
 src/megatron/bridge/recipes/gemma/gemma2.py   |  2 +-
 .../llama/conf/llama32_1b_finetune.yaml       |  7 +++--
 .../llama/conf/llama32_1b_pretrain.yaml       |  3 +-
 4 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index abbb7be78..7bda6076b 100644
--- a/README.md
+++ b/README.md
@@ -163,34 +163,13 @@ For more details on supported models, see our documentation:
 
 #### Launching Recipes
 
-All recipes are ready to train out of the box, using mock data by default. For an example of how to override the default configuration through YAML or Hydra-style CLI overrides, please have a look at this [script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b.py). The script can then be launched with `torchrun`. For example, with the aforementioned script:
+For a conceptual overview of how recipes are structured, overridden, and launched with either `torchrun` or NeMo-Run, read the [Using Recipes guide](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html).
 
-```sh
-torchrun --nproc-per-node=2 pretrain_llama3_8b.py model.tensor_model_parallel_size=1 <additional overrides ...>
-```
-
-Optionally, Megatron Bridge also supports launching with [NeMo-Run](https://github.com/NVIDIA-NeMo/Run). See the following examples for reference on launching with NeMo-Run:
-
-- [pretrain_llama3_8b_nemo_run_script.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py)
-- [pretrain_llama3_8b_nemo_run_partial.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_partial.py)
-
-These examples can also be run as-is with the Llama 3 8B recipe (with NeMo-Run installed).
-
-Launch Llama 3 8B pretraining with NeMo-Run's `run.Script`:
+Runnable tutorials live in [`tutorials/recipes/llama`](tutorials/recipes/llama) and ship with a detailed README that covers:
 
-```sh
-uv run python pretrain_llama3_8b_nemo_run_script.py \
-    --nproc-per-node=2 \
-    model.pipeline_model_parallel_size=1 \
-    train.train_iters=10 # this script passes Hydra-style overrides to the target script
-```
-
-Launch Llama 3 8B pretraining with NeMo-Run's `run.Partial`:
-
-```sh
-uv run python pretrain_llama3_8b_nemo_run_partial.py \
-    --nproc-per-node=2
-```
+- `00_quickstart_pretrain.py` for mock-data pretraining
+- `01_quickstart_finetune.py` + LoRA configs
+- YAML-driven flows and launch helpers
 
 <!-- ### Vision-Language Models -->
 
diff --git a/src/megatron/bridge/recipes/gemma/gemma2.py b/src/megatron/bridge/recipes/gemma/gemma2.py
index 8987202ee..98c80e3e5 100644
--- a/src/megatron/bridge/recipes/gemma/gemma2.py
+++ b/src/megatron/bridge/recipes/gemma/gemma2.py
@@ -247,7 +247,7 @@ def _gemma2_common(
             reset_attention_mask=False,
             reset_position_ids=False,
             eod_mask_loss=False,
-            sequence_length=seq_length,
+            seq_length=seq_length,
             num_dataset_builder_threads=1,
             blend=blend,
             blend_per_split=blend_per_split,
diff --git a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
index 5be4051bd..02c3897a6 100644
--- a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
+++ b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
@@ -30,8 +30,6 @@ train:
   eval_iters: 10
   eval_interval: 50
   
-  # Load from pretrained checkpoint
-  # load: /path/to/pretrained/checkpoint
 
 # Optimizer configuration
 optimizer:
@@ -48,6 +46,11 @@ scheduler:
 checkpoint:
   # Directory to save finetuned checkpoints
   save: ./checkpoints/llama32_1b_finetuned
+  # Directory to resume from during training
+  load: ./checkpoints/llama32_1b
+  # Directory for pretrained weights in Megatron format
+  pretrained_checkpoint: ./path/to/pretrained/checkpoint/
+
   save_interval: 50
 
 # LoRA configuration
diff --git a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
index 378503aad..9b9e8bf1d 100644
--- a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
+++ b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
@@ -45,9 +45,8 @@ scheduler:
 checkpoint:
   # Directory to save checkpoints
   save: ./checkpoints/llama32_1b
+  load: ./checkpoints/llama32_1b
   save_interval: 50
-  # Resume from checkpoint (optional)
-  # load: ./checkpoints/llama32_1b/iter_0000050
 
 # Model configuration
 # Note: seq_length must match data.sequence_length

From eec49d39fdba1bdc3f1817285f42be41c761e105 Mon Sep 17 00:00:00 2001
From: Ananth Subramaniam <ansubramania@nvidia.com>
Date: Tue, 25 Nov 2025 09:50:01 -0800
Subject: [PATCH 7/7] fix docs

Signed-off-by: Ananth Subramaniam <ansubramania@nvidia.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7bda6076b..012a649af 100644
--- a/README.md
+++ b/README.md
@@ -165,7 +165,7 @@ For more details on supported models, see our documentation:
 
 For a conceptual overview of how recipes are structured, overridden, and launched with either `torchrun` or NeMo-Run, read the [Using Recipes guide](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html).
 
-Runnable tutorials live in [`tutorials/recipes/llama`](tutorials/recipes/llama) and ship with a detailed README that covers:
+Runnable tutorials live in `tutorials/recipes/llama` that covers:
 
 - `00_quickstart_pretrain.py` for mock-data pretraining
 - `01_quickstart_finetune.py` + LoRA configs