diff --git a/README.md b/README.md
index abbb7be78..012a649af 100644
--- a/README.md
+++ b/README.md
@@ -163,34 +163,13 @@ For more details on supported models, see our documentation:
 
 #### Launching Recipes
 
-All recipes are ready to train out of the box, using mock data by default. For an example of how to override the default configuration through YAML or Hydra-style CLI overrides, please have a look at this [script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b.py). The script can then be launched with `torchrun`. For example, with the aforementioned script:
+For a conceptual overview of how recipes are structured, overridden, and launched with either `torchrun` or NeMo-Run, read the [Using Recipes guide](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html).
 
-```sh
-torchrun --nproc-per-node=2 pretrain_llama3_8b.py model.tensor_model_parallel_size=1 <additional overrides ...>
-```
-
-Optionally, Megatron Bridge also supports launching with [NeMo-Run](https://github.com/NVIDIA-NeMo/Run). See the following examples for reference on launching with NeMo-Run:
-
-- [pretrain_llama3_8b_nemo_run_script.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py)
-- [pretrain_llama3_8b_nemo_run_partial.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/recipes/llama/pretrain_llama3_8b_nemo_run_partial.py)
-
-These examples can also be run as-is with the Llama 3 8B recipe (with NeMo-Run installed).
-
-Launch Llama 3 8B pretraining with NeMo-Run's `run.Script`:
+Runnable tutorials live in `tutorials/recipes/llama` that covers:
 
-```sh
-uv run python pretrain_llama3_8b_nemo_run_script.py \
-    --nproc-per-node=2 \
-    model.pipeline_model_parallel_size=1 \
-    train.train_iters=10 # this script passes Hydra-style overrides to the target script
-```
-
-Launch Llama 3 8B pretraining with NeMo-Run's `run.Partial`:
-
-```sh
-uv run python pretrain_llama3_8b_nemo_run_partial.py \
-    --nproc-per-node=2
-```
+- `00_quickstart_pretrain.py` for mock-data pretraining
+- `01_quickstart_finetune.py` + LoRA configs
+- YAML-driven flows and launch helpers
 
 <!-- ### Vision-Language Models -->
 
diff --git a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml
deleted file mode 100644
index 5f55d9988..000000000
--- a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Example override file
-
-# To override a parameter, ensure the structure matches the ConfigContainer
-# and its sub-configurations (e.g., model, train, etc.)
-# Top-level ConfigContainer fields are dataclasses themselves
-
-model:
-  seq_length: 4096
-
-train:
-  train_iters: 20
-  global_batch_size: 8
-  micro_batch_size: 1
-  eval_iters: 0
-
-optimizer:
-  lr: 0.00025
-  min_lr: 0.000025
-
-scheduler:
-  lr_warmup_iters: 10
-
-checkpoint:
-  # Directory to save to. If null, no checkpoint will be saved.
-  save: null
-
-dist:
-  use_megatron_fsdp: false
-  use_torch_fsdp2: false
-
-logger:
-  log_interval: 1
-
-dataset:
-  seq_length: 4096
-
-rng:
-  seed: 42
-
-ddp:
-  grad_reduce_in_fp32: true
-
-profiling:
-  # For optional fields in the config, specify the target to instantiate the object.
-  _target_: megatron.bridge.training.config.ProfilingConfig
-  use_nsys_profiler: false
-  profile_step_start: 5
-  profile_step_end: 10
-  use_pytorch_profiler: true
-  profile_ranks: [0, 1]
-  record_shapes: true
diff --git a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml b/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml
deleted file mode 100644
index 0e239e284..000000000
--- a/examples/recipes/llama/conf/llama3_8b_pretrain_override_example_megatron_fsdp.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Example override file
-
-# To override a parameter, ensure the structure matches the ConfigContainer
-# and its sub-configurations (e.g., model, train, etc.)
-# Top-level ConfigContainer fields are dataclasses themselves
-
-model:
-  seq_length: 4096
-  init_model_with_meta_device: true
-
-train:
-  train_iters: 20
-  global_batch_size: 8
-  micro_batch_size: 1
-  eval_iters: 0
-
-optimizer:
-  lr: 0.00025
-  min_lr: 0.000025
-
-scheduler:
-  lr_warmup_iters: 10
-
-checkpoint:
-  # Directory to save to. If null, no checkpoint will be saved.
-  save: null
-  ckpt_format: "fsdp_dtensor"
-
-dist:
-  use_megatron_fsdp: true
-  use_torch_fsdp2: false
-
-logger:
-  log_interval: 1
-
-dataset:
-  seq_length: 4096
-
-rng:
-  seed: 42
-
-ddp:
-  grad_reduce_in_fp32: true
-  data_parallel_sharding_strategy: "optim_grads_params" # for Megatron FSDP ZeRO-3 like sharding
-
-profiling:
-  # For optional fields in the config, specify the target to instantiate the object.
-  _target_: megatron.bridge.training.config.ProfilingConfig
-  use_nsys_profiler: false
-  profile_step_start: 5
-  profile_step_end: 10
-  use_pytorch_profiler: true
-  profile_ranks: [0, 1]
-  record_shapes: true
diff --git a/examples/recipes/llama/pretrain_llama3_8b.py b/examples/recipes/llama/pretrain_llama3_8b.py
deleted file mode 100644
index 76ebde762..000000000
--- a/examples/recipes/llama/pretrain_llama3_8b.py
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Llama3 8B Pretraining Script with YAML and CLI Configuration Overrides.
-
-This script provides a flexible way to pretrain Llama3 8B models using Megatron-Bridge with support for
-both YAML configuration files and command-line overrides using Hydra-style syntax.
-
-Examples:
-    Basic usage with default configuration:
-        $ torchrun --nproc_per_node=8 examples/recipes/llama/pretrain_llama3_8b.py
-
-    Using a custom YAML config file:
-        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_custom_config.yaml
-
-    Using CLI overrides only:
-        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py model.tensor_model_parallel_size=4 train.train_iters=100000
-
-    Combining YAML and CLI overrides (CLI takes precedence):
-        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file conf/my_config.yaml \
-        model.pipeline_dtype=torch.float16 \
-        train.global_batch_size=512
-
-Configuration Precedence:
-    1. Base configuration from pretrain_config() recipe
-    2. YAML overrides from --config-file (if provided)
-    3. CLI overrides (highest precedence)
-
-Supported Override Syntax:
-    - Standard assignment: key=value
-    - Nested assignment: section.subsection.key=value
-    - Addition: +new_key=value
-    - Deletion: ~key_to_remove
-    - Type conversion: Automatic for basic types (int, float, bool, str)
-    - Complex types: torch.dtype, enums, etc. are supported
-"""
-
-import argparse
-import logging
-import os
-import sys
-from pathlib import Path
-from typing import Tuple
-
-import torch
-from omegaconf import OmegaConf
-
-from megatron.bridge.recipes.llama import llama3_8b_pretrain_config as pretrain_config
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.training.utils.omegaconf_utils import (
-    apply_overrides,
-    create_omegaconf_dict_config,
-    parse_hydra_overrides,
-)
-from megatron.bridge.utils.common_utils import get_rank_safe
-
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-# Define paths relative to this script's location
-# Assumes this script (pretrain_llama3_8b.py) is in Megatron-Bridge/examples/recipes/llama/
-# and the config is in a 'conf' subdirectory.
-SCRIPT_DIR: Path = Path(__file__).parent.resolve()
-DEFAULT_CONFIG_FILENAME: str = "llama3_8b_pretrain_override_example.yaml"
-DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME
-
-
-def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
-    """Parse command line arguments, separating known script args from OmegaConf overrides."""
-    parser = argparse.ArgumentParser(
-        description="Pretrain Llama3 8B model using Megatron-Bridge with YAML and CLI overrides",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--config-file",
-        type=str,
-        default=str(DEFAULT_CONFIG_FILE_PATH),
-        help="Path to the YAML OmegaConf override file. Default: conf/llama3_8b_pretrain_override_example.yaml",
-    )
-    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
-
-    # Parse known args for the script, remaining will be treated as overrides
-    args, cli_dotlist_overrides = parser.parse_known_args()
-    return args, cli_dotlist_overrides
-
-
-def main() -> None:
-    """
-    Entry point for the Llama3 8B pretraining script.
-
-    This function orchestrates the complete configuration workflow:
-    1. Loads the base configuration from pretrain_config() recipe
-    2. Applies YAML overrides from --config-file (if exists)
-    3. Applies CLI overrides using Hydra-style syntax
-    4. Starts Megatron pretraining with the final merged configuration
-
-    Configuration merging preserves callable fields (like activation functions)
-    and handles type conversions automatically.
-
-    Examples of CLI usage:
-        # Use default config with custom learning rate
-        torchrun --nproc_per_node=8 pretrain_llama3_8b.py optimizer.lr=0.0002
-
-        # Custom config file with additional overrides
-        torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_config.yaml train.train_iters=50000
-
-        # Multiple overrides for distributed training
-        torchrun --nproc_per_node=8 pretrain_llama3_8b.py \
-            model.tensor_model_parallel_size=4 \
-            model.pipeline_model_parallel_size=2 \
-            train.global_batch_size=512
-    """
-    args, cli_overrides = parse_cli_args()
-
-    logger.info("Megatron-Bridge Llama3 8B Pretraining Script with YAML & CLI Overrides")
-    logger.info("------------------------------------------------------------------")
-
-    # Load base configuration from the recipe as a Python dataclass
-    cfg: ConfigContainer = pretrain_config()
-    logger.info("Loaded base configuration")
-
-    # Print configuration on rank 0
-    if get_rank_safe() == 0:
-        cfg.print_yaml()
-
-    # Convert the initial Python dataclass to an OmegaConf DictConfig for merging
-    merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
-
-    # Load and merge YAML overrides if a config file is provided
-    if args.config_file:
-        logger.debug(f"Loading YAML overrides from: {args.config_file}")
-        if not os.path.exists(args.config_file):
-            logger.error(f"Override YAML file not found: {args.config_file}")
-            sys.exit(1)
-        yaml_overrides_omega = OmegaConf.load(args.config_file)
-        merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega)
-        logger.debug("YAML overrides merged successfully.")
-
-    # Apply command-line overrides using Hydra-style parsing
-    if cli_overrides:
-        logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}")
-        merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
-        logger.debug("Hydra-style command-line overrides applied successfully.")
-
-    # Apply the final merged OmegaConf configuration back to the original ConfigContainer
-    logger.debug("Applying final merged configuration back to Python ConfigContainer...")
-    final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
-    # Apply overrides while preserving excluded fields
-    apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
-
-    # Display final configuration
-    if get_rank_safe() == 0:
-        logger.info("--- Final Merged Configuration ---")
-        cfg.print_yaml()
-        logger.info("----------------------------------")
-
-    # Start training
-    logger.debug("Starting pretraining...")
-    pretrain(config=cfg, forward_step_func=forward_step)
-
-    # Cleanup process group
-    if torch.distributed.is_initialized():
-        torch.distributed.barrier()
-        torch.distributed.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py b/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py
deleted file mode 100644
index 6b8c6c68d..000000000
--- a/examples/recipes/llama/pretrain_llama3_8b_nemo_run_script.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-NeMo Run Launcher for Llama3 8B Pretraining.
-
-This script launches the pretrain_llama3_8b.py script using NeMo Run with TorchRun,
-while forwarding any additional command line arguments to the target script.
-
-Examples:
-    Basic usage with default config:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8
-
-    Using a custom config file:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 --config-file=my_config.yaml
-
-    Passing additional overrides to the target script:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 \
-            model.tensor_model_parallel_size=4 \
-            train.train_iters=100000
-
-    Using both custom config and CLI overrides:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 \
-            --config-file=conf/my_custom_config.yaml \
-            optimizerg.lr=0.0002 \
-            train.global_batch_size=512
-
-    Dry run to see what would be executed:
-        $ python pretrain_llama3_8b_nemo_run_script.py --nproc-per-node=8 --dryrun \
-            model.pipeline_dtype=torch.float16
-
-Argument Forwarding:
-    Any arguments not recognized by this launcher script will be forwarded
-    to the target pretrain_llama3_8b.py script as Hydra-style overrides.
-"""
-
-import argparse
-import logging
-import sys
-from pathlib import Path
-from typing import Tuple
-
-import nemo_run as run
-
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-# Define paths relative to this script's location
-# Assumes this script (pretrain_llama3_8b_nemo_run_script.py) is in Megatron-Bridge/examples/recipes/llama/
-# and pretrain_llama3_8b.py is in the same directory,
-# and the config is in a 'conf' subdirectory.
-SCRIPT_DIR: Path = Path(__file__).parent.resolve()
-PRETRAIN_SCRIPT_FILENAME: str = "pretrain_llama3_8b.py"
-PRETRAIN_SCRIPT_PATH: Path = SCRIPT_DIR / PRETRAIN_SCRIPT_FILENAME
-DEFAULT_CONFIG_FILENAME: str = "llama3_8b_pretrain_override_example.yaml"
-DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME
-
-
-def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
-    """Parse command line arguments, separating launcher args from target script args."""
-    parser = argparse.ArgumentParser(
-        description="Launcher for Llama3 8B pretraining using nemo_run and TorchRun. "
-        "Additional arguments will be forwarded to pretrain_llama3_8b.py",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--nproc-per-node",
-        type=int,
-        default=2,
-        help="Number of processes per node for TorchRun (typically number of GPUs).",
-    )
-    parser.add_argument(
-        "--config-file",
-        type=str,
-        default=str(DEFAULT_CONFIG_FILE_PATH),
-        help="Path to the YAML override config file for the pretrain_llama3_8b.py script.",
-    )
-    parser.add_argument(
-        "--dryrun",
-        action="store_true",
-        help="Dry run the script without actually running it.",
-    )
-
-    # Parse known args for the launcher, remaining will be forwarded to target script
-    args, forwarded_args = parser.parse_known_args()
-    return args, forwarded_args
-
-
-def main() -> None:
-    """
-    Main function for script demonstrating how to use the NeMo Run executor.
-    """
-    args, forwarded_args = parse_cli_args()
-
-    logger.info("Nemo Run Launcher for Llama3 8B Pretraining")
-    logger.info("===========================================")
-
-    if not PRETRAIN_SCRIPT_PATH.is_file():
-        logger.error(f"Target pretraining script not found: {PRETRAIN_SCRIPT_PATH}")
-        logger.error(f"Please ensure '{PRETRAIN_SCRIPT_FILENAME}' exists in the same directory as this launcher.")
-        sys.exit(1)
-
-    config_file_to_use = Path(args.config_file).resolve()
-    if not config_file_to_use.is_file():
-        logger.error(f"Specified YAML config file not found: {config_file_to_use}")
-        logger.error("Ensure the path passed to --config_file is correct.")
-        sys.exit(1)
-
-    # Build the arguments list for the target script
-    target_script_args = [
-        "--config-file",
-        str(config_file_to_use),
-    ]
-
-    # Add any forwarded arguments (Hydra-style overrides and other target script args)
-    if forwarded_args:
-        target_script_args.extend(forwarded_args)
-        logger.info(f"Forwarding additional arguments to target script: {forwarded_args}")
-
-    logger.info(f"Target script: {PRETRAIN_SCRIPT_PATH}")
-    logger.info(f"Target script arguments: {target_script_args}")
-
-    train_script = run.Script(
-        path=str(PRETRAIN_SCRIPT_PATH),
-        entrypoint="python",
-        args=target_script_args,
-    )
-
-    # Define the executor
-    logger.info(f"Launching locally with TorchRun with nproc_per_node={args.nproc_per_node}")
-    executor = run.LocalExecutor(ntasks_per_node=args.nproc_per_node, launcher="torchrun")
-
-    # Execute the run
-    run.run(train_script, executor=executor, dryrun=args.dryrun)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/megatron/bridge/recipes/gemma/gemma2.py b/src/megatron/bridge/recipes/gemma/gemma2.py
index 8987202ee..98c80e3e5 100644
--- a/src/megatron/bridge/recipes/gemma/gemma2.py
+++ b/src/megatron/bridge/recipes/gemma/gemma2.py
@@ -247,7 +247,7 @@ def _gemma2_common(
             reset_attention_mask=False,
             reset_position_ids=False,
             eod_mask_loss=False,
-            sequence_length=seq_length,
+            seq_length=seq_length,
             num_dataset_builder_threads=1,
             blend=blend,
             blend_per_split=blend_per_split,
diff --git a/tutorials/recipes/llama/00_quickstart_pretrain.py b/tutorials/recipes/llama/00_quickstart_pretrain.py
new file mode 100644
index 000000000..245cca0be
--- /dev/null
+++ b/tutorials/recipes/llama/00_quickstart_pretrain.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Quickstart: Pretrain Llama 3.2 1B with Megatron Bridge
+
+Usage:
+    Single GPU:
+        torchrun --nproc_per_node=1 00_quickstart_pretrain.py
+
+    Multiple GPUs (automatic data parallelism):
+        torchrun --nproc_per_node=8 00_quickstart_pretrain.py
+
+The script uses sensible defaults and mock data for quick testing.
+For custom configurations through YAML and Hydra-style overrides, see 02_pretrain_with_yaml.py
+For multi-node training, see launch_with_sbatch.sh or 04_launch_slurm_with_nemo_run.py
+"""
+
+from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+
+
+def main() -> None:
+    """Run Llama 3.2 1B pretraining with default configuration."""
+
+    # Load the base recipe configuration
+    # Llama 3.2 1B works on a single GPU (TP=1, PP=1, CP=1)
+    config = llama32_1b_pretrain_config()
+
+    # OPTIONAL: Customize key settings here
+    # Uncomment and modify as needed:
+
+    # For a quick test run:
+    config.train.train_iters = 10
+    config.scheduler.lr_warmup_iters = 2
+
+    # Use your own data:
+    # config.data.data_path = "/path/to/your/dataset"
+
+    # Adjust batch sizes for your GPU memory:
+    # config.train.global_batch_size = 256
+    # config.train.micro_batch_size = 2
+
+    # Change checkpoint save frequency:
+    # config.train.save_interval = 500
+
+    # Start pretraining
+    pretrain(config=config, forward_step_func=forward_step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/recipes/llama/01_quickstart_finetune.py b/tutorials/recipes/llama/01_quickstart_finetune.py
new file mode 100644
index 000000000..6a8af8e80
--- /dev/null
+++ b/tutorials/recipes/llama/01_quickstart_finetune.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Quickstart: Finetune Llama 3.2 1B with Megatron Bridge
+
+Usage:
+    Single GPU with LoRA:
+        torchrun --nproc_per_node=1 01_quickstart_finetune.py \
+            --pretrained-checkpoint /path/to/megatron/checkpoint
+
+    Multiple GPUs (automatic data parallelism):
+        torchrun --nproc_per_node=8 01_quickstart_finetune.py \
+            --pretrained-checkpoint /path/to/megatron/checkpoint
+
+Prerequisites:
+    You need a checkpoint in Megatron format. You can either:
+    1. Convert HF checkpoint to Megatron format:
+       python examples/conversion/convert_checkpoints.py import \
+           --hf-model meta-llama/Llama-3.2-1B \
+           --megatron-path ./checkpoints/llama32_1b
+    2. Use a checkpoint from pretraining (see 00_quickstart_pretrain.py)
+
+The script uses SQuAD dataset by default. See inline comments for:
+- Using your own dataset
+- Adjusting LoRA hyperparameters
+- Switching to full supervised finetuning
+
+For YAML configuration, see 03_finetune_with_yaml.py
+For multi-node training, see launch_with_sbatch.sh or 04_launch_slurm_with_nemo_run.py
+"""
+
+import argparse
+
+from megatron.bridge.recipes.llama import llama32_1b_finetune_config
+from megatron.bridge.training.finetune import finetune
+from megatron.bridge.training.gpt_step import forward_step
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Finetune Llama 3.2 1B with LoRA",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--pretrained-checkpoint",
+        type=str,
+        required=True,
+        help="Path to pretrained checkpoint in Megatron format",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    """Run Llama 3.2 1B finetuning with LoRA."""
+    args = parse_args()
+
+    # Load the base finetune configuration
+    # Uses LoRA for efficient finetuning on a single GPU
+    config = llama32_1b_finetune_config()
+
+    # Load from the pretrained checkpoint
+    config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint
+
+    # === Quick test run ===
+    config.train.train_iters = 10
+    config.scheduler.lr_warmup_iters = 2
+
+    # ===== OPTIONAL CUSTOMIZATIONS =====
+    # Uncomment and modify as needed:
+
+    # === Use your own dataset ===
+    # Replace SQuAD with your custom dataset
+    # Option 1: Simple path override
+    # config.dataset.dataset_root = "/path/to/your/dataset"
+
+    # Or replace the dataset with FinetuningDatasetConfig for JSONL data
+    # from megatron.bridge.training.config import FinetuningDatasetConfig
+    # config.dataset = FinetuningDatasetConfig(
+    #     dataset_root="/path/to/your/dataset_dir",  # expects training/validation/test jsonl files
+    #     seq_length=config.model.seq_length,
+    # )
+
+    # === Adjust learning rate ===
+    # config.optimizer.lr = 5e-5
+
+    # === Change checkpoint save frequency ===
+    # config.train.save_interval = 100
+
+    # === Adjust LoRA hyperparameters ===
+    # Higher rank = more trainable parameters, potentially better quality but slower
+    # config.peft.dim = 16  # LoRA rank
+    # config.peft.alpha = 32  # LoRA alpha scaling
+
+    # === Full supervised finetuning (no LoRA) ===
+    # For full finetuning, reload config with peft=None:
+    # config = llama32_1b_finetune_config(peft=None)
+    # config.checkpoint.pretrained_checkpoint = args.pretrained_checkpoint
+    # Note: Full finetuning uses more memory than LoRA
+    # The recipe automatically adjusts parallelism for full SFT
+
+    # Start finetuning
+    finetune(config=config, forward_step_func=forward_step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/recipes/llama/02_pretrain_with_yaml.py b/tutorials/recipes/llama/02_pretrain_with_yaml.py
new file mode 100644
index 000000000..999922324
--- /dev/null
+++ b/tutorials/recipes/llama/02_pretrain_with_yaml.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pretrain with YAML Configuration and CLI Overrides
+
+This script demonstrates how to use YAML configuration files and command-line
+overrides for more complex configuration overrides.
+
+Usage:
+    With default config file:
+        torchrun --nproc_per_node=8 02_pretrain_with_yaml.py
+
+    With custom config file:
+        torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
+            --config-file conf/my_custom_config.yaml
+
+    With command-line overrides:
+        torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
+            train.train_iters=5000 \
+            train.global_batch_size=256
+
+    Combining YAML and CLI (CLI takes precedence):
+        torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
+            --config-file conf/llama32_1b_pretrain.yaml \
+            train.train_iters=10000
+
+Configuration Priority (highest to lowest):
+    1. Command-line overrides (highest)
+    2. YAML config file
+    3. Base recipe defaults (lowest)
+
+See conf/ directory for example YAML configurations.
+For a pure Python usage see 00_quickstart_pretrain.py.
+"""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Tuple
+
+from omegaconf import OmegaConf
+
+from megatron.bridge.recipes.llama import llama32_1b_pretrain_config
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.pretrain import pretrain
+from megatron.bridge.training.utils.omegaconf_utils import (
+    apply_overrides,
+    create_omegaconf_dict_config,
+    parse_hydra_overrides,
+)
+
+
+logger = logging.getLogger(__name__)
+
+# Default config file location
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DEFAULT_CONFIG_FILE = SCRIPT_DIR / "conf" / "llama32_1b_pretrain.yaml"
+
+
+def parse_args() -> Tuple[argparse.Namespace, list[str]]:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Pretrain with YAML configuration and CLI overrides",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=None,
+        help=f"Path to YAML config file (optional). Default: {DEFAULT_CONFIG_FILE}",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+
+    # Separate known args from CLI overrides
+    args, cli_overrides = parser.parse_known_args()
+    return args, cli_overrides
+
+
+def main() -> None:
+    """Run pretraining with YAML configuration and CLI overrides."""
+    args, cli_overrides = parse_args()
+
+    # Load base configuration from recipe
+    config: ConfigContainer = llama32_1b_pretrain_config()
+
+    # Convert to OmegaConf for merging
+    omega_conf, excluded_fields = create_omegaconf_dict_config(config)
+
+    # Apply YAML overrides if provided
+    if args.config_file:
+        config_file_path = Path(args.config_file)
+        if not config_file_path.exists():
+            logger.error(f"Config file not found: {config_file_path}")
+            sys.exit(1)
+
+        yaml_conf = OmegaConf.load(config_file_path)
+        omega_conf = OmegaConf.merge(omega_conf, yaml_conf)
+
+    # Apply command-line overrides
+    if cli_overrides:
+        omega_conf = parse_hydra_overrides(omega_conf, cli_overrides)
+
+    # Convert back to ConfigContainer
+    final_config_dict = OmegaConf.to_container(omega_conf, resolve=True)
+    apply_overrides(config, final_config_dict, excluded_fields)
+
+    # Start pretraining
+    pretrain(config=config, forward_step_func=forward_step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/recipes/llama/03_finetune_with_yaml.py b/tutorials/recipes/llama/03_finetune_with_yaml.py
new file mode 100644
index 000000000..ead33540f
--- /dev/null
+++ b/tutorials/recipes/llama/03_finetune_with_yaml.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Finetune with YAML Configuration and CLI Overrides
+
+This script demonstrates how to use YAML configuration files and command-line
+overrides for finetuning with LoRA or full supervised finetuning (SFT).
+
+Usage:
+    With default config file:
+        torchrun --nproc_per_node=1 03_finetune_with_yaml.py
+
+    With custom config file:
+        torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
+            --config-file conf/my_finetune_config.yaml
+
+    With command-line overrides:
+        torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
+            train.train_iters=1000 \
+            optimizer.lr=5e-5
+
+    Full finetuning instead of LoRA:
+        torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
+            --peft none \
+            train.train_iters=1000
+
+    Combining YAML and CLI (CLI takes precedence):
+        torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
+            --config-file conf/llama32_1b_finetune.yaml \
+            peft.dim=16 \
+            train.train_iters=2000
+
+Configuration Priority (highest to lowest):
+    1. Command-line overrides (highest)
+    2. YAML config file
+    3. Base recipe defaults (lowest)
+
+See conf/ directory for example YAML configurations.
+For a pure Python usage see 01_quickstart_finetune.py.
+"""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Tuple
+
+from omegaconf import OmegaConf
+
+from megatron.bridge.recipes.llama import llama32_1b_finetune_config
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.finetune import finetune
+from megatron.bridge.training.gpt_step import forward_step
+from megatron.bridge.training.utils.omegaconf_utils import (
+    apply_overrides,
+    create_omegaconf_dict_config,
+    parse_hydra_overrides,
+)
+
+
+logger = logging.getLogger(__name__)
+
+# Default config file location
+SCRIPT_DIR = Path(__file__).parent.resolve()
+DEFAULT_CONFIG_FILE = SCRIPT_DIR / "conf" / "llama32_1b_finetune.yaml"
+
+
+def parse_args() -> Tuple[argparse.Namespace, list[str]]:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Finetune with YAML configuration and CLI overrides",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=None,
+        help=f"Path to YAML config file (optional). Default: {DEFAULT_CONFIG_FILE}",
+    )
+    parser.add_argument(
+        "--peft",
+        type=str,
+        default="lora",
+        choices=["lora", "dora", "none"],
+        help="PEFT method to use. Use 'none' for full finetuning.",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+
+    # Separate known args from CLI overrides
+    args, cli_overrides = parser.parse_known_args()
+    return args, cli_overrides
+
+
+def main() -> None:
+    """Run finetuning with YAML configuration and CLI overrides."""
+    args, cli_overrides = parse_args()
+
+    # Load base configuration from recipe
+    peft_method = None if args.peft == "none" else args.peft
+    config: ConfigContainer = llama32_1b_finetune_config(peft=peft_method)
+
+    # Convert to OmegaConf for merging
+    omega_conf, excluded_fields = create_omegaconf_dict_config(config)
+
+    # Apply YAML overrides if provided
+    if args.config_file:
+        config_file_path = Path(args.config_file)
+        if not config_file_path.exists():
+            logger.error(f"Config file not found: {config_file_path}")
+            sys.exit(1)
+
+        yaml_conf = OmegaConf.load(config_file_path)
+        omega_conf = OmegaConf.merge(omega_conf, yaml_conf)
+
+    # Apply command-line overrides
+    if cli_overrides:
+        omega_conf = parse_hydra_overrides(omega_conf, cli_overrides)
+
+    # Convert back to ConfigContainer
+    final_config_dict = OmegaConf.to_container(omega_conf, resolve=True)
+    apply_overrides(config, final_config_dict, excluded_fields)
+
+    # Start finetuning
+    finetune(config=config, forward_step_func=forward_step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py b/tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py
new file mode 100644
index 000000000..fef063e7a
--- /dev/null
+++ b/tutorials/recipes/llama/04_launch_slurm_with_nemo_run.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Launch Training on Slurm with NeMo-Run
+
+This script demonstrates how to launch training scripts (pretrain or finetune)
+on a Slurm cluster using NeMo-Run. This enables easy multi-node training with
+proper job management.
+
+Prerequisites: Install nemo-run
+
+Usage:
+    # From the Slurm cluster (uses LocalTunnel)
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account
+
+    # From your local machine (uses SSHTunnel)
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        --ssh-tunnel \
+        --host my-cluster.example.com \
+        --user myusername \
+        --remote-job-dir /home/myusername/nemo-runs
+
+    # With custom SSH key
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        --ssh-tunnel \
+        --host my-cluster.example.com \
+        --user myusername \
+        --remote-job-dir /home/myusername/nemo-runs \
+        --identity ~/.ssh/id_rsa
+
+    # Launch with custom config (pass arguments to training script)
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 03_finetune_with_yaml.py \
+        --nodes 1 \
+        --partition gpu \
+        --account my_account \
+        --config-file conf/llama32_1b_finetune.yaml
+
+    # Pass CLI overrides to training script
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 02_pretrain_with_yaml.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        train.train_iters=5000 \
+        optimizer.lr=0.0002
+
+    # With container and custom mounts
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        --container-image /path/to/container.sqsh \
+        --mount /data:/data
+
+    # Wait for job completion and tail logs
+    python 04_launch_slurm_with_nemo_run.py \
+        --script 00_quickstart_pretrain.py \
+        --nodes 2 \
+        --partition gpu \
+        --account my_account \
+        --no-detach \
+        --tail-logs
+
+Note:
+- Use --ssh-tunnel when launching from your local machine
+- Omit --ssh-tunnel when already on the Slurm cluster (uses LocalTunnel)
+- By default, jobs are submitted and detached (--detach)
+- Use --no-detach --tail-logs to wait and monitor job output
+- Any unknown arguments are forwarded to the training script
+- Adjust cluster-specific settings (account, partition, container paths)
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import nemo_run as run
+
+
+logger = logging.getLogger(__name__)
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+
+
+def parse_args() -> tuple[argparse.Namespace, list[str]]:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Launch training (pretrain/finetune) on Slurm using NeMo-Run",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--script",
+        type=str,
+        required=True,
+        help="Training script to run (e.g., 00_quickstart_pretrain.py, 01_quickstart_finetune.py)",
+    )
+    parser.add_argument(
+        "--nodes",
+        type=int,
+        default=1,
+        help="Number of nodes to use",
+    )
+    parser.add_argument(
+        "--devices",
+        type=int,
+        default=8,
+        help="GPUs per node",
+    )
+    parser.add_argument(
+        "--partition",
+        type=str,
+        required=True,
+        help="Slurm partition name",
+    )
+    parser.add_argument(
+        "--account",
+        type=str,
+        required=True,
+        help="Slurm account name",
+    )
+    parser.add_argument(
+        "--time",
+        type=str,
+        default="04:00:00",
+        help="Job time limit",
+    )
+    parser.add_argument(
+        "--ssh-tunnel",
+        action="store_true",
+        help="Use SSH tunnel (for launching from local machine). Requires --host, --user, --remote-job-dir",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        help="SSH host for tunnel (required if --ssh-tunnel is set)",
+    )
+    parser.add_argument(
+        "--user",
+        type=str,
+        help="SSH user for tunnel (required if --ssh-tunnel is set)",
+    )
+    parser.add_argument(
+        "--remote-job-dir",
+        type=str,
+        help="Remote directory to store job files (required if --ssh-tunnel is set)",
+    )
+    parser.add_argument(
+        "--identity",
+        type=str,
+        default=None,
+        help="Path to SSH private key for authentication",
+    )
+    parser.add_argument(
+        "--container-image",
+        type=str,
+        default=None,
+        help="Container image path",
+    )
+    parser.add_argument(
+        "--mount",
+        type=str,
+        action="append",
+        default=[],
+        help="Container mounts in format host:container (can be specified multiple times)",
+    )
+    parser.add_argument(
+        "--experiment-name",
+        type=str,
+        default="megatron_bridge_training",
+        help="Name for the experiment",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print what would be executed without submitting the job",
+    )
+    parser.add_argument(
+        "--detach",
+        action="store_true",
+        default=True,
+        help="Detach from the experiment after submission",
+    )
+    parser.add_argument(
+        "--tail-logs",
+        action="store_true",
+        help="Tail logs after submission (only works with --no-detach)",
+    )
+
+    # Use parse_known_args to capture forwarded arguments for the training script
+    args, forwarded_args = parser.parse_known_args()
+    return args, forwarded_args
+
+
+def main() -> None:
+    """Launch training (pretrain/finetune) using NeMo-Run SlurmExecutor."""
+    args, forwarded_args = parse_args()
+
+    # Validate SSH tunnel arguments
+    if args.ssh_tunnel:
+        if not all([args.host, args.user, args.remote_job_dir]):
+            raise ValueError("--ssh-tunnel requires --host, --user, and --remote-job-dir to be specified")
+
+    # Resolve script path
+    script_path = SCRIPT_DIR / args.script
+    if not script_path.exists():
+        raise FileNotFoundError(f"Training script not found: {script_path}")
+
+    # Build arguments for the training script from forwarded args
+    script_args = forwarded_args if forwarded_args else []
+
+    # Create the training task
+    task = run.Script(
+        path=str(script_path),
+        entrypoint="python",
+        args=script_args,
+    )
+
+    # Configure tunnel (SSH for remote, Local if already on cluster)
+    tunnel = None
+    if args.ssh_tunnel:
+        tunnel = run.SSHTunnel(
+            host=args.host,
+            user=args.user,
+            job_dir=args.remote_job_dir,
+            identity=args.identity,
+        )
+        logger.info(f"Using SSH tunnel to {args.user}@{args.host}")
+    else:
+        tunnel = run.LocalTunnel()
+        logger.info("Using LocalTunnel (running on cluster)")
+
+    # Create the Slurm executor
+    executor = run.SlurmExecutor(
+        account=args.account,
+        partition=args.partition,
+        nodes=args.nodes,
+        ntasks_per_node=args.devices,
+        gpus_per_node=args.devices,
+        mem="0",
+        exclusive=True,
+        time=args.time,
+        tunnel=tunnel,
+    )
+
+    # Configure container if specified
+    if args.container_image:
+        executor.container_image = args.container_image
+
+    # Configure mounts if specified
+    if args.mount:
+        executor.container_mounts = args.mount
+
+    # Run the experiment
+    with run.Experiment(args.experiment_name) as exp:
+        exp.add(task, executor=executor, name="training")
+
+        if args.dry_run:
+            exp.dryrun()
+        else:
+            exp.run(detach=args.detach, tail_logs=args.tail_logs)
+
+            if args.detach:
+                logger.info("Job submitted to Slurm!")
+                logger.info("Use 'squeue' to check job status")
+            else:
+                logger.info("Job completed!")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    main()
diff --git a/tutorials/recipes/llama/README.md b/tutorials/recipes/llama/README.md
new file mode 100644
index 000000000..b5306707d
--- /dev/null
+++ b/tutorials/recipes/llama/README.md
@@ -0,0 +1,219 @@
+# Recipes with Megatron Bridge
+
+This guide shows you how to pretrain and finetune Llama models using Megatron Bridge.
+
+## Quickstart
+
+The fastest way to get started with Megatron Bridge pretraining:
+
+```bash
+torchrun --nproc_per_node=1 00_quickstart_pretrain.py
+```
+
+This runs Llama 3.2 1B pretraining on a single GPU with mock data.
+
+For finetuning, you first need a checkpoint in Megatron format. Convert from HuggingFace using the `AutoBridge`:
+
+> **Note:** You must be authenticated with Hugging Face to download the model. Run `hf auth login --token $HF_TOKEN` if needed.
+
+```bash
+python ../../conversion/convert_checkpoints.py import \
+    --hf-model meta-llama/Llama-3.2-1B \
+    --megatron-path ./checkpoints/llama32_1b
+```
+
+Then run finetuning:
+
+```bash
+torchrun --nproc_per_node=1 01_quickstart_finetune.py \
+    --pretrained-checkpoint ./checkpoints/llama32_1b
+```
+
+The [01_quickstart_finetune.py](01_quickstart_finetune.py) recipe finetunes Llama 3.2 1B using LoRA on the SQuAD dataset by default.
+
+To plug in your own JSONL dataset, swap the dataset config in that script:
+
+```python
+from megatron.bridge.training.config import FinetuningDatasetConfig
+
+config.dataset = FinetuningDatasetConfig(
+    dataset_root="/path/to/dataset_dir",  # contains training/validation/test jsonl files
+    seq_length=config.model.seq_length,
+)
+```
+
+## Configuration
+
+Megatron Bridge recipes are standard Python scripts, giving you full flexibility in how you configure your training. You can:
+1.  Modify the Python scripts directly
+2.  Use the framework's YAML-based configuration system
+3.  Implement your own configuration management (ArgParse, Hydra, etc.)
+
+### Using Framework YAML Configs
+
+The recipes include optional support for YAML configuration and dot-notation overrides via `ConfigContainer`. This is just one way to manage config; you are free to use other methods.
+
+To use the provided YAML system:
+
+```bash
+torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
+    --config-file conf/llama32_1b_pretrain.yaml
+```
+
+Understanding the YAML Structure:
+
+YAML files mirror the `ConfigContainer` structure. Each top-level key corresponds to a configuration section (e.g., `dataset`, `train`, `model`, `optimizer`).
+
+Example YAML (`conf/llama32_1b_pretrain.yaml`):
+
+```yaml
+# Each section maps to a ConfigContainer field
+dataset:                           # GPTDatasetConfig
+  data_path: /path/to/training/data
+  sequence_length: 4096
+
+train:                             # TrainingConfig
+  train_iters: 100
+  global_batch_size: 256
+
+checkpoint:                        # CheckpointConfig
+  save: ./checkpoints/llama32_1b
+  save_interval: 50
+
+model:                             # Model Provider
+  seq_length: 4096                 # Must match data.sequence_length
+  tensor_model_parallel_size: 1
+  
+optimizer:                         # OptimizerConfig
+  lr: 0.0003
+```
+
+Command-Line Overrides:
+
+You can override values using dot notation (`section.field=value`):
+
+```bash
+torchrun --nproc_per_node=2 02_pretrain_with_yaml.py \
+    --config-file conf/llama32_1b_pretrain.yaml \
+    train.train_iters=5000 \
+    train.global_batch_size=512 \
+    optimizer.lr=0.0002
+```
+
+Priority order (highest to lowest):
+1.  Command-line overrides
+2.  YAML config file
+3.  Base recipe defaults
+
+### Finetuning Configuration
+
+For more complex finetuning configurations:
+
+```bash
+torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
+    --config-file conf/llama32_1b_finetune.yaml
+```
+
+Example YAML (`conf/llama32_1b_finetune.yaml`):
+
+```yaml
+# Each section maps to a ConfigContainer field
+dataset:                           # FinetuningDatasetConfig
+  data_path: /path/to/finetuning_dataset.jsonl
+  seq_length: 4096
+
+train:                             # TrainingConfig  
+  train_iters: 100
+  global_batch_size: 128
+
+checkpoint:                        # CheckpointConfig
+  pretrained_checkpoint: /path/to/pretrained/checkpoint
+  save: ./checkpoints/llama32_1b_finetuned
+  save_interval: 50
+
+peft:                             # PEFT (LoRA config)
+  dim: 8      # LoRA rank
+  alpha: 16   # LoRA alpha
+
+model:                            # Model Provider
+  seq_length: 4096                # Must match data.seq_length
+  
+optimizer:                        # OptimizerConfig
+  lr: 0.0001
+```
+
+Full Finetuning (No LoRA)
+
+```bash
+torchrun --nproc_per_node=2 03_finetune_with_yaml.py \
+    --peft none \
+    train.train_iters=1000
+```
+
+## Multi-Node Training
+
+### Direct Slurm with sbatch
+
+For traditional HPC workflows without NeMo-Run:
+
+```bash
+# 1. Configure launch_with_sbatch.sh
+# Edit SBATCH directives and script variables at the top
+
+# 2. Submit job
+sbatch launch_with_sbatch.sh
+```
+
+The `launch_with_sbatch.sh` script shows how to:
+- Configure Slurm job parameters
+- Set up multi-node torchrun
+- Use containers (optional)
+- Pass arguments to training scripts
+
+### NeMo-Run
+
+For job management and remote launching capabilities:
+
+Prerequisites:
+
+```bash
+pip install nemo-run
+```
+
+From the Slurm cluster login node:
+
+```bash
+python 04_launch_slurm_with_nemo_run.py \
+    --script 00_quickstart_pretrain.py \
+    --nodes 2 \
+    --devices 8 \
+    --partition gpu \
+    --account my_account
+```
+
+From your local machine (SSHTunnel):
+
+```bash
+python 04_launch_slurm_with_nemo_run.py \
+    --script 00_quickstart_pretrain.py \
+    --nodes 2 \
+    --devices 8 \
+    --partition gpu \
+    --account my_account \
+    --ssh-tunnel \
+    --host my-cluster.example.com \
+    --user myusername \
+    --remote-job-dir /home/myusername/nemo-runs
+```
+
+With custom config:
+
+```bash
+python 04_launch_slurm_with_nemo_run.py \
+    --script 03_finetune_with_yaml.py \
+    --nodes 1 \
+    --devices 8 \
+    --partition gpu \
+    --account my_account \
+    --config-file conf/llama32_1b_finetune.yaml
+```
diff --git a/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
new file mode 100644
index 000000000..02c3897a6
--- /dev/null
+++ b/tutorials/recipes/llama/conf/llama32_1b_finetune.yaml
@@ -0,0 +1,78 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example YAML configuration for Llama 3.2 1B finetuning with LoRA
+# This file demonstrates commonly customized settings.
+# Modify values as needed for your use case.
+
+# Data configuration
+dataset:
+  # Replace with your dataset path (JSONL format recommended)
+  # data_path: /path/to/your/finetuning_dataset.jsonl
+  seq_length: 4096
+
+# Training configuration
+train:
+  train_iters: 100
+  global_batch_size: 128
+  micro_batch_size: 2
+  eval_iters: 10
+  eval_interval: 50
+  
+
+# Optimizer configuration
+optimizer:
+  lr: 0.0001
+  min_lr: 0.00001
+  weight_decay: 0.0
+
+# Learning rate scheduler
+scheduler:
+  lr_warmup_iters: 10
+  lr_decay_style: cosine
+
+# Checkpoint configuration
+checkpoint:
+  # Directory to save finetuned checkpoints
+  save: ./checkpoints/llama32_1b_finetuned
+  # Directory to resume from during training
+  load: ./checkpoints/llama32_1b
+  # Directory for pretrained weights in Megatron format
+  pretrained_checkpoint: ./path/to/pretrained/checkpoint/
+
+  save_interval: 50
+
+# LoRA configuration
+peft:
+  dim: 8  # LoRA rank
+  alpha: 16  # LoRA alpha scaling
+
+# Model configuration
+# Note: seq_length must match dataset.seq_length
+model:
+  seq_length: 4096
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+
+# Logging
+logger:
+  log_interval: 10
+  tensorboard_dir: ./logs/llama32_1b_finetuned
+  # wandb_project: my_finetune_project
+  # wandb_entity: my_team
+
+# Random seed
+rng:
+  seed: 1234
diff --git a/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
new file mode 100644
index 000000000..9b9e8bf1d
--- /dev/null
+++ b/tutorials/recipes/llama/conf/llama32_1b_pretrain.yaml
@@ -0,0 +1,68 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example YAML configuration for Llama 3.2 1B pretraining
+# This file demonstrates commonly customized settings.
+# Modify values as needed for your use case.
+
+# Data configuration
+dataset:
+  # Replace with your dataset path
+  # data_path: /path/to/your/dataset
+  sequence_length: 4096
+
+# Training configuration
+train:
+  train_iters: 100
+  global_batch_size: 256
+  micro_batch_size: 2
+  eval_iters: 10
+  eval_interval: 50
+
+# Optimizer configuration
+optimizer:
+  lr: 0.0003
+  min_lr: 0.00003
+  weight_decay: 0.1
+
+# Learning rate scheduler
+scheduler:
+  lr_warmup_iters: 20
+  lr_decay_style: cosine
+
+# Checkpoint configuration
+checkpoint:
+  # Directory to save checkpoints
+  save: ./checkpoints/llama32_1b
+  load: ./checkpoints/llama32_1b
+  save_interval: 50
+
+# Model configuration
+# Note: seq_length must match data.sequence_length
+model:
+  seq_length: 4096
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+
+# Logging
+logger:
+  log_interval: 10
+  tensorboard_dir: ./logs/llama32_1b
+  # wandb_project: my_project  # Uncomment to enable W&B logging
+  # wandb_entity: my_team
+
+# Random seed
+rng:
+  seed: 1234
diff --git a/tutorials/recipes/llama/launch_with_sbatch.sh b/tutorials/recipes/llama/launch_with_sbatch.sh
new file mode 100644
index 000000000..e75690d27
--- /dev/null
+++ b/tutorials/recipes/llama/launch_with_sbatch.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#SBATCH --job-name=megatron-bridge-train
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8
+#SBATCH --time=04:00:00
+#SBATCH --partition=gpu
+#SBATCH --account=my_account
+#SBATCH --output=logs/train_%j.out
+#SBATCH --error=logs/train_%j.err
+#SBATCH --exclusive
+
+# ==============================================================================
+# Direct Slurm Launch with sbatch
+#
+# This script demonstrates how to launch training directly using sbatch without
+# NeMo-Run. This is useful if you prefer traditional HPC workflows or don't want
+# to install additional dependencies.
+#
+# Usage:
+#   1. Modify the #SBATCH directives above for your cluster
+#   2. Set the TRAINING_SCRIPT and other variables below
+#   3. Submit: sbatch launch_with_sbatch.sh
+#
+# For NeMo-Run based launching (recommended), see 04_launch_slurm_with_nemo_run.py
+# ==============================================================================
+
+# ==============================================================================
+# CONFIGURATION - Modify these for your setup
+# ==============================================================================
+
+# Training script to run (choose one)
+TRAINING_SCRIPT="00_quickstart_pretrain.py"
+# TRAINING_SCRIPT="01_quickstart_finetune.py"
+# TRAINING_SCRIPT="02_pretrain_with_yaml.py"
+# TRAINING_SCRIPT="03_finetune_with_yaml.py"
+
+# Optional: YAML config file (for *_with_yaml.py scripts)
+CONFIG_FILE=""
+# CONFIG_FILE="conf/llama32_1b_pretrain.yaml"
+# CONFIG_FILE="conf/llama32_1b_finetune.yaml"
+
+# Optional: Additional CLI overrides (for *_with_yaml.py scripts)
+CLI_OVERRIDES=""
+# CLI_OVERRIDES="train.train_iters=1000 train.global_batch_size=512"
+
+# Optional: For finetuning scripts, specify checkpoint path
+PRETRAINED_CHECKPOINT=""
+# PRETRAINED_CHECKPOINT="./checkpoints/llama32_1b"
+
+# Container image (optional, only if using containers)
+CONTAINER_IMAGE=""
+# CONTAINER_IMAGE="/path/to/container.sqsh"
+
+# Container mounts (optional, space-separated)
+CONTAINER_MOUNTS=""
+# CONTAINER_MOUNTS="/data:/data /model:/model"
+
+# ==============================================================================
+# Environment Setup
+# ==============================================================================
+
+# Set common environment variables
+# Optional: Set these if needed
+# export CUDA_DEVICE_MAX_CONNECTIONS=1
+# export NCCL_DEBUG=INFO
+
+# ==============================================================================
+# Job Execution
+# ==============================================================================
+
+echo "======================================"
+echo "Megatron Bridge Training Job"
+echo "======================================"
+echo "Job ID: $SLURM_JOB_ID"
+echo "Nodes: $SLURM_JOB_NUM_NODES"
+echo "GPUs per node: $SLURM_GPUS_PER_NODE"
+echo "Script: $TRAINING_SCRIPT"
+echo "======================================"
+
+# Build the command
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SCRIPT_PATH="${SCRIPT_DIR}/${TRAINING_SCRIPT}"
+
+# Build torchrun command
+CMD="torchrun"
+CMD="$CMD --nproc_per_node=$SLURM_GPUS_PER_NODE"
+CMD="$CMD --nnodes=$SLURM_JOB_NUM_NODES"
+CMD="$CMD --node_rank=\$SLURM_PROCID"
+CMD="$CMD --master_addr=\$(scontrol show hostname \$SLURM_NODELIST | head -n1)"
+CMD="$CMD --master_port=29500"
+CMD="$CMD $SCRIPT_PATH"
+
+# Add config file if specified
+if [ -n "$CONFIG_FILE" ]; then
+    CMD="$CMD --config-file $CONFIG_FILE"
+fi
+
+# Add pretrained checkpoint if specified (for finetuning)
+if [ -n "$PRETRAINED_CHECKPOINT" ]; then
+    CMD="$CMD --pretrained-checkpoint $PRETRAINED_CHECKPOINT"
+fi
+
+# Add CLI overrides if specified
+if [ -n "$CLI_OVERRIDES" ]; then
+    CMD="$CMD $CLI_OVERRIDES"
+fi
+
+echo "Executing: $CMD"
+echo "======================================"
+
+# Execute with or without container
+if [ -n "$CONTAINER_IMAGE" ]; then
+    # With container
+    SRUN_CMD="srun --container-image=$CONTAINER_IMAGE"
+    
+    # Add container mounts
+    if [ -n "$CONTAINER_MOUNTS" ]; then
+        for mount in $CONTAINER_MOUNTS; do
+            SRUN_CMD="$SRUN_CMD --container-mounts=$mount"
+        done
+    fi
+    
+    $SRUN_CMD bash -c "$CMD"
+else
+    # Without container
+    srun bash -c "$CMD"
+fi
+
+echo "======================================"
+echo "Job completed"
+echo "======================================"
+