Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/integration_test_8gpu_auto_parallel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: Auto Parallel 8 GPU Integration Tests
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pattern across the other workflows seems to still be naming this 8 GPU

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC it basically says we have at most 8 GPU used in each integration tests (and we only have 8 available)


on:
push:
branches: [ main ]
paths:
- 'torchtitan/experiments/auto_parallel/**'
- '.github/workflows/integration_test_8gpu_auto_parallel.yaml'
pull_request:
paths:
- 'torchtitan/experiments/auto_parallel/**'
- '.github/workflows/integration_test_8gpu_auto_parallel.yaml'
schedule:
# Runs every 12 hours
- cron: '0 */12 * * *'

concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.g5.48xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.6"
# This image is faster to clone than the default, but it lacks CC needed by triton
# (1m25s vs 2m37s).
docker-image: torchtitan-ubuntu-20.04-clang12
repository: pytorch/torchtitan
upload-artifact: outputs
script: |
set -eux

# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"

# Log CUDA driver version for debugging.
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
echo "CUDA driver version: ${DRIVER_VERSION}"

pip config --user set global.progress_bar off

python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126

# Install autoparallel - required dependency for auto_parallel experiment
python -m pip install git+https://github.com/meta-pytorch/autoparallel.git

mkdir artifacts-to-be-uploaded
python -m torchtitan.experiments.auto_parallel.tests.integration_tests artifacts-to-be-uploaded --ngpu 4
2 changes: 1 addition & 1 deletion torchtitan/experiments/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ We provide this `experiments/` folder to host experiments that add significant v
| [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
| [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
| [transformers_modeling_backend](./transformers_modeling_backend/) | [![Transformers modeling backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_modeling_backend.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_modeling_backend.yaml?query=branch%3Amain) | [@3outeille](https://github.com/3outeille) |
| [auto_parallel](./auto_parallel/) | TBA | [@wconstab](https://github.com/wconstab) | [@xmfan](https://github.com/xmfan) |
| [auto_parallel](./auto_parallel/) | [![Auto Parallel 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_auto_parallel.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_auto_parallel.yaml?query=branch%3Amain) | [@wconstab](https://github.com/wconstab) [@xmfan](https://github.com/xmfan) |
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def input_fn():
autop.add_input_constraints([x_sharding])
autop.add_output_constraints([out_sharding])
t0 = time.time()
sharding_placement = autop.optimize_placement()
sharding_placement = autop.optimize_placement(verbose=False)
t1 = time.time()
logger.info(f"AutoParallel took {t1 - t0} seconds")
parallel_mod = autop.apply_placement(sharding_placement)
Expand Down
5 changes: 5 additions & 0 deletions torchtitan/experiments/auto_parallel/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
85 changes: 85 additions & 0 deletions torchtitan/experiments/auto_parallel/tests/integration_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import argparse
import os

from tests.integration_tests import OverrideDefinitions
from tests.integration_tests.run_tests import run_tests


def build_auto_parallel_test_list() -> list[OverrideDefinitions]:
"""
returns a list of OverrideDefinitions that is used to generate
variations of integration tests based on the same root config file.
"""
integration_tests_flavors = [
# llama3 tests
OverrideDefinitions(
[
[
"--model.name auto_parallel.llama3",
"--parallelism.data_parallel_shard_degree 2",
"--parallelism.tensor_parallel_degree 2",
"--job.custom_config_module=torchtitan.experiments.auto_parallel.job_config",
],
],
"llama3 AutoParallel FSDP+TP",
"llama3_autoparallel_fsdp_tp",
ngpu=4,
),
# TODO: Re-enable this once we fix the test
Copy link
Member Author

@xmfan xmfan Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't work yet, I'm thinking of enabling it in a different PR

# deepseek_v3 tests
# OverrideDefinitions(
# [
# [
# "--model.name auto_parallel.deepseek_v3",
# "--parallelism.data_parallel_shard_degree 2",
# "--parallelism.expert_parallel_degree 2",
# "--job.custom_config_module=torchtitan.experiments.auto_parallel.job_config",
# "--activation_checkpoint.mode none",
# ],
# ],
# "deepseek_v3 AutoParallel FSDP+TP+EP",
# "deepseekv3_autoparallel_fsdp_tp_ep",
# ngpu=4,
# ),
]
return integration_tests_flavors


_TEST_SUITES_FUNCTION = {
"auto_parallel": build_auto_parallel_test_list,
}


def main():
parser = argparse.ArgumentParser()
parser.add_argument("output_dir")
parser.add_argument(
"--config_path",
default="./tests/integration_tests/base_config.toml",
help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
)
parser.add_argument(
"--test_name",
default="all",
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
)
parser.add_argument("--ngpu", default=8, type=int)
args = parser.parse_args()

if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
if os.listdir(args.output_dir):
raise RuntimeError("Please provide an empty output directory.")

test_list = _TEST_SUITES_FUNCTION["auto_parallel"]()
run_tests(args, test_list)


if __name__ == "__main__":
main()
Loading