diff --git a/.github/workflows/integration_test_8gpu_auto_parallel.yaml b/.github/workflows/integration_test_8gpu_auto_parallel.yaml new file mode 100644 index 0000000000..85618aeeef --- /dev/null +++ b/.github/workflows/integration_test_8gpu_auto_parallel.yaml @@ -0,0 +1,56 @@ +name: Auto Parallel 8 GPU Integration Tests + +on: + push: + branches: [ main ] + paths: + - 'torchtitan/experiments/auto_parallel/**' + - '.github/workflows/integration_test_8gpu_auto_parallel.yaml' + pull_request: + paths: + - 'torchtitan/experiments/auto_parallel/**' + - '.github/workflows/integration_test_8gpu_auto_parallel.yaml' + schedule: + # Runs every 12 hours + - cron: '0 */12 * * *' + +concurrency: + group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.g5.48xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + repository: pytorch/torchtitan + upload-artifact: outputs + script: | + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + # Log CUDA driver version for debugging. + DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true) + echo "CUDA driver version: ${DRIVER_VERSION}" + + pip config --user set global.progress_bar off + + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + + # Install autoparallel - required dependency for auto_parallel experiment + python -m pip install git+https://github.com/meta-pytorch/autoparallel.git + + mkdir artifacts-to-be-uploaded + python -m torchtitan.experiments.auto_parallel.tests.integration_tests artifacts-to-be-uploaded --ngpu 4 diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md index aa93628656..5c1b20898d 100644 --- a/torchtitan/experiments/README.md +++ b/torchtitan/experiments/README.md @@ -32,4 +32,4 @@ We provide this `experiments/` folder to host experiments that add significant v | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) | | [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) | | [transformers_modeling_backend](./transformers_modeling_backend/) | [![Transformers modeling backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_modeling_backend.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_modeling_backend.yaml?query=branch%3Amain) | [@3outeille](https://github.com/3outeille) | -| [auto_parallel](./auto_parallel/) | TBA | [@wconstab](https://github.com/wconstab) | [@xmfan](https://github.com/xmfan) | +| [auto_parallel](./auto_parallel/) | [![Auto Parallel 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_auto_parallel.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_auto_parallel.yaml?query=branch%3Amain) | [@wconstab](https://github.com/wconstab) [@xmfan](https://github.com/xmfan) | diff --git a/torchtitan/experiments/auto_parallel/llama3/parallelize_llama.py b/torchtitan/experiments/auto_parallel/llama3/parallelize_llama.py index 1d2bee4351..d7fbae2622 100644 --- a/torchtitan/experiments/auto_parallel/llama3/parallelize_llama.py +++ b/torchtitan/experiments/auto_parallel/llama3/parallelize_llama.py @@ -126,7 +126,7 @@ def input_fn(): autop.add_input_constraints([x_sharding]) autop.add_output_constraints([out_sharding]) t0 = time.time() - sharding_placement = autop.optimize_placement() + sharding_placement = autop.optimize_placement(verbose=False) t1 = time.time() logger.info(f"AutoParallel took {t1 - t0} seconds") parallel_mod = autop.apply_placement(sharding_placement) diff --git a/torchtitan/experiments/auto_parallel/tests/__init__.py b/torchtitan/experiments/auto_parallel/tests/__init__.py new file mode 100644 index 0000000000..2e41cd717f --- /dev/null +++ b/torchtitan/experiments/auto_parallel/tests/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/torchtitan/experiments/auto_parallel/tests/integration_tests.py b/torchtitan/experiments/auto_parallel/tests/integration_tests.py new file mode 100644 index 0000000000..334aed86dd --- /dev/null +++ b/torchtitan/experiments/auto_parallel/tests/integration_tests.py @@ -0,0 +1,85 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +from tests.integration_tests import OverrideDefinitions +from tests.integration_tests.run_tests import run_tests + + +def build_auto_parallel_test_list() -> list[OverrideDefinitions]: + """ + returns a list of OverrideDefinitions that is used to generate + variations of integration tests based on the same root config file. + """ + integration_tests_flavors = [ + # llama3 tests + OverrideDefinitions( + [ + [ + "--model.name auto_parallel.llama3", + "--parallelism.data_parallel_shard_degree 2", + "--parallelism.tensor_parallel_degree 2", + "--job.custom_config_module=torchtitan.experiments.auto_parallel.job_config", + ], + ], + "llama3 AutoParallel FSDP+TP", + "llama3_autoparallel_fsdp_tp", + ngpu=4, + ), + # TODO: Re-enable this once we fix the test + # deepseek_v3 tests + # OverrideDefinitions( + # [ + # [ + # "--model.name auto_parallel.deepseek_v3", + # "--parallelism.data_parallel_shard_degree 2", + # "--parallelism.expert_parallel_degree 2", + # "--job.custom_config_module=torchtitan.experiments.auto_parallel.job_config", + # "--activation_checkpoint.mode none", + # ], + # ], + # "deepseek_v3 AutoParallel FSDP+TP+EP", + # "deepseekv3_autoparallel_fsdp_tp_ep", + # ngpu=4, + # ), + ] + return integration_tests_flavors + + +_TEST_SUITES_FUNCTION = { + "auto_parallel": build_auto_parallel_test_list, +} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("output_dir") + parser.add_argument( + "--config_path", + default="./tests/integration_tests/base_config.toml", + help="Base config path for integration tests. This is the config that will be used as a base for all tests.", + ) + parser.add_argument( + "--test_name", + default="all", + help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)", + ) + parser.add_argument("--ngpu", default=8, type=int) + args = parser.parse_args() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + if os.listdir(args.output_dir): + raise RuntimeError("Please provide an empty output directory.") + + test_list = _TEST_SUITES_FUNCTION["auto_parallel"]() + run_tests(args, test_list) + + +if __name__ == "__main__": + main()