diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 0000000000..ed5a23bf4e --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,6 @@ +"ciflow/8gpu": + - .ci/docker/** + - .github/workflows/** + - scripts/** + - tests/** + - torchtitan/** diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml new file mode 100644 index 0000000000..9eae404af4 --- /dev/null +++ b/.github/pytorch-probot.yml @@ -0,0 +1,3 @@ +ciflow_push_tags: + - ciflow/8gpu +labeler_config: labeler.yml diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index a20cd22545..e8b2fe63ea 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -3,6 +3,8 @@ name: 8 GPU Feature Tests on: push: branches: [ main ] + tags: + - ciflow/8gpu/* paths-ignore: - 'torchtitan/experiments/**' pull_request: @@ -27,33 +29,7 @@ permissions: jobs: # Step 1: Dynamically compute the matrix based on conditions set-matrix: - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set.outputs.matrix }} - steps: - - id: set - run: | - # Decide which matrix entries to include based on event type - if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then - # Include both CUDA and ROCm - echo '{"include":[ - {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}, - {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} - ]}' > matrix.json - else - # Include only CUDA - echo '{"include":[ - {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"} - ]}' > matrix.json - fi - - # Export matrix to job outputs - { - echo 'matrix<> $GITHUB_OUTPUT - + uses: ./.github/workflows/set-matrix.yaml # Step 2: Use the dynamic matrix in the build-test job build-test: diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml new file mode 100644 index 0000000000..5564d8d70b --- /dev/null +++ b/.github/workflows/set-matrix.yaml @@ -0,0 +1,76 @@ +name: Set Matrix + +on: + workflow_call: + outputs: + matrix: + description: dynamically set matrix + value: ${{ jobs.set.outputs.matrix }} + +jobs: + set: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set.outputs.matrix }} + env: + # Event flags evaluated by github actions before the step runs: + IS_MAIN_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + IS_SCHEDULE: ${{ github.event_name == 'schedule' }} + IS_8GPU_TAG: ${{ startsWith(github.ref, 'refs/tags/ciflow/8gpu/') }} + TRIGGERED_8GPU_LABEL: ${{ github.event_name == 'pull_request' && github.event.action == 'labeled' }} + + steps: + - id: set + run: | + # Define ROCm matrix + ROCM_MATRIX='{ + "name": "rocm", + "runner": "linux.rocm.gpu.gfx942.8", + "gpu-arch-type": "rocm", + "gpu-arch-version": "7.0", + "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12", + "index-url": "https://download.pytorch.org/whl/nightly/rocm7.0" + }' + + # Define CUDA matrix + CUDA_MATRIX='{ + "name": "cuda", + "runner": "linux.g5.48xlarge.nvidia.gpu", + "gpu-arch-type": "cuda", + "gpu-arch-version": "12.6", + "docker-image": "torchtitan-ubuntu-20.04-clang12", + "index-url": "https://download.pytorch.org/whl/nightly/cu126" + }' + + # Use default value as 'false' for unset environment variables + IS_MAIN_PUSH="${IS_MAIN_PUSH:-false}" + IS_SCHEDULE="${IS_SCHEDULE:-false}" + IS_8GPU_TAG="${IS_8GPU_TAG:-false}" + TRIGGERED_8GPU_LABEL="${TRIGGERED_8GPU_LABEL:-false}" + + # Decide which matrix entries to include based on event type + # Runs ROCm only for push tag OR when PR label gets triggered + if [[ "$IS_8GPU_TAG" == "true" || "$TRIGGERED_8GPU_LABEL" == "true" ]]; then + cat > matrix.json < matrix.json < matrix.json <> $GITHUB_OUTPUT