From 0539dbe9e7c0487dc8f52e41e4c139ecc60e4add Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 4 Dec 2025 20:58:08 -0600 Subject: [PATCH 01/29] Implement ciflow/rocm. --- .github/pytorch-probot.yml | 5 +++++ .github/workflows/integration_test_8gpu_features.yaml | 10 +++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 .github/pytorch-probot.yml diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml new file mode 100644 index 0000000000..45eb1ad19f --- /dev/null +++ b/.github/pytorch-probot.yml @@ -0,0 +1,5 @@ +tracking_issue: 24422 +ciflow_tracking_issue: 64124 +ciflow_push_tags: +- ciflow/8gpu +- ciflow/rocm diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index a20cd22545..bc620340b0 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -3,6 +3,8 @@ name: 8 GPU Feature Tests on: push: branches: [ main ] + tags: + - ciflow/8gpu/* paths-ignore: - 'torchtitan/experiments/**' pull_request: @@ -28,20 +30,22 @@ jobs: # Step 1: Dynamically compute the matrix based on conditions set-matrix: runs-on: ubuntu-latest + env: + HAS_8GPU_LABEL: ${{ (github.event_name == 'pull_request') && contains(github.event.pull_request.labels.*.name, 'ciflow/8gpu') }} outputs: matrix: ${{ steps.set.outputs.matrix }} steps: - id: set run: | # Decide which matrix entries to include based on event type - if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then - # Include both CUDA and ROCm + # Runs CUDA and ROCm for push to main, cron schedule and PR label + if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${HAS_8GPU_LABEL}" == "true" ]]; then echo '{"include":[ {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}, {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} ]}' > matrix.json + # Runs CUDA for normal PR (without PR label) else - # Include only CUDA echo '{"include":[ {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"} ]}' > matrix.json From 19a5a15a94c4ae800fcac81c5a4eb1397f3397f7 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 5 Dec 2025 13:02:59 -0600 Subject: [PATCH 02/29] Removed tracking_issue & ciflow_tracking_issue. --- .github/pytorch-probot.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 45eb1ad19f..46ceef2c2b 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,5 +1,3 @@ -tracking_issue: 24422 -ciflow_tracking_issue: 64124 ciflow_push_tags: - ciflow/8gpu - ciflow/rocm From a2a55a79a91b5e864d5457014f699d69ffc86c88 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 5 Dec 2025 13:31:44 -0600 Subject: [PATCH 03/29] Removed ciflow/rocm tag. --- .github/pytorch-probot.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 46ceef2c2b..fab26355cd 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,3 +1,2 @@ ciflow_push_tags: - ciflow/8gpu -- ciflow/rocm From cc29b3985f546befaf502bacc44844d678e93f13 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 8 Dec 2025 18:38:23 -0600 Subject: [PATCH 04/29] Added condition to run ROCm workflow when label is added. --- .github/workflows/integration_test_8gpu_features.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index bc620340b0..34bae6b2fa 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -8,6 +8,7 @@ on: paths-ignore: - 'torchtitan/experiments/**' pull_request: + types: [opened, synchronize, reopened, labeled, unlabeled] paths-ignore: - 'torchtitan/experiments/**' schedule: @@ -38,8 +39,13 @@ jobs: - id: set run: | # Decide which matrix entries to include based on event type + # Runs ROCm when label event is triggered + if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.action }}" == "labeled" && "${HAS_8GPU_LABEL}" == "true" ]]; then + echo '{"include":[ + {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} + ]}' > matrix.json # Runs CUDA and ROCm for push to main, cron schedule and PR label - if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${HAS_8GPU_LABEL}" == "true" ]]; then + elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${HAS_8GPU_LABEL}" == "true" ]]; then echo '{"include":[ {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}, {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} From 4485cd17b189b2665c855961c9af6b5e8bdeb252 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 9 Dec 2025 20:06:48 -0600 Subject: [PATCH 05/29] Moved dynamic matrix creation to a separate reusable workflow. Added feature to add label automatically. --- .github/labeler.yml | 6 +++ .github/pytorch-probot.yml | 1 + .../integration_test_8gpu_features.yaml | 35 +-------------- .github/workflows/set-matrix.yaml | 44 +++++++++++++++++++ 4 files changed, 52 insertions(+), 34 deletions(-) create mode 100644 .github/labeler.yml create mode 100644 .github/workflows/set-matrix.yaml diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 0000000000..882b137717 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,6 @@ +"ciflow/8gpu/": +- .ci/docker/** +- .github/workflows/** +- scripts/** +- tests/** +- torchtitan/** diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index fab26355cd..bec5930439 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,2 +1,3 @@ ciflow_push_tags: - ciflow/8gpu +labeler_config: labeler.yml diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index 34bae6b2fa..ba942a1aa0 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -30,40 +30,7 @@ permissions: jobs: # Step 1: Dynamically compute the matrix based on conditions set-matrix: - runs-on: ubuntu-latest - env: - HAS_8GPU_LABEL: ${{ (github.event_name == 'pull_request') && contains(github.event.pull_request.labels.*.name, 'ciflow/8gpu') }} - outputs: - matrix: ${{ steps.set.outputs.matrix }} - steps: - - id: set - run: | - # Decide which matrix entries to include based on event type - # Runs ROCm when label event is triggered - if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.action }}" == "labeled" && "${HAS_8GPU_LABEL}" == "true" ]]; then - echo '{"include":[ - {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} - ]}' > matrix.json - # Runs CUDA and ROCm for push to main, cron schedule and PR label - elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${HAS_8GPU_LABEL}" == "true" ]]; then - echo '{"include":[ - {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}, - {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} - ]}' > matrix.json - # Runs CUDA for normal PR (without PR label) - else - echo '{"include":[ - {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"} - ]}' > matrix.json - fi - - # Export matrix to job outputs - { - echo 'matrix<> $GITHUB_OUTPUT - + uses: ./.github/workflows/set-matrix.yaml # Step 2: Use the dynamic matrix in the build-test job build-test: diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml new file mode 100644 index 0000000000..c1ce6ed911 --- /dev/null +++ b/.github/workflows/set-matrix.yaml @@ -0,0 +1,44 @@ +name: Set Matrix + +on: + workflow_call: + outputs: + matrix: + description: dynamically set matrix + value: ${{ jobs.set.outputs.matrix }} + +jobs: + set: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.write.outputs.matrix }} + env: + HAS_8GPU_LABEL: ${{ (github.event_name == 'pull_request') && contains(github.event.pull_request.labels.*.name, 'ciflow/8gpu') }} + + steps: + - id: set + run: | + # Decide which matrix entries to include based on event type + # Runs ROCm when label event is triggered + if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.action }}" == "labeled" && "${HAS_8GPU_LABEL}" == "true" ]]; then + echo '{"include":[ + {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} + ]}' > matrix.json # Runs CUDA and ROCm for push to main, cron schedule and PR label + elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${HAS_8GPU_LABEL}" == "true" ]]; then + echo '{"include":[ + {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}, + {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} + ]}' > matrix.json + # Runs CUDA for normal PR (without PR label) + else + echo '{"include":[ + {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"} + ]}' > matrix.json + fi + + # Export matrix to job outputs + { + echo 'matrix<> $GITHUB_OUTPUT From 132a3ebc4e01aa0650043739ccced667af06a553 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 9 Dec 2025 20:28:21 -0600 Subject: [PATCH 06/29] Using set instead of write in set-matrix. --- .github/workflows/set-matrix.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index c1ce6ed911..dd63ab15d9 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -11,7 +11,7 @@ jobs: set: runs-on: ubuntu-latest outputs: - matrix: ${{ steps.write.outputs.matrix }} + matrix: ${{ steps.set.outputs.matrix }} env: HAS_8GPU_LABEL: ${{ (github.event_name == 'pull_request') && contains(github.event.pull_request.labels.*.name, 'ciflow/8gpu') }} From 75a1d536cef6eff8c569b9de6b74e71dc0802a2f Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 9 Dec 2025 20:46:25 -0600 Subject: [PATCH 07/29] Removed trailing slash from 'ciflow/8gpu' label. --- .github/labeler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index 882b137717..0f48a9517c 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,4 +1,4 @@ -"ciflow/8gpu/": +"ciflow/8gpu": - .ci/docker/** - .github/workflows/** - scripts/** From 8f8d9dfa464cb98d70b936238818c0e30a8ff744 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 10 Dec 2025 17:01:00 -0600 Subject: [PATCH 08/29] Refactored set-matrix.yaml. Added option to create ROCm matrix for push tags. --- .github/workflows/set-matrix.yaml | 86 +++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index dd63ab15d9..67d3f6ee4d 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -13,32 +13,66 @@ jobs: outputs: matrix: ${{ steps.set.outputs.matrix }} env: - HAS_8GPU_LABEL: ${{ (github.event_name == 'pull_request') && contains(github.event.pull_request.labels.*.name, 'ciflow/8gpu') }} + # Event flags evaluated by github actions before the step runs: + IS_MAIN_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + IS_SCHEDULE: ${{ github.event_name == 'schedule' }} + IS_PR: ${{ github.event_name == 'pull_request' }} + HAS_8GPU_LABEL: ${{ github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ciflow/8gpu') }} + IS_8GPU_TAG: ${{ startsWith(github.ref, 'refs/tags/ciflow/8gpu/') }} steps: - id: set - run: | - # Decide which matrix entries to include based on event type - # Runs ROCm when label event is triggered - if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.action }}" == "labeled" && "${HAS_8GPU_LABEL}" == "true" ]]; then - echo '{"include":[ - {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} - ]}' > matrix.json # Runs CUDA and ROCm for push to main, cron schedule and PR label - elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]] || [[ "${HAS_8GPU_LABEL}" == "true" ]]; then - echo '{"include":[ - {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}, - {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} - ]}' > matrix.json - # Runs CUDA for normal PR (without PR label) - else - echo '{"include":[ - {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"} - ]}' > matrix.json - fi - - # Export matrix to job outputs - { - echo 'matrix<> $GITHUB_OUTPUT + run: | + # Define ROCm matrix + ROCM_MATRIX='{ + "name": "rocm", + "runner": "linux.rocm.gpu.gfx942.8", + "gpu-arch-type": "rocm", + "gpu-arch-version": "7.0", + "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12", + "index-url": "https://download.pytorch.org/whl/nightly/rocm7.0" + }' + + # Define CUDA matrix + CUDA_MATRIX='{ + "name": "cuda", + "runner": "linux.g5.48xlarge.nvidia.gpu", + "gpu-arch-type": "cuda", + "gpu-arch-version": "12.6", + "docker-image": "torchtitan-ubuntu-20.04-clang12", + "index-url": "https://download.pytorch.org/whl/nightly/cu126" + }' + + # Use default value as 'false' for unset environment variables + IS_MAIN_PUSH="${IS_MAIN_PUSH:-false}" + IS_SCHEDULE="${IS_SCHEDULE:-false}" + IS_PR="${IS_PR:-false}" + HAS_8GPU_LABEL="${HAS_8GPU_LABEL:-false}" + IS_8GPU_TAG="${IS_8GPU_TAG:-false}" + + # Decide which matrix entries to include based on event type + # Runs ROCm only for push tag + if [[ "$IS_8GPU_TAG" == "true" ]]; then + cat > matrix.json < matrix.json < matrix.json <> $GITHUB_OUTPUT From 9417b906829da4211d7ef0aea4a37ed63d8d6ed0 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 10 Dec 2025 17:08:21 -0600 Subject: [PATCH 09/29] Fixed indentation and Lint in set-matrix.yaml. --- .github/workflows/set-matrix.yaml | 90 +++++++++++++++---------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 67d3f6ee4d..04a8fd1d26 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -22,57 +22,57 @@ jobs: steps: - id: set - run: | - # Define ROCm matrix - ROCM_MATRIX='{ - "name": "rocm", - "runner": "linux.rocm.gpu.gfx942.8", - "gpu-arch-type": "rocm", - "gpu-arch-version": "7.0", - "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12", - "index-url": "https://download.pytorch.org/whl/nightly/rocm7.0" - }' + run: | + # Define ROCm matrix + ROCM_MATRIX='{ + "name": "rocm", + "runner": "linux.rocm.gpu.gfx942.8", + "gpu-arch-type": "rocm", + "gpu-arch-version": "7.0", + "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12", + "index-url": "https://download.pytorch.org/whl/nightly/rocm7.0" + }' - # Define CUDA matrix - CUDA_MATRIX='{ - "name": "cuda", - "runner": "linux.g5.48xlarge.nvidia.gpu", - "gpu-arch-type": "cuda", - "gpu-arch-version": "12.6", - "docker-image": "torchtitan-ubuntu-20.04-clang12", - "index-url": "https://download.pytorch.org/whl/nightly/cu126" - }' + # Define CUDA matrix + CUDA_MATRIX='{ + "name": "cuda", + "runner": "linux.g5.48xlarge.nvidia.gpu", + "gpu-arch-type": "cuda", + "gpu-arch-version": "12.6", + "docker-image": "torchtitan-ubuntu-20.04-clang12", + "index-url": "https://download.pytorch.org/whl/nightly/cu126" + }' - # Use default value as 'false' for unset environment variables - IS_MAIN_PUSH="${IS_MAIN_PUSH:-false}" - IS_SCHEDULE="${IS_SCHEDULE:-false}" - IS_PR="${IS_PR:-false}" - HAS_8GPU_LABEL="${HAS_8GPU_LABEL:-false}" - IS_8GPU_TAG="${IS_8GPU_TAG:-false}" + # Use default value as 'false' for unset environment variables + IS_MAIN_PUSH="${IS_MAIN_PUSH:-false}" + IS_SCHEDULE="${IS_SCHEDULE:-false}" + IS_PR="${IS_PR:-false}" + HAS_8GPU_LABEL="${HAS_8GPU_LABEL:-false}" + IS_8GPU_TAG="${IS_8GPU_TAG:-false}" - # Decide which matrix entries to include based on event type - # Runs ROCm only for push tag - if [[ "$IS_8GPU_TAG" == "true" ]]; then - cat > matrix.json < matrix.json < matrix.json < matrix.json < matrix.json < matrix.json <> $GITHUB_OUTPUT + # Export matrix to job outputs + { + echo 'matrix<> $GITHUB_OUTPUT From 924444dbe4f4558e135078b9084ec24da0317920 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 10 Dec 2025 17:11:29 -0600 Subject: [PATCH 10/29] Fixed indentation in set-matrix.yaml. --- .github/workflows/set-matrix.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 04a8fd1d26..98316cf88d 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -55,19 +55,19 @@ jobs: if [[ "$IS_8GPU_TAG" == "true" ]]; then cat > matrix.json < matrix.json < matrix.json < Date: Wed, 10 Dec 2025 22:22:16 -0600 Subject: [PATCH 11/29] Run ROCm workflow only when PR label added. --- .github/workflows/set-matrix.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 98316cf88d..f1dc831651 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -19,6 +19,7 @@ jobs: IS_PR: ${{ github.event_name == 'pull_request' }} HAS_8GPU_LABEL: ${{ github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ciflow/8gpu') }} IS_8GPU_TAG: ${{ startsWith(github.ref, 'refs/tags/ciflow/8gpu/') }} + ADDED_8GPU_LABEL: ${{ github.event_name == 'pull_request' && github.event.action == 'labeled' }} steps: - id: set @@ -51,8 +52,8 @@ jobs: IS_8GPU_TAG="${IS_8GPU_TAG:-false}" # Decide which matrix entries to include based on event type - # Runs ROCm only for push tag - if [[ "$IS_8GPU_TAG" == "true" ]]; then + # Runs ROCm only for push tag OR label added + if [[ "$IS_8GPU_TAG" == "true" || "$ADDED_8GPU_LABEL" == "true" ]]; then cat > matrix.json < Date: Thu, 11 Dec 2025 00:41:53 -0600 Subject: [PATCH 12/29] Changed the name to TRIGGERED_8GPU_LABEL. Setting default value for TRIGGERED_8GPU_LABEL. --- .github/workflows/set-matrix.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index f1dc831651..01bc29575a 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -19,7 +19,7 @@ jobs: IS_PR: ${{ github.event_name == 'pull_request' }} HAS_8GPU_LABEL: ${{ github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ciflow/8gpu') }} IS_8GPU_TAG: ${{ startsWith(github.ref, 'refs/tags/ciflow/8gpu/') }} - ADDED_8GPU_LABEL: ${{ github.event_name == 'pull_request' && github.event.action == 'labeled' }} + TRIGGERED_8GPU_LABEL: ${{ github.event_name == 'pull_request' && github.event.action == 'labeled' }} steps: - id: set @@ -50,10 +50,11 @@ jobs: IS_PR="${IS_PR:-false}" HAS_8GPU_LABEL="${HAS_8GPU_LABEL:-false}" IS_8GPU_TAG="${IS_8GPU_TAG:-false}" + TRIGGERED_8GPU_LABEL="${TRIGGERED_8GPU_LABEL:-false}" # Decide which matrix entries to include based on event type - # Runs ROCm only for push tag OR label added - if [[ "$IS_8GPU_TAG" == "true" || "$ADDED_8GPU_LABEL" == "true" ]]; then + # Runs ROCm only for push tag OR when PR label gets triggered + if [[ "$IS_8GPU_TAG" == "true" || "$TRIGGERED_8GPU_LABEL" == "true" ]]; then cat > matrix.json < Date: Thu, 11 Dec 2025 15:02:50 -0600 Subject: [PATCH 13/29] Minor comment change. --- .github/workflows/set-matrix.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 01bc29575a..5b83f01e4d 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -65,7 +65,7 @@ jobs: {"include": [$CUDA_MATRIX,$ROCM_MATRIX]} JSON - # Runs CUDA as default (includes normal PR, if PR label is NOT present) + # Runs CUDA only as default (includes normal PR, if PR label is NOT present) else cat > matrix.json < Date: Thu, 11 Dec 2025 15:09:28 -0600 Subject: [PATCH 14/29] re-ordered if conditions in set-matrix.yaml. --- .github/workflows/set-matrix.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 5b83f01e4d..b643716087 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -53,16 +53,16 @@ jobs: TRIGGERED_8GPU_LABEL="${TRIGGERED_8GPU_LABEL:-false}" # Decide which matrix entries to include based on event type - # Runs ROCm only for push tag OR when PR label gets triggered - if [[ "$IS_8GPU_TAG" == "true" || "$TRIGGERED_8GPU_LABEL" == "true" ]]; then + # Runs CUDA and ROCm for normal PR (if PR label is present) OR for push to main, cron schedule + if [[ ("$HAS_8GPU_LABEL" == "true" && "$IS_PR" == "true") || ("$IS_MAIN_PUSH" == "true" || "$IS_SCHEDULE" == "true") ]]; then cat > matrix.json < matrix.json < Date: Thu, 11 Dec 2025 15:16:55 -0600 Subject: [PATCH 15/29] Dummy commit From 1b429ca18114864db488eb356db515747fc79211 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 11 Dec 2025 15:35:52 -0600 Subject: [PATCH 16/29] Separated condition for PR label trigger in set-matrix.yaml. --- .github/workflows/set-matrix.yaml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index b643716087..12348e67c2 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -53,14 +53,20 @@ jobs: TRIGGERED_8GPU_LABEL="${TRIGGERED_8GPU_LABEL:-false}" # Decide which matrix entries to include based on event type + # Runs when PR label gets triggered + if [[ "$TRIGGERED_8GPU_LABEL" == "true" ]]; then + cat > matrix.json < matrix.json < matrix.json < Date: Thu, 11 Dec 2025 16:22:27 -0600 Subject: [PATCH 17/29] DEBUG:rolled back original if conditions in set-matrix.yaml and added debug statements. --- .github/workflows/set-matrix.yaml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 12348e67c2..b133253ec7 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -53,26 +53,23 @@ jobs: TRIGGERED_8GPU_LABEL="${TRIGGERED_8GPU_LABEL:-false}" # Decide which matrix entries to include based on event type - # Runs when PR label gets triggered - if [[ "$TRIGGERED_8GPU_LABEL" == "true" ]]; then + # Runs ROCm only for push tag OR when PR label gets triggered + if [[ "$IS_8GPU_TAG" == "true" || "$TRIGGERED_8GPU_LABEL" == "true" ]]; then + echo "***ROCm only for push tag OR when PR label gets triggered" cat > matrix.json < matrix.json < matrix.json < matrix.json < Date: Thu, 11 Dec 2025 16:57:57 -0600 Subject: [PATCH 18/29] Dummy commit From 17e7fd558db1d6c5fc07e20fa430d5302fe5314c Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 11 Dec 2025 17:52:58 -0600 Subject: [PATCH 19/29] DEBUG: Using 4 GPU ROCm runner. --- .github/workflows/set-matrix.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index b133253ec7..5388448fcc 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -27,7 +27,7 @@ jobs: # Define ROCm matrix ROCM_MATRIX='{ "name": "rocm", - "runner": "linux.rocm.gpu.gfx942.8", + "runner": "linux.rocm.gpu.gfx942.4", "gpu-arch-type": "rocm", "gpu-arch-version": "7.0", "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12", From e54daeebe06dda01bc9dc5d5c37f3b7cee34c889 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 11 Dec 2025 18:42:06 -0600 Subject: [PATCH 20/29] Dummy commit From 255ab3d3f2ab599f170666d3bde633f270c29c6a Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 11 Dec 2025 23:59:54 -0600 Subject: [PATCH 21/29] Using 8 GPU ROCm runner. --- .github/workflows/set-matrix.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 5388448fcc..b133253ec7 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -27,7 +27,7 @@ jobs: # Define ROCm matrix ROCM_MATRIX='{ "name": "rocm", - "runner": "linux.rocm.gpu.gfx942.4", + "runner": "linux.rocm.gpu.gfx942.8", "gpu-arch-type": "rocm", "gpu-arch-version": "7.0", "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12", From ffd1da398ce203399e97d9af071a95e23a1df054 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 12 Dec 2025 12:18:22 -0600 Subject: [PATCH 22/29] Corrected indentation in labeler.yml. Removed debug statements from set-matrix.yaml. --- .github/labeler.yml | 10 +++++----- .github/workflows/set-matrix.yaml | 3 --- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index 0f48a9517c..ed5a23bf4e 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,6 +1,6 @@ "ciflow/8gpu": -- .ci/docker/** -- .github/workflows/** -- scripts/** -- tests/** -- torchtitan/** + - .ci/docker/** + - .github/workflows/** + - scripts/** + - tests/** + - torchtitan/** diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index b133253ec7..5b83f01e4d 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -55,21 +55,18 @@ jobs: # Decide which matrix entries to include based on event type # Runs ROCm only for push tag OR when PR label gets triggered if [[ "$IS_8GPU_TAG" == "true" || "$TRIGGERED_8GPU_LABEL" == "true" ]]; then - echo "***ROCm only for push tag OR when PR label gets triggered" cat > matrix.json < matrix.json < matrix.json < Date: Fri, 12 Dec 2025 13:37:23 -0600 Subject: [PATCH 23/29] Corrected indentation in pytorch-probot.yml. --- .github/pytorch-probot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index bec5930439..9eae404af4 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,3 +1,3 @@ ciflow_push_tags: -- ciflow/8gpu + - ciflow/8gpu labeler_config: labeler.yml From 4fd8306ad334eed23ef4a5aee7e98b9f38510fb1 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 12 Dec 2025 16:45:23 -0800 Subject: [PATCH 24/29] Simplify pull_request trigger --- .github/workflows/integration_test_8gpu_features.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index ba942a1aa0..e8b2fe63ea 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -8,7 +8,6 @@ on: paths-ignore: - 'torchtitan/experiments/**' pull_request: - types: [opened, synchronize, reopened, labeled, unlabeled] paths-ignore: - 'torchtitan/experiments/**' schedule: From 158571f6a82aa86e69e86ba721751e798f544bca Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 12 Dec 2025 17:05:59 -0800 Subject: [PATCH 25/29] Don't run both CUDA and ROCm job when ciflow/8gpu is there --- .github/workflows/set-matrix.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 5b83f01e4d..29b322da54 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -60,7 +60,7 @@ jobs: JSON # Runs CUDA and ROCm for normal PR (if PR label is present) OR for push to main, cron schedule - elif [[ ("$HAS_8GPU_LABEL" == "true" && "$IS_PR" == "true") || ("$IS_MAIN_PUSH" == "true" || "$IS_SCHEDULE" == "true") ]]; then + elif [[ ("$HAS_8GPU_LABEL" == "false" && "$IS_PR" == "true") || ("$IS_MAIN_PUSH" == "true" || "$IS_SCHEDULE" == "true") ]]; then cat > matrix.json < Date: Fri, 12 Dec 2025 17:27:23 -0800 Subject: [PATCH 26/29] Remove HAS_8GPU_LABEL --- .github/workflows/set-matrix.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 29b322da54..7cd3f52d5b 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -17,7 +17,6 @@ jobs: IS_MAIN_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} IS_SCHEDULE: ${{ github.event_name == 'schedule' }} IS_PR: ${{ github.event_name == 'pull_request' }} - HAS_8GPU_LABEL: ${{ github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ciflow/8gpu') }} IS_8GPU_TAG: ${{ startsWith(github.ref, 'refs/tags/ciflow/8gpu/') }} TRIGGERED_8GPU_LABEL: ${{ github.event_name == 'pull_request' && github.event.action == 'labeled' }} @@ -48,7 +47,6 @@ jobs: IS_MAIN_PUSH="${IS_MAIN_PUSH:-false}" IS_SCHEDULE="${IS_SCHEDULE:-false}" IS_PR="${IS_PR:-false}" - HAS_8GPU_LABEL="${HAS_8GPU_LABEL:-false}" IS_8GPU_TAG="${IS_8GPU_TAG:-false}" TRIGGERED_8GPU_LABEL="${TRIGGERED_8GPU_LABEL:-false}" @@ -60,7 +58,7 @@ jobs: JSON # Runs CUDA and ROCm for normal PR (if PR label is present) OR for push to main, cron schedule - elif [[ ("$HAS_8GPU_LABEL" == "false" && "$IS_PR" == "true") || ("$IS_MAIN_PUSH" == "true" || "$IS_SCHEDULE" == "true") ]]; then + elif [[ ("$IS_MAIN_PUSH" == "true" || "$IS_SCHEDULE" == "true") ]]; then cat > matrix.json < Date: Fri, 12 Dec 2025 22:50:35 -0600 Subject: [PATCH 27/29] Dummy commit From fab557f9637232edd1bbcf2528b90916cba34266 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 12 Dec 2025 22:59:47 -0600 Subject: [PATCH 28/29] DEBUG --- .github/workflows/set-matrix.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 7cd3f52d5b..72c16c7788 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -53,18 +53,21 @@ jobs: # Decide which matrix entries to include based on event type # Runs ROCm only for push tag OR when PR label gets triggered if [[ "$IS_8GPU_TAG" == "true" || "$TRIGGERED_8GPU_LABEL" == "true" ]]; then + echo "Runs ROCm only for push tag OR when PR label gets triggered" cat > matrix.json < matrix.json < matrix.json < Date: Fri, 12 Dec 2025 23:31:18 -0600 Subject: [PATCH 29/29] Cleanup set-matrix.yaml. --- .github/workflows/set-matrix.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index 72c16c7788..5564d8d70b 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -16,7 +16,6 @@ jobs: # Event flags evaluated by github actions before the step runs: IS_MAIN_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} IS_SCHEDULE: ${{ github.event_name == 'schedule' }} - IS_PR: ${{ github.event_name == 'pull_request' }} IS_8GPU_TAG: ${{ startsWith(github.ref, 'refs/tags/ciflow/8gpu/') }} TRIGGERED_8GPU_LABEL: ${{ github.event_name == 'pull_request' && github.event.action == 'labeled' }} @@ -46,28 +45,24 @@ jobs: # Use default value as 'false' for unset environment variables IS_MAIN_PUSH="${IS_MAIN_PUSH:-false}" IS_SCHEDULE="${IS_SCHEDULE:-false}" - IS_PR="${IS_PR:-false}" IS_8GPU_TAG="${IS_8GPU_TAG:-false}" TRIGGERED_8GPU_LABEL="${TRIGGERED_8GPU_LABEL:-false}" # Decide which matrix entries to include based on event type # Runs ROCm only for push tag OR when PR label gets triggered if [[ "$IS_8GPU_TAG" == "true" || "$TRIGGERED_8GPU_LABEL" == "true" ]]; then - echo "Runs ROCm only for push tag OR when PR label gets triggered" cat > matrix.json < matrix.json < matrix.json <