Skip to content

AICOMRCCL-708 fix rccl unit test failures on mi300a #22485

AICOMRCCL-708 fix rccl unit test failures on mi300a

AICOMRCCL-708 fix rccl unit test failures on mi300a #22485

Workflow file for this run

name: TheRock CI
on:
push:
branches:
- develop
- release/therock-*
pull_request:
types:
- opened
- synchronize
- reopened
workflow_dispatch:
inputs:
projects:
type: string
description: "Insert space-separated list of projects to test or 'all' to test all projects. ex: 'projects/clr projects/rocminfo'"
permissions:
contents: read
concurrency:
# A PR number if a pull request and otherwise the commit hash. This cancels
# queued and in-progress runs for the same PR (presubmit) or commit
# (postsubmit). The workflow name is prepended to avoid conflicts between
# different workflows.
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
cancel-in-progress: true
jobs:
setup:
name: "Setup"
runs-on: ubuntu-24.04
env:
# The commit being checked out is the merge commit for a PR. Its first
# parent will be the tip of the base branch.
BASE_REF: HEAD^
outputs:
linux_projects: ${{ steps.linux_projects.outputs.projects }}
windows_projects: ${{ steps.windows_projects.outputs.projects }}
run_linux_rccl_ci: ${{ steps.rccl_check.outputs.run_linux_rccl_ci }}
steps:
- name: Checkout code
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
sparse-checkout: .github
sparse-checkout-cone-mode: true
fetch-depth: 2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: "3.12"
- name: Install python dependencies
run: |
python -m pip install --upgrade pip
pip install pydantic requests
- name: Detect changed subtrees
id: detect
if: github.event_name == 'pull_request'
env:
GH_TOKEN: ${{ github.token }}
run: |
python .github/scripts/pr_detect_changed_subtrees.py \
--repo "${{ github.repository }}" \
--pr "${{ github.event.pull_request.number }}" \
--config ".github/repos-config.json"
- name: Determine Linux projects to run
id: linux_projects
env:
SUBTREES: ${{ steps.detect.outputs.subtrees }}
PROJECTS: ${{ inputs.projects }}
PLATFORM: "linux"
run: |
python .github/scripts/therock_configure_ci.py
- name: Determine Windows projects to run
id: windows_projects
env:
SUBTREES: ${{ steps.detect.outputs.subtrees }}
PROJECTS: ${{ inputs.projects }}
PLATFORM: "windows"
run: |
python .github/scripts/therock_configure_ci.py
- name: "Detect RCCL file changes"
id: rccl_check
shell: bash
run: |
echo "Changed files:"
git diff --name-only HEAD^
if git diff --name-only HEAD^ | grep -q '^projects/rccl/'; then
echo "run_linux_rccl_ci=true" >> "$GITHUB_OUTPUT"
echo "RCCL changes detected"
else
echo "run_linux_rccl_ci=false" >> "$GITHUB_OUTPUT"
echo "No RCCL changes detected"
fi
therock-ci-linux:
name: Linux (${{ matrix.projects.projects_to_test }})
permissions:
contents: read
id-token: write
needs: setup
if: ${{ needs.setup.outputs.linux_projects != '[]' && needs.setup.outputs.run_linux_rccl_ci != 'true' }}
strategy:
fail-fast: false
matrix:
projects: ${{ fromJSON(needs.setup.outputs.linux_projects) }}
uses: ./.github/workflows/therock-ci-linux.yml
secrets: inherit
with:
cmake_options: ${{ matrix.projects.cmake_options }}
projects_to_test: ${{ matrix.projects.projects_to_test }}
therock-rccl-ci-linux:
name: TheRock RCCL CI Linux
needs: setup
if: ${{ needs.setup.outputs.run_linux_rccl_ci == 'true' }}
permissions:
contents: read
id-token: write
strategy:
fail-fast: false
matrix:
amdgpu_family: [gfx94X-dcgpu, gfx950-dcgpu]
uses: ./.github/workflows/therock-rccl-ci-linux.yml
secrets: inherit
with:
amdgpu_families: ${{ matrix.amdgpu_family }}
artifact_group: ${{ matrix.amdgpu_family }}
extra_cmake_options: >
-DTHEROCK_ENABLE_ALL=OFF
-DTHEROCK_BUILD_TESTING=ON
-DTHEROCK_BUNDLE_SYSDEPS=ON
-DTHEROCK_ENABLE_COMM_LIBS=ON
-DTHEROCK_ENABLE_ROCPROFV3=ON
-DTHEROCK_ENABLE_MPI=ON
therock-ci-windows:
name: Windows (${{ matrix.projects.projects_to_test }})
permissions:
contents: read
id-token: write
needs: setup
if: ${{ needs.setup.outputs.windows_projects != '[]' && needs.setup.outputs.run_linux_rccl_ci != 'true' }}
strategy:
fail-fast: false
matrix:
projects: ${{ fromJSON(needs.setup.outputs.windows_projects) }}
uses: ./.github/workflows/therock-ci-windows.yml
secrets: inherit
with:
cmake_options: ${{ matrix.projects.cmake_options }}
projects_to_test: ${{ matrix.projects.projects_to_test }}
therock_ci_summary:
name: TheRock CI Summary
if: always()
needs:
- setup
- therock-ci-linux
- therock-rccl-ci-linux
- therock-ci-windows
runs-on: ubuntu-24.04
steps:
- name: Output failed jobs
run: |
echo '${{ toJson(needs) }}'
FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
| jq --raw-output \
'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
)"
if [[ "${FAILED_JOBS}" != "" ]]; then
echo "The following jobs failed: ${FAILED_JOBS}"
exit 1
fi