vllm-project · DarkLight1337 · Dec 21, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
@@ -0,0 +1,238 @@
+#!/bin/bash
+# vllm-omni customized version
+# Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/bootstrap-amd.sh
+# Last synced: 2025-12-15
+# Modifications: Use local template file instead of downloading from ci-infra
+
+set -euo pipefail
+
+if [[ -z "${RUN_ALL:-}" ]]; then
+    RUN_ALL=0
+fi
+
+if [[ -z "${NIGHTLY:-}" ]]; then
+    NIGHTLY=0
+fi
+
+if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then
+    VLLM_CI_BRANCH="main"
+fi
+
+if [[ -z "${AMD_MIRROR_HW:-}" ]]; then
+    AMD_MIRROR_HW="amdproduction"
+fi
+
+if [[ -z "${DOCS_ONLY_DISABLE:-}" ]]; then
+    DOCS_ONLY_DISABLE=0
+fi
+
+fail_fast() {
+    DISABLE_LABEL="ci-no-fail-fast"
+    # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+    if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then
+            echo false
+        else
+            echo true
+        fi
+    else
+        echo false  # not a PR or BUILDKITE_PULL_REQUEST not set
+    fi
+}
+
+check_run_all_label() {
+    RUN_ALL_LABEL="ready-run-all-tests"
+    # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+    if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then
+            echo true
+        else
+            echo false
+        fi
+    else
+        echo false  # not a PR or BUILDKITE_PULL_REQUEST not set
+    fi
+}
+
+if [[ -z "${COV_ENABLED:-}" ]]; then
+    COV_ENABLED=0
+fi
+
+upload_pipeline() {
+    echo "Uploading pipeline..."
+    # Install minijinja
+    ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI'
+    curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh
+    source /var/lib/buildkite-agent/.cargo/env
+
+    if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
+        AMD_MIRROR_HW="amdtentative"
+    fi
+
+    # Use local template file for vllm-omni
+    cp .buildkite/test-template-amd-omni.j2 .buildkite/test-template.j2
+
+
+    # (WIP) Use pipeline generator instead of jinja template
+    if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then
+        python -m pip install click pydantic
+        python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
+        buildkite-agent pipeline upload .buildkite/pipeline.yaml
+        exit 0
+    fi
+    echo "List file diff: $LIST_FILE_DIFF"
+    echo "Run all: $RUN_ALL"
+    echo "Nightly: $NIGHTLY"
+    echo "AMD Mirror HW: $AMD_MIRROR_HW"
+
+    FAIL_FAST=$(fail_fast)
+
+    cd .buildkite
+    (
+        set -x
+        # Output pipeline.yaml with all blank lines removed
+        minijinja-cli test-template.j2 test-amd.yaml \
+            -D branch="$BUILDKITE_BRANCH" \
+            -D list_file_diff="$LIST_FILE_DIFF" \
+            -D run_all="$RUN_ALL" \
+            -D nightly="$NIGHTLY" \
+            -D mirror_hw="$AMD_MIRROR_HW" \
+            -D fail_fast="$FAIL_FAST" \
+            -D vllm_use_precompiled="$VLLM_USE_PRECOMPILED" \
+            -D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \
+            -D cov_enabled="$COV_ENABLED" \
+            -D vllm_ci_branch="$VLLM_CI_BRANCH" \
+            | sed '/^[[:space:]]*$/d' \
+            > pipeline.yaml
+    )
+    cat pipeline.yaml
+    buildkite-agent artifact upload pipeline.yaml
+    buildkite-agent pipeline upload pipeline.yaml
+    exit 0
+}
+
+get_diff() {
+    $(git add .)
+    echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD))
+}
+
+get_diff_main() {
+    $(git add .)
+    echo $(git diff --name-only --diff-filter=ACMDR HEAD~1)
+}
+
+file_diff=$(get_diff)
+if [[ $BUILDKITE_BRANCH == "main" ]]; then
+    file_diff=$(get_diff_main)
+fi
+
+# ----------------------------------------------------------------------
+# Early exit start: skip pipeline if conditions are met
+# ----------------------------------------------------------------------
+
+# skip pipeline if all changed files are under docs/
+if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then
+  if [[ -n "${file_diff:-}" ]]; then
+    docs_only=1
+    # Robust iteration over newline-separated file_diff
+    while IFS= read -r f; do
+      [[ -z "$f" ]] && continue
+      # **Policy:** only skip if *every* path starts with docs/
+      if [[ "$f" != docs/* ]]; then
+        docs_only=0
+        break
+      fi
+    done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r')
+
+    if [[ "$docs_only" -eq 1 ]]; then
+      buildkite-agent annotate ":memo: CI skipped — docs/** only changes detected
+
+\`\`\`
+${file_diff}
+\`\`\`" --style "info" || true
+      echo "[docs-only] All changes are under docs/. Exiting before pipeline upload."
+      exit 0
+    fi
+  fi
+fi
+
+# ----------------------------------------------------------------------
+# Early exit end
+# ----------------------------------------------------------------------
+
+patterns=(
+    "docker/Dockerfile"
+    "CMakeLists.txt"
+    "requirements/common.txt"
+    "requirements/cuda.txt"
+    "requirements/build.txt"
+    "requirements/test.txt"
+    "setup.py"
+    "csrc/"
+    "cmake/"
+)
+
+ignore_patterns=(
+    "docker/Dockerfile."
+    "csrc/cpu"
+    "csrc/rocm"
+    "cmake/hipify.py"
+    "cmake/cpu_extension.cmake"
+)
+
+for file in $file_diff; do
+    # First check if file matches any pattern
+    matches_pattern=0
+    for pattern in "${patterns[@]}"; do
+        if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then
+            matches_pattern=1
+            break
+        fi
+    done
+
+    # If file matches pattern, check it's not in ignore patterns
+    if [[ $matches_pattern -eq 1 ]]; then
+        matches_ignore=0
+        for ignore in "${ignore_patterns[@]}"; do
+            if [[ $file == $ignore* ]] || [[ $file == $ignore ]]; then
+                matches_ignore=1
+                break
+            fi
+        done
+
+        if [[ $matches_ignore -eq 0 ]]; then
+            RUN_ALL=1
+            echo "Found changes: $file. Run all tests"
+            break
+        fi
+    fi
+done
+
+# Check for ready-run-all-tests label
+LABEL_RUN_ALL=$(check_run_all_label)
+if [[ $LABEL_RUN_ALL == true ]]; then
+    RUN_ALL=1
+    NIGHTLY=1
+    echo "Found 'ready-run-all-tests' label. Running all tests including optional tests."
+fi
+
+# Decide whether to use precompiled wheels
+# Relies on existing patterns array as a basis.
+if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then
+    echo "VLLM_USE_PRECOMPILED is already set to: $VLLM_USE_PRECOMPILED"
+elif [[ $RUN_ALL -eq 1 ]]; then
+    export VLLM_USE_PRECOMPILED=0
+    echo "Detected critical changes, building wheels from source"
+else
+    export VLLM_USE_PRECOMPILED=1
+    echo "No critical changes, using precompiled wheels"
+fi
+
+
+LIST_FILE_DIFF=$(get_diff | tr ' ' '|')
+if [[ $BUILDKITE_BRANCH == "main" ]]; then
+    LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|')
+fi
+upload_pipeline
@@ -0,0 +1,152 @@
+#!/bin/bash
+# vllm-omni customized version
+# Based on: vllm/.buildkite/scripts/hardware_ci/run-amd-test.sh
+# Last synced: 2025-12-15
+# Modifications: docker image name for vllm-omni
+
+# This script runs test inside the corresponding ROCm docker container.
+set -o pipefail
+
+# Export Python path
+export PYTHONPATH=".."
+
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- ROCm info"
+rocminfo
+
+# cleanup older docker images
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+
+# Call the cleanup docker function
+cleanup_docker
+
+echo "--- Resetting GPUs"
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- Pulling container"
+image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull "${image_name}"
+
+remove_docker_container() {
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+}
+trap remove_docker_container EXIT
+
+echo "--- Running container"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+commands=$@
+echo "Commands:$commands"
+
+PARALLEL_JOB_COUNT=8
+MYPYTHONPATH=".."
+
+# Test that we're launching on the machine that has
+# proper access to GPUs
+render_gid=$(getent group render | cut -d: -f3)
+if [[ -z "$render_gid" ]]; then
+  echo "Error: 'render' group not found. This is required for GPU access." >&2
+  exit 1
+fi
+
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
+    echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+    docker run \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
+        --shm-size=16gb \
+        --group-add "$render_gid" \
+        --rm \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
+        -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
+    fi
+  done
+else
+  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+  docker run \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
+          --shm-size=16gb \
+          --group-add "$render_gid" \
+          --rm \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
+fi