Skip to content
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
4e4a0c6
add Dockerfile.rocm
tjtanaa Dec 11, 2025
8cac62d
Merge remote-tracking branch 'origin/main' into dockerfile-amd-2
tjtanaa Dec 11, 2025
2b7ff41
add dockerfile build instruction
tjtanaa Dec 11, 2025
b03d282
add preliminary CI files
tjtanaa Dec 11, 2025
d5c75c3
fix local error
tjtanaa Dec 11, 2025
e2c24b7
simplify amd test to just test build docker
tjtanaa Dec 11, 2025
46da880
use amd-cpu to build image like in vLLM
tjtanaa Dec 11, 2025
bb03847
apply review feedback
tjtanaa Dec 12, 2025
055090d
fix precommit
tjtanaa Dec 12, 2025
c3f1a06
Merge remote-tracking branch 'origin/main' into dockerfile-amd-2
tjtanaa Dec 12, 2025
e8374b5
test pushing CI docker
tjtanaa Dec 12, 2025
02e68e2
try using cpu_queue_premerge_us_east_1 to build image
tjtanaa Dec 12, 2025
57661bd
add preliminary script to run amd ci
tjtanaa Dec 15, 2025
ae8a0cc
Merge remote-tracking branch 'origin/main' into dockerfile-amd-2
tjtanaa Dec 15, 2025
89987f3
add change the working directory of vllm omni docker image in CI; add…
tjtanaa Dec 16, 2025
81c0dd8
Merge remote-tracking branch 'origin/main' into dockerfile-amd-2
tjtanaa Dec 16, 2025
19c3056
fix test path; add qwen25 omni
tjtanaa Dec 16, 2025
8ae3569
add necessary env flag for mi325 vllm 0.11.0
tjtanaa Dec 16, 2025
442dc44
fix get device; add qwen3-omni unit tests
tjtanaa Dec 16, 2025
3532ec7
fix the file pointed by qwen3 omni test
tjtanaa Dec 16, 2025
056fe9a
trying to fix aiter mi325x arch auto detection issue
tjtanaa Dec 17, 2025
7c689e3
fix the rocm qwen3 omni unit test
tjtanaa Dec 17, 2025
ce45b1f
remove qwen3 unit tests first; reuse AITER_ROCM_ARCH from base image
tjtanaa Dec 17, 2025
055de64
sync with upstream
tjtanaa Dec 17, 2025
5e9c4d3
remove print
tjtanaa Dec 17, 2025
d865d18
simplify more
tjtanaa Dec 17, 2025
5053c1b
keep the template small
tjtanaa Dec 17, 2025
c94d67e
remove unwanted print
tjtanaa Dec 18, 2025
5c104bb
remove qwen3 omni test relate file for now
tjtanaa Dec 18, 2025
2ff27c6
Merge remote-tracking branch 'origin/main' into dockerfile-amd-2
tjtanaa Dec 19, 2025
e6d5b32
upgrade vllm version to 0.12.0 following main
tjtanaa Dec 19, 2025
ef7a50d
fix import error ModuleNotFoundError: No module named 'vllm.vllm_flas…
tjtanaa Dec 20, 2025
453ce5c
Merge remote-tracking branch 'origin/main' into dockerfile-amd-2
tjtanaa Dec 20, 2025
cc225dd
add forward_hip instead of sharing the same path with cuda
tjtanaa Dec 20, 2025
858f74e
revert forward_cuda
tjtanaa Dec 20, 2025
5695e2f
add forward hip dispatching logic
tjtanaa Dec 21, 2025
32233ae
try to do torch sync when destructing omni runner in tests
tjtanaa Dec 21, 2025
dd3e6db
revert the create_new_process_for_each_test for test_qwen25omni
tjtanaa Dec 21, 2025
fb1b1d8
fix create_new_process_for_each_test
tjtanaa Dec 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 238 additions & 0 deletions .buildkite/bootstrap-amd-omni.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
#!/bin/bash
# vllm-omni customized version
# Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/bootstrap-amd.sh
# Last synced: 2025-12-15
# Modifications: Use local template file instead of downloading from ci-infra

set -euo pipefail

if [[ -z "${RUN_ALL:-}" ]]; then
RUN_ALL=0
fi

if [[ -z "${NIGHTLY:-}" ]]; then
NIGHTLY=0
fi

if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then
VLLM_CI_BRANCH="main"
fi

if [[ -z "${AMD_MIRROR_HW:-}" ]]; then
AMD_MIRROR_HW="amdproduction"
fi

if [[ -z "${DOCS_ONLY_DISABLE:-}" ]]; then
DOCS_ONLY_DISABLE=0
fi

fail_fast() {
DISABLE_LABEL="ci-no-fail-fast"
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then
echo false
else
echo true
fi
else
echo false # not a PR or BUILDKITE_PULL_REQUEST not set
fi
}

check_run_all_label() {
RUN_ALL_LABEL="ready-run-all-tests"
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then
echo true
else
echo false
fi
else
echo false # not a PR or BUILDKITE_PULL_REQUEST not set
fi
}

if [[ -z "${COV_ENABLED:-}" ]]; then
COV_ENABLED=0
fi

upload_pipeline() {
echo "Uploading pipeline..."
# Install minijinja
ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI'
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh
source /var/lib/buildkite-agent/.cargo/env

if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
AMD_MIRROR_HW="amdtentative"
fi

# Use local template file for vllm-omni
cp .buildkite/test-template-amd-omni.j2 .buildkite/test-template.j2


# (WIP) Use pipeline generator instead of jinja template
if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then
python -m pip install click pydantic
python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
buildkite-agent pipeline upload .buildkite/pipeline.yaml
exit 0
fi
echo "List file diff: $LIST_FILE_DIFF"
echo "Run all: $RUN_ALL"
echo "Nightly: $NIGHTLY"
echo "AMD Mirror HW: $AMD_MIRROR_HW"

FAIL_FAST=$(fail_fast)

cd .buildkite
(
set -x
# Output pipeline.yaml with all blank lines removed
minijinja-cli test-template.j2 test-amd.yaml \
-D branch="$BUILDKITE_BRANCH" \
-D list_file_diff="$LIST_FILE_DIFF" \
-D run_all="$RUN_ALL" \
-D nightly="$NIGHTLY" \
-D mirror_hw="$AMD_MIRROR_HW" \
-D fail_fast="$FAIL_FAST" \
-D vllm_use_precompiled="$VLLM_USE_PRECOMPILED" \
-D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \
-D cov_enabled="$COV_ENABLED" \
-D vllm_ci_branch="$VLLM_CI_BRANCH" \
| sed '/^[[:space:]]*$/d' \
> pipeline.yaml
)
cat pipeline.yaml
buildkite-agent artifact upload pipeline.yaml
buildkite-agent pipeline upload pipeline.yaml
exit 0
}

get_diff() {
$(git add .)
echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD))
}

get_diff_main() {
$(git add .)
echo $(git diff --name-only --diff-filter=ACMDR HEAD~1)
}

file_diff=$(get_diff)
if [[ $BUILDKITE_BRANCH == "main" ]]; then
file_diff=$(get_diff_main)
fi

# ----------------------------------------------------------------------
# Early exit start: skip pipeline if conditions are met
# ----------------------------------------------------------------------

# skip pipeline if all changed files are under docs/
if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then
if [[ -n "${file_diff:-}" ]]; then
docs_only=1
# Robust iteration over newline-separated file_diff
while IFS= read -r f; do
[[ -z "$f" ]] && continue
# **Policy:** only skip if *every* path starts with docs/
if [[ "$f" != docs/* ]]; then
docs_only=0
break
fi
done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r')

if [[ "$docs_only" -eq 1 ]]; then
buildkite-agent annotate ":memo: CI skipped — docs/** only changes detected

\`\`\`
${file_diff}
\`\`\`" --style "info" || true
echo "[docs-only] All changes are under docs/. Exiting before pipeline upload."
exit 0
fi
fi
fi

# ----------------------------------------------------------------------
# Early exit end
# ----------------------------------------------------------------------

patterns=(
"docker/Dockerfile"
"CMakeLists.txt"
"requirements/common.txt"
"requirements/cuda.txt"
"requirements/build.txt"
"requirements/test.txt"
"setup.py"
"csrc/"
"cmake/"
)

ignore_patterns=(
"docker/Dockerfile."
"csrc/cpu"
"csrc/rocm"
"cmake/hipify.py"
"cmake/cpu_extension.cmake"
)

for file in $file_diff; do
# First check if file matches any pattern
matches_pattern=0
for pattern in "${patterns[@]}"; do
if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then
matches_pattern=1
break
fi
done

# If file matches pattern, check it's not in ignore patterns
if [[ $matches_pattern -eq 1 ]]; then
matches_ignore=0
for ignore in "${ignore_patterns[@]}"; do
if [[ $file == $ignore* ]] || [[ $file == $ignore ]]; then
matches_ignore=1
break
fi
done

if [[ $matches_ignore -eq 0 ]]; then
RUN_ALL=1
echo "Found changes: $file. Run all tests"
break
fi
fi
done

# Check for ready-run-all-tests label
LABEL_RUN_ALL=$(check_run_all_label)
if [[ $LABEL_RUN_ALL == true ]]; then
RUN_ALL=1
NIGHTLY=1
echo "Found 'ready-run-all-tests' label. Running all tests including optional tests."
fi

# Decide whether to use precompiled wheels
# Relies on existing patterns array as a basis.
if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then
echo "VLLM_USE_PRECOMPILED is already set to: $VLLM_USE_PRECOMPILED"
elif [[ $RUN_ALL -eq 1 ]]; then
export VLLM_USE_PRECOMPILED=0
echo "Detected critical changes, building wheels from source"
else
export VLLM_USE_PRECOMPILED=1
echo "No critical changes, using precompiled wheels"
fi


LIST_FILE_DIFF=$(get_diff | tr ' ' '|')
if [[ $BUILDKITE_BRANCH == "main" ]]; then
LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|')
fi
upload_pipeline
152 changes: 152 additions & 0 deletions .buildkite/scripts/hardware_ci/run-amd-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/bin/bash
# vllm-omni customized version
# Based on: vllm/.buildkite/scripts/hardware_ci/run-amd-test.sh
# Last synced: 2025-12-15
# Modifications: docker image name for vllm-omni

# This script runs test inside the corresponding ROCm docker container.
set -o pipefail

# Export Python path
export PYTHONPATH=".."

# Print ROCm version
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done

echo "--- ROCm info"
rocminfo

# cleanup older docker images
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}

# Call the cleanup docker function
cleanup_docker

echo "--- Resetting GPUs"

echo "reset" > /opt/amdgpu/etc/gpu_state

while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done

echo "--- Pulling container"
image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"

remove_docker_container() {
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
}
trap remove_docker_container EXIT

echo "--- Running container"

HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"

commands=$@
echo "Commands:$commands"

PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."

# Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1
fi

# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
# assign shard-id for each shard
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}_${GPU}" \
"${image_name}" \
/bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in "${PIDS[@]}"; do
wait "${pid}"
STATUS+=($?)
done
for st in "${STATUS[@]}"; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit "${st}"
fi
done
else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"
fi
Loading