diff --git a/.buildkite/bootstrap-amd-omni.sh b/.buildkite/bootstrap-amd-omni.sh new file mode 100755 index 0000000000..a38b762201 --- /dev/null +++ b/.buildkite/bootstrap-amd-omni.sh @@ -0,0 +1,238 @@ +#!/bin/bash +# vllm-omni customized version +# Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/bootstrap-amd.sh +# Last synced: 2025-12-15 +# Modifications: Use local template file instead of downloading from ci-infra + +set -euo pipefail + +if [[ -z "${RUN_ALL:-}" ]]; then + RUN_ALL=0 +fi + +if [[ -z "${NIGHTLY:-}" ]]; then + NIGHTLY=0 +fi + +if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then + VLLM_CI_BRANCH="main" +fi + +if [[ -z "${AMD_MIRROR_HW:-}" ]]; then + AMD_MIRROR_HW="amdproduction" +fi + +if [[ -z "${DOCS_ONLY_DISABLE:-}" ]]; then + DOCS_ONLY_DISABLE=0 +fi + +fail_fast() { + DISABLE_LABEL="ci-no-fail-fast" + # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq + if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then + PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') + if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then + echo false + else + echo true + fi + else + echo false # not a PR or BUILDKITE_PULL_REQUEST not set + fi +} + +check_run_all_label() { + RUN_ALL_LABEL="ready-run-all-tests" + # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq + if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then + PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') + if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then + echo true + else + echo false + fi + else + echo false # not a PR or BUILDKITE_PULL_REQUEST not set + fi +} + +if [[ -z "${COV_ENABLED:-}" ]]; then + COV_ENABLED=0 +fi + +upload_pipeline() { + echo "Uploading pipeline..." + # Install minijinja + ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI' + curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh + source /var/lib/buildkite-agent/.cargo/env + + if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then + AMD_MIRROR_HW="amdtentative" + fi + + # Use local template file for vllm-omni + cp .buildkite/test-template-amd-omni.j2 .buildkite/test-template.j2 + + + # (WIP) Use pipeline generator instead of jinja template + if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then + python -m pip install click pydantic + python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW" + buildkite-agent pipeline upload .buildkite/pipeline.yaml + exit 0 + fi + echo "List file diff: $LIST_FILE_DIFF" + echo "Run all: $RUN_ALL" + echo "Nightly: $NIGHTLY" + echo "AMD Mirror HW: $AMD_MIRROR_HW" + + FAIL_FAST=$(fail_fast) + + cd .buildkite + ( + set -x + # Output pipeline.yaml with all blank lines removed + minijinja-cli test-template.j2 test-amd.yaml \ + -D branch="$BUILDKITE_BRANCH" \ + -D list_file_diff="$LIST_FILE_DIFF" \ + -D run_all="$RUN_ALL" \ + -D nightly="$NIGHTLY" \ + -D mirror_hw="$AMD_MIRROR_HW" \ + -D fail_fast="$FAIL_FAST" \ + -D vllm_use_precompiled="$VLLM_USE_PRECOMPILED" \ + -D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \ + -D cov_enabled="$COV_ENABLED" \ + -D vllm_ci_branch="$VLLM_CI_BRANCH" \ + | sed '/^[[:space:]]*$/d' \ + > pipeline.yaml + ) + cat pipeline.yaml + buildkite-agent artifact upload pipeline.yaml + buildkite-agent pipeline upload pipeline.yaml + exit 0 +} + +get_diff() { + $(git add .) + echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD)) +} + +get_diff_main() { + $(git add .) + echo $(git diff --name-only --diff-filter=ACMDR HEAD~1) +} + +file_diff=$(get_diff) +if [[ $BUILDKITE_BRANCH == "main" ]]; then + file_diff=$(get_diff_main) +fi + +# ---------------------------------------------------------------------- +# Early exit start: skip pipeline if conditions are met +# ---------------------------------------------------------------------- + +# skip pipeline if all changed files are under docs/ +if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then + if [[ -n "${file_diff:-}" ]]; then + docs_only=1 + # Robust iteration over newline-separated file_diff + while IFS= read -r f; do + [[ -z "$f" ]] && continue + # **Policy:** only skip if *every* path starts with docs/ + if [[ "$f" != docs/* ]]; then + docs_only=0 + break + fi + done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r') + + if [[ "$docs_only" -eq 1 ]]; then + buildkite-agent annotate ":memo: CI skipped — docs/** only changes detected + +\`\`\` +${file_diff} +\`\`\`" --style "info" || true + echo "[docs-only] All changes are under docs/. Exiting before pipeline upload." + exit 0 + fi + fi +fi + +# ---------------------------------------------------------------------- +# Early exit end +# ---------------------------------------------------------------------- + +patterns=( + "docker/Dockerfile" + "CMakeLists.txt" + "requirements/common.txt" + "requirements/cuda.txt" + "requirements/build.txt" + "requirements/test.txt" + "setup.py" + "csrc/" + "cmake/" +) + +ignore_patterns=( + "docker/Dockerfile." + "csrc/cpu" + "csrc/rocm" + "cmake/hipify.py" + "cmake/cpu_extension.cmake" +) + +for file in $file_diff; do + # First check if file matches any pattern + matches_pattern=0 + for pattern in "${patterns[@]}"; do + if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then + matches_pattern=1 + break + fi + done + + # If file matches pattern, check it's not in ignore patterns + if [[ $matches_pattern -eq 1 ]]; then + matches_ignore=0 + for ignore in "${ignore_patterns[@]}"; do + if [[ $file == $ignore* ]] || [[ $file == $ignore ]]; then + matches_ignore=1 + break + fi + done + + if [[ $matches_ignore -eq 0 ]]; then + RUN_ALL=1 + echo "Found changes: $file. Run all tests" + break + fi + fi +done + +# Check for ready-run-all-tests label +LABEL_RUN_ALL=$(check_run_all_label) +if [[ $LABEL_RUN_ALL == true ]]; then + RUN_ALL=1 + NIGHTLY=1 + echo "Found 'ready-run-all-tests' label. Running all tests including optional tests." +fi + +# Decide whether to use precompiled wheels +# Relies on existing patterns array as a basis. +if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then + echo "VLLM_USE_PRECOMPILED is already set to: $VLLM_USE_PRECOMPILED" +elif [[ $RUN_ALL -eq 1 ]]; then + export VLLM_USE_PRECOMPILED=0 + echo "Detected critical changes, building wheels from source" +else + export VLLM_USE_PRECOMPILED=1 + echo "No critical changes, using precompiled wheels" +fi + + +LIST_FILE_DIFF=$(get_diff | tr ' ' '|') +if [[ $BUILDKITE_BRANCH == "main" ]]; then + LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|') +fi +upload_pipeline diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh new file mode 100755 index 0000000000..a291f1b8c4 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# vllm-omni customized version +# Based on: vllm/.buildkite/scripts/hardware_ci/run-amd-test.sh +# Last synced: 2025-12-15 +# Modifications: docker image name for vllm-omni + +# This script runs test inside the corresponding ROCm docker container. +set -o pipefail + +# Export Python path +export PYTHONPATH=".." + +# Print ROCm version +echo "--- Confirming Clean Initial State" +while true; do + sleep 3 + if grep -q clean /opt/amdgpu/etc/gpu_state; then + echo "GPUs state is \"clean\"" + break + fi +done + +echo "--- ROCm info" +rocminfo + +# cleanup older docker images +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} + +# Call the cleanup docker function +cleanup_docker + +echo "--- Resetting GPUs" + +echo "reset" > /opt/amdgpu/etc/gpu_state + +while true; do + sleep 3 + if grep -q clean /opt/amdgpu/etc/gpu_state; then + echo "GPUs state is \"clean\"" + break + fi +done + +echo "--- Pulling container" +image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni" +container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" +docker pull "${image_name}" + +remove_docker_container() { + docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true +} +trap remove_docker_container EXIT + +echo "--- Running container" + +HF_CACHE="$(realpath ~)/huggingface" +mkdir -p "${HF_CACHE}" +HF_MOUNT="/root/.cache/huggingface" + +commands=$@ +echo "Commands:$commands" + +PARALLEL_JOB_COUNT=8 +MYPYTHONPATH=".." + +# Test that we're launching on the machine that has +# proper access to GPUs +render_gid=$(getent group render | cut -d: -f3) +if [[ -z "$render_gid" ]]; then + echo "Error: 'render' group not found. This is required for GPU access." >&2 + exit 1 +fi + +# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. +if [[ $commands == *"--shard-id="* ]]; then + # assign job count as the number of shards used + commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g') + for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do + # assign shard-id for each shard + commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g') + echo "Shard ${GPU} commands:$commands_gpu" + echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" + docker run \ + --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ + --network=host \ + --shm-size=16gb \ + --group-add "$render_gid" \ + --rm \ + -e HIP_VISIBLE_DEVICES="${GPU}" \ + -e HF_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + -e "PYTHONPATH=${MYPYTHONPATH}" \ + --name "${container_name}_${GPU}" \ + "${image_name}" \ + /bin/bash -c "${commands_gpu}" \ + |& while read -r line; do echo ">>Shard $GPU: $line"; done & + PIDS+=($!) + done + #wait for all processes to finish and collect exit codes + for pid in "${PIDS[@]}"; do + wait "${pid}" + STATUS+=($?) + done + for st in "${STATUS[@]}"; do + if [[ ${st} -ne 0 ]]; then + echo "One of the processes failed with $st" + exit "${st}" + fi + done +else + echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" + docker run \ + --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ + --network=host \ + --shm-size=16gb \ + --group-add "$render_gid" \ + --rm \ + -e HF_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + -e "PYTHONPATH=${MYPYTHONPATH}" \ + --name "${container_name}" \ + "${image_name}" \ + /bin/bash -c "${commands}" +fi diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml new file mode 100644 index 0000000000..57008e59f5 --- /dev/null +++ b/.buildkite/test-amd.yaml @@ -0,0 +1,53 @@ +steps: + +- label: "Diffusion Model Test" + timeout_in_minutes: 15 + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - export VLLM_ROCM_USE_AITER=1 + - export VLLM_ROCM_USE_AITER_MHA=1 + - export VLLM_ROCM_USE_AITER_LINEAR=0 + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py + +- label: "Diffusion Cache Backend Test" + timeout_in_minutes: 15 + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - export VLLM_ROCM_USE_AITER=1 + - export VLLM_ROCM_USE_AITER_MHA=1 + - export VLLM_ROCM_USE_AITER_LINEAR=0 + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py + +- label: "Omni Model Test Qwen2-5-Omni" + timeout_in_minutes: 15 + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - export VLLM_ROCM_USE_AITER=1 + - export VLLM_ROCM_USE_AITER_MHA=1 + - export VLLM_ROCM_USE_AITER_LINEAR=0 + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2 new file mode 100644 index 0000000000..0b6eb8f54b --- /dev/null +++ b/.buildkite/test-template-amd-omni.j2 @@ -0,0 +1,53 @@ +{# vllm-omni customized version + Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/test-template-amd.j2 + Last synced: 2025-12-15 + Modifications: Removed unused CUDA/NVIDIA logic, keeping only AMD tests +#} +{% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm-omni" %} +{% set default_working_dir = "/app/vllm-omni" %} + + - group: "AMD Tests" + depends_on: ~ + steps: + - label: "AMD: :docker: build image" + depends_on: ~ + soft_fail: false + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker build -f docker/Dockerfile.rocm -t {{ docker_image_amd }} --progress plain ." + - "docker push {{ docker_image_amd }}" + key: "amd-build" + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 1 + - exit_status: -10 # Agent was lost + limit: 1 + - exit_status: 1 # Machine occasionally fail + limit: 1 + agents: + queue: cpu_queue_premerge_us_east_1 + + {% for step in steps %} + {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %} + - label: "{{ step.agent_pool }}: {{ step.label }}" + depends_on: amd-build + agents: + {% if step.agent_pool %} + queue: amd_{{ step.agent_pool }} + {% else %} + queue: amd_mi325_1 + {% endif %} + command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" + env: + DOCKER_BUILDKIT: "1" + priority: 100 + {% if step.grade and step.grade == "Blocking" %} + soft_fail: false + {% else %} + soft_fail: true + {% endif%} + {% endif %} + {% endfor %} diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm new file mode 100644 index 0000000000..7fabb9c3c6 --- /dev/null +++ b/docker/Dockerfile.rocm @@ -0,0 +1,42 @@ +ARG BASE_IMAGE=rocm/vllm-dev:nightly_main_20251205 +FROM ${BASE_IMAGE} + +ARG COMMON_WORKDIR=/app +ARG VLLM_VERSION=v0.12.0 +ARG PYTORCH_ROCM_ARCH="gfx942;gfx950" + +WORKDIR ${COMMON_WORKDIR} + +# Step 1: Setup - Install system dependencies +RUN apt-get update && \ + apt-get install -y ffmpeg && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Step 2: Reinstall vllm from source +RUN python3 -m pip uninstall -y vllm && rm -rf vllm &&\ + git clone https://github.com/vllm-project/vllm.git && \ + cd vllm && \ + git checkout ${VLLM_VERSION} && \ + python3 -m pip install -r requirements/rocm.txt && \ + python3 setup.py clean --all && \ + PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} python3 setup.py develop && \ + cd ../ && \ + rm -rf vllm/.git + +RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni + +# Step 3: Copy vllm-omni code and install without uv +COPY . ${COMMON_WORKDIR}/vllm-omni +RUN cd ${COMMON_WORKDIR}/vllm-omni && python3 -m pip install --no-cache-dir ".[dev]" + +# Create python symlink +# `GPU_ARCHS` is an environment variable that is used to set the GPU archs for the AITER. +# This is needed to prevent the AITER automatic GPU arch detection from failing on MI325X. +# The AITER version used in this dockerfile has issues with handling +# the GPU archs of MI325X (CI machine) correctly. So we manually set the GPU archs here. +# We reuse AITER_ROCM_ARCH from the base image to avoid duplication. +ENV GPU_ARCHS=${AITER_ROCM_ARCH} +RUN ln -sf /usr/bin/python3 /usr/bin/python + +ENTRYPOINT [] diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md index 5956ed102d..03758fd935 100644 --- a/docs/getting_started/installation/gpu.md +++ b/docs/getting_started/installation/gpu.md @@ -49,11 +49,14 @@ vLLM-Omni is a Python library that supports the following GPU variants. The libr ## Set up using Docker -### Build wheel from source -=== "NVIDIA CUDA" +### Build your own docker image + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-docker" - --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source-in-docker" +### Build wheel from source === "AMD ROCm" diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 87d7eb27ad..887244766f 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -69,9 +69,6 @@ python -c "import setuptools_scm; print(setuptools_scm.get_version())" PYTORCH_ROCM_ARCH=gfx942 python3 setup.py develop ``` -!!! note - vLLM release wheels based on the branch with prefix `releases/`, not from the tag as vLLM may cherry pick bugfixes after cutting a branch. - #### Installation of vLLM-Omni @@ -110,6 +107,41 @@ export VLLM_ROCM_USE_AITER_RMSNORM=0 # --8<-- [end:build-wheel-from-source-in-docker] +# --8<-- [start:build-docker] + +#### Build docker image + +```bash +DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-omni-rocm . +``` + +If you want to specify which GPU Arch to build for to cutdown build time: + +```bash +DOCKER_BUILDKIT=1 docker build \ + -f docker/Dockerfile.rocm \ + --build-arg PYTORCH_ROCM_ARCH="gfx942;gfx950" \ + -t vllm-omni-rocm . +``` + +#### Launch the docker image + +``` +docker run -it \ +--network=host \ +--group-add=video \ +--ipc=host \ +--cap-add=SYS_PTRACE \ +--security-opt seccomp=unconfined \ +--device /dev/kfd \ +--device /dev/dri \ +-v :/app/model \ +vllm-omni-rocm \ +bash +``` + +# --8<-- [end:build-docker] + # --8<-- [start:pre-built-images] # --8<-- [end:pre-built-images] diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml new file mode 100644 index 0000000000..96e9d7fa72 --- /dev/null +++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml @@ -0,0 +1,105 @@ +# stage config for running qwen2.5-omni with architecture of OmniLLM. + +# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090). +# This config is optimized for CI e2e tests. +stage_args: + - stage_id: 0 + runtime: + process: true # Run this stage in a separate process + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + max_model_len: 896 + max_num_batched_tokens: 896 + max_num_seqs: 1 + gpu_memory_utilization: 0.8 + skip_mm_profiling: true + enforce_eager: true # Now we only support eager mode + trust_remote_code: true + engine_output_type: latent + enable_prefix_caching: false + is_comprehension: true + final_output: true + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + - stage_id: 1 + runtime: + process: true + devices: "1" + max_batch_size: 1 + engine_args: + model_stage: talker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + max_model_len: 896 + max_num_batched_tokens: 896 + max_num_seqs: 1 + gpu_memory_utilization: 0.8 + skip_mm_profiling: true + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: latent + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker + default_sampling_params: + temperature: 0.9 + top_p: 0.8 + top_k: 40 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + stop_token_ids: [8294] + - stage_id: 2 + runtime: + process: true + devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU + max_batch_size: 1 + engine_args: + model_stage: code2wav + model_arch: Qwen2_5OmniForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + gpu_memory_utilization: 0.15 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: audio + engine_input_source: [1] + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + +# Top-level runtime config (concise): default windows and stage edges +runtime: + enabled: true + defaults: + window_size: -1 # Simplified: trigger downstream only after full upstream completion + max_inflight: 1 # Simplified: process serially within each stage + edges: + - from: 0 # thinker → talker: trigger only after receiving full input (-1) + to: 1 + window_size: -1 + - from: 1 # talker → code2wav: trigger only after receiving full input (-1) + to: 2 + window_size: -1 diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py index a03399aeba..63eea1ba26 100644 --- a/tests/e2e/offline_inference/test_qwen2_5_omni.py +++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py @@ -13,7 +13,7 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.multimodal.image import convert_image_mode -from vllm_omni.utils import is_npu +from vllm_omni.utils import is_npu, is_rocm from .conftest import OmniRunner from .utils import create_new_process_for_each_test @@ -23,6 +23,9 @@ # CI stage config optimized for 24GB GPU (L4/RTX3090) or NPU if is_npu(): stage_config = str(Path(__file__).parent / "stage_configs" / "npu" / "qwen2_5_omni_ci.yaml") +elif is_rocm(): + # ROCm stage config optimized for MI325 GPU + stage_config = str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml") else: stage_config = str(Path(__file__).parent / "stage_configs" / "qwen2_5_omni_ci.yaml") diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py index fafd98e555..61f0c7b12a 100644 --- a/tests/e2e/offline_inference/test_t2i_model.py +++ b/tests/e2e/offline_inference/test_t2i_model.py @@ -5,7 +5,7 @@ import pytest import torch -from vllm_omni.utils.platform_utils import is_npu +from vllm_omni.utils.platform_utils import is_npu, is_rocm # ruff: noqa: E402 REPO_ROOT = Path(__file__).resolve().parents[2] @@ -24,6 +24,11 @@ # TODO: When NPU support is ready, remove this branch. if is_npu(): models = ["Qwen/Qwen-Image"] +elif is_rocm(): + # TODO: When ROCm support is ready, remove this branch. + # vLLM V0.11.0 has issues running riverclouds/qwen_image_random + # on ROCm + models = ["Tongyi-MAI/Z-Image-Turbo"] @pytest.mark.parametrize("model_name", models) diff --git a/tests/e2e/offline_inference/utils.py b/tests/e2e/offline_inference/utils.py index 931e7b506c..c491c10b91 100644 --- a/tests/e2e/offline_inference/utils.py +++ b/tests/e2e/offline_inference/utils.py @@ -195,7 +195,11 @@ def create_new_process_for_each_test( A decorator to run test functions in separate processes. """ if method is None: - use_spawn = current_platform.is_rocm() or current_platform.is_xpu() + # TODO: Find out why spawn is not working correctly on ROCm + # The test content will not run and tests passed immediately. + # For now, using `fork` for ROCm as it can run with `fork` + # and tests are running correctly. + use_spawn = current_platform.is_xpu() method = "spawn" if use_spawn else "fork" assert method in ["spawn", "fork"], "Method must be either 'spawn' or 'fork'" diff --git a/vllm_omni/diffusion/layers/custom_op.py b/vllm_omni/diffusion/layers/custom_op.py index 461da0d361..0bf5c4f60e 100644 --- a/vllm_omni/diffusion/layers/custom_op.py +++ b/vllm_omni/diffusion/layers/custom_op.py @@ -3,7 +3,7 @@ import torch.nn as nn -from vllm_omni.utils.platform_utils import detect_device_type +from vllm_omni.utils.platform_utils import detect_device_type, is_rocm class CustomOp(nn.Module): @@ -18,7 +18,9 @@ def __init__(self) -> None: self._forward_method = self.dispatch_forward() def dispatch_forward(self) -> Callable: - if self.is_cuda: + if is_rocm(): + return self.forward_hip + elif self.is_cuda: return self.forward_cuda else: return self.forward_native @@ -36,3 +38,7 @@ def forward_native(self, *args, **kwargs): def forward_cuda(self, *args, **kwargs): raise NotImplementedError + + def forward_hip(self, *args, **kwargs): + # By default, we assume that HIP ops are compatible with CUDA ops. + return self.forward_cuda(*args, **kwargs) diff --git a/vllm_omni/diffusion/layers/rope.py b/vllm_omni/diffusion/layers/rope.py index acc0158fc6..528f2425ef 100644 --- a/vllm_omni/diffusion/layers/rope.py +++ b/vllm_omni/diffusion/layers/rope.py @@ -1,8 +1,13 @@ +from importlib.util import find_spec + import torch from einops import rearrange, repeat +from vllm.logger import init_logger from vllm_omni.diffusion.layers.custom_op import CustomOp +logger = init_logger(__name__) + def rotate_half(x, interleaved=False): if not interleaved: @@ -45,6 +50,11 @@ def __init__( super().__init__() self.is_neox_style = is_neox_style self.interleaved = not is_neox_style + self.apply_rotary_emb_flash_attn = None + if find_spec("flash_attn") is not None: + from flash_attn.ops.triton.rotary import apply_rotary + + self.apply_rotary_emb_flash_attn = apply_rotary def forward_cuda( self, @@ -66,6 +76,27 @@ def forward_cuda( interleaved=self.interleaved, ) + def forward_hip( + self, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + if self.apply_rotary_emb_flash_attn is None: + return self.forward_cuda(x, cos, sin) + + if cos.dim() == 3: + # (B, S, D/2) -> (S, D/2) + cos = cos[0] + sin = sin[0] + + return self.apply_rotary_emb_flash_attn( + x, + cos, + sin, + interleaved=self.interleaved, + ) + def forward_native( self, x: torch.Tensor, diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index d86ded250b..82c8a4d34a 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -8,7 +8,7 @@ from vllm.logger import init_logger from vllm.transformers_utils.config import get_config -from vllm_omni.utils import detect_device_type +from vllm_omni.utils import detect_device_type, is_rocm # Get the project root directory (2 levels up from this file) PROJECT_ROOT = Path(__file__).parent.parent.parent @@ -86,8 +86,10 @@ def resolve_model_config_path(model: str) -> str: device_type = detect_device_type() # Try device-specific config first - if device_type != "cuda": + if device_type != "cuda" or is_rocm(): device_config_file = f"vllm_omni/model_executor/stage_configs/{device_type}/{model_type}.yaml" + if is_rocm(): + device_config_file = f"vllm_omni/model_executor/stage_configs/rocm/{model_type}.yaml" device_config_path = PROJECT_ROOT / device_config_file if os.path.exists(device_config_path): return str(device_config_path) diff --git a/vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml b/vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml new file mode 100644 index 0000000000..c646aa76a9 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml @@ -0,0 +1,102 @@ +# stage config for running qwen2.5-omni with architecture of OmniLLM. + +# The following config has been verified on 2x H100-80G GPU. +stage_args: + - stage_id: 0 + runtime: + process: true # Run this stage in a separate process + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.8 + enforce_eager: true # Now we only support eager mode + trust_remote_code: true + engine_output_type: latent + enable_prefix_caching: false + max_num_batched_tokens: 32768 + is_comprehension: true + final_output: true + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + + - stage_id: 1 + runtime: + process: true + devices: "1" + max_batch_size: 1 + engine_args: + model_stage: talker + model_arch: Qwen2_5OmniForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.8 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + max_num_batched_tokens: 32768 + engine_output_type: latent + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker + default_sampling_params: + temperature: 0.9 + top_p: 0.8 + top_k: 40 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + stop_token_ids: [8294] + + - stage_id: 2 + runtime: + process: true + devices: "2" # Example: use a different GPU than the previous stage; use "0" if single GPU + max_batch_size: 1 + engine_args: + model_stage: code2wav + model_arch: Qwen2_5OmniForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + gpu_memory_utilization: 0.15 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + max_num_batched_tokens: 32768 + engine_output_type: audio + engine_input_source: [1] + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + +# Top-level runtime config (concise): default windows and stage edges +runtime: + enabled: true + defaults: + window_size: -1 # Simplified: trigger downstream only after full upstream completion + max_inflight: 1 # Simplified: process serially within each stage + + edges: + - from: 0 # thinker → talker: trigger only after receiving full input (-1) + to: 1 + window_size: -1 + - from: 1 # talker → code2wav: trigger only after receiving full input (-1) + to: 2 + window_size: -1 diff --git a/vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml b/vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml new file mode 100644 index 0000000000..73f65ecb55 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml @@ -0,0 +1,97 @@ +# Stage config for running Qwen3-Omni-MoE with 3-stage architecture +# Stage 0: Thinker (multimodal understanding + text generation) +# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) +# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) + +# The following config has been verified on 2x H100-80G GPUs. +stage_args: + - stage_id: 0 + runtime: + devices: "0,1" + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.6 + enforce_eager: true + trust_remote_code: true + engine_output_type: latent # Output hidden states for talker + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + hf_config_name: thinker_config + tensor_parallel_size: 2 + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + + - stage_id: 1 + runtime: + devices: "1" + max_batch_size: 1 + engine_args: + model_stage: talker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.3 + enforce_eager: true + trust_remote_code: true + engine_output_type: latent # Output codec codes for code2wav + # tensor_parallel_size: 2 + enable_prefix_caching: false + max_num_batched_tokens: 32768 + distributed_executor_backend: "mp" + hf_config_name: talker_config + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker + # final_output: true + # final_output_type: text + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: False + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 2 + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: code2wav + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: audio # Final output: audio waveform + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 1000000 + hf_config_name: thinker_config + engine_input_source: [1] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: True + repetition_penalty: 1.1 diff --git a/vllm_omni/utils/__init__.py b/vllm_omni/utils/__init__.py index 50dbb478d9..34b2545db5 100644 --- a/vllm_omni/utils/__init__.py +++ b/vllm_omni/utils/__init__.py @@ -2,10 +2,12 @@ detect_device_type, get_device_control_env_var, is_npu, + is_rocm, ) __all__ = [ "detect_device_type", "get_device_control_env_var", "is_npu", + "is_rocm", ] diff --git a/vllm_omni/utils/platform_utils.py b/vllm_omni/utils/platform_utils.py index 385b1a8f36..5f8259ab83 100644 --- a/vllm_omni/utils/platform_utils.py +++ b/vllm_omni/utils/platform_utils.py @@ -19,6 +19,10 @@ def is_npu() -> bool: return detect_device_type() == "npu" +def is_rocm() -> bool: + return current_platform.is_rocm() + + def get_device_control_env_var() -> str: """Return the environment variable name for device visibility control.""" if hasattr(current_platform, "device_control_env_var"):