diff --git a/.buildkite/bootstrap-amd-omni.sh b/.buildkite/bootstrap-amd-omni.sh
new file mode 100755
index 0000000000..a38b762201
--- /dev/null
+++ b/.buildkite/bootstrap-amd-omni.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+# vllm-omni customized version
+# Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/bootstrap-amd.sh
+# Last synced: 2025-12-15
+# Modifications: Use local template file instead of downloading from ci-infra
+
+set -euo pipefail
+
+if [[ -z "${RUN_ALL:-}" ]]; then
+    RUN_ALL=0
+fi
+
+if [[ -z "${NIGHTLY:-}" ]]; then
+    NIGHTLY=0
+fi
+
+if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then
+    VLLM_CI_BRANCH="main"
+fi
+
+if [[ -z "${AMD_MIRROR_HW:-}" ]]; then
+    AMD_MIRROR_HW="amdproduction"
+fi
+
+if [[ -z "${DOCS_ONLY_DISABLE:-}" ]]; then
+    DOCS_ONLY_DISABLE=0
+fi
+
+fail_fast() {
+    DISABLE_LABEL="ci-no-fail-fast"
+    # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+    if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then
+            echo false
+        else
+            echo true
+        fi
+    else
+        echo false  # not a PR or BUILDKITE_PULL_REQUEST not set
+    fi
+}
+
+check_run_all_label() {
+    RUN_ALL_LABEL="ready-run-all-tests"
+    # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+    if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then
+            echo true
+        else
+            echo false
+        fi
+    else
+        echo false  # not a PR or BUILDKITE_PULL_REQUEST not set
+    fi
+}
+
+if [[ -z "${COV_ENABLED:-}" ]]; then
+    COV_ENABLED=0
+fi
+
+upload_pipeline() {
+    echo "Uploading pipeline..."
+    # Install minijinja
+    ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI'
+    curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh
+    source /var/lib/buildkite-agent/.cargo/env
+
+    if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
+        AMD_MIRROR_HW="amdtentative"
+    fi
+
+    # Use local template file for vllm-omni
+    cp .buildkite/test-template-amd-omni.j2 .buildkite/test-template.j2
+
+
+    # (WIP) Use pipeline generator instead of jinja template
+    if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then
+        python -m pip install click pydantic
+        python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
+        buildkite-agent pipeline upload .buildkite/pipeline.yaml
+        exit 0
+    fi
+    echo "List file diff: $LIST_FILE_DIFF"
+    echo "Run all: $RUN_ALL"
+    echo "Nightly: $NIGHTLY"
+    echo "AMD Mirror HW: $AMD_MIRROR_HW"
+
+    FAIL_FAST=$(fail_fast)
+
+    cd .buildkite
+    (
+        set -x
+        # Output pipeline.yaml with all blank lines removed
+        minijinja-cli test-template.j2 test-amd.yaml \
+            -D branch="$BUILDKITE_BRANCH" \
+            -D list_file_diff="$LIST_FILE_DIFF" \
+            -D run_all="$RUN_ALL" \
+            -D nightly="$NIGHTLY" \
+            -D mirror_hw="$AMD_MIRROR_HW" \
+            -D fail_fast="$FAIL_FAST" \
+            -D vllm_use_precompiled="$VLLM_USE_PRECOMPILED" \
+            -D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \
+            -D cov_enabled="$COV_ENABLED" \
+            -D vllm_ci_branch="$VLLM_CI_BRANCH" \
+            | sed '/^[[:space:]]*$/d' \
+            > pipeline.yaml
+    )
+    cat pipeline.yaml
+    buildkite-agent artifact upload pipeline.yaml
+    buildkite-agent pipeline upload pipeline.yaml
+    exit 0
+}
+
+get_diff() {
+    $(git add .)
+    echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD))
+}
+
+get_diff_main() {
+    $(git add .)
+    echo $(git diff --name-only --diff-filter=ACMDR HEAD~1)
+}
+
+file_diff=$(get_diff)
+if [[ $BUILDKITE_BRANCH == "main" ]]; then
+    file_diff=$(get_diff_main)
+fi
+
+# ----------------------------------------------------------------------
+# Early exit start: skip pipeline if conditions are met
+# ----------------------------------------------------------------------
+
+# skip pipeline if all changed files are under docs/
+if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then
+  if [[ -n "${file_diff:-}" ]]; then
+    docs_only=1
+    # Robust iteration over newline-separated file_diff
+    while IFS= read -r f; do
+      [[ -z "$f" ]] && continue
+      # **Policy:** only skip if *every* path starts with docs/
+      if [[ "$f" != docs/* ]]; then
+        docs_only=0
+        break
+      fi
+    done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r')
+
+    if [[ "$docs_only" -eq 1 ]]; then
+      buildkite-agent annotate ":memo: CI skipped — docs/** only changes detected
+
+\`\`\`
+${file_diff}
+\`\`\`" --style "info" || true
+      echo "[docs-only] All changes are under docs/. Exiting before pipeline upload."
+      exit 0
+    fi
+  fi
+fi
+
+# ----------------------------------------------------------------------
+# Early exit end
+# ----------------------------------------------------------------------
+
+patterns=(
+    "docker/Dockerfile"
+    "CMakeLists.txt"
+    "requirements/common.txt"
+    "requirements/cuda.txt"
+    "requirements/build.txt"
+    "requirements/test.txt"
+    "setup.py"
+    "csrc/"
+    "cmake/"
+)
+
+ignore_patterns=(
+    "docker/Dockerfile."
+    "csrc/cpu"
+    "csrc/rocm"
+    "cmake/hipify.py"
+    "cmake/cpu_extension.cmake"
+)
+
+for file in $file_diff; do
+    # First check if file matches any pattern
+    matches_pattern=0
+    for pattern in "${patterns[@]}"; do
+        if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then
+            matches_pattern=1
+            break
+        fi
+    done
+
+    # If file matches pattern, check it's not in ignore patterns
+    if [[ $matches_pattern -eq 1 ]]; then
+        matches_ignore=0
+        for ignore in "${ignore_patterns[@]}"; do
+            if [[ $file == $ignore* ]] || [[ $file == $ignore ]]; then
+                matches_ignore=1
+                break
+            fi
+        done
+
+        if [[ $matches_ignore -eq 0 ]]; then
+            RUN_ALL=1
+            echo "Found changes: $file. Run all tests"
+            break
+        fi
+    fi
+done
+
+# Check for ready-run-all-tests label
+LABEL_RUN_ALL=$(check_run_all_label)
+if [[ $LABEL_RUN_ALL == true ]]; then
+    RUN_ALL=1
+    NIGHTLY=1
+    echo "Found 'ready-run-all-tests' label. Running all tests including optional tests."
+fi
+
+# Decide whether to use precompiled wheels
+# Relies on existing patterns array as a basis.
+if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then
+    echo "VLLM_USE_PRECOMPILED is already set to: $VLLM_USE_PRECOMPILED"
+elif [[ $RUN_ALL -eq 1 ]]; then
+    export VLLM_USE_PRECOMPILED=0
+    echo "Detected critical changes, building wheels from source"
+else
+    export VLLM_USE_PRECOMPILED=1
+    echo "No critical changes, using precompiled wheels"
+fi
+
+
+LIST_FILE_DIFF=$(get_diff | tr ' ' '|')
+if [[ $BUILDKITE_BRANCH == "main" ]]; then
+    LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|')
+fi
+upload_pipeline
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
new file mode 100755
index 0000000000..a291f1b8c4
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# vllm-omni customized version
+# Based on: vllm/.buildkite/scripts/hardware_ci/run-amd-test.sh
+# Last synced: 2025-12-15
+# Modifications: docker image name for vllm-omni
+
+# This script runs test inside the corresponding ROCm docker container.
+set -o pipefail
+
+# Export Python path
+export PYTHONPATH=".."
+
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- ROCm info"
+rocminfo
+
+# cleanup older docker images
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+
+# Call the cleanup docker function
+cleanup_docker
+
+echo "--- Resetting GPUs"
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- Pulling container"
+image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull "${image_name}"
+
+remove_docker_container() {
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+}
+trap remove_docker_container EXIT
+
+echo "--- Running container"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+commands=$@
+echo "Commands:$commands"
+
+PARALLEL_JOB_COUNT=8
+MYPYTHONPATH=".."
+
+# Test that we're launching on the machine that has
+# proper access to GPUs
+render_gid=$(getent group render | cut -d: -f3)
+if [[ -z "$render_gid" ]]; then
+  echo "Error: 'render' group not found. This is required for GPU access." >&2
+  exit 1
+fi
+
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
+    echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+    docker run \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
+        --shm-size=16gb \
+        --group-add "$render_gid" \
+        --rm \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
+        -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
+    fi
+  done
+else
+  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+  docker run \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
+          --shm-size=16gb \
+          --group-add "$render_gid" \
+          --rm \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
+fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
new file mode 100644
index 0000000000..57008e59f5
--- /dev/null
+++ b/.buildkite/test-amd.yaml
@@ -0,0 +1,53 @@
+steps:
+
+- label: "Diffusion Model Test"
+  timeout_in_minutes: 15
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
+    - export VLLM_ROCM_USE_AITER=1
+    - export VLLM_ROCM_USE_AITER_MHA=1
+    - export VLLM_ROCM_USE_AITER_LINEAR=0
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
+
+- label: "Diffusion Cache Backend Test"
+  timeout_in_minutes: 15
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
+    - export VLLM_ROCM_USE_AITER=1
+    - export VLLM_ROCM_USE_AITER_MHA=1
+    - export VLLM_ROCM_USE_AITER_LINEAR=0
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+
+- label: "Omni Model Test Qwen2-5-Omni"
+  timeout_in_minutes: 15
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
+    - export VLLM_ROCM_USE_AITER=1
+    - export VLLM_ROCM_USE_AITER_MHA=1
+    - export VLLM_ROCM_USE_AITER_LINEAR=0
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2
new file mode 100644
index 0000000000..0b6eb8f54b
--- /dev/null
+++ b/.buildkite/test-template-amd-omni.j2
@@ -0,0 +1,53 @@
+{# vllm-omni customized version
+   Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/test-template-amd.j2
+   Last synced: 2025-12-15
+   Modifications: Removed unused CUDA/NVIDIA logic, keeping only AMD tests
+#}
+{% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm-omni" %}
+{% set default_working_dir = "/app/vllm-omni" %}
+
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+      - label: "AMD: :docker: build image"
+        depends_on: ~
+        soft_fail: false
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "docker build -f docker/Dockerfile.rocm -t {{ docker_image_amd }} --progress plain ."
+          - "docker push {{ docker_image_amd }}"
+        key: "amd-build"
+        env:
+          DOCKER_BUILDKIT: "1"
+        retry:
+          automatic:
+            - exit_status: -1  # Agent was lost
+              limit: 1
+            - exit_status: -10  # Agent was lost
+              limit: 1
+            - exit_status: 1  # Machine occasionally fail
+              limit: 1
+        agents:
+          queue: cpu_queue_premerge_us_east_1
+
+    {% for step in steps %}
+    {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %}
+      - label: "{{ step.agent_pool }}: {{ step.label }}"
+        depends_on: amd-build
+        agents:
+          {% if step.agent_pool %}
+          queue: amd_{{ step.agent_pool }}
+          {% else %}
+          queue: amd_mi325_1
+          {% endif %}
+        command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        priority: 100
+        {% if step.grade and step.grade == "Blocking" %}
+        soft_fail: false
+        {% else %}
+        soft_fail: true
+        {% endif%}
+    {% endif %}
+    {% endfor %}
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
new file mode 100644
index 0000000000..7fabb9c3c6
--- /dev/null
+++ b/docker/Dockerfile.rocm
@@ -0,0 +1,42 @@
+ARG BASE_IMAGE=rocm/vllm-dev:nightly_main_20251205
+FROM ${BASE_IMAGE}
+
+ARG COMMON_WORKDIR=/app
+ARG VLLM_VERSION=v0.12.0
+ARG PYTORCH_ROCM_ARCH="gfx942;gfx950"
+
+WORKDIR ${COMMON_WORKDIR}
+
+# Step 1: Setup - Install system dependencies
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Step 2: Reinstall vllm from source
+RUN python3 -m pip uninstall -y vllm && rm -rf vllm &&\
+    git clone https://github.com/vllm-project/vllm.git && \
+    cd vllm && \
+    git checkout ${VLLM_VERSION} && \
+    python3 -m pip install -r requirements/rocm.txt && \
+    python3 setup.py clean --all && \
+    PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} python3 setup.py develop && \
+    cd ../ && \
+    rm -rf vllm/.git
+
+RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni
+
+# Step 3: Copy vllm-omni code and install without uv
+COPY . ${COMMON_WORKDIR}/vllm-omni
+RUN cd ${COMMON_WORKDIR}/vllm-omni && python3 -m pip install --no-cache-dir ".[dev]"
+
+# Create python symlink
+# `GPU_ARCHS` is an environment variable that is used to set the GPU archs for the AITER.
+# This is needed to prevent the AITER automatic GPU arch detection from failing on MI325X.
+# The AITER version used in this dockerfile has issues with handling
+# the GPU archs of MI325X (CI machine) correctly. So we manually set the GPU archs here.
+# We reuse AITER_ROCM_ARCH from the base image to avoid duplication.
+ENV GPU_ARCHS=${AITER_ROCM_ARCH}
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+
+ENTRYPOINT []
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index 5956ed102d..03758fd935 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -49,11 +49,14 @@ vLLM-Omni is a Python library that supports the following GPU variants. The libr
 
 ## Set up using Docker
 
-### Build wheel from source
 
-=== "NVIDIA CUDA"
+### Build your own docker image
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-docker"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source-in-docker"
+### Build wheel from source
 
 === "AMD ROCm"
 
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 87d7eb27ad..887244766f 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -69,9 +69,6 @@ python -c "import setuptools_scm; print(setuptools_scm.get_version())"
 PYTORCH_ROCM_ARCH=gfx942 python3 setup.py develop
 ```
 
-!!! note
-    vLLM release wheels based on the branch with prefix `releases/`, not from the tag as vLLM may cherry pick bugfixes after cutting a branch.
-
 
 #### Installation of vLLM-Omni
 
@@ -110,6 +107,41 @@ export VLLM_ROCM_USE_AITER_RMSNORM=0
 
 # --8<-- [end:build-wheel-from-source-in-docker]
 
+# --8<-- [start:build-docker]
+
+#### Build docker image
+
+```bash
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-omni-rocm .
+```
+
+If you want to specify which GPU Arch to build for to cutdown build time:
+
+```bash
+DOCKER_BUILDKIT=1 docker build \
+  -f docker/Dockerfile.rocm \
+  --build-arg PYTORCH_ROCM_ARCH="gfx942;gfx950" \
+  -t vllm-omni-rocm .
+```
+
+#### Launch the docker image
+
+```
+docker run -it \
+--network=host \
+--group-add=video \
+--ipc=host \
+--cap-add=SYS_PTRACE \
+--security-opt seccomp=unconfined \
+--device /dev/kfd \
+--device /dev/dri \
+-v <path/to/model>:/app/model \
+vllm-omni-rocm \
+bash
+```
+
+# --8<-- [end:build-docker]
+
 # --8<-- [start:pre-built-images]
 
 # --8<-- [end:pre-built-images]
diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml
new file mode 100644
index 0000000000..96e9d7fa72
--- /dev/null
+++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml
@@ -0,0 +1,105 @@
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
+# This config is optimized for CI e2e tests.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "0"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1
diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py
index a03399aeba..63eea1ba26 100644
--- a/tests/e2e/offline_inference/test_qwen2_5_omni.py
+++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py
@@ -13,7 +13,7 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.multimodal.image import convert_image_mode
 
-from vllm_omni.utils import is_npu
+from vllm_omni.utils import is_npu, is_rocm
 
 from .conftest import OmniRunner
 from .utils import create_new_process_for_each_test
@@ -23,6 +23,9 @@
 # CI stage config optimized for 24GB GPU (L4/RTX3090) or NPU
 if is_npu():
     stage_config = str(Path(__file__).parent / "stage_configs" / "npu" / "qwen2_5_omni_ci.yaml")
+elif is_rocm():
+    # ROCm stage config optimized for MI325 GPU
+    stage_config = str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")
 else:
     stage_config = str(Path(__file__).parent / "stage_configs" / "qwen2_5_omni_ci.yaml")
 
diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py
index fafd98e555..61f0c7b12a 100644
--- a/tests/e2e/offline_inference/test_t2i_model.py
+++ b/tests/e2e/offline_inference/test_t2i_model.py
@@ -5,7 +5,7 @@
 import pytest
 import torch
 
-from vllm_omni.utils.platform_utils import is_npu
+from vllm_omni.utils.platform_utils import is_npu, is_rocm
 
 # ruff: noqa: E402
 REPO_ROOT = Path(__file__).resolve().parents[2]
@@ -24,6 +24,11 @@
 # TODO: When NPU support is ready, remove this branch.
 if is_npu():
     models = ["Qwen/Qwen-Image"]
+elif is_rocm():
+    # TODO: When ROCm support is ready, remove this branch.
+    # vLLM V0.11.0 has issues running riverclouds/qwen_image_random
+    # on ROCm
+    models = ["Tongyi-MAI/Z-Image-Turbo"]
 
 
 @pytest.mark.parametrize("model_name", models)
diff --git a/tests/e2e/offline_inference/utils.py b/tests/e2e/offline_inference/utils.py
index 931e7b506c..c491c10b91 100644
--- a/tests/e2e/offline_inference/utils.py
+++ b/tests/e2e/offline_inference/utils.py
@@ -195,7 +195,11 @@ def create_new_process_for_each_test(
         A decorator to run test functions in separate processes.
     """
     if method is None:
-        use_spawn = current_platform.is_rocm() or current_platform.is_xpu()
+        # TODO: Find out why spawn is not working correctly on ROCm
+        # The test content will not run and tests passed immediately.
+        # For now, using `fork` for ROCm as it can run with `fork`
+        # and tests are running correctly.
+        use_spawn = current_platform.is_xpu()
         method = "spawn" if use_spawn else "fork"
 
     assert method in ["spawn", "fork"], "Method must be either 'spawn' or 'fork'"
diff --git a/vllm_omni/diffusion/layers/custom_op.py b/vllm_omni/diffusion/layers/custom_op.py
index 461da0d361..0bf5c4f60e 100644
--- a/vllm_omni/diffusion/layers/custom_op.py
+++ b/vllm_omni/diffusion/layers/custom_op.py
@@ -3,7 +3,7 @@
 
 import torch.nn as nn
 
-from vllm_omni.utils.platform_utils import detect_device_type
+from vllm_omni.utils.platform_utils import detect_device_type, is_rocm
 
 
 class CustomOp(nn.Module):
@@ -18,7 +18,9 @@ def __init__(self) -> None:
         self._forward_method = self.dispatch_forward()
 
     def dispatch_forward(self) -> Callable:
-        if self.is_cuda:
+        if is_rocm():
+            return self.forward_hip
+        elif self.is_cuda:
             return self.forward_cuda
         else:
             return self.forward_native
@@ -36,3 +38,7 @@ def forward_native(self, *args, **kwargs):
 
     def forward_cuda(self, *args, **kwargs):
         raise NotImplementedError
+
+    def forward_hip(self, *args, **kwargs):
+        # By default, we assume that HIP ops are compatible with CUDA ops.
+        return self.forward_cuda(*args, **kwargs)
diff --git a/vllm_omni/diffusion/layers/rope.py b/vllm_omni/diffusion/layers/rope.py
index acc0158fc6..528f2425ef 100644
--- a/vllm_omni/diffusion/layers/rope.py
+++ b/vllm_omni/diffusion/layers/rope.py
@@ -1,8 +1,13 @@
+from importlib.util import find_spec
+
 import torch
 from einops import rearrange, repeat
+from vllm.logger import init_logger
 
 from vllm_omni.diffusion.layers.custom_op import CustomOp
 
+logger = init_logger(__name__)
+
 
 def rotate_half(x, interleaved=False):
     if not interleaved:
@@ -45,6 +50,11 @@ def __init__(
         super().__init__()
         self.is_neox_style = is_neox_style
         self.interleaved = not is_neox_style
+        self.apply_rotary_emb_flash_attn = None
+        if find_spec("flash_attn") is not None:
+            from flash_attn.ops.triton.rotary import apply_rotary
+
+            self.apply_rotary_emb_flash_attn = apply_rotary
 
     def forward_cuda(
         self,
@@ -66,6 +76,27 @@ def forward_cuda(
             interleaved=self.interleaved,
         )
 
+    def forward_hip(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.apply_rotary_emb_flash_attn is None:
+            return self.forward_cuda(x, cos, sin)
+
+        if cos.dim() == 3:
+            # (B, S, D/2) -> (S, D/2)
+            cos = cos[0]
+            sin = sin[0]
+
+        return self.apply_rotary_emb_flash_attn(
+            x,
+            cos,
+            sin,
+            interleaved=self.interleaved,
+        )
+
     def forward_native(
         self,
         x: torch.Tensor,
diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py
index d86ded250b..82c8a4d34a 100644
--- a/vllm_omni/entrypoints/utils.py
+++ b/vllm_omni/entrypoints/utils.py
@@ -8,7 +8,7 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_config
 
-from vllm_omni.utils import detect_device_type
+from vllm_omni.utils import detect_device_type, is_rocm
 
 # Get the project root directory (2 levels up from this file)
 PROJECT_ROOT = Path(__file__).parent.parent.parent
@@ -86,8 +86,10 @@ def resolve_model_config_path(model: str) -> str:
     device_type = detect_device_type()
 
     # Try device-specific config first
-    if device_type != "cuda":
+    if device_type != "cuda" or is_rocm():
         device_config_file = f"vllm_omni/model_executor/stage_configs/{device_type}/{model_type}.yaml"
+        if is_rocm():
+            device_config_file = f"vllm_omni/model_executor/stage_configs/rocm/{model_type}.yaml"
         device_config_path = PROJECT_ROOT / device_config_file
         if os.path.exists(device_config_path):
             return str(device_config_path)
diff --git a/vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml b/vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml
new file mode 100644
index 0000000000..c646aa76a9
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml
@@ -0,0 +1,102 @@
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# The following config has been verified on 2x H100-80G GPU.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "2"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      engine_output_type: audio
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1
diff --git a/vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml b/vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml
new file mode 100644
index 0000000000..73f65ecb55
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml
@@ -0,0 +1,97 @@
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes)
+# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
+
+# The following config has been verified on 2x H100-80G GPUs.
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.6
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent  # Output hidden states for talker
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      hf_config_name: thinker_config
+      tensor_parallel_size: 2
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    runtime:
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.3
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent  # Output codec codes for code2wav
+      # tensor_parallel_size: 2
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      distributed_executor_backend: "mp"
+      hf_config_name: talker_config
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
+    # final_output: true
+    # final_output_type: text
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 4096
+      seed: 42
+      detokenize: False
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 2
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio  # Final output: audio waveform
+      gpu_memory_utilization: 0.1
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 1000000
+      hf_config_name: thinker_config
+    engine_input_source: [1]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 65536
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
diff --git a/vllm_omni/utils/__init__.py b/vllm_omni/utils/__init__.py
index 50dbb478d9..34b2545db5 100644
--- a/vllm_omni/utils/__init__.py
+++ b/vllm_omni/utils/__init__.py
@@ -2,10 +2,12 @@
     detect_device_type,
     get_device_control_env_var,
     is_npu,
+    is_rocm,
 )
 
 __all__ = [
     "detect_device_type",
     "get_device_control_env_var",
     "is_npu",
+    "is_rocm",
 ]
diff --git a/vllm_omni/utils/platform_utils.py b/vllm_omni/utils/platform_utils.py
index 385b1a8f36..5f8259ab83 100644
--- a/vllm_omni/utils/platform_utils.py
+++ b/vllm_omni/utils/platform_utils.py
@@ -19,6 +19,10 @@ def is_npu() -> bool:
     return detect_device_type() == "npu"
 
 
+def is_rocm() -> bool:
+    return current_platform.is_rocm()
+
+
 def get_device_control_env_var() -> str:
     """Return the environment variable name for device visibility control."""
     if hasattr(current_platform, "device_control_env_var"):