vllm-project · hsliuustc0106 · Jan 20, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 9, 2026
@@ -15,7 +15,7 @@ Easy, fast, and cheap omni-modality model serving for everyone
 ---
 
 *Latest News* 🔥
-
+- [2026/01] We released [0.14.0rc1](https://github.com/vllm-project/vllm-omni/releases/tag/v0.14.0).
 - [2026/01] We released [0.12.0rc1](https://github.com/vllm-project/vllm-omni/releases/tag/v0.12.0rc1) - a major RC milestone focused on maturing the diffusion stack, strengthening OpenAI-compatible serving, expanding omni-model coverage, and improving stability across platforms (GPU/NPU/ROCm), please check our latest [design](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true).
 - [2025/11] vLLM community officially released [vllm-project/vllm-omni](https://github.com/vllm-project/vllm-omni) in order to support omni-modality models serving.
 

@@ -1,5 +1,5 @@
 ARG VLLM_BASE_IMAGE=vllm/vllm-openai
-ARG VLLM_BASE_TAG=v0.12.0
+ARG VLLM_BASE_TAG=v0.14.0
 FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
 ARG APP_DIR=/workspace/vllm-omni
 WORKDIR ${APP_DIR}

@@ -2,7 +2,7 @@ ARG BASE_IMAGE=rocm/vllm-dev:nightly_main_20251205
 FROM ${BASE_IMAGE}
 
 ARG COMMON_WORKDIR=/app
-ARG VLLM_VERSION=v0.12.0
+ARG VLLM_VERSION=v0.14.0
 ARG PYTORCH_ROCM_ARCH="gfx942;gfx950"
 
 WORKDIR ${COMMON_WORKDIR}

diff --git a/docs/configuration/README.md b/docs/configuration/README.md
@@ -2,7 +2,7 @@
 
 This section lists the most common options for running vLLM-Omni.
 
-For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.12.0/configuration/index.html)
+For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.14.0/configuration/index.html)
 
 Currently, the main options are maintained by stage configs for each model.
 

diff --git a/docs/design/architecture_overview.md b/docs/design/architecture_overview.md
@@ -192,4 +192,4 @@ curl -sS -X POST http://localhost:8091/v1/chat/completions \
 }
 ```
 
-For more usages, please refer to [examples](../examples/README.md).
+For more usages, please refer to [examples](https://github.com/vllm-project/vllm-omni/tree/main/examples).
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -58,11 +58,11 @@ If you want to check, modify or debug with source code of vLLM, install the libr
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-git checkout v0.12.0
+git checkout v0.14.0
 ```
 Set up environment variables to get pre-built wheels. If there are internet problems, just download the whl file manually. And set `VLLM_PRECOMPILED_WHEEL_LOCATION` as your local absolute path of whl file.
 ```bash
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.12.0/vllm-0.12.0-cp38-abi3-manylinux_2_31_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.14.0/vllm-0.14.0-cp38-abi3-manylinux_2_31_x86_64.whl
 ```
 Install vllm with command below (If you have no existing PyTorch).
 ```bash
@@ -93,7 +93,7 @@ docker run --runtime nvidia --gpus 2 \
     --env "HF_TOKEN=$HF_TOKEN" \
     -p 8091:8091 \
     --ipc=host \
-    vllm/vllm-omni:v0.12.0rc1 \
+    vllm/vllm-omni:v0.14.0 \
     --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 8091
 ```
 

diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -68,7 +68,7 @@ docker run -it \
   -v <path/to/model>:/app/model \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   --env "HF_TOKEN=$HF_TOKEN" \
-  vllm/vllm-omni-rocm:v0.12.0rc1 \
+  vllm/vllm-omni-rocm:v0.14.0 \
   vllm serve --model Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
 ```
 
@@ -86,7 +86,7 @@ docker run -it \
   -v <path/to/model>:/app/model \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   --env "HF_TOKEN=$HF_TOKEN" \
-  vllm/vllm-omni-rocm:v0.12.0rc1 \
+  vllm/vllm-omni-rocm:v0.14.0 \
   bash
 ```
 

diff --git a/docs/getting_started/installation/npu/npu.inc.md b/docs/getting_started/installation/npu/npu.inc.md
@@ -13,10 +13,10 @@ export DEVICE0=/dev/davinci0
 export DEVICE1=/dev/davinci1
 # Update the vllm-ascend image
 # Atlas A2:
-# export IMAGE=quay.io/ascend/vllm-ascend:v0.12.0rc1
+# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0
 # Atlas A3:
-# export IMAGE=quay.io/ascend/vllm-ascend:v0.12.0rc1-a3
-export IMAGE=quay.io/ascend/vllm-ascend:v0.12.0rc1
+# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0-a3
+export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0
 docker run --rm \
     --name vllm-omni-npu \
     --shm-size=1g \
@@ -42,7 +42,7 @@ source ~/.bashrc
 
 # Inside the container, install vLLM-Omni from source
 cd /vllm-workspace
-git clone -b v0.12.0rc1 https://github.com/vllm-project/vllm-omni.git
+git clone -b v0.14.0 https://github.com/vllm-project/vllm-omni.git
 cd vllm-omni
 pip install -v -e .
 export VLLM_WORKER_MULTIPROC_METHOD=spawn

@@ -278,9 +278,9 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin
 
 
 query_map = {
-    "mixed_modalities": get_mixed_modalities_query,
+    "use_mixed_modalities": get_mixed_modalities_query,
     "use_audio_in_video": get_use_audio_in_video_query,
-    "multi_audios": get_multi_audios_query,
+    "use_multi_audios": get_multi_audios_query,
     "use_image": get_image_query,
     "use_video": get_video_query,
     "use_audio": get_audio_query,
@@ -434,7 +434,7 @@ def parse_args():
         "--query-type",
         "-q",
         type=str,
-        default="mixed_modalities",
+        default="use_mixed_modalities",
         choices=query_map.keys(),
         help="Query type.",
     )

@@ -1,2 +1,2 @@
 python end2end.py --output-wav output_audio \
-                  --query-type mixed_modalities
+                  --query-type use_mixed_modalities
@@ -12,6 +12,7 @@
 import librosa
 import numpy as np
 import soundfile as sf
+import vllm
 from PIL import Image
 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
@@ -225,18 +226,76 @@ def get_multi_audios_query() -> QueryResult:
     )
 
 
+# def get_use_audio_in_video_query(video_path: str | None = None) -> QueryResult:
+# question = (
+#     "Describe the content of the video in details, then convert what the "
+#     "baby say into text."
+# )
+# prompt = (
+#     f"<|im_start|>system\n{default_system}<|im_end|>\n"
+#     "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
+#     f"{question}<|im_end|>\n"
+#     f"<|im_start|>assistant\n"
+# )
+# if video_path:
+#     if not os.path.exists(video_path):
+#         raise FileNotFoundError(f"Video file not found: {video_path}")
+#     video_frames = video_to_ndarrays(video_path, num_frames=16)
+# else:
+#     video_frames = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
+# audio = extract_video_audio(video_path, sampling_rate=16000)
+# return QueryResult(
+#     inputs={
+#         "prompt": prompt,
+#         "multi_modal_data": {
+#             "video": video_frames,
+#             "audio": audio,
+#         },
+#         "mm_processor_kwargs": {
+#             "use_audio_in_video": True,
+#         },
+#     },
+#     limit_mm_per_prompt={"audio": 1, "video": 1},
+# )
+def get_use_audio_in_video_query() -> QueryResult:
+    question = "Describe the content of the video in details, then convert what the baby say into text."
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    asset = VideoAsset(name="baby_reading", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "video": 1},
+    )
+
+
 query_map = {
     "text": get_text_query,
     "use_audio": get_audio_query,
     "use_image": get_image_query,
     "use_video": get_video_query,
-    "multi_audios": get_multi_audios_query,
-    "mixed_modalities": get_mixed_modalities_query,
+    "use_multi_audios": get_multi_audios_query,
+    "use_mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
 }
 
 
 def main(args):
     model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+    print("=" * 20, "\n", f"vllm version: {vllm.__version__}", "\n", "=" * 20)
 
     # Get paths from args
     video_path = getattr(args, "video_path", None)
@@ -259,6 +318,10 @@ def main(args):
             num_frames=getattr(args, "num_frames", 16),
             sampling_rate=getattr(args, "sampling_rate", 16000),
         )
+    elif args.query_type == "multi_audios":
+        query_result = query_func()
+    elif args.query_type == "use_audio_in_video":
+        query_result = query_func()
     else:
         query_result = query_func()
 
@@ -270,7 +333,7 @@ def main(args):
     )
 
     thinker_sampling_params = SamplingParams(
-        temperature=0.4,
+        temperature=0.9,
         top_p=0.9,
         top_k=-1,
         max_tokens=1200,
@@ -331,6 +394,8 @@ def main(args):
     total_requests = len(prompts)
     processed_count = 0
 
+    print(f"query type: {args.query_type}")
+
     for stage_outputs in omni_generator:
         if stage_outputs.final_output_type == "text":
             for output in stage_outputs.request_output:
@@ -385,7 +450,7 @@ def parse_args():
         "--query-type",
         "-q",
         type=str,
-        default="mixed_modalities",
+        default="use_mixed_modalities",
         choices=query_map.keys(),
         help="Query type.",
     )

@@ -305,13 +305,34 @@ def get_multi_audios_query(custom_prompt: str | None = None):
     }
 
 
+def get_use_audio_in_video_query(
+    video_path: str | None = None,
+    audio_path: str | None = None,
+    custom_prompt: str | None = None,
+):
+    question = custom_prompt or (
+        "Describe the content of the video in details, then convert what the baby say into text."
+    )
+    video_url = get_video_url_from_path(video_path)
+    audio_url = get_audio_url_from_path(audio_path)
+    return {
+        "role": "user",
+        "content": [
+            {"type": "video_url", "video_url": {"url": video_url}},
+            {"type": "audio_url", "audio_url": {"url": audio_url}},
+            {"type": "text", "text": question},
+        ],
+    }
+
+
 query_map = {
     "text": get_text_query,
     "use_audio": get_audio_query,
     "use_image": get_image_query,
     "use_video": get_video_query,
     "use_mixed_modalities": get_mixed_modalities_query,
     "use_multi_audios": get_multi_audios_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
 }
 
 
@@ -372,6 +393,12 @@ def run_multimodal_generation(args) -> None:
         prompt = query_func(audio_path=audio_path, custom_prompt=custom_prompt)
     elif args.query_type == "text":
         prompt = query_func(custom_prompt=custom_prompt)
+    elif args.query_type == "use_audio_in_video":
+        prompt = query_func(
+            video_path=video_path,
+            audio_path=audio_path,
+            custom_prompt=custom_prompt,
+        )
     else:
         prompt = query_func()
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -13,11 +13,28 @@
 import whisper
 import yaml
 from vllm.logger import init_logger
-from vllm.utils import get_open_port
+from vllm.utils.network_utils import get_open_port
 
 logger = init_logger(__name__)
 
 
+@pytest.fixture(autouse=True)
+def default_vllm_config():
+    """Set a default VllmConfig for all tests.
+
+    This fixture is auto-used for all tests to ensure that any test
+    that directly instantiates vLLM CustomOps (e.g., RMSNorm, LayerNorm)
+    or model components has the required VllmConfig context.
+
+    This fixture is required for vLLM 0.14.0+ where CustomOp initialization
+    requires a VllmConfig context set via set_current_vllm_config().
+    """
+    from vllm.config import VllmConfig, set_current_vllm_config
+
+    with set_current_vllm_config(VllmConfig()):
+        yield
+
+
 @pytest.fixture(autouse=True)
 def clean_gpu_memory_between_tests():
     if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":

diff --git a/tests/diffusion/attention/test_sequence_parallel.py b/tests/diffusion/attention/test_sequence_parallel.py
@@ -170,6 +170,11 @@ def test_sequence_parallel(
     """Test Ulysses attention by comparing with and without SP enabled."""
     sequence_parallel_size = ulysses_degree * ring_degree
 
+    # Skip if not enough GPUs available
+    available_gpus = torch_device.device_count()
+    if available_gpus < sequence_parallel_size:
+        pytest.skip(f"Test requires {sequence_parallel_size} GPUs but only {available_gpus} available")
+
     # Create temporary files to share results between processes
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as f:
         baseline_output_file = f.name

diff --git a/tests/diffusion/distributed/test_comm.py b/tests/diffusion/distributed/test_comm.py
@@ -48,6 +48,11 @@ def test_4d_identity(
     use_sync: bool,
 ):
     """Test that two consecutive all-to-all operations return the original input."""
+    # Skip if not enough GPUs available
+    available_gpus = torch_device.device_count()
+    if available_gpus < world_size:
+        pytest.skip(f"Test requires {world_size} GPUs but only {available_gpus} available")
+
     # Ensure num_heads is divisible by world_size
     if num_heads % world_size != 0:
         pytest.skip(f"num_heads ({num_heads}) not divisible by world_size ({world_size})")
@@ -177,6 +182,11 @@ def test_5d_identity(
     use_sync: bool,
 ):
     """Test that two consecutive all-to-all operations return the original input."""
+    # Skip if not enough GPUs available
+    available_gpus = torch_device.device_count()
+    if available_gpus < world_size:
+        pytest.skip(f"Test requires {world_size} GPUs but only {available_gpus} available")
+
     # Ensure num_heads is divisible by world_size
     if num_heads % world_size != 0:
         pytest.skip(f"num_heads ({num_heads}) not divisible by world_size ({world_size})")
@@ -305,6 +315,11 @@ def test_ring_p2p(
     head_size: int,
 ):
     """Test Ring P2P communication (send_recv)."""
+    # Skip if not enough GPUs available
+    available_gpus = torch_device.device_count()
+    if available_gpus < world_size:
+        pytest.skip(f"Test requires {world_size} GPUs but only {available_gpus} available")
+
     torch.multiprocessing.spawn(
         _test_ring_p2p_worker,
         args=(world_size, dtype, batch_size, num_heads, head_size),

diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py
@@ -35,7 +35,7 @@
 
 @pytest.mark.core_model
 @pytest.mark.parametrize("test_config", test_params)
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("spawn")
 def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
     """Test processing audio, image, and video together, generating audio output."""
     model, stage_config_path = test_config
@@ -95,7 +95,7 @@ def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: t
 
 @pytest.mark.core_model
 @pytest.mark.parametrize("test_config", test_params)
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("spawn")
 def test_mixed_modalities_to_text_only(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
     """Test processing audio, image, and video together, generating audio output."""
     model, stage_config_path = test_config
-Original file line number
+Diff line change
@@ Expand Up @@
     }
     ```
-    For more usages, please refer to [examples](../examples/README.md).
+    For more usages, please refer to [examples](https://github.com/vllm-project/vllm-omni/tree/main/examples).