vllm-project · hsliuustc0106 · Feb 9, 2026 · Jan 28, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -582,22 +582,30 @@ def convert_audio_to_text(audio_data):
     """
     Convert base64 encoded audio data to text using speech recognition.
     """
-    import whisper
-
     audio_data = base64.b64decode(audio_data)
     output_path = f"./test_{int(time.time())}"
     with open(output_path, "wb") as audio_file:
         audio_file.write(audio_data)
 
     print(f"audio data is saved: {output_path}")
 
+    text = convert_audio_file_to_text(output_path=output_path)
+    return text
+
+
+def convert_audio_file_to_text(output_path):
+    import whisper
+
     model = whisper.load_model("base")
     text = model.transcribe(
         output_path,
         temperature=0.0,
         word_timestamps=True,
         condition_on_previous_text=False,
     )["text"]
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
     if text:
         return text
     else:
@@ -608,7 +616,6 @@ def merge_base64_and_convert_to_text(base64_list):
     """
     Merge a list of base64 encoded audio data and convert to text.
     """
-    import whisper
     from pydub import AudioSegment
 
     merged_audio = None
@@ -621,17 +628,8 @@ def merge_base64_and_convert_to_text(base64_list):
             merged_audio += seg
     output_path = f"./test_{int(time.time())}"
     merged_audio.export(output_path, format="wav")
-    model = whisper.load_model("base")
-    text = model.transcribe(
-        output_path,
-        temperature=0.0,
-        word_timestamps=True,
-        condition_on_previous_text=False,
-    )["text"]
-    if text:
-        return text
-    else:
-        return ""
+    text = convert_audio_file_to_text(output_path)
+    return text
 
 
 def modify_stage_config(
@@ -880,6 +878,7 @@ def __init__(
         model: str,
         serve_args: list[str],
         *,
+        port: int = None,
         env_dict: dict[str, str] | None = None,
     ) -> None:
         _run_pre_test_cleanup(enable_force=True)
@@ -890,7 +889,10 @@ def __init__(
         self.env_dict = env_dict
         self.proc: subprocess.Popen | None = None
         self.host = "127.0.0.1"
-        self.port = get_open_port()
+        if port is None:
+            self.port = get_open_port()
+        else:
+            self.port = port
 
     def _start_server(self) -> None:
         """Start the vLLM-Omni server subprocess."""

diff --git a/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/qwen2_5_omni_ci.yaml
@@ -0,0 +1,106 @@
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
+# This config is optimized for CI e2e tests.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 32768
+      max_num_batched_tokens: 32768
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 32768
+      max_num_batched_tokens: 32768
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "0"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+      max_num_batched_tokens: 4069
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1