Skip to content
24 changes: 23 additions & 1 deletion .buildkite/test-nightly.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@ steps:
queue: "cpu_queue_premerge"

- label: "Omni Model Test with H100"
timeout_in_minutes: 180
timeout_in_minutes: 60
depends_on: image-build
if: build.env("NIGHTLY") == "1"
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py
- pytest -s -v tests/examples/online_serving/test_qwen3_omni.py
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -44,3 +45,24 @@ steps:
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate



- label: "Omni Model Test"
timeout_in_minutes: 60
depends_on: image-build
commands:
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/examples/online_serving/test_qwen2_5_omni.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
33 changes: 18 additions & 15 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,22 +588,31 @@ def convert_audio_to_text(audio_data):
"""
Convert base64 encoded audio data to text using speech recognition.
"""
import whisper

audio_data = base64.b64decode(audio_data)
output_path = f"./test_{int(time.time())}"
with open(output_path, "wb") as audio_file:
audio_file.write(audio_data)

print(f"audio data is saved: {output_path}")

text = convert_audio_file_to_text(output_path=output_path)
return text


def convert_audio_file_to_text(output_path):
import whisper

model = whisper.load_model("base")
text = model.transcribe(
output_path,
temperature=0.0,
word_timestamps=True,
condition_on_previous_text=False,
)["text"]
del model
if torch.cuda.is_available():
gc.collect()
torch.cuda.empty_cache()
if text:
return text
else:
Expand All @@ -614,7 +623,6 @@ def merge_base64_and_convert_to_text(base64_list):
"""
Merge a list of base64 encoded audio data and convert to text.
"""
import whisper
from pydub import AudioSegment

merged_audio = None
Expand All @@ -627,17 +635,8 @@ def merge_base64_and_convert_to_text(base64_list):
merged_audio += seg
output_path = f"./test_{int(time.time())}"
merged_audio.export(output_path, format="wav")
model = whisper.load_model("base")
text = model.transcribe(
output_path,
temperature=0.0,
word_timestamps=True,
condition_on_previous_text=False,
)["text"]
if text:
return text
else:
return ""
text = convert_audio_file_to_text(output_path)
return text


def modify_stage_config(
Expand Down Expand Up @@ -886,6 +885,7 @@ def __init__(
model: str,
serve_args: list[str],
*,
port: int | None = None,
env_dict: dict[str, str] | None = None,
) -> None:
_run_pre_test_cleanup(enable_force=True)
Expand All @@ -896,7 +896,10 @@ def __init__(
self.env_dict = env_dict
self.proc: subprocess.Popen | None = None
self.host = "127.0.0.1"
self.port = get_open_port()
if port is None:
self.port = get_open_port()
else:
self.port = port

def _start_server(self) -> None:
"""Start the vLLM-Omni server subprocess."""
Expand Down
106 changes: 106 additions & 0 deletions tests/e2e/stage_configs/qwen2_5_omni_ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# stage config for running qwen2.5-omni with architecture of OmniLLM.

# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
# This config is optimized for CI e2e tests.
stage_args:
- stage_id: 0
runtime:
process: true # Run this stage in a separate process
devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len: 32768
max_num_batched_tokens: 32768
max_num_seqs: 1
gpu_memory_utilization: 0.8
skip_mm_profiling: true
enforce_eager: true # Now we only support eager mode
trust_remote_code: true
engine_output_type: latent
enable_prefix_caching: false
is_comprehension: true
final_output: true
final_output_type: text
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.1
- stage_id: 1
runtime:
process: true
devices: "1"
max_batch_size: 1
engine_args:
model_stage: talker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
max_model_len: 32768
max_num_batched_tokens: 32768
max_num_seqs: 1
gpu_memory_utilization: 0.8
skip_mm_profiling: true
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: latent
engine_input_source: [0]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
default_sampling_params:
temperature: 0.9
top_p: 0.8
top_k: 40
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.05
stop_token_ids: [8294]
- stage_id: 2
runtime:
process: true
devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU
max_batch_size: 1
engine_args:
model_stage: code2wav
model_arch: Qwen2_5OmniForConditionalGeneration
worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
gpu_memory_utilization: 0.15
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: audio
max_num_batched_tokens: 4069
engine_input_source: [1]
final_output: true
final_output_type: audio
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 128
seed: 42
detokenize: True
repetition_penalty: 1.1

# Top-level runtime config (concise): default windows and stage edges
runtime:
enabled: true
defaults:
window_size: -1 # Simplified: trigger downstream only after full upstream completion
max_inflight: 1 # Simplified: process serially within each stage
edges:
- from: 0 # thinker → talker: trigger only after receiving full input (-1)
to: 1
window_size: -1
- from: 1 # talker → code2wav: trigger only after receiving full input (-1)
to: 2
window_size: -1
Loading