Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,38 @@ steps:
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: "Bagel Text2Img Model Test with H100"
timeout_in_minutes: 30
depends_on: image-build
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/model_executor/models/bagel/test_bagel_text2img.py
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ dev = [
"psutil>=7.2.0",
"soundfile>=0.13.1",
"imageio[ffmpeg]>=0.6.0",
"opencv-python>=4.12.0.88"
"opencv-python>=4.12.0.88",
"mooncake-transfer-engine==0.3.8.post1"
]

docs = [
Expand Down
292 changes: 292 additions & 0 deletions tests/e2e/offline_inference/test_bagel_text2img.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""
End-to-end test for Bagel text2img generation.

This test validates that the Bagel model generates images that match
expected reference pixel values within a ±5 tolerance.

Equivalent to running:
python3 examples/offline_inference/bagel/end2end.py \
--prompts "A futuristic city skyline at twilight, cyberpunk style" \
--modality text2img --step 15
"""

import os
import signal
import socket
import subprocess
import tempfile
import time

import yaml

from vllm_omni.entrypoints.omni import Omni

# Reference pixel data extracted from the known-good output image
# Each entry contains (x, y) position and expected (R, G, B) values
REFERENCE_PIXELS = [
{"position": (100, 100), "rgb": (68, 107, 134)},
{"position": (400, 50), "rgb": (95, 139, 166)},
{"position": (700, 100), "rgb": (99, 122, 151)},
{"position": (150, 400), "rgb": (111, 125, 153)},
{"position": (512, 512), "rgb": (97, 107, 131)},
{"position": (700, 400), "rgb": (48, 64, 98)},
{"position": (100, 700), "rgb": (79, 63, 84)},
{"position": (400, 700), "rgb": (40, 58, 79)},
{"position": (700, 700), "rgb": (60, 75, 103)},
{"position": (256, 256), "rgb": (97, 128, 156)},
]

# Maximum allowed difference per color channel
PIXEL_TOLERANCE = 5


def test_bagel_text2img_shared_memory_connector():
"""Test Bagel text2img with shared memory connector."""

omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT")

try:
params_list = omni.default_sampling_params_list
params_list[0].max_tokens = 1 # type: ignore
if len(params_list) > 1:
params_list[1].num_inference_steps = 15 # type: ignore

omni_outputs = list(
omni.generate(
prompts=[
{
"prompt": "<|im_start|>A futuristic city skyline at twilight, cyberpunk style<|im_end|>",
"modalities": ["image"],
}
],
sampling_params_list=params_list,
)
)

# Extract generated image
generated_image = None
for req_output in omni_outputs:
if images := getattr(req_output, "images", None):
generated_image = images[0]
break
if hasattr(req_output, "request_output") and req_output.request_output:
for stage_out in req_output.request_output:
if hasattr(stage_out, "images") and stage_out.images:
generated_image = stage_out.images[0]
break
if generated_image:
break

assert generated_image is not None, "No images generated"
assert generated_image.size == (1024, 1024), f"Expected 1024x1024, got {generated_image.size}"

# Validate pixels
for ref in REFERENCE_PIXELS:
x, y = ref["position"]
expected = ref["rgb"]
actual = generated_image.getpixel((x, y))[:3]
assert all(abs(a - e) <= PIXEL_TOLERANCE for a, e in zip(actual, expected)), f"Pixel mismatch at ({x}, {y})"

finally:
omni.close()


def _find_free_port() -> int:
"""Find and return a free ephemeral port by binding to port 0."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("127.0.0.1", 0))
s.listen(1)
port = s.getsockname()[1]
return port


def test_bagel_text2img_mooncake_connector():
"""Test Bagel text2img with Mooncake connector for inter-stage communication."""

MOONCAKE_HOST = "127.0.0.1"
# Use dynamically allocated ephemeral ports to avoid conflicts
MOONCAKE_RPC_PORT = _find_free_port()
MOONCAKE_HTTP_PORT = _find_free_port()
MOONCAKE_METRICS_PORT = _find_free_port()

def wait_for_port(host: str, port: int, timeout: int = 30) -> bool:
"""Wait for a port to become available."""
for _ in range(timeout):
try:
with socket.create_connection((host, port), timeout=1):
return True
except OSError:
time.sleep(1)
return False

# Stage configuration with Mooncake connector
mooncake_config = {
"stage_args": [
{
"stage_id": 0,
"stage_type": "llm",
"runtime": {"devices": "0", "max_batch_size": 1},
"engine_args": {
"model_stage": "thinker",
"model_arch": "BagelForConditionalGeneration",
"worker_type": "ar",
"scheduler_cls": "vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler",
"gpu_memory_utilization": 0.35,
"enforce_eager": True,
"trust_remote_code": True,
"engine_output_type": "text",
"distributed_executor_backend": "mp",
"enable_prefix_caching": False,
"max_num_batched_tokens": 32768,
"tensor_parallel_size": 1,
"omni_kv_config": {"need_send_cache": True, "kv_transfer_criteria": {"type": "prefill_finished"}},
},
"final_output": True,
"final_output_type": "text",
"is_comprehension": True,
"default_sampling_params": {
"temperature": 0.4,
"top_p": 0.9,
"top_k": 1,
"max_tokens": 2048,
"seed": 52,
"detokenize": True,
"repetition_penalty": 1.05,
},
"output_connectors": {"to_stage_1": "mooncake_connector"},
},
{
"stage_id": 1,
"stage_type": "diffusion",
"runtime": {"devices": "0", "max_batch_size": 1},
"engine_args": {
"model_stage": "dit",
"gpu_memory_utilization": 0.55,
"enforce_eager": True,
"trust_remote_code": True,
"engine_output_type": "image",
"distributed_executor_backend": "mp",
"enable_prefix_caching": False,
"max_num_batched_tokens": 32768,
"tensor_parallel_size": 1,
"omni_kv_config": {"need_recv_cache": True},
},
"engine_input_source": [0],
"final_output": True,
"final_output_type": "image",
"is_comprehension": False,
"default_sampling_params": {"seed": 52},
"input_connectors": {"from_stage_0": "mooncake_connector"},
},
],
"runtime": {
"enabled": True,
"defaults": {"window_size": -1, "max_inflight": 1},
"connectors": {
"mooncake_connector": {
"name": "MooncakeConnector",
"extra": {
"host": MOONCAKE_HOST,
"metadata_server": f"http://{MOONCAKE_HOST}:{MOONCAKE_HTTP_PORT}/metadata",
"master": f"{MOONCAKE_HOST}:{MOONCAKE_RPC_PORT}",
"segment": 512000000,
"localbuf": 64000000,
"proto": "tcp",
},
},
},
"edges": [{"from": 0, "to": 1, "window_size": -1}],
},
}

mooncake_master_proc = None
temp_config_file = None
omni = None

try:
# Clean up existing mooncake_master processes
subprocess.run(["pkill", "-9", "-f", "mooncake_master"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(1)

# Start mooncake_master
mooncake_master_proc = subprocess.Popen(
[
"mooncake_master",
f"--rpc_port={MOONCAKE_RPC_PORT}",
"--enable_http_metadata_server=true",
"--http_metadata_server_host=0.0.0.0",
f"--http_metadata_server_port={MOONCAKE_HTTP_PORT}",
f"--metrics_port={MOONCAKE_METRICS_PORT}",
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
preexec_fn=os.setsid,
)

assert wait_for_port(MOONCAKE_HOST, MOONCAKE_RPC_PORT), "mooncake_master failed to start"

# Create temp config and initialize Omni
temp_config_file = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False)
yaml.dump(mooncake_config, temp_config_file)
temp_config_file.close()

omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file.name)

# Generate image
params_list = omni.default_sampling_params_list
params_list[0].max_tokens = 1 # type: ignore
if len(params_list) > 1:
params_list[1].num_inference_steps = 15 # type: ignore

omni_outputs = list(
omni.generate(
prompts=[
{
"prompt": "<|im_start|>A futuristic city skyline at twilight, cyberpunk style<|im_end|>",
"modalities": ["image"],
}
],
sampling_params_list=params_list,
)
)

# Extract generated image
generated_image = None
for req_output in omni_outputs:
if images := getattr(req_output, "images", None):
generated_image = images[0]
break
if hasattr(req_output, "request_output") and req_output.request_output:
for stage_out in req_output.request_output:
if hasattr(stage_out, "images") and stage_out.images:
generated_image = stage_out.images[0]
break
if generated_image:
break

assert generated_image is not None, "No images generated"
assert generated_image.size == (1024, 1024), f"Expected 1024x1024, got {generated_image.size}"

# Validate pixels
for ref in REFERENCE_PIXELS:
x, y = ref["position"]
expected = ref["rgb"]
actual = generated_image.getpixel((x, y))[:3]
assert all(abs(a - e) <= PIXEL_TOLERANCE for a, e in zip(actual, expected)), f"Pixel mismatch at ({x}, {y})"

finally:
if omni:
omni.close()
if temp_config_file:
try:
os.unlink(temp_config_file.name)
except OSError:
pass
if mooncake_master_proc:
try:
os.killpg(os.getpgid(mooncake_master_proc.pid), signal.SIGKILL)
except OSError:
pass
4 changes: 2 additions & 2 deletions vllm_omni/model_executor/stage_configs/bagel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ stage_args:
model_arch: BagelForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.4
gpu_memory_utilization: 0.35
enforce_eager: true
trust_remote_code: true
engine_output_type: text
Expand Down Expand Up @@ -42,7 +42,7 @@ stage_args:
max_batch_size: 1
engine_args:
model_stage: dit
gpu_memory_utilization: 0.4
gpu_memory_utilization: 0.55
enforce_eager: true
trust_remote_code: true
engine_output_type: image
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ stage_args:
model_arch: BagelForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.4
gpu_memory_utilization: 0.35
enforce_eager: true
trust_remote_code: true
engine_output_type: text
Expand Down Expand Up @@ -41,11 +41,11 @@ stage_args:
- stage_id: 1
stage_type: diffusion
runtime:
devices: "1"
devices: "0"
max_batch_size: 1
engine_args:
model_stage: dit
gpu_memory_utilization: 0.4
gpu_memory_utilization: 0.55
enforce_eager: true
trust_remote_code: true
engine_output_type: image
Expand Down
Loading