Skip to content

Commit 29f17c0

Browse files
ywang96JJJYmmmwulipcIsotr0py
authored andcommitted
[Model] Support Qwen3-VL Model Series (vllm-project#24727)
Signed-off-by: Roger Wang <[email protected]> Signed-off-by: Isotr0py <[email protected]> Co-authored-by: Huang Jie <[email protected]> Co-authored-by: 松灵 <[email protected]> Co-authored-by: Isotr0py <[email protected]>
1 parent 2f28b63 commit 29f17c0

File tree

14 files changed

+2114
-15
lines changed

14 files changed

+2114
-15
lines changed

docs/models/supported_models.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
667667
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
668668
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
669669
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
670+
| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
671+
| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
670672
| `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
671673
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
672674
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |

examples/offline_inference/vision_language.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,6 +1499,80 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
14991499
)
15001500

15011501

1502+
# Qwen3-VL-Dense
1503+
def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
1504+
model_name = "Qwen/Qwen3-VL-4B-Instruct"
1505+
1506+
engine_args = EngineArgs(
1507+
model=model_name,
1508+
max_model_len=4096,
1509+
max_num_seqs=5,
1510+
mm_processor_kwargs={
1511+
"min_pixels": 28 * 28,
1512+
"max_pixels": 1280 * 28 * 28,
1513+
"fps": 1,
1514+
},
1515+
limit_mm_per_prompt={modality: 1},
1516+
)
1517+
1518+
if modality == "image":
1519+
placeholder = "<|image_pad|>"
1520+
elif modality == "video":
1521+
placeholder = "<|video_pad|>"
1522+
1523+
prompts = [
1524+
(
1525+
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
1526+
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
1527+
f"{question}<|im_end|>\n"
1528+
"<|im_start|>assistant\n"
1529+
)
1530+
for question in questions
1531+
]
1532+
1533+
return ModelRequestData(
1534+
engine_args=engine_args,
1535+
prompts=prompts,
1536+
)
1537+
1538+
1539+
# Qwen3-VL-MOE
1540+
def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
1541+
model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
1542+
1543+
engine_args = EngineArgs(
1544+
model=model_name,
1545+
max_model_len=4096,
1546+
max_num_seqs=5,
1547+
mm_processor_kwargs={
1548+
"min_pixels": 28 * 28,
1549+
"max_pixels": 1280 * 28 * 28,
1550+
"fps": 1,
1551+
},
1552+
limit_mm_per_prompt={modality: 1},
1553+
)
1554+
1555+
if modality == "image":
1556+
placeholder = "<|image_pad|>"
1557+
elif modality == "video":
1558+
placeholder = "<|video_pad|>"
1559+
1560+
prompts = [
1561+
(
1562+
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
1563+
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
1564+
f"{question}<|im_end|>\n"
1565+
"<|im_start|>assistant\n"
1566+
)
1567+
for question in questions
1568+
]
1569+
1570+
return ModelRequestData(
1571+
engine_args=engine_args,
1572+
prompts=prompts,
1573+
)
1574+
1575+
15021576
# R-4B
15031577
def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
15041578
assert modality == "image"
@@ -1709,6 +1783,8 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
17091783
"qwen2_vl": run_qwen2_vl,
17101784
"qwen2_5_vl": run_qwen2_5_vl,
17111785
"qwen2_5_omni": run_qwen2_5_omni,
1786+
"qwen3_vl": run_qwen3_vl,
1787+
"qwen3_vl_moe": run_qwen3_vl_moe,
17121788
"rvl": run_r_vl,
17131789
"skywork_chat": run_skyworkr1v,
17141790
"smolvlm": run_smolvlm,
@@ -1718,6 +1794,15 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
17181794
}
17191795

17201796

1797+
MODELS_NEED_VIDEO_METADATA = [
1798+
"glm4_1v",
1799+
"glm4_5v",
1800+
"glm4_5v_fp8",
1801+
"qwen3_vl",
1802+
"qwen3_vl_moe",
1803+
]
1804+
1805+
17211806
def get_multi_modal_input(args):
17221807
"""
17231808
return {

tests/models/multimodal/processing/test_common.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
3131
"""
3232
# Ensure video metadata is included
3333
if "video" in mm_data:
34+
# GLM4.1V doesn't support multiple videos
3435
video = mm_data["video"]
3536
mm_data["video"] = (video, {
3637
"total_num_frames": len(video),
@@ -41,6 +42,34 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
4142
return mm_data
4243

4344

45+
def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
46+
"""
47+
Patch the multimodal data for Qwen3-VL model.
48+
"""
49+
50+
def create_metadata(frames: np.ndarray):
51+
num_frames = len(frames)
52+
return {
53+
"total_num_frames": num_frames,
54+
"fps": 2.0,
55+
"duration": num_frames / 2.0,
56+
"video_backend": "opencv",
57+
"frames_indices": list(range(num_frames)),
58+
"do_sample_frames": True,
59+
}
60+
61+
# Ensure video metadata is included
62+
if "video" in mm_data:
63+
video = mm_data["video"]
64+
if isinstance(video, list):
65+
# multiple videos
66+
mm_data["video"] = [(vid, create_metadata(vid)) for vid in video]
67+
else:
68+
# single video
69+
mm_data["video"] = (video, create_metadata(video))
70+
return mm_data
71+
72+
4473
def _test_processing_correctness(
4574
model_id_or_arch: str,
4675
hit_rate: float,
@@ -181,8 +210,10 @@ def _test_processing_correctness(
181210
}
182211

183212
MM_DATA_PATCHES = {
184-
# GLM4.1V requires video metadata to be included in the input
213+
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
185214
"glm4v": glm4_1v_patch_mm_data,
215+
"qwen3_vl": qwen3_vl_patch_mm_data,
216+
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
186217
}
187218

188219

@@ -328,6 +359,8 @@ def _test_processing_correctness_one(
328359
"Qwen/Qwen2.5-VL-3B-Instruct",
329360
"Qwen/Qwen2-Audio-7B-Instruct",
330361
"Qwen/Qwen2.5-Omni-3B",
362+
"Qwen/Qwen3-VL-4B-Instruct",
363+
"Qwen/Qwen3-VL-30B-A3B-Instruct",
331364
"YannQi/R-4B",
332365
"Skywork/Skywork-R1V-38B",
333366
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",

tests/models/registry.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,12 @@ def check_available_online(
559559
max_model_len=4096),
560560
"Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
561561
"Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501
562+
"Qwen3VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-4B-Instruct", # noqa: E501
563+
max_model_len=4096,
564+
min_transformers_version="4.57"), # noqa: E501
565+
"Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", # noqa: E501
566+
max_model_len=4096,
567+
min_transformers_version="4.57"),
562568
"RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B",
563569
trust_remote_code=True),
564570
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",

vllm/attention/layer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@
3030
USE_XFORMERS_OPS = None
3131

3232

33+
34+
def check_upstream_fa_availability(dtype: torch.dtype):
35+
if dtype in (torch.float16, torch.bfloat16) and current_platform.is_cuda(
36+
) and current_platform.has_device_capability(80):
37+
from transformers.utils import is_flash_attn_2_available
38+
return is_flash_attn_2_available()
39+
return False
40+
41+
3342
def check_xformers_availability():
3443
global USE_XFORMERS_OPS
3544
if USE_XFORMERS_OPS is not None:

vllm/model_executor/layers/rotary_embedding/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ def get_rope(
103103
is_neox_style,
104104
dtype,
105105
mrope_section=rope_scaling["mrope_section"],
106+
mrope_interleaved=rope_scaling.get("mrope_interleaved",
107+
False),
106108
)
107109
else:
108110
rotary_emb = RotaryEmbedding(

0 commit comments

Comments
 (0)