@@ -1499,6 +1499,80 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
14991499 )
15001500
15011501
1502+ # Qwen3-VL-Dense
1503+ def run_qwen3_vl (questions : list [str ], modality : str ) -> ModelRequestData :
1504+ model_name = "Qwen/Qwen3-VL-4B-Instruct"
1505+
1506+ engine_args = EngineArgs (
1507+ model = model_name ,
1508+ max_model_len = 4096 ,
1509+ max_num_seqs = 5 ,
1510+ mm_processor_kwargs = {
1511+ "min_pixels" : 28 * 28 ,
1512+ "max_pixels" : 1280 * 28 * 28 ,
1513+ "fps" : 1 ,
1514+ },
1515+ limit_mm_per_prompt = {modality : 1 },
1516+ )
1517+
1518+ if modality == "image" :
1519+ placeholder = "<|image_pad|>"
1520+ elif modality == "video" :
1521+ placeholder = "<|video_pad|>"
1522+
1523+ prompts = [
1524+ (
1525+ "<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
1526+ f"<|im_start|>user\n <|vision_start|>{ placeholder } <|vision_end|>"
1527+ f"{ question } <|im_end|>\n "
1528+ "<|im_start|>assistant\n "
1529+ )
1530+ for question in questions
1531+ ]
1532+
1533+ return ModelRequestData (
1534+ engine_args = engine_args ,
1535+ prompts = prompts ,
1536+ )
1537+
1538+
1539+ # Qwen3-VL-MOE
1540+ def run_qwen3_vl_moe (questions : list [str ], modality : str ) -> ModelRequestData :
1541+ model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
1542+
1543+ engine_args = EngineArgs (
1544+ model = model_name ,
1545+ max_model_len = 4096 ,
1546+ max_num_seqs = 5 ,
1547+ mm_processor_kwargs = {
1548+ "min_pixels" : 28 * 28 ,
1549+ "max_pixels" : 1280 * 28 * 28 ,
1550+ "fps" : 1 ,
1551+ },
1552+ limit_mm_per_prompt = {modality : 1 },
1553+ )
1554+
1555+ if modality == "image" :
1556+ placeholder = "<|image_pad|>"
1557+ elif modality == "video" :
1558+ placeholder = "<|video_pad|>"
1559+
1560+ prompts = [
1561+ (
1562+ "<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
1563+ f"<|im_start|>user\n <|vision_start|>{ placeholder } <|vision_end|>"
1564+ f"{ question } <|im_end|>\n "
1565+ "<|im_start|>assistant\n "
1566+ )
1567+ for question in questions
1568+ ]
1569+
1570+ return ModelRequestData (
1571+ engine_args = engine_args ,
1572+ prompts = prompts ,
1573+ )
1574+
1575+
15021576# R-4B
15031577def run_r_vl (questions : list [str ], modality : str ) -> ModelRequestData :
15041578 assert modality == "image"
@@ -1709,6 +1783,8 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
17091783 "qwen2_vl" : run_qwen2_vl ,
17101784 "qwen2_5_vl" : run_qwen2_5_vl ,
17111785 "qwen2_5_omni" : run_qwen2_5_omni ,
1786+ "qwen3_vl" : run_qwen3_vl ,
1787+ "qwen3_vl_moe" : run_qwen3_vl_moe ,
17121788 "rvl" : run_r_vl ,
17131789 "skywork_chat" : run_skyworkr1v ,
17141790 "smolvlm" : run_smolvlm ,
@@ -1718,6 +1794,15 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
17181794}
17191795
17201796
1797+ MODELS_NEED_VIDEO_METADATA = [
1798+ "glm4_1v" ,
1799+ "glm4_5v" ,
1800+ "glm4_5v_fp8" ,
1801+ "qwen3_vl" ,
1802+ "qwen3_vl_moe" ,
1803+ ]
1804+
1805+
17211806def get_multi_modal_input (args ):
17221807 """
17231808 return {
0 commit comments