|
113 | 113 | dtype="bfloat16" if current_platform.is_cpu() else "auto", |
114 | 114 | marks=[pytest.mark.core_model, pytest.mark.cpu_model], |
115 | 115 | ), |
| 116 | + "paligemma": VLMTestInfo( |
| 117 | + models=["google/paligemma-3b-mix-224"], |
| 118 | + test_type=VLMTestType.IMAGE, |
| 119 | + prompt_formatter=identity, |
| 120 | + img_idx_to_prompt=lambda idx: "", |
| 121 | + # Paligemma uses its own sample prompts because the default one fails |
| 122 | + single_image_prompts=IMAGE_ASSETS.prompts( |
| 123 | + { |
| 124 | + "stop_sign": "caption es", |
| 125 | + "cherry_blossom": "What is in the picture?", |
| 126 | + } |
| 127 | + ), |
| 128 | + auto_cls=AutoModelForImageTextToText, |
| 129 | + vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, |
| 130 | + dtype="bfloat16", |
| 131 | + marks=[ |
| 132 | + pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask") |
| 133 | + ], |
| 134 | + ), |
116 | 135 | "qwen2_5_vl": VLMTestInfo( |
117 | 136 | models=["Qwen/Qwen2.5-VL-3B-Instruct"], |
118 | 137 | test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), |
|
177 | 196 | # Gemma3 has bidirectional mask on images |
178 | 197 | "gemma3-transformers": VLMTestInfo( |
179 | 198 | models=["google/gemma-3-4b-it"], |
180 | | - test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), |
181 | | - prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 |
182 | | - single_image_prompts=IMAGE_ASSETS.prompts( |
183 | | - { |
184 | | - "stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501 |
185 | | - "cherry_blossom": "<start_of_image>What is the season?", |
186 | | - } |
187 | | - ), |
188 | | - multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 |
189 | | - max_model_len=8192, |
| 199 | + test_type=VLMTestType.IMAGE, |
| 200 | + prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n", # noqa: E501 |
| 201 | + max_model_len=4096, |
190 | 202 | auto_cls=AutoModelForImageTextToText, |
191 | | - # TODO: Support `do_pan_and_scan` in transformers backend |
192 | | - # patch_hf_runner=model_utils.gemma3_patch_hf_runner, |
193 | 203 | vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output, |
194 | 204 | image_size_factors=[(0.25, 0.5, 1.0)], |
195 | 205 | vllm_runner_kwargs={ |
196 | 206 | "model_impl": "transformers", |
197 | | - # "mm_processor_kwargs": {"do_pan_and_scan": True}, |
198 | 207 | }, |
199 | 208 | marks=[pytest.mark.core_model], |
200 | 209 | ), |
|
213 | 222 | }, |
214 | 223 | marks=[pytest.mark.core_model], |
215 | 224 | ), |
216 | | - # PaliGemma has PrefixLM attention |
217 | | - "paligemma-transformers": VLMTestInfo( |
218 | | - models=["google/paligemma-3b-mix-224"], |
219 | | - test_type=VLMTestType.IMAGE, |
220 | | - prompt_formatter=identity, |
221 | | - img_idx_to_prompt=lambda idx: "", |
222 | | - # PaliGemma uses its own sample prompts because the default one fails |
223 | | - single_image_prompts=IMAGE_ASSETS.prompts( |
224 | | - { |
225 | | - "stop_sign": "caption es", |
226 | | - "cherry_blossom": "What is in the picture?", |
227 | | - } |
228 | | - ), |
229 | | - auto_cls=AutoModelForImageTextToText, |
230 | | - vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, |
231 | | - image_size_factors=[(0.25, 0.5, 1.0)], |
232 | | - vllm_runner_kwargs={ |
233 | | - "model_impl": "transformers", |
234 | | - }, |
235 | | - marks=[pytest.mark.core_model], |
236 | | - ), |
237 | 225 | # Pixel values from processor are not 4D or 5D arrays |
238 | 226 | "qwen2_5_vl-transformers": VLMTestInfo( |
239 | 227 | models=["Qwen/Qwen2.5-VL-3B-Instruct"], |
|
360 | 348 | image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], |
361 | 349 | marks=[large_gpu_mark(min_gb=32)], |
362 | 350 | ), |
| 351 | + "gemma3": VLMTestInfo( |
| 352 | + models=["google/gemma-3-4b-it"], |
| 353 | + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), |
| 354 | + prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 |
| 355 | + single_image_prompts=IMAGE_ASSETS.prompts( |
| 356 | + { |
| 357 | + "stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501 |
| 358 | + "cherry_blossom": "<start_of_image>What is the season?", |
| 359 | + } |
| 360 | + ), |
| 361 | + multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 |
| 362 | + max_model_len=4096, |
| 363 | + max_num_seqs=2, |
| 364 | + auto_cls=AutoModelForImageTextToText, |
| 365 | + vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, |
| 366 | + patch_hf_runner=model_utils.gemma3_patch_hf_runner, |
| 367 | + num_logprobs=10, |
| 368 | + ), |
363 | 369 | "glm4v": VLMTestInfo( |
364 | 370 | models=["zai-org/glm-4v-9b"], |
365 | 371 | test_type=VLMTestType.IMAGE, |
|
0 commit comments