[Model] Revert PR vllm-project#26715: Restore custom PaliGemma and Gemma3-MM impl… (vllm-project#27309)

lucianommartins · Chenyaaang · commit df47c7ae718d · 2025-10-22T23:47:45.000Z
Signed-off-by: Luciano Martins &lt;lucianommartins@users.noreply.github.com&gt;
Co-authored-by: Luciano Martins &lt;lucianommartins@users.noreply.github.com&gt;
diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md
@@ -16,8 +16,8 @@
 | meta-llama/Llama-4-*                                | Llama4ForConditionalGeneration | ❌ |
 | microsoft/Phi-3-mini-128k-instruct                  | Phi3ForCausalLM                | 🟨 |
 | microsoft/phi-4                                     | Phi3ForCausalLM                | ❌ |
-| google/gemma-3-27b-it                               | TransformersForMultimodalLM    | 🟨 |
-| google/gemma-3-4b-it                                | TransformersForMultimodalLM    | ❌ |
+| google/gemma-3-27b-it                               | Gemma3ForConditionalGeneration | 🟨 |
+| google/gemma-3-4b-it                                | Gemma3ForConditionalGeneration | ❌ |
 | deepseek-ai/DeepSeek-R1                             | DeepseekV3ForCausalLM          | ❌ |
 | deepseek-ai/DeepSeek-V3                             | DeepseekV3ForCausalLM          | ❌ |
 | RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8  | LlamaForCausalLM               | ✅ |
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -641,6 +641,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
 | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
+| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
 | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
@@ -670,6 +671,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
 | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
+| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
 | `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ |
@@ -694,8 +696,6 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
 | `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ |
-| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
-| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ |
 
 <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
 &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
@@ -704,7 +704,21 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 !!! warning
-    For `Gemma3ForConditionalGeneration`, `{"do_pan_and_scan": true}` is not supported in Transformers backend yet.
+    Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
+    However, there are differences in how they handle text + image inputs:
+
+    V0 correctly implements the model's attention pattern:
+    - Uses bidirectional attention between the image tokens corresponding to the same image
+    - Uses causal attention for other tokens
+    - Implemented via (naive) PyTorch SDPA with masking tensors
+    - Note: May use significant memory for long prompts with image
+
+    V1 currently uses a simplified attention pattern:
+    - Uses causal attention for all tokens, including image tokens
+    - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
+    - Will be updated in the future to support the correct behavior
+
+    This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
 
 !!! note
     `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
@@ -756,6 +770,9 @@ Some models are supported only via the [Transformers backend](#transformers). Th
     The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
     For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
 
+!!! warning
+    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
+
 !!! note
     For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
@@ -275,8 +275,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         max_model_len=2048,
         max_num_seqs=2,
-        # TODO: Support this in transformers backend
-        # mm_processor_kwargs={"do_pan_and_scan": True},
+        mm_processor_kwargs={"do_pan_and_scan": True},
         limit_mm_per_prompt={modality: 1},
     )
 
diff --git a/tests/models/language/generation/test_gemma.py b/tests/models/language/generation/test_gemma.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 
-MODELS = ["google/gemma-2b", "google/gemma-2-2b"]
+MODELS = ["google/gemma-2b", "google/gemma-2-2b", "google/gemma-3-4b-it"]
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -14,8 +14,14 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
             model,
             load_format="dummy",
         ) as llm:
-            normalizers = llm.apply_model(
-                lambda model: model.model.normalizer.cpu().item()
-            )
-            config = llm.llm.llm_engine.model_config.hf_config
+            if model == "google/gemma-3-4b-it":
+                normalizers = llm.llm.collective_rpc(
+                    lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()  # noqa: E501
+                )
+                config = llm.llm.llm_engine.model_config.hf_config.text_config
+            else:
+                normalizers = llm.llm.collective_rpc(
+                    lambda self: self.model_runner.model.model.normalizer.cpu().item()
+                )
+                config = llm.llm.llm_engine.model_config.hf_config
             assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
@@ -113,6 +113,25 @@
         dtype="bfloat16" if current_platform.is_cpu() else "auto",
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
+    "paligemma": VLMTestInfo(
+        models=["google/paligemma-3b-mix-224"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=identity,
+        img_idx_to_prompt=lambda idx: "",
+        # Paligemma uses its own sample prompts because the default one fails
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "caption es",
+                "cherry_blossom": "What is in the picture?",
+            }
+        ),
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
+        dtype="bfloat16",
+        marks=[
+            pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
+        ],
+    ),
     "qwen2_5_vl": VLMTestInfo(
         models=["Qwen/Qwen2.5-VL-3B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
@@ -177,24 +196,14 @@
     # Gemma3 has bidirectional mask on images
     "gemma3-transformers": VLMTestInfo(
         models=["google/gemma-3-4b-it"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
-        single_image_prompts=IMAGE_ASSETS.prompts(
-            {
-                "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
-                "cherry_blossom": "<start_of_image>What is the season?",
-            }
-        ),
-        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
-        max_model_len=8192,
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
+        max_model_len=4096,
         auto_cls=AutoModelForImageTextToText,
-        # TODO: Support `do_pan_and_scan` in transformers backend
-        # patch_hf_runner=model_utils.gemma3_patch_hf_runner,
         vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
         image_size_factors=[(0.25, 0.5, 1.0)],
         vllm_runner_kwargs={
             "model_impl": "transformers",
-            # "mm_processor_kwargs": {"do_pan_and_scan": True},
         },
         marks=[pytest.mark.core_model],
     ),
@@ -213,27 +222,6 @@
         },
         marks=[pytest.mark.core_model],
     ),
-    # PaliGemma has PrefixLM attention
-    "paligemma-transformers": VLMTestInfo(
-        models=["google/paligemma-3b-mix-224"],
-        test_type=VLMTestType.IMAGE,
-        prompt_formatter=identity,
-        img_idx_to_prompt=lambda idx: "",
-        # PaliGemma uses its own sample prompts because the default one fails
-        single_image_prompts=IMAGE_ASSETS.prompts(
-            {
-                "stop_sign": "caption es",
-                "cherry_blossom": "What is in the picture?",
-            }
-        ),
-        auto_cls=AutoModelForImageTextToText,
-        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        image_size_factors=[(0.25, 0.5, 1.0)],
-        vllm_runner_kwargs={
-            "model_impl": "transformers",
-        },
-        marks=[pytest.mark.core_model],
-    ),
     # Pixel values from processor are not 4D or 5D arrays
     "qwen2_5_vl-transformers": VLMTestInfo(
         models=["Qwen/Qwen2.5-VL-3B-Instruct"],
@@ -360,6 +348,24 @@
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[large_gpu_mark(min_gb=32)],
     ),
+    "gemma3": VLMTestInfo(
+        models=["google/gemma-3-4b-it"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
+                "cherry_blossom": "<start_of_image>What is the season?",
+            }
+        ),
+        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
+        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
+        num_logprobs=10,
+    ),
     "glm4v": VLMTestInfo(
         models=["zai-org/glm-4v-9b"],
         test_type=VLMTestType.IMAGE,
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -328,6 +328,16 @@ def processor(*args, **kwargs):
 
     hf_model.processor = processor
 
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, **kwargs):
+        # FIXME: https://github.com/huggingface/transformers/issues/38333
+        kwargs["disable_compile"] = True
+
+        return orig_generate(*args, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
     return hf_model
 
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
@@ -222,6 +222,7 @@ def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
     "ovis": False,
     "ovis2_5": False,
+    "paligemma": False,
     "ultravox": False,
     "whisper": False,
 }
@@ -333,6 +334,7 @@ def _test_processing_correctness_one(
         "deepseek-ai/deepseek-vl2-tiny",
         "baidu/ERNIE-4.5-VL-28B-A3B-PT",
         "adept/fuyu-8b",
+        "google/gemma-3-4b-it",
         "google/gemma-3n-E2B-it",
         "zai-org/glm-4v-9b",
         "zai-org/GLM-4.1V-9B-Thinking",
@@ -369,6 +371,8 @@ def _test_processing_correctness_one(
         "AIDC-AI/Ovis1.6-Llama3.2-3B",
         "AIDC-AI/Ovis2-1B",
         "AIDC-AI/Ovis2.5-2B",
+        "google/paligemma-3b-mix-224",
+        "google/paligemma2-3b-ft-docci-448",
         "microsoft/Phi-3.5-vision-instruct",
         "microsoft/Phi-4-multimodal-instruct",
         "mistralai/Pixtral-12B-2409",
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -48,6 +48,7 @@
     "Idefics3ForConditionalGeneration",
     "LlavaForConditionalGeneration",
     "MiniCPMV",
+    "PaliGemmaForConditionalGeneration",
 ]
 REPO_ID_TO_SKIP = {
     "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py

Original file line number	Diff line number	Diff line change
`@@ -275,8 +275,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:`
`275`	`275`	`model=model_name,`
`276`	`276`	`max_model_len=2048,`
`277`	`277`	`max_num_seqs=2,`
`278`		`- # TODO: Support this in transformers backend`
`279`		`- # mm_processor_kwargs={"do_pan_and_scan": True},`
	`278`	`+ mm_processor_kwargs={"do_pan_and_scan": True},`
`280`	`279`	`limit_mm_per_prompt={modality: 1},`
`281`	`280`	`)`
`282`	`281`
Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@`
`48`	`48`	`"Idefics3ForConditionalGeneration",`
`49`	`49`	`"LlavaForConditionalGeneration",`
`50`	`50`	`"MiniCPMV",`
	`51`	`+ "PaliGemmaForConditionalGeneration",`
`51`	`52`	`]`
`52`	`53`	`REPO_ID_TO_SKIP = {`
`53`	`54`	`"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",`