vllm-project
diff --git a/‎vllm/config/model.py‎
Lines changed: 87 additions & 3 deletions b/‎vllm/config/model.py‎
Lines changed: 87 additions & 3 deletions
diff --git a/‎vllm/model_executor/layers/quantization/gguf.py‎
Lines changed: 44 additions & 12 deletions b/‎vllm/model_executor/layers/quantization/gguf.py‎
Lines changed: 44 additions & 12 deletions
@@ -68,6 +68,36 @@
 
 logger = init_logger(__name__)
 
+
+def _detect_gguf_multimodal_gemma3(model: str) -> bool:
+    """Check if GGUF model has multimodal projector file for Gemma3.
+
+    Args:
+        model: Model path string
+
+    Returns:
+        True if this is a Gemma3 GGUF model with mmproj file, False otherwise
+    """
+    if not model.endswith(".gguf"):
+        return False
+
+    try:
+        from pathlib import Path
+
+        model_path = Path(model)
+        if not model_path.is_file():
+            return False
+
+        model_dir = model_path.parent
+        mmproj_patterns = ["mmproj.gguf", "mmproj-*.gguf", "*mmproj*.gguf"]
+        for pattern in mmproj_patterns:
+            if list(model_dir.glob(pattern)):
+                return True
+        return False
+    except Exception:
+        return False
+
+
 RunnerOption = Literal["auto", RunnerType]
 ConvertType = Literal["none", "embed", "classify", "reward"]
 ConvertOption = Literal["auto", ConvertType]
@@ -556,6 +586,46 @@ def __post_init__(
 
         architectures = self.architectures
         registry = self.registry
+
+        # GGUF multimodal: Force Gemma3ForConditionalGeneration architecture
+        # when mmproj file is present, before model resolution
+        if _detect_gguf_multimodal_gemma3(self.model):
+            is_gemma3 = any("gemma3" in str(arch).lower() for arch in architectures)
+            if is_gemma3:
+                architectures = ["Gemma3ForConditionalGeneration"]
+                self.hf_config.architectures = architectures
+                logger.info(
+                    "Detected Gemma3 GGUF with mmproj.gguf, "
+                    "forcing Gemma3ForConditionalGeneration"
+                )
+
+                # Initialize vision_config if not present
+                if (
+                    not hasattr(self.hf_config, "vision_config")
+                    or self.hf_config.vision_config is None
+                ):
+                    from transformers import SiglipVisionConfig
+
+                    self.hf_config.vision_config = SiglipVisionConfig(
+                        hidden_size=1152,
+                        intermediate_size=4304,
+                        num_hidden_layers=27,
+                        num_attention_heads=16,
+                        num_channels=3,
+                        image_size=896,
+                        patch_size=14,
+                        layer_norm_eps=1e-6,
+                        attention_dropout=0.0,
+                        num_image_tokens=256,
+                        # Disable pooling head for Gemma3
+                        vision_use_head=False,
+                    )
+                    self.hf_config.mm_tokens_per_image = 256
+                    self.hf_config.image_token_index = 262144
+                    # DO NOT set boi_token_index - let
+                    # gemma3_mm.py fall back to 262143
+                    self.hf_config.eoi_token_index = 256000
+
         is_generative_model = registry.is_text_generation_model(architectures, self)
         is_pooling_model = registry.is_pooling_model(architectures, self)
 
@@ -701,8 +771,24 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
 
         self.original_max_model_len = self.max_model_len
         self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
+
+        # GGUF multimodal: Set flag to initialize multimodal_config
+        # when Gemma3 mmproj file is present
+        is_gguf_multimodal = False
+        if _detect_gguf_multimodal_gemma3(self.model):
+            is_gemma3 = any(
+                "gemma3" in str(arch).lower() for arch in self.architectures
+            )
+            if is_gemma3:
+                is_gguf_multimodal = True
+                logger.info(
+                    "Detected Gemma3 GGUF multimodal model "
+                    "with mmproj.gguf, initializing "
+                    "multimodal_config"
+                )
+
         # Init multimodal config if needed
-        if self._model_info.supports_multimodal:
+        if self._model_info.supports_multimodal or is_gguf_multimodal:
             if (
                 mm_encoder_tp_mode == "data"
                 and not self._model_info.supports_multimodal_encoder_tp_data
@@ -888,8 +974,6 @@ def _get_default_runner_type(
                 _, (runner_type, _) = match
                 return runner_type
 
-        return "generate"
-
     def _get_runner_type(
         self,
         architectures: list[str],
 
@@ -36,9 +36,12 @@
 class GGUFConfig(QuantizationConfig):
     """Config class for GGUF."""
 
-    def __init__(self, unquantized_modules: list[str] | None = None) -> None:
+    def __init__(
+        self, unquantized_modules: list[str] | None = None, model_arch: str = ""
+    ) -> None:
         super().__init__()
         self.unquantized_modules = unquantized_modules or []
+        self.model_arch = model_arch
 
     def __repr__(self) -> str:
         return "GGUFConfig()"
@@ -59,17 +62,19 @@ def get_config_filenames(cls) -> list[str]:
 
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "GGUFConfig":
-        return cls()
+        # Extract model_arch from config if available
+        model_arch = config.get("model_arch", "")
+        return cls(model_arch=model_arch)
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
             if is_layer_skipped_gguf(prefix, self.unquantized_modules):
                 return UnquantizedLinearMethod()
-            return GGUFLinearMethod(self)
+            return GGUFLinearMethod(self, self.model_arch)
         elif isinstance(layer, VocabParallelEmbedding):
-            return GGUFEmbeddingMethod(self)
+            return GGUFEmbeddingMethod(self, self.model_arch)
         elif isinstance(layer, FusedMoE):
             return GGUFMoEMethod(self, layer.moe_config)
         return None
@@ -115,7 +120,10 @@ def is_layer_skipped_gguf(prefix: str, unquantized_modules: list[str]):
 
 
 def _fused_mul_mat_gguf(
-    x: torch.Tensor, qweight: torch.Tensor, qweight_type: int
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    qweight_type: int,
+    target_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     if qweight_type in IMATRIX_QUANT_TYPES:
         mmvq_safe = 8 if qweight.shape[0] > 5120 else 16
@@ -125,9 +133,15 @@ def _fused_mul_mat_gguf(
     # so input to logits generator is empty which causes invalid parameter
     if x.shape[0] == 0:
         return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device)
-    # there is no need to call any kernel for fp16/bf16
+    # Unquantized weights (F16/BF16) use direct matrix multiplication
     if qweight_type in UNQUANTIZED_TYPES:
-        return x @ qweight.T
+        # GGUF stores some weights as F16, but the model may use bfloat16 for
+        # computation. Convert weight dtype to match target_dtype (typically
+        # params_dtype=bfloat16) to ensure consistency in mixed-precision.
+        weight = qweight
+        if target_dtype is not None and weight.dtype != target_dtype:
+            weight = weight.to(target_dtype)
+        return x @ weight.T
     # enable MMVQ in contiguous batching with batch_size=1
     if x.shape[0] <= mmvq_safe and qweight_type in MMVQ_QUANT_TYPES:
         y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
@@ -138,7 +152,9 @@ def _fused_mul_mat_gguf(
     elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
-        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
+        # Use target_dtype if provided, otherwise fall back to x.dtype
+        dequant_dtype = target_dtype if target_dtype is not None else x.dtype
+        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, dequant_dtype)
         y = x @ weight.T
     else:
         # Raise an error if the quantization type is not supported.
@@ -153,6 +169,7 @@ def _fused_mul_mat_gguf_fake(
     x: torch.Tensor,
     qweight: torch.Tensor,
     qweight_type: int,
+    target_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device)
 
@@ -311,7 +328,13 @@ def _apply_gguf_embedding(
     dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     if qweight_type in UNQUANTIZED_TYPES:
-        return torch.embedding(qweight, x)
+        # torch.embedding preserves weight tensor dtype (F16 for GGUF).
+        # Convert result to model's computation dtype (typically bfloat16)
+        # to maintain consistency throughout forward pass.
+        result = torch.embedding(qweight, x)
+        if dtype is not None and result.dtype != dtype:
+            result = result.to(dtype)
+        return result
     elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         x_flat = x.flatten()
@@ -355,7 +378,9 @@ class GGUFLinearMethod(LinearMethodBase):
         quant_config: The GGUF quantization config.
     """
 
-    def __init__(self, quant_config: GGUFConfig):
+    def __init__(self, quant_config: GGUFConfig, model_arch: str = ""):
+        self.model_arch = model_arch.lower()
+        self.is_gemma3 = "gemma3" in self.model_arch
         self.quant_config = quant_config
 
     def create_weights(
@@ -369,6 +394,9 @@ def create_weights(
         **extra_weight_attrs,
     ):
         self.params_dtype = params_dtype
+        # Gemma3 isolation: only apply dtype conversion for Gemma3 models
+        self.target_dtype = params_dtype if self.is_gemma3 else None
+
         output_size_per_partition = sum(output_partition_sizes)
 
         tensor_shape = (output_size_per_partition, input_size_per_partition)
@@ -467,14 +495,18 @@ def apply(
                 qweight_type = layer.qweight_type.shard_weight_type[idx]
                 result.append(
                     fused_mul_mat_gguf(
-                        x, qweight[start:end, :offset].contiguous(), qweight_type
+                        x,
+                        qweight[start:end, :offset].contiguous(),
+                        qweight_type,
+                        self.target_dtype,
                     )
                 )
             out = torch.cat(result, axis=1)
         else:
             qweight = layer.qweight
             qweight_type = layer.qweight_type.weight_type
-            out = fused_mul_mat_gguf(x, qweight, qweight_type)
+            # Gemma3 isolation: only apply dtype conversion for Gemma3
+            out = fused_mul_mat_gguf(x, qweight, qweight_type, self.target_dtype)
         if bias is not None:
             out.add_(bias)
         return out