Fix GGUF pan-and-scan attention and CUDA graph mask preservation

lucianommartins · lucianommartins · commit 25518175ea35 · 2025-11-26T03:21:04.000Z
Fixes four critical issues in GGUF multimodal inference:

1. Attention scaling parameter bug (gemma3.py):
   - Fix F.scaled_dot_product_attention to use named parameters
   - Changed positional args to attn_mask=attn_mask, scale=self.scaling
   - Prevents incorrect dropout application (was 6.25% instead of 0%)

2. Custom attention mask persistence (gpu_model_runner.py):
   - Store custom_model_kwargs after mask generation
   - Merge custom_model_kwargs in _dummy_run
   - Prevents loss of attention masks during CUDA graph re-initialization

3. Pan-and-scan attention pattern (gemma3_mm.py):
   - Detect pan-and-scan mode via multimodal_config.do_pan_and_scan
   - Prevents crop isolation artifacts in sequential processing

4. GGUF unquantized weight loading (weight_utils.py):
   - Add proper dtype conversion for BF16/F16/F32 stored as uint8
   - Handle byte-to-dtype conversion (BF16: 2 bytes, F16: 2 bytes, F32: 4 bytes)
   - Add fallback handling for unexpected dtype/type combinations
   - Fixes weight loading for unquantized GGUF multimodal projector weights

Signed-off-by: Luciano Martins &lt;lucianommartins@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -919,9 +919,79 @@ def gguf_quant_weights_iterator(
             weight = tensor.data
             weight_type = tensor.tensor_type
             name = gguf_to_hf_name_map[tensor.name]
+
             if weight_type.name not in ("F32", "BF16", "F16"):
+                # Quantized tensors: handled by quantization layers
                 name = name.replace("weight", "qweight")
-            param = torch.tensor(weight)
+                param = torch.tensor(weight)
+            else:
+                # Unquantized tensors: may need dtype conversion
+                # GGUF stores BF16/F16 as uint8 bytes but F32 as float32
+
+                # Check if already in target dtype
+                if weight.dtype == np.float32 and weight_type.name == "F32":
+                    # F32 tensors are stored directly as float32
+                    param = torch.from_numpy(np.array(weight))
+
+                elif weight.dtype == np.float16 and weight_type.name == "F16":
+                    # F16 tensors are stored directly as float16
+                    param = torch.from_numpy(np.array(weight))
+
+                elif weight.dtype == np.uint8:
+                    # Stored as bytes: convert to target dtype
+                    if weight_type.name == "BF16":
+                        # BF16: 2 bytes per value
+                        # Input: [..., hidden_dim * 2] uint8
+                        # Output: [..., hidden_dim] bfloat16
+                        weight_uint16 = np.frombuffer(
+                            weight.tobytes(), dtype=np.uint16
+                        )
+                        target_shape = weight.shape[:-1] + (weight.shape[-1] // 2,)
+                        weight_uint16 = weight_uint16.reshape(target_shape)
+                        param = torch.from_numpy(weight_uint16).view(torch.bfloat16)
+
+                    elif weight_type.name == "F16":
+                        # F16 (float16): 2 bytes per value
+                        # Input: [..., hidden_dim * 2] uint8
+                        # Output: [..., hidden_dim] float16
+                        weight_uint16 = np.frombuffer(
+                            weight.tobytes(), dtype=np.uint16
+                        )
+                        target_shape = weight.shape[:-1] + (weight.shape[-1] // 2,)
+                        weight_uint16 = weight_uint16.reshape(target_shape)
+                        param = torch.from_numpy(weight_uint16).view(torch.float16)
+
+                    elif weight_type.name == "F32":
+                        # F32 (float32): 4 bytes per value
+                        # Input: [..., hidden_dim * 4] uint8
+                        # Output: [..., hidden_dim] float32
+                        weight_float32 = np.frombuffer(
+                            weight.tobytes(), dtype=np.float32
+                        )
+                        target_shape = weight.shape[:-1] + (weight.shape[-1] // 4,)
+                        weight_float32 = weight_float32.reshape(target_shape)
+                        param = torch.from_numpy(weight_float32)
+
+                    else:
+                        # Unknown format
+                        logger.warning(
+                            "Unknown uint8-stored weight type '%s' for tensor '%s'.",
+                            weight_type.name,
+                            name,
+                        )
+                        param = torch.tensor(weight)
+
+                else:
+                    # Unexpected dtype/type combination
+                    logger.warning(
+                        "Unexpected dtype '%s' for weight type '%s' in tensor '%s'. "
+                        "Falling back to torch.tensor().",
+                        weight.dtype,
+                        weight_type.name,
+                        name,
+                    )
+                    param = torch.tensor(weight)
+
             yield name, param
 
 
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
@@ -286,8 +286,8 @@ def naive_attn_with_masks(
                 query,
                 key,
                 value,
-                attn_mask,
-                self.scaling,
+                attn_mask=attn_mask,
+                scale=self.scaling,
             )
             output = output.transpose(1, 2).flatten(-2, -1)
             out[start_idx:end_idx] = output
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
@@ -708,10 +708,26 @@ def generate_attention_masks(
             # Fill the lower triangle with 0 (causal attention)
             global_attn_mask = global_attn_mask.triu(diagonal=1)
 
-            # Enable bidirectional attention between image tokens
-            # Use advanced indexing for better performance
-            img_indices = torch.where(img_pos)[0]
-            global_attn_mask[:, :, img_indices[:, None], img_indices] = 0
+            # Conditionally apply bidirectional attention based on pan-and-scan
+            # mode. Pan-and-scan crops require pure causal attention to build
+            # sequential context across crops, matching HF transformers behavior.
+            # Non-pan-and-scan images use bidirectional attention for richer
+            # cross-token interactions within each image.
+            is_pan_and_scan = getattr(self.multimodal_config, "do_pan_and_scan", False)
+
+            if is_pan_and_scan:
+                # Pan-and-scan: Keep pure causal attention (mask unchanged).
+                # Crops are processed sequentially, allowing later crops to
+                # attend to earlier ones, building coherent context across the
+                # entire image. This prevents crop isolation artifacts.
+                pass
+            else:
+                # Non-pan-and-scan: Enable bidirectional attention for image
+                # tokens. This allows all tokens within each image to attend
+                # to each other, improving representation quality.
+                img_indices = torch.where(img_pos)[0]
+                global_attn_mask[:, :, img_indices[:, None], img_indices] = 0
+
             global_attn_masks.append(global_attn_mask)
 
             # GGUF compatibility: config might be Gemma3TextConfig directly
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2484,6 +2484,8 @@ def _preprocess(
                     mask_dtype=self.model.dtype,
                 )
                 model_kwargs.update(mask_kwargs)
+                # Store for _dummy_run to prevent loss during re-initialization.
+                self.custom_model_kwargs = mask_kwargs
         elif self.enable_prompt_embeds and is_first_rank:
             # Get the input embeddings for the tokens that are not input embeds,
             # then put them into the appropriate positions.
@@ -3952,6 +3954,7 @@ def _dummy_run(
                 model_kwargs = {
                     **model_kwargs,
                     **self._dummy_mm_kwargs(num_reqs),
+                    **getattr(self, "custom_model_kwargs", {}),
                 }
             elif self.enable_prompt_embeds:
                 input_ids = None