refactor(gguf): remove redundant vocab_size extraction

lucianommartins · lucianommartins · commit f784296af45c · 2025-11-17T18:32:36.000Z
Remove extract_vocab_size_from_gguf() and vocab_size override logic
as transformers now handles this internally via modeling_gguf_pytorch_utils.

Also fixed the tests/models/multimodal/generation/test_common.py to use
HuggingFace implementation for Gemma3 testing.

Signed-off-by: Luciano Martins &lt;lucianommartins@users.noreply.github.com&gt;
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
@@ -221,7 +221,7 @@
         vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
         image_size_factors=[(0.25, 0.5, 1.0)],
         vllm_runner_kwargs={
-            "model_impl": "auto",
+            "model_impl": "transformers",
         },
         marks=[pytest.mark.core_model],
     ),
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
@@ -41,50 +41,6 @@ def detect_gguf_multimodal(model: str) -> Path | None:
         return None
 
 
-def extract_vocab_size_from_gguf(model_path: str) -> int | None:
-    """Extract vocabulary size from GGUF file's embedding tensor.
-
-    Reads the token_embd.weight tensor shape to determine the actual
-    vocabulary size in the GGUF file. This is the source of truth for
-    GGUF models and may differ from the HuggingFace tokenizer config,
-    particularly for models with extended vocabularies (e.g., Unsloth).
-
-    Args:
-        model_path: Path to GGUF model file
-
-    Returns:
-        Vocabulary size (second dimension of token_embd.weight tensor)
-        or None if extraction fails
-
-    Note:
-        GGUF embedding tensor format: [hidden_size, vocab_size]
-        We extract vocab_size from shape[1]
-    """
-    try:
-        reader = gguf.GGUFReader(model_path)
-        for tensor in reader.tensors:
-            if tensor.name == "token_embd.weight":
-                # Tensor shape: [hidden_size, vocab_size]
-                vocab_size = int(tensor.shape[1])
-                logger.info(
-                    "Extracted vocab_size=%d from GGUF embedding tensor",
-                    vocab_size,
-                )
-                return vocab_size
-
-        logger.warning(
-            "Could not find token_embd.weight tensor in GGUF file: %s",
-            model_path,
-        )
-        return None
-    except Exception as e:
-        logger.warning(
-            "Failed to extract vocab_size from GGUF file %s: %s",
-            model_path,
-            e,
-        )
-        return None
-
 
 def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | None":
     """Extract vision config parameters from mmproj.gguf metadata.
@@ -208,21 +164,4 @@ def maybe_patch_hf_config_from_gguf(
             )
             hf_config = new_hf_config
 
-    # Override vocab_size from GGUF embedding tensor for all GGUF models
-    # This handles models with extended vocabularies (e.g., Unsloth)
-    if model.endswith(".gguf"):
-        vocab_size_from_gguf = extract_vocab_size_from_gguf(model)
-        if vocab_size_from_gguf is not None:
-            # Get text config (handles both regular and multimodal configs)
-            text_config = hf_config.get_text_config()
-            original_vocab_size = text_config.vocab_size
-
-            if original_vocab_size != vocab_size_from_gguf:
-                logger.info(
-                    "Overriding vocab_size: %d (HF config) → %d (GGUF file)",
-                    original_vocab_size,
-                    vocab_size_from_gguf,
-                )
-                text_config.vocab_size = vocab_size_from_gguf
-
     return hf_config