[Docs] Lazy import gguf (#20785)

simon-mo · web-flow · commit b854321ffe50 · 2025-07-10T16:06:37.000-07:00
Signed-off-by: simon-mo &lt;simon.mo@hey.com&gt;
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
@@ -11,7 +11,6 @@
     ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam,
     MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part)
 from vllm.inputs import TokensPrompt
-from vllm.model_executor.model_loader import get_model_cls
 from vllm.model_executor.models.interfaces import supports_score_template
 from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.outputs import PoolingRequestOutput
@@ -140,6 +139,8 @@ def apply_score_template(
     prompt_1: str,
     prompt_2: str,
 ) -> str:
+    # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
+    from vllm.model_executor.model_loader import get_model_cls
 
     model = get_model_cls(model_config)
     if supports_score_template(model):
@@ -162,6 +163,9 @@ def post_process_tokens(
     Note:
         This is an in-place operation.
     """
+    # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
+    from vllm.model_executor.model_loader import get_model_cls
+
     model = get_model_cls(model_config)
     if supports_score_template(model):
         model.post_process_tokens(prompt)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -14,7 +14,6 @@
 from typing import Any, Callable, Optional, Union
 
 import filelock
-import gguf
 import huggingface_hub.constants
 import numpy as np
 import torch
@@ -40,6 +39,11 @@
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(
         "SafetensorsStreamer")
 
+try:
+    import gguf
+except ImportError:
+    gguf = PlaceholderModule("gguf")
+
 try:
     from fastsafetensors import SafeTensorsFileLoader, SingleGroup
 except ImportError: