diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 3fc4ed606b8a..f3f042355c9e 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -11,7 +11,6 @@ ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam, MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part) from vllm.inputs import TokensPrompt -from vllm.model_executor.model_loader import get_model_cls from vllm.model_executor.models.interfaces import supports_score_template from vllm.multimodal.inputs import MultiModalDataDict from vllm.outputs import PoolingRequestOutput @@ -140,6 +139,8 @@ def apply_score_template( prompt_1: str, prompt_2: str, ) -> str: + # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf) + from vllm.model_executor.model_loader import get_model_cls model = get_model_cls(model_config) if supports_score_template(model): @@ -162,6 +163,9 @@ def post_process_tokens( Note: This is an in-place operation. """ + # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf) + from vllm.model_executor.model_loader import get_model_cls + model = get_model_cls(model_config) if supports_score_template(model): model.post_process_tokens(prompt) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 857f4bca6824..1058ae140b5b 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -14,7 +14,6 @@ from typing import Any, Callable, Optional, Union import filelock -import gguf import huggingface_hub.constants import numpy as np import torch @@ -40,6 +39,11 @@ SafetensorsStreamer = runai_model_streamer.placeholder_attr( "SafetensorsStreamer") +try: + import gguf +except ImportError: + gguf = PlaceholderModule("gguf") + try: from fastsafetensors import SafeTensorsFileLoader, SingleGroup except ImportError: