From fed1aa3647b0c6121095521774d56d1815fc14ec Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Wed, 29 Oct 2025 10:04:32 -0700 Subject: [PATCH 1/5] Make MediaConnector extensible. Signed-off-by: Chenheli Hua --- vllm/entrypoints/chat_utils.py | 16 +++++++++------- vllm/envs.py | 9 +++++++++ vllm/multimodal/registry.py | 19 ++++++++++++++++++- vllm/multimodal/utils.py | 4 ++++ vllm/multimodal/video.py | 24 +++++------------------- 5 files changed, 45 insertions(+), 27 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 09641aaff306..dcb42fa90cb8 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -43,11 +43,12 @@ # pydantic needs the TypedDict from typing_extensions from typing_extensions import Required, TypedDict +from vllm import envs from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict -from vllm.multimodal.utils import MediaConnector +from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer @@ -806,10 +807,11 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: self._tracker = tracker multimodal_config = self._tracker.model_config.multimodal_config media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) - self._connector = MediaConnector( - media_io_kwargs=media_io_kwargs, + + self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( + envs.VLLM_MEDIA_CONNECTOR, + media_io_kwargs=self._tracker._model_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, - allowed_media_domains=tracker.allowed_media_domains, ) @property @@ -891,10 +893,10 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: self._tracker = tracker multimodal_config = self._tracker.model_config.multimodal_config media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) - self._connector = MediaConnector( - media_io_kwargs=media_io_kwargs, + self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( + envs.VLLM_MEDIA_CONNECTOR, + media_io_kwargs=self._tracker._model_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, - allowed_media_domains=tracker.allowed_media_domains, ) @property diff --git a/vllm/envs.py b/vllm/envs.py index 018af0e5bba8..c6ed2d0e00ad 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -70,6 +70,7 @@ VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8 VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 VLLM_VIDEO_LOADER_BACKEND: str = "opencv" + VLLM_MEDIA_CONNECTOR: str = "http" VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.8" @@ -704,6 +705,14 @@ def get_vllm_port() -> int | None: "VLLM_VIDEO_LOADER_BACKEND": lambda: os.getenv( "VLLM_VIDEO_LOADER_BACKEND", "opencv" ), + # Media connector implementation. + # - "http": Default connector that supports fetching media via HTTP. + # + # Custom implementations can be registered + # via `@MEDIA_CONNECTOR_REGISTRY.register("my_custom_media_connector")` and + # imported at runtime. + # If a non-existing backend is used, an AssertionError will be thrown. + "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"), # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache # Default is 4 GiB per API process + 4 GiB per engine core process "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")), diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 8f9276e84640..7bf6dd753b7f 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Generic, Protocol, TypeVar +from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar import torch.nn as nn @@ -358,3 +358,20 @@ def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int: first_modality = next(iter(max_tokens)) return max_tokens[first_modality] + + +class ExtensionManager: + def __init__(self) -> None: + self.name2class: dict[str, type] = {} + + def register(self, name: str): + def wrap(cls_to_register): + self.name2class[name] = cls_to_register + return cls_to_register + + return wrap + + def load(self, cls_name: str, *args, **kwargs) -> Any: + cls = self.name2class.get(cls_name) + assert cls is not None, f"Extension class {cls_name} not found" + return cls(args, kwargs) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 7f259dad08f9..403f738d24e7 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -19,6 +19,7 @@ import vllm.envs as envs from vllm.connections import HTTPConnection, global_http_connection from vllm.logger import init_logger +from vllm.multimodal.registry import ExtensionManager from vllm.utils.jsontree import json_map_leaves from .audio import AudioMediaIO @@ -46,7 +47,10 @@ _M = TypeVar("_M") +MEDIA_CONNECTOR_REGISTRY = ExtensionManager() + +@MEDIA_CONNECTOR_REGISTRY.register("http") class MediaConnector: def __init__( self, diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 666ef275a924..3bd85d752086 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -14,6 +14,7 @@ from vllm import envs from vllm.logger import init_logger +from vllm.multimodal.registry import ExtensionManager from .base import MediaIO from .image import ImageMediaIO @@ -55,6 +56,9 @@ def sample_frames_from_video(frames: npt.NDArray, num_frames: int) -> npt.NDArra class VideoLoader: + def __init__(self, *args, **kwargs) -> None: + pass + @classmethod @abstractmethod def load_bytes( @@ -63,25 +67,7 @@ def load_bytes( raise NotImplementedError -class VideoLoaderRegistry: - def __init__(self) -> None: - self.name2class: dict[str, type] = {} - - def register(self, name: str): - def wrap(cls_to_register): - self.name2class[name] = cls_to_register - return cls_to_register - - return wrap - - @staticmethod - def load(cls_name: str) -> VideoLoader: - cls = VIDEO_LOADER_REGISTRY.name2class.get(cls_name) - assert cls is not None, f"VideoLoader class {cls_name} not found" - return cls() - - -VIDEO_LOADER_REGISTRY = VideoLoaderRegistry() +VIDEO_LOADER_REGISTRY = ExtensionManager() @VIDEO_LOADER_REGISTRY.register("opencv") From 9852c2d3d03337f7b61b621e9014bad30ac5805f Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Wed, 29 Oct 2025 10:13:51 -0700 Subject: [PATCH 2/5] address gemini comments Signed-off-by: Chenheli Hua --- vllm/entrypoints/chat_utils.py | 2 ++ vllm/multimodal/registry.py | 2 +- vllm/multimodal/video.py | 3 --- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index dcb42fa90cb8..c89ab1cc10bc 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -812,6 +812,7 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: envs.VLLM_MEDIA_CONNECTOR, media_io_kwargs=self._tracker._model_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, + allowed_media_domains=tracker.allowed_media_domains, ) @property @@ -897,6 +898,7 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: envs.VLLM_MEDIA_CONNECTOR, media_io_kwargs=self._tracker._model_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, + allowed_media_domains=tracker.allowed_media_domains, ) @property diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 7bf6dd753b7f..bac7f0b26a4e 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -374,4 +374,4 @@ def wrap(cls_to_register): def load(self, cls_name: str, *args, **kwargs) -> Any: cls = self.name2class.get(cls_name) assert cls is not None, f"Extension class {cls_name} not found" - return cls(args, kwargs) + return cls(*args, **kwargs) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 3bd85d752086..2975f9f602ed 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -56,9 +56,6 @@ def sample_frames_from_video(frames: npt.NDArray, num_frames: int) -> npt.NDArra class VideoLoader: - def __init__(self, *args, **kwargs) -> None: - pass - @classmethod @abstractmethod def load_bytes( From 923a9647c976e21c68b1629792957f9291d4b3ff Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Wed, 29 Oct 2025 10:15:40 -0700 Subject: [PATCH 3/5] address codex comments Signed-off-by: Chenheli Hua --- vllm/entrypoints/chat_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c89ab1cc10bc..d7d6419d643b 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -810,7 +810,7 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=self._tracker._model_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) @@ -896,7 +896,7 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=self._tracker._model_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) From aab2bb0d3e32a54b9f684ea7090adae40555b9d9 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Mon, 3 Nov 2025 08:43:58 -0800 Subject: [PATCH 4/5] address comments Signed-off-by: Chenheli Hua --- vllm/multimodal/registry.py | 19 +------------------ vllm/multimodal/utils.py | 2 +- vllm/multimodal/video.py | 2 +- vllm/utils/registry.py | 20 ++++++++++++++++++++ 4 files changed, 23 insertions(+), 20 deletions(-) create mode 100644 vllm/utils/registry.py diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index bac7f0b26a4e..8f9276e84640 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar +from typing import TYPE_CHECKING, Generic, Protocol, TypeVar import torch.nn as nn @@ -358,20 +358,3 @@ def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int: first_modality = next(iter(max_tokens)) return max_tokens[first_modality] - - -class ExtensionManager: - def __init__(self) -> None: - self.name2class: dict[str, type] = {} - - def register(self, name: str): - def wrap(cls_to_register): - self.name2class[name] = cls_to_register - return cls_to_register - - return wrap - - def load(self, cls_name: str, *args, **kwargs) -> Any: - cls = self.name2class.get(cls_name) - assert cls is not None, f"Extension class {cls_name} not found" - return cls(*args, **kwargs) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 403f738d24e7..3fad11a2cb4d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -19,8 +19,8 @@ import vllm.envs as envs from vllm.connections import HTTPConnection, global_http_connection from vllm.logger import init_logger -from vllm.multimodal.registry import ExtensionManager from vllm.utils.jsontree import json_map_leaves +from vllm.utils.registry import ExtensionManager from .audio import AudioMediaIO from .base import MediaIO diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 2975f9f602ed..369c5e6cb4d1 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -14,7 +14,7 @@ from vllm import envs from vllm.logger import init_logger -from vllm.multimodal.registry import ExtensionManager +from vllm.utils.registry import ExtensionManager from .base import MediaIO from .image import ImageMediaIO diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py new file mode 100644 index 000000000000..466be1eddcfd --- /dev/null +++ b/vllm/utils/registry.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + + +class ExtensionManager: + def __init__(self) -> None: + self.name2class: dict[str, type] = {} + + def register(self, name: str): + def wrap(cls_to_register): + self.name2class[name] = cls_to_register + return cls_to_register + + return wrap + + def load(self, cls_name: str, *args, **kwargs) -> Any: + cls = self.name2class.get(cls_name) + assert cls is not None, f"Extension class {cls_name} not found" + return cls(*args, **kwargs) From 84cf6169d75c8a472f271676962cd834de9304b5 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Mon, 3 Nov 2025 08:55:15 -0800 Subject: [PATCH 5/5] add docstring Signed-off-by: Chenheli Hua --- vllm/utils/registry.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py index 466be1eddcfd..ac9b859159ea 100644 --- a/vllm/utils/registry.py +++ b/vllm/utils/registry.py @@ -4,10 +4,36 @@ class ExtensionManager: + """ + A registry for managing pluggable extension classes. + + This class provides a simple mechanism to register and instantiate + extension classes by name. It is commonly used to implement plugin + systems where different implementations can be swapped at runtime. + + Examples: + Basic usage with a registry instance: + + >>> FOO_REGISTRY = ExtensionManager() + >>> @FOO_REGISTRY.register("my_foo_impl") + ... class MyFooImpl(Foo): + ... def __init__(self, value): + ... self.value = value + >>> foo_impl = FOO_REGISTRY.load("my_foo_impl", value=123) + + """ + def __init__(self) -> None: + """ + Initialize an empty extension registry. + """ self.name2class: dict[str, type] = {} def register(self, name: str): + """ + Decorator to register a class with the given name. + """ + def wrap(cls_to_register): self.name2class[name] = cls_to_register return cls_to_register @@ -15,6 +41,9 @@ def wrap(cls_to_register): return wrap def load(self, cls_name: str, *args, **kwargs) -> Any: + """ + Instantiate and return a registered extension class by name. + """ cls = self.name2class.get(cls_name) assert cls is not None, f"Extension class {cls_name} not found" return cls(*args, **kwargs)