From fed1aa3647b0c6121095521774d56d1815fc14ec Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Wed, 29 Oct 2025 10:04:32 -0700
Subject: [PATCH 1/5] Make MediaConnector extensible.

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/entrypoints/chat_utils.py | 16 +++++++++-------
 vllm/envs.py                   |  9 +++++++++
 vllm/multimodal/registry.py    | 19 ++++++++++++++++++-
 vllm/multimodal/utils.py       |  4 ++++
 vllm/multimodal/video.py       | 24 +++++-------------------
 5 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 09641aaff306..dcb42fa90cb8 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -43,11 +43,12 @@
 # pydantic needs the TypedDict from typing_extensions
 from typing_extensions import Required, TypedDict
 
+from vllm import envs
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
-from vllm.multimodal.utils import MediaConnector
+from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
@@ -806,10 +807,11 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
         self._tracker = tracker
         multimodal_config = self._tracker.model_config.multimodal_config
         media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
-        self._connector = MediaConnector(
-            media_io_kwargs=media_io_kwargs,
+
+        self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
+            envs.VLLM_MEDIA_CONNECTOR,
+            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
-            allowed_media_domains=tracker.allowed_media_domains,
         )
 
     @property
@@ -891,10 +893,10 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
         self._tracker = tracker
         multimodal_config = self._tracker.model_config.multimodal_config
         media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
-        self._connector = MediaConnector(
-            media_io_kwargs=media_io_kwargs,
+        self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
+            envs.VLLM_MEDIA_CONNECTOR,
+            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
-            allowed_media_domains=tracker.allowed_media_domains,
         )
 
     @property
diff --git a/vllm/envs.py b/vllm/envs.py
index 018af0e5bba8..c6ed2d0e00ad 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -70,6 +70,7 @@
     VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
+    VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
     VLLM_MAIN_CUDA_VERSION: str = "12.8"
@@ -704,6 +705,14 @@ def get_vllm_port() -> int | None:
     "VLLM_VIDEO_LOADER_BACKEND": lambda: os.getenv(
         "VLLM_VIDEO_LOADER_BACKEND", "opencv"
     ),
+    # Media connector implementation.
+    # - "http": Default connector that supports fetching media via HTTP.
+    #
+    # Custom implementations can be registered
+    # via `@MEDIA_CONNECTOR_REGISTRY.register("my_custom_media_connector")` and
+    # imported at runtime.
+    # If a non-existing backend is used, an AssertionError will be thrown.
+    "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"),
     # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache
     # Default is 4 GiB per API process + 4 GiB per engine core process
     "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 8f9276e84640..7bf6dd753b7f 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
+from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar
 
 import torch.nn as nn
 
@@ -358,3 +358,20 @@ def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
 
         first_modality = next(iter(max_tokens))
         return max_tokens[first_modality]
+
+
+class ExtensionManager:
+    def __init__(self) -> None:
+        self.name2class: dict[str, type] = {}
+
+    def register(self, name: str):
+        def wrap(cls_to_register):
+            self.name2class[name] = cls_to_register
+            return cls_to_register
+
+        return wrap
+
+    def load(self, cls_name: str, *args, **kwargs) -> Any:
+        cls = self.name2class.get(cls_name)
+        assert cls is not None, f"Extension class {cls_name} not found"
+        return cls(args, kwargs)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 7f259dad08f9..403f738d24e7 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -19,6 +19,7 @@
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
+from vllm.multimodal.registry import ExtensionManager
 from vllm.utils.jsontree import json_map_leaves
 
 from .audio import AudioMediaIO
@@ -46,7 +47,10 @@
 
 _M = TypeVar("_M")
 
+MEDIA_CONNECTOR_REGISTRY = ExtensionManager()
 
+
+@MEDIA_CONNECTOR_REGISTRY.register("http")
 class MediaConnector:
     def __init__(
         self,
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 666ef275a924..3bd85d752086 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -14,6 +14,7 @@
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.multimodal.registry import ExtensionManager
 
 from .base import MediaIO
 from .image import ImageMediaIO
@@ -55,6 +56,9 @@ def sample_frames_from_video(frames: npt.NDArray, num_frames: int) -> npt.NDArra
 
 
 class VideoLoader:
+    def __init__(self, *args, **kwargs) -> None:
+        pass
+
     @classmethod
     @abstractmethod
     def load_bytes(
@@ -63,25 +67,7 @@ def load_bytes(
         raise NotImplementedError
 
 
-class VideoLoaderRegistry:
-    def __init__(self) -> None:
-        self.name2class: dict[str, type] = {}
-
-    def register(self, name: str):
-        def wrap(cls_to_register):
-            self.name2class[name] = cls_to_register
-            return cls_to_register
-
-        return wrap
-
-    @staticmethod
-    def load(cls_name: str) -> VideoLoader:
-        cls = VIDEO_LOADER_REGISTRY.name2class.get(cls_name)
-        assert cls is not None, f"VideoLoader class {cls_name} not found"
-        return cls()
-
-
-VIDEO_LOADER_REGISTRY = VideoLoaderRegistry()
+VIDEO_LOADER_REGISTRY = ExtensionManager()
 
 
 @VIDEO_LOADER_REGISTRY.register("opencv")

From 9852c2d3d03337f7b61b621e9014bad30ac5805f Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Wed, 29 Oct 2025 10:13:51 -0700
Subject: [PATCH 2/5] address gemini comments

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/entrypoints/chat_utils.py | 2 ++
 vllm/multimodal/registry.py    | 2 +-
 vllm/multimodal/video.py       | 3 ---
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index dcb42fa90cb8..c89ab1cc10bc 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -812,6 +812,7 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
             envs.VLLM_MEDIA_CONNECTOR,
             media_io_kwargs=self._tracker._model_config.media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
+            allowed_media_domains=tracker.allowed_media_domains,
         )
 
     @property
@@ -897,6 +898,7 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
             envs.VLLM_MEDIA_CONNECTOR,
             media_io_kwargs=self._tracker._model_config.media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
+            allowed_media_domains=tracker.allowed_media_domains,
         )
 
     @property
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 7bf6dd753b7f..bac7f0b26a4e 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -374,4 +374,4 @@ def wrap(cls_to_register):
     def load(self, cls_name: str, *args, **kwargs) -> Any:
         cls = self.name2class.get(cls_name)
         assert cls is not None, f"Extension class {cls_name} not found"
-        return cls(args, kwargs)
+        return cls(*args, **kwargs)
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 3bd85d752086..2975f9f602ed 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -56,9 +56,6 @@ def sample_frames_from_video(frames: npt.NDArray, num_frames: int) -> npt.NDArra
 
 
 class VideoLoader:
-    def __init__(self, *args, **kwargs) -> None:
-        pass
-
     @classmethod
     @abstractmethod
     def load_bytes(

From 923a9647c976e21c68b1629792957f9291d4b3ff Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Wed, 29 Oct 2025 10:15:40 -0700
Subject: [PATCH 3/5] address codex comments

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/entrypoints/chat_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c89ab1cc10bc..d7d6419d643b 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -810,7 +810,7 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
 
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
+            media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )
@@ -896,7 +896,7 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
         media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
         self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
             envs.VLLM_MEDIA_CONNECTOR,
-            media_io_kwargs=self._tracker._model_config.media_io_kwargs,
+            media_io_kwargs=media_io_kwargs,
             allowed_local_media_path=tracker.allowed_local_media_path,
             allowed_media_domains=tracker.allowed_media_domains,
         )

From aab2bb0d3e32a54b9f684ea7090adae40555b9d9 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Mon, 3 Nov 2025 08:43:58 -0800
Subject: [PATCH 4/5] address comments

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/multimodal/registry.py | 19 +------------------
 vllm/multimodal/utils.py    |  2 +-
 vllm/multimodal/video.py    |  2 +-
 vllm/utils/registry.py      | 20 ++++++++++++++++++++
 4 files changed, 23 insertions(+), 20 deletions(-)
 create mode 100644 vllm/utils/registry.py

diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index bac7f0b26a4e..8f9276e84640 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar
+from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
 
 import torch.nn as nn
 
@@ -358,20 +358,3 @@ def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
 
         first_modality = next(iter(max_tokens))
         return max_tokens[first_modality]
-
-
-class ExtensionManager:
-    def __init__(self) -> None:
-        self.name2class: dict[str, type] = {}
-
-    def register(self, name: str):
-        def wrap(cls_to_register):
-            self.name2class[name] = cls_to_register
-            return cls_to_register
-
-        return wrap
-
-    def load(self, cls_name: str, *args, **kwargs) -> Any:
-        cls = self.name2class.get(cls_name)
-        assert cls is not None, f"Extension class {cls_name} not found"
-        return cls(*args, **kwargs)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 403f738d24e7..3fad11a2cb4d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -19,8 +19,8 @@
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
-from vllm.multimodal.registry import ExtensionManager
 from vllm.utils.jsontree import json_map_leaves
+from vllm.utils.registry import ExtensionManager
 
 from .audio import AudioMediaIO
 from .base import MediaIO
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 2975f9f602ed..369c5e6cb4d1 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -14,7 +14,7 @@
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.multimodal.registry import ExtensionManager
+from vllm.utils.registry import ExtensionManager
 
 from .base import MediaIO
 from .image import ImageMediaIO
diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py
new file mode 100644
index 000000000000..466be1eddcfd
--- /dev/null
+++ b/vllm/utils/registry.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+
+class ExtensionManager:
+    def __init__(self) -> None:
+        self.name2class: dict[str, type] = {}
+
+    def register(self, name: str):
+        def wrap(cls_to_register):
+            self.name2class[name] = cls_to_register
+            return cls_to_register
+
+        return wrap
+
+    def load(self, cls_name: str, *args, **kwargs) -> Any:
+        cls = self.name2class.get(cls_name)
+        assert cls is not None, f"Extension class {cls_name} not found"
+        return cls(*args, **kwargs)

From 84cf6169d75c8a472f271676962cd834de9304b5 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Mon, 3 Nov 2025 08:55:15 -0800
Subject: [PATCH 5/5] add docstring

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/utils/registry.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py
index 466be1eddcfd..ac9b859159ea 100644
--- a/vllm/utils/registry.py
+++ b/vllm/utils/registry.py
@@ -4,10 +4,36 @@
 
 
 class ExtensionManager:
+    """
+    A registry for managing pluggable extension classes.
+
+    This class provides a simple mechanism to register and instantiate
+    extension classes by name. It is commonly used to implement plugin
+    systems where different implementations can be swapped at runtime.
+
+    Examples:
+        Basic usage with a registry instance:
+
+        >>> FOO_REGISTRY = ExtensionManager()
+        >>> @FOO_REGISTRY.register("my_foo_impl")
+        ... class MyFooImpl(Foo):
+        ...     def __init__(self, value):
+        ...         self.value = value
+        >>> foo_impl = FOO_REGISTRY.load("my_foo_impl", value=123)
+
+    """
+
     def __init__(self) -> None:
+        """
+        Initialize an empty extension registry.
+        """
         self.name2class: dict[str, type] = {}
 
     def register(self, name: str):
+        """
+        Decorator to register a class with the given name.
+        """
+
         def wrap(cls_to_register):
             self.name2class[name] = cls_to_register
             return cls_to_register
@@ -15,6 +41,9 @@ def wrap(cls_to_register):
         return wrap
 
     def load(self, cls_name: str, *args, **kwargs) -> Any:
+        """
+        Instantiate and return a registered extension class by name.
+        """
         cls = self.name2class.get(cls_name)
         assert cls is not None, f"Extension class {cls_name} not found"
         return cls(*args, **kwargs)