vllm-project · simon-mo · Oct 25, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 26, 2025
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
@@ -520,9 +520,6 @@ def __post_init__(self):
             if not current_platform.support_hybrid_kv_cache():
                 # Hybrid KV cache manager is not supported on non-GPU platforms.
                 self.scheduler_config.disable_hybrid_kv_cache_manager = True
-            if self.kv_transfer_config is not None:
-                # Hybrid KV cache manager is not compatible with KV transfer.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
             if self.kv_events_config is not None:
                 # Hybrid KV cache manager is not compatible with KV events.
                 self.scheduler_config.disable_hybrid_kv_cache_manager = True

@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, KVConnectorRole)
+    KVConnectorBase_V1, KVConnectorRole, supports_hma)
 
-__all__ = ["KVConnectorRole", "KVConnectorBase_V1"]
+__all__ = ["KVConnectorRole", "KVConnectorBase_V1", "supports_hma"]
@@ -37,7 +37,7 @@
 import enum
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
 
 import torch
 
@@ -64,6 +64,19 @@
 logger = init_logger(__name__)
 
 
+class SupportsHMA:
+    """
+    Inherent this interface if the connector supports hybrid memory 
+    allocator (HMA). This is required to use the connector together
+    with hybrid memory allocator.
+    """
+    pass
+
+
+def supports_hma(cls: type) -> bool:
+    return isinstance(cls, SupportsHMA)
+
+
 class KVConnectorRole(enum.Enum):
     # Connector running in the scheduler process
     SCHEDULER = 0
@@ -323,7 +336,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput):
     def request_finished(
         self,
         request: "Request",
-        block_ids: list[int],
+        block_ids: Union[list[int], tuple[list[int], ...]],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         """
         Called when a request has finished, before its blocks are freed.

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
 from lmcache.integration.vllm.vllm_v1_adapter import LMCacheConnectorV1Impl
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole, SupportsHMA)
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 
@@ -20,7 +20,7 @@
 logger = init_logger(__name__)
 
 
-class LMCacheConnectorV1(KVConnectorBase_V1):
+class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA):
 
     def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
         super().__init__(vllm_config=vllm_config, role=role)
@@ -153,7 +153,7 @@ def build_connector_meta(
     def request_finished(
         self,
         request: "Request",
-        block_ids: list[int],
+        block_ids: Union[list[int], tuple[list[int], ...]],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         """
         Called when a request has finished, before its blocks are freed.

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -3,7 +3,7 @@
 import copy
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
 
@@ -245,7 +245,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput):
     def request_finished(
         self,
         request: "Request",
-        blocks: list[int],
+        blocks: Union[tuple[list[int], ...], list[int]],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         async_saves = 0
         kv_txfer_params = None

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -194,7 +194,7 @@ def build_connector_meta(
     def request_finished(
         self,
         request: "Request",
-        block_ids: list[int],
+        block_ids: Union[tuple[list[int], ...], list[int]],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.request_finished(request, block_ids)
@@ -397,7 +397,7 @@ def build_connector_meta(
     def request_finished(
         self,
         request: "Request",
-        block_ids: list[int],
+        block_ids: Union[tuple[list[int], ...], list[int]],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         """
         Once a request is finished, determine whether request blocks

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
 from itertools import islice
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import torch
 
@@ -108,7 +108,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput):
     def request_finished(
         self,
         request: "Request",
-        block_ids: list[int],
+        block_ids: Union[list[int], tuple[list[int], ...]],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.request_finished(request, block_ids)
@@ -344,7 +344,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput):
     def request_finished(
         self,
         request: Request,
-        block_ids: list[int],
+        block_ids: Union[tuple[list[int], ...], list[int]],
     ) -> tuple[bool, Optional[dict[str, Any]]]:
         """
         Called when a request has finished, before its blocks are freed.

@@ -7,14 +7,16 @@
 import time
 from collections import defaultdict
 from collections.abc import Iterable
+from copy import deepcopy
 from typing import Any, Optional, Union
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
 from vllm.distributed.kv_transfer.kv_connector.factory import (
     KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
-                                                          KVConnectorRole)
+                                                          KVConnectorRole,
+                                                          supports_hma)
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
     KVConnectorStats)
 from vllm.logger import init_logger
@@ -83,14 +85,22 @@ def __init__(
         # KV Connector pushes/pull of remote KVs for P/D and offloading.
         self.connector = None
         if self.vllm_config.kv_transfer_config is not None:
-            assert len(self.kv_cache_config.kv_cache_groups) == 1, (
-                "Multiple KV cache groups are not currently supported "
-                "with KV connectors")
             assert not self.is_encoder_decoder, (
                 "Encoder-decoder models are not currently supported "
                 "with KV connectors")
+
+            connector_vllm_config = deepcopy(self.vllm_config)
+            connector_vllm_config.kv_cache_config = kv_cache_config
             self.connector = KVConnectorFactory.create_connector(
-                config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
+                config=connector_vllm_config, role=KVConnectorRole.SCHEDULER)
+
+            # Make sure that the connector supports HMA if HMA is enabled.
+            num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups)
+            if not supports_hma(self.connector) and num_kv_cache_groups > 1:
+                raise NotImplementedError(
+                    f"Connector {self.connector.__class__.__name__} does not"
+                    f" support HMA but HMA is enabled. Please set "
+                    f"`--disable-hybrid-kv-cache-manager`.")
 
         self.kv_event_publisher = EventPublisherFactory.create(
             self.kv_events_config,
@@ -1231,8 +1241,17 @@ def _connector_finished(
         if self.connector is None:
             return False, None
 
-        (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
-        return self.connector.request_finished(request, block_ids)
+        num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups)
+
+        block_ids = self.kv_cache_manager.get_block_ids(request.request_id)
+
+        if not supports_hma(self.connector) or num_kv_cache_groups == 1:
+            # NOTE(Kuntai): this code path is a hack.
+            # We should remove this code path after all connectors
+            # support hybrid memory allocator.
+            return self.connector.request_finished(request, block_ids[0])
+        else:
+            return self.connector.request_finished(request, block_ids)
 
     def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         """

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -5,6 +5,7 @@
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
+from copy import deepcopy
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
@@ -315,6 +316,15 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
     def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
 
+        # Init kv cache connector here, because it requires
+        # `kv_cache_config`.
+        # NOTE(Kuntai): This need to be done before `initialize_kv_cache`,
+        # because `initialize_kv_cache` will inject kv cache groups not
+        # related to kv cache connector (e.g. kv cache sharing layers).
+        connector_vllm_config = deepcopy(self.vllm_config)
+        connector_vllm_config.kv_cache_config = kv_cache_config
+        ensure_kv_transfer_initialized(connector_vllm_config)
+
         if self.vllm_config.model_config.enable_sleep_mode:
             from vllm.device_allocator.cumem import CuMemAllocator
 
@@ -714,5 +724,3 @@ def init_worker_distributed_environment(
         parallel_config.tensor_parallel_size,
         parallel_config.pipeline_parallel_size,
         parallel_config.decode_context_parallel_size)
-
-    ensure_kv_transfer_initialized(vllm_config)