From 1ded8ae1ab57151d778518df098a179d7907773f Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Thu, 25 Sep 2025 14:16:13 -0700 Subject: [PATCH 01/27] Refactor: make sure the API calls are backward compatible Signed-off-by: KuntaiDu --- vllm/config/__init__.py | 3 -- .../kv_transfer/kv_connector/v1/__init__.py | 4 +-- .../kv_transfer/kv_connector/v1/base.py | 16 +++++++++- .../kv_connector/v1/lmcache_connector.py | 6 ++-- vllm/v1/core/sched/scheduler.py | 30 +++++++++++++++---- vllm/v1/worker/gpu_worker.py | 13 ++++++-- 6 files changed, 55 insertions(+), 17 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index df6564077e8a..9973f32b202c 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -520,9 +520,6 @@ def __post_init__(self): if not current_platform.support_hybrid_kv_cache(): # Hybrid KV cache manager is not supported on non-GPU platforms. self.scheduler_config.disable_hybrid_kv_cache_manager = True - if self.kv_transfer_config is not None: - # Hybrid KV cache manager is not compatible with KV transfer. - self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.kv_events_config is not None: # Hybrid KV cache manager is not compatible with KV events. self.scheduler_config.disable_hybrid_kv_cache_manager = True diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py index f00f31dde915..318d4e64ad79 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_connector.v1.base import ( - KVConnectorBase_V1, KVConnectorRole) + KVConnectorBase_V1, KVConnectorRole, supports_hma) -__all__ = ["KVConnectorRole", "KVConnectorBase_V1"] +__all__ = ["KVConnectorRole", "KVConnectorBase_V1", "supports_hma"] diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 184d0a62f2c3..6547a52ff2b0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -64,6 +64,20 @@ logger = init_logger(__name__) +class SupportsHMA: + """ + Inherent this interface if the connector supports HMA. + If inherented from this interface, vLLM will: + - send `KVCacheConfig` to the connector in `register_kv_caches`. + - send `tuple[list[int], ...]` to the connector in `request_finished`. + """ + pass + + +def supports_hma(cls: type) -> bool: + return isinstance(cls, SupportsHMA) + + class KVConnectorRole(enum.Enum): # Connector running in the scheduler process SCHEDULER = 0 @@ -323,7 +337,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: "Request", - block_ids: list[int], + block_ids: list[int] | tuple[list[int], ...], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Called when a request has finished, before its blocks are freed. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 2b0abe983fbb..f5091980fbb4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -7,7 +7,7 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( - KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole, SupportsHMA) from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput @@ -20,7 +20,7 @@ logger = init_logger(__name__) -class LMCacheConnectorV1(KVConnectorBase_V1): +class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) @@ -153,7 +153,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: list[int], + block_ids: list[int] | tuple[list[int], ...], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Called when a request has finished, before its blocks are freed. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 7fc4776b0261..2839e2bf4974 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -7,6 +7,7 @@ import time from collections import defaultdict from collections.abc import Iterable +from copy import deepcopy from typing import Any, Optional, Union from vllm.config import VllmConfig @@ -14,7 +15,8 @@ from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, - KVConnectorRole) + KVConnectorRole, + supports_hma) from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorStats) from vllm.logger import init_logger @@ -83,15 +85,26 @@ def __init__( # KV Connector pushes/pull of remote KVs for P/D and offloading. self.connector = None if self.vllm_config.kv_transfer_config is not None: - assert len(self.kv_cache_config.kv_cache_groups) == 1, ( - "Multiple KV cache groups are not currently supported " - "with KV connectors") assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported " "with KV connectors") + num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups) + + connector_vllm_config = deepcopy(self.vllm_config) + if num_kv_cache_groups > 1: + # NOTE(Kuntai): hybrid allocator is enabled. + # We inject `kv_cache_config` into vllm_config. + connector_vllm_config.kv_cache_config = kv_cache_config self.connector = KVConnectorFactory.create_connector( config=self.vllm_config, role=KVConnectorRole.SCHEDULER) + # Make sure that the connector supports HMA if HMA is enabled. + if not supports_hma(self.connector) and num_kv_cache_groups > 1: + raise NotImplementedError( + f"Connector {self.connector.__class__.__name__} does not" + f" support HMA but HMA is enabled. Please set " + f"`--disable-hybrid-kv-cache-manager`.") + self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, self.parallel_config.data_parallel_rank, @@ -1231,8 +1244,13 @@ def _connector_finished( if self.connector is None: return False, None - (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id) - return self.connector.request_finished(request, block_ids) + if not supports_hma(self.connector): + (block_ids, ) = self.kv_cache_manager.get_block_ids( + request.request_id) + return self.connector.request_finished(request, block_ids) + else: + block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + return self.connector.request_finished(request, block_ids) def _update_waiting_for_remote_kv(self, request: Request) -> bool: """ diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 9082bbfd8f8e..075e62ad7194 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -5,6 +5,7 @@ import gc import os from contextlib import AbstractContextManager, nullcontext +from copy import deepcopy from typing import TYPE_CHECKING, Any, Optional, Union import torch @@ -315,6 +316,16 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: """Allocate GPU KV cache with the specified kv_cache_config.""" + # Init kv cache connector here, because it requires + # `kv_cache_config`. + # This need to be done before `initialize_kv_cache` because + # `initialize_kv_cache` will inject kv cache groups not + # related to kv cache connector (e.g. kv cache sharing layers) + connector_vllm_config = deepcopy(self.vllm_config) + if len(kv_cache_config.kv_cache_groups) > 1: + connector_vllm_config.kv_cache_config = kv_cache_config + ensure_kv_transfer_initialized(connector_vllm_config) + if self.vllm_config.model_config.enable_sleep_mode: from vllm.device_allocator.cumem import CuMemAllocator @@ -714,5 +725,3 @@ def init_worker_distributed_environment( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, parallel_config.decode_context_parallel_size) - - ensure_kv_transfer_initialized(vllm_config) From 42040ba5754694bd9e71263abaa4108a828b2be2 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Thu, 25 Sep 2025 14:27:09 -0700 Subject: [PATCH 02/27] align function signature Signed-off-by: KuntaiDu --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 7 +++---- .../kv_transfer/kv_connector/v1/multi_connector.py | 2 +- .../kv_transfer/kv_connector/v1/nixl_connector.py | 4 ++-- .../kv_transfer/kv_connector/v1/offloading_connector.py | 2 +- vllm/v1/core/sched/scheduler.py | 9 +++------ vllm/v1/worker/gpu_worker.py | 9 ++++----- 6 files changed, 14 insertions(+), 19 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 6547a52ff2b0..1f366fc92ce4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -66,10 +66,9 @@ class SupportsHMA: """ - Inherent this interface if the connector supports HMA. - If inherented from this interface, vLLM will: - - send `KVCacheConfig` to the connector in `register_kv_caches`. - - send `tuple[list[int], ...]` to the connector in `request_finished`. + Inherent this interface if the connector supports hybrid memory + allocator (HMA). This is required to use the connector together + with hybrid memory allocator. """ pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 6836a71e58d6..9c712e2e52ef 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -245,7 +245,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: "Request", - blocks: list[int], + blocks: tuple[list[int], ...] | list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: async_saves = 0 kv_txfer_params = None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 528d4022bd17..7e74026554b6 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -194,7 +194,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: list[int], + block_ids: tuple[list[int], ...] | list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) @@ -397,7 +397,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: list[int], + block_ids: tuple[list[int], ...] | list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Once a request is finished, determine whether request blocks diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index c23efa604544..1af7c4fed3b3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -344,7 +344,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: Request, - block_ids: list[int], + block_ids: tuple[list[int], ...] | list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Called when a request has finished, before its blocks are freed. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 2839e2bf4974..fd151a0f70a3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -88,17 +88,14 @@ def __init__( assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported " "with KV connectors") - num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups) connector_vllm_config = deepcopy(self.vllm_config) - if num_kv_cache_groups > 1: - # NOTE(Kuntai): hybrid allocator is enabled. - # We inject `kv_cache_config` into vllm_config. - connector_vllm_config.kv_cache_config = kv_cache_config + connector_vllm_config.kv_cache_config = kv_cache_config self.connector = KVConnectorFactory.create_connector( - config=self.vllm_config, role=KVConnectorRole.SCHEDULER) + config=connector_vllm_config, role=KVConnectorRole.SCHEDULER) # Make sure that the connector supports HMA if HMA is enabled. + num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups) if not supports_hma(self.connector) and num_kv_cache_groups > 1: raise NotImplementedError( f"Connector {self.connector.__class__.__name__} does not" diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 075e62ad7194..b27129b5ddf3 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -318,12 +318,11 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: # Init kv cache connector here, because it requires # `kv_cache_config`. - # This need to be done before `initialize_kv_cache` because - # `initialize_kv_cache` will inject kv cache groups not - # related to kv cache connector (e.g. kv cache sharing layers) + # NOTE(Kuntai): This need to be done before `initialize_kv_cache`, + # because `initialize_kv_cache` will inject kv cache groups not + # related to kv cache connector (e.g. kv cache sharing layers). connector_vllm_config = deepcopy(self.vllm_config) - if len(kv_cache_config.kv_cache_groups) > 1: - connector_vllm_config.kv_cache_config = kv_cache_config + connector_vllm_config.kv_cache_config = kv_cache_config ensure_kv_transfer_initialized(connector_vllm_config) if self.vllm_config.model_config.enable_sleep_mode: From fbaa51a3e3ef297d5533e6275d950d4b44f8a2cc Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Fri, 26 Sep 2025 12:14:04 -0700 Subject: [PATCH 03/27] fix mypy errors Signed-off-by: KuntaiDu --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 4 ++-- .../kv_transfer/kv_connector/v1/lmcache_connector.py | 4 ++-- .../kv_transfer/kv_connector/v1/multi_connector.py | 4 ++-- .../kv_transfer/kv_connector/v1/nixl_connector.py | 4 ++-- .../kv_transfer/kv_connector/v1/offloading_connector.py | 6 +++--- vllm/v1/core/sched/scheduler.py | 7 ++++++- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 1f366fc92ce4..7f12d1b11e11 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -37,7 +37,7 @@ import enum from abc import ABC, abstractmethod from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Callable, Literal, Optional +from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union import torch @@ -336,7 +336,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: "Request", - block_ids: list[int] | tuple[list[int], ...], + block_ids: Union[list[int], tuple[list[int], ...]], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Called when a request has finished, before its blocks are freed. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index f5091980fbb4..6143b50e118f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Optional, Union import torch from lmcache.integration.vllm.vllm_v1_adapter import LMCacheConnectorV1Impl @@ -153,7 +153,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: list[int] | tuple[list[int], ...], + block_ids: Union[list[int], tuple[list[int], ...]], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Called when a request has finished, before its blocks are freed. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 9c712e2e52ef..e197b54ae119 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -3,7 +3,7 @@ import copy from collections.abc import Iterable from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Optional, Union import torch @@ -245,7 +245,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: "Request", - blocks: tuple[list[int], ...] | list[int], + blocks: Union[tuple[list[int], ...], list[int]], ) -> tuple[bool, Optional[dict[str, Any]]]: async_saves = 0 kv_txfer_params = None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 7e74026554b6..a5936cc6e4b8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -194,7 +194,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: tuple[list[int], ...] | list[int], + block_ids: Union[tuple[list[int], ...], list[int]], ) -> tuple[bool, Optional[dict[str, Any]]]: assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) @@ -397,7 +397,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: tuple[list[int], ...] | list[int], + block_ids: Union[tuple[list[int], ...], list[int]], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Once a request is finished, determine whether request blocks diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 1af7c4fed3b3..c003729db2f3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -4,7 +4,7 @@ from collections.abc import Iterable, Iterator from dataclasses import dataclass from itertools import islice -from typing import Any, Optional +from typing import Any, Optional, Union import torch @@ -108,7 +108,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: "Request", - block_ids: list[int], + block_ids: Union[list[int], tuple[list[int], ...]], ) -> tuple[bool, Optional[dict[str, Any]]]: assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) @@ -344,7 +344,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: Request, - block_ids: tuple[list[int], ...] | list[int], + block_ids: Union[tuple[list[int], ...], list[int]], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Called when a request has finished, before its blocks are freed. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index fd151a0f70a3..4dc6bc3b1029 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1241,7 +1241,12 @@ def _connector_finished( if self.connector is None: return False, None - if not supports_hma(self.connector): + num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups) + + if not supports_hma(self.connector) or num_kv_cache_groups == 1: + # NOTE(Kuntai): this code path is a hack. + # We should remove this code path after all connectors + # support hybrid memory allocator. (block_ids, ) = self.kv_cache_manager.get_block_ids( request.request_id) return self.connector.request_finished(request, block_ids) From fae4c82dfe9fdc048a8ea001167eaab1b36929ad Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Fri, 26 Sep 2025 13:31:19 -0700 Subject: [PATCH 04/27] adjust the signature of block_ids Signed-off-by: KuntaiDu --- vllm/v1/core/sched/scheduler.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4dc6bc3b1029..65978e8b90e0 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1243,15 +1243,14 @@ def _connector_finished( num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups) + block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + if not supports_hma(self.connector) or num_kv_cache_groups == 1: # NOTE(Kuntai): this code path is a hack. # We should remove this code path after all connectors # support hybrid memory allocator. - (block_ids, ) = self.kv_cache_manager.get_block_ids( - request.request_id) - return self.connector.request_finished(request, block_ids) + return self.connector.request_finished(request, block_ids[0]) else: - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) return self.connector.request_finished(request, block_ids) def _update_waiting_for_remote_kv(self, request: Request) -> bool: From 0aa2b010d8995f1ff2bc1c99b0157f8ed29201a5 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Sun, 12 Oct 2025 07:58:07 -0700 Subject: [PATCH 05/27] allow hybrid kv cache manager + connector Signed-off-by: KuntaiDu --- vllm/config/vllm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 9d156dd8d9de..01f14d1e88bb 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -535,9 +535,6 @@ def __post_init__(self): if not current_platform.support_hybrid_kv_cache(): # Hybrid KV cache manager is not supported on non-GPU platforms. self.scheduler_config.disable_hybrid_kv_cache_manager = True - if self.kv_transfer_config is not None: - # Hybrid KV cache manager is not compatible with KV transfer. - self.scheduler_config.disable_hybrid_kv_cache_manager = True if self.kv_events_config is not None: # Hybrid KV cache manager is not compatible with KV events. self.scheduler_config.disable_hybrid_kv_cache_manager = True From 89a976ced21efebd196e1f6c9fd369b43d0cbfac Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Sun, 12 Oct 2025 08:14:34 -0700 Subject: [PATCH 06/27] init using ConnectorVllmConfig Signed-off-by: KuntaiDu --- vllm/config/vllm.py | 17 +++++++++++++++++ vllm/v1/core/sched/scheduler.py | 20 ++++---------------- vllm/v1/worker/gpu_worker.py | 6 ++---- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 01f14d1e88bb..811f027a0448 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -40,11 +40,14 @@ from transformers import PretrainedConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + from vllm.v1.kv_cache_interface import KVCacheConfig else: PretrainedConfig = Any QuantizationConfig = Any + KVCacheConfig = Any + logger = init_logger(__name__) @@ -884,3 +887,17 @@ def get_layers_from_vllm_config( for layer_name in layer_names if isinstance(forward_context[layer_name], layer_type) } + + +class ConnectorVllmConfig(VllmConfig): + kv_cache_config: Optional[KVCacheConfig] = None + + def __init__(self, vllm_config: VllmConfig, kv_cache_config: KVCacheConfig): + # Copy over all fields from vllm_config into the base class + super().__init__(**vllm_config.__dict__) + self.kv_cache_config = kv_cache_config + + def __post_init__(self): + # NOTE(Kuntai): bypass post init so that it copies the exact same thing + # as VllmConfig. + pass diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 89d6e8a46656..90e04a1c44b4 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -9,7 +9,7 @@ from collections.abc import Iterable from typing import Any, Union -from vllm.config import VllmConfig +from vllm.config import ConnectorVllmConfig, VllmConfig from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.distributed.kv_transfer.kv_connector.v1 import ( @@ -88,23 +88,13 @@ def __init__( "Encoder-decoder models are not currently supported with KV connectors" ) - from copy import deepcopy - - connector_vllm_config = deepcopy(self.vllm_config) - connector_vllm_config.kv_cache_config = kv_cache_config + connector_vllm_config = ConnectorVllmConfig( + self.vllm_config, kv_cache_config + ) self.connector = KVConnectorFactory.create_connector( config=connector_vllm_config, role=KVConnectorRole.SCHEDULER ) - # Make sure that the connector supports HMA if HMA is enabled. - num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups) - if not supports_hma(self.connector) and num_kv_cache_groups > 1: - raise NotImplementedError( - f"Connector {self.connector.__class__.__name__} does not" - f" support HMA but HMA is enabled. Please set " - f"`--disable-hybrid-kv-cache-manager`." - ) - self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, self.parallel_config.data_parallel_rank, @@ -1309,8 +1299,6 @@ def _connector_finished( if self.connector is None: return False, None - num_kv_cache_groups = len(self.kv_cache_config.kv_cache_groups) - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) if not supports_hma(self.connector): diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index cf5872125ead..ebb72efdca1e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -6,7 +6,6 @@ import gc import os from contextlib import AbstractContextManager, nullcontext -from copy import deepcopy from typing import TYPE_CHECKING, Any, Optional, Union import torch @@ -14,7 +13,7 @@ import torch.nn as nn import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import ConnectorVllmConfig, VllmConfig from vllm.distributed import ( ensure_model_parallel_initialized, init_distributed_environment, @@ -332,8 +331,7 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: # NOTE(Kuntai): This need to be done before `initialize_kv_cache`, # because `initialize_kv_cache` will inject kv cache groups not # related to kv cache connector (e.g. kv cache sharing layers). - connector_vllm_config = deepcopy(self.vllm_config) - connector_vllm_config.kv_cache_config = kv_cache_config + connector_vllm_config = ConnectorVllmConfig(self.vllm_config, kv_cache_config) ensure_kv_transfer_initialized(connector_vllm_config) if self.vllm_config.model_config.enable_sleep_mode: From 9cdd2b0e0c8a21830a8638bb9d85749879764285 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Sun, 12 Oct 2025 18:50:55 -0700 Subject: [PATCH 07/27] put the change of function signature inside KVConnectorHMAMixin class Signed-off-by: KuntaiDu --- vllm/config/__init__.py | 2 ++ .../kv_transfer/kv_connector/factory.py | 18 ++++++++-- .../kv_transfer/kv_connector/v1/__init__.py | 8 ++++- .../kv_transfer/kv_connector/v1/base.py | 36 ++++++++++++++----- .../kv_connector/v1/lmcache_connector.py | 19 ++++++---- .../kv_connector/v1/multi_connector.py | 4 +-- .../kv_connector/v1/nixl_connector.py | 2 +- .../kv_connector/v1/offloading_connector.py | 2 +- 8 files changed, 67 insertions(+), 24 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6a0197d044dc..0351462d2354 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -35,6 +35,7 @@ update_config, ) from vllm.config.vllm import ( + ConnectorVllmConfig, VllmConfig, get_cached_compilation_config, get_current_vllm_config, @@ -92,6 +93,7 @@ "update_config", # From vllm.config.vllm "VllmConfig", + "ConnectorVllmConfig", "get_cached_compilation_config", "get_current_vllm_config", "set_current_vllm_config", diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 395a4e20e0ba..08d0780cb1c9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -5,15 +5,18 @@ from typing import TYPE_CHECKING, Callable import vllm.envs as envs +from vllm.config import ConnectorVllmConfig from vllm.distributed.kv_transfer.kv_connector.base import ( KVConnectorBase, KVConnectorBaseType, ) -from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole +from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorHMAMixin, + KVConnectorRole, +) from vllm.logger import init_logger if TYPE_CHECKING: - from vllm.config import VllmConfig from vllm.config.kv_transfer import KVTransferConfig logger = init_logger(__name__) @@ -37,7 +40,7 @@ def loader() -> type[KVConnectorBase]: @classmethod def create_connector( cls, - config: "VllmConfig", + config: ConnectorVllmConfig, role: KVConnectorRole, ) -> KVConnectorBase: if not envs.VLLM_USE_V1: @@ -48,6 +51,15 @@ def create_connector( kv_transfer_config = config.kv_transfer_config connector_cls = cls.get_connector_class(kv_transfer_config) + + # check if the connector supports HMA + hma_enabled = not config.scheduler_config.disable_hybrid_kv_cache_manager + if not issubclass(connector_cls, KVConnectorHMAMixin) and hma_enabled: + raise ValueError( + f"Connector {connector_cls.__name__} does not support HMA but " + f"HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`." + ) + logger.info( "Creating v1 connector with name: %s and engine_id: %s", connector_cls.__name__, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py index 2a1a942ee4fe..d3a4577f521d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py @@ -2,8 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, + KVConnectorHMAMixin, KVConnectorRole, supports_hma, ) -__all__ = ["KVConnectorRole", "KVConnectorBase_V1", "supports_hma"] +__all__ = [ + "KVConnectorRole", + "KVConnectorBase_V1", + "supports_hma", + "KVConnectorHMAMixin", +] diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 9a9cb35e54a0..f555acd16670 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -38,7 +38,7 @@ import enum from abc import ABC, abstractmethod from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Literal, Optional import torch @@ -70,18 +70,36 @@ logger = init_logger(__name__) -class SupportsHMA: +class KVConnectorHMAMixin: """ - Inherent this interface if the connector supports hybrid memory - allocator (HMA). This is required to use the connector together - with hybrid memory allocator. + Mixin class for connectors that support hybrid memory allocator (HMA). + This is required to use the connector together with hybrid memory allocator. """ - pass + def request_finished( + self, + request: "Request", + block_ids: tuple[list[int], ...], + ) -> tuple[bool, Optional[dict[str, Any]]]: + """ + Called exactly once when a request has finished, before its blocks are + freed. + + The connector may assumes responsibility for freeing the the blocks + asynchronously by returning True. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + return False, None -def supports_hma(cls: type) -> bool: - return isinstance(cls, SupportsHMA) +def supports_hma(connector: Any) -> bool: + return isinstance(connector, KVConnectorHMAMixin) class KVConnectorRole(enum.Enum): @@ -374,7 +392,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: "Request", - block_ids: Union[list[int], tuple[list[int], ...]], + block_ids: list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Called exactly once when a request has finished, before its blocks are diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index e3f6f7d3ac2b..495b188fb708 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -1,16 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional import torch from lmcache.integration.vllm.vllm_v1_adapter import LMCacheConnectorV1Impl -from vllm.config import VllmConfig +from vllm.config import ConnectorVllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, + KVConnectorHMAMixin, KVConnectorMetadata, KVConnectorRole, - SupportsHMA, ) from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput @@ -24,8 +24,8 @@ logger = init_logger(__name__) -class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA): - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): +class LMCacheConnectorV1(KVConnectorHMAMixin, KVConnectorBase_V1): + def __init__(self, vllm_config: ConnectorVllmConfig, role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) self._lmcache_engine = LMCacheConnectorV1Impl(vllm_config, role, self) @@ -158,10 +158,10 @@ def build_connector_meta( """ return self._lmcache_engine.build_connector_meta(scheduler_output) - def request_finished( + def request_finished( # type: ignore[override] self, request: "Request", - block_ids: Union[list[int], tuple[list[int], ...]], + block_ids: tuple[list[int], ...], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Called when a request has finished, before its blocks are freed. @@ -172,5 +172,10 @@ def request_finished( get_finished(). Optional KVTransferParams to be included in the request outputs returned by the engine. + + Note: + This method intentionally uses tuple[list[int], ...] from + KVConnectorHMAMixin interface instead of list[int] from + KVConnectorBase_V1 to support hybrid memory allocation. """ return self._lmcache_engine.request_finished(request, block_ids) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index d22925b60b3e..e48d4ccd1d6c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -3,7 +3,7 @@ import copy from collections.abc import Iterable from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional import torch @@ -257,7 +257,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: "Request", - blocks: Union[tuple[list[int], ...], list[int]], + blocks: list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: async_saves = 0 kv_txfer_params = None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 8ecc0e29990f..3b78123d21de 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -213,7 +213,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: Union[tuple[list[int], ...], list[int]], + block_ids: list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index b98db3847638..1ac42cbc7ce9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -112,7 +112,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: "Request", - block_ids: Union[list[int], tuple[list[int], ...]], + block_ids: list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) From e0ac23c7eb61fde2b26d5a5c2f0fd51e4aa43e5e Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Sun, 12 Oct 2025 18:56:18 -0700 Subject: [PATCH 08/27] remove unnecessary change of function signature Signed-off-by: KuntaiDu --- .../distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 2 +- .../kv_transfer/kv_connector/v1/offloading_connector.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 3b78123d21de..365d1a1ff280 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -444,7 +444,7 @@ def build_connector_meta( def request_finished( self, request: "Request", - block_ids: Union[tuple[list[int], ...], list[int]], + block_ids: list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Once a request is finished, determine whether request blocks diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 1ac42cbc7ce9..745af0efba18 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -4,7 +4,7 @@ from collections.abc import Iterable, Iterator from dataclasses import dataclass from itertools import islice -from typing import Any, Optional, Union +from typing import Any, Optional import torch @@ -353,7 +353,7 @@ def update_connector_output(self, connector_output: KVConnectorOutput): def request_finished( self, request: Request, - block_ids: Union[tuple[list[int], ...], list[int]], + block_ids: list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: """ Called when a request has finished, before its blocks are freed. From b29a257f791cbb0e501c1918e22d3b634be940ea Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Sun, 12 Oct 2025 20:11:13 -0700 Subject: [PATCH 09/27] copy kv cache config instead of just sending the pointer Signed-off-by: KuntaiDu --- vllm/config/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index bab4ba927394..4c56c1d7a97e 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -895,7 +895,7 @@ class ConnectorVllmConfig(VllmConfig): def __init__(self, vllm_config: VllmConfig, kv_cache_config: KVCacheConfig): # Copy over all fields from vllm_config into the base class super().__init__(**vllm_config.__dict__) - self.kv_cache_config = kv_cache_config + self.kv_cache_config = copy.deepcopy(kv_cache_config) def __post_init__(self): # NOTE(Kuntai): bypass post init so that it copies the exact same thing From 866c404e69477e969e726e2930cfd6f8accaec72 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Wed, 15 Oct 2025 22:46:00 -0700 Subject: [PATCH 10/27] align the way of checking if the connector supports HMA Signed-off-by: KuntaiDu --- vllm/distributed/kv_transfer/kv_connector/factory.py | 4 ++-- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 38013a5f9c36..57a616fdf42a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -12,8 +12,8 @@ KVConnectorBaseType, ) from vllm.distributed.kv_transfer.kv_connector.v1 import ( - KVConnectorHMAMixin, KVConnectorRole, + supports_hma, ) from vllm.logger import init_logger @@ -55,7 +55,7 @@ def create_connector( # check if the connector supports HMA hma_enabled = not config.scheduler_config.disable_hybrid_kv_cache_manager - if not issubclass(connector_cls, KVConnectorHMAMixin) and hma_enabled: + if hma_enabled and not supports_hma(connector_cls): raise ValueError( f"Connector {connector_cls.__name__} does not support HMA but " f"HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`." diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index c26f51beb446..baf28d13a486 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -99,7 +99,10 @@ def request_finished( def supports_hma(connector: Any) -> bool: - return isinstance(connector, KVConnectorHMAMixin) + if isinstance(connector, type): + return issubclass(connector, KVConnectorHMAMixin) + else: + return isinstance(connector, KVConnectorHMAMixin) class KVConnectorRole(enum.Enum): From 367b7b76bbe4496001c58e2f7c2c5ebdc0959d40 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Fri, 17 Oct 2025 05:26:52 -0700 Subject: [PATCH 11/27] change class name to SupportsHMA Signed-off-by: KuntaiDu --- .../kv_transfer/kv_connector/v1/__init__.py | 4 ++-- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 10 +++++----- .../kv_transfer/kv_connector/v1/lmcache_connector.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py index d3a4577f521d..3981d74cbcc4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, - KVConnectorHMAMixin, KVConnectorRole, + SupportsHMA, supports_hma, ) @@ -11,5 +11,5 @@ "KVConnectorRole", "KVConnectorBase_V1", "supports_hma", - "KVConnectorHMAMixin", + "SupportsHMA", ] diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index baf28d13a486..e3983705de76 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -70,7 +70,7 @@ logger = init_logger(__name__) -class KVConnectorHMAMixin: +class SupportsHMA: """ Mixin class for connectors that support hybrid memory allocator (HMA). This is required to use the connector together with hybrid memory allocator. @@ -85,7 +85,7 @@ def request_finished( Called exactly once when a request has finished, before its blocks are freed. - The connector may assumes responsibility for freeing the the blocks + The connector may assumes responsibility for freeing the blocks asynchronously by returning True. Returns: @@ -100,9 +100,9 @@ def request_finished( def supports_hma(connector: Any) -> bool: if isinstance(connector, type): - return issubclass(connector, KVConnectorHMAMixin) + return issubclass(connector, SupportsHMA) else: - return isinstance(connector, KVConnectorHMAMixin) + return isinstance(connector, SupportsHMA) class KVConnectorRole(enum.Enum): @@ -401,7 +401,7 @@ def request_finished( Called exactly once when a request has finished, before its blocks are freed. - The connector may assumes responsibility for freeing the the blocks + The connector may assumes responsibility for freeing the blocks asynchronously by returning True. Returns: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index ed2bcd9d921e..e559c3c5a7b3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -8,9 +8,9 @@ from vllm.config import ConnectorVllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, - KVConnectorHMAMixin, KVConnectorMetadata, KVConnectorRole, + SupportsHMA, ) from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput @@ -24,7 +24,7 @@ logger = init_logger(__name__) -class LMCacheConnectorV1(KVConnectorHMAMixin, KVConnectorBase_V1): +class LMCacheConnectorV1(SupportsHMA, KVConnectorBase_V1): def __init__(self, vllm_config: ConnectorVllmConfig, role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) self._lmcache_engine = LMCacheConnectorV1Impl(vllm_config, role, self) @@ -175,7 +175,7 @@ def request_finished( # type: ignore[override] Note: This method intentionally uses tuple[list[int], ...] from - KVConnectorHMAMixin interface instead of list[int] from + SupportsHMA interface instead of list[int] from KVConnectorBase_V1 to support hybrid memory allocation. """ return self._lmcache_engine.request_finished(request, block_ids) From e35f118d2e96d5542f43e1d88a84d847610e16bd Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Fri, 17 Oct 2025 05:35:04 -0700 Subject: [PATCH 12/27] avoid using ConnectorVllmConfig, use copy instead Signed-off-by: KuntaiDu --- vllm/config/__init__.py | 2 -- vllm/config/vllm.py | 14 -------------- .../kv_transfer/kv_connector/factory.py | 4 ++-- .../kv_connector/v1/lmcache_connector.py | 4 ++-- vllm/v1/core/sched/scheduler.py | 8 ++++---- vllm/v1/worker/gpu_worker.py | 5 +++-- 6 files changed, 11 insertions(+), 26 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 0351462d2354..6a0197d044dc 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -35,7 +35,6 @@ update_config, ) from vllm.config.vllm import ( - ConnectorVllmConfig, VllmConfig, get_cached_compilation_config, get_current_vllm_config, @@ -93,7 +92,6 @@ "update_config", # From vllm.config.vllm "VllmConfig", - "ConnectorVllmConfig", "get_cached_compilation_config", "get_current_vllm_config", "set_current_vllm_config", diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 4c56c1d7a97e..41942d583d4c 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -887,17 +887,3 @@ def get_layers_from_vllm_config( for layer_name in layer_names if isinstance(forward_context[layer_name], layer_type) } - - -class ConnectorVllmConfig(VllmConfig): - kv_cache_config: KVCacheConfig | None = None - - def __init__(self, vllm_config: VllmConfig, kv_cache_config: KVCacheConfig): - # Copy over all fields from vllm_config into the base class - super().__init__(**vllm_config.__dict__) - self.kv_cache_config = copy.deepcopy(kv_cache_config) - - def __post_init__(self): - # NOTE(Kuntai): bypass post init so that it copies the exact same thing - # as VllmConfig. - pass diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 57a616fdf42a..f7acdd8be9d5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING import vllm.envs as envs -from vllm.config import ConnectorVllmConfig +from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.base import ( KVConnectorBase, KVConnectorBaseType, @@ -41,7 +41,7 @@ def loader() -> type[KVConnectorBase]: @classmethod def create_connector( cls, - config: ConnectorVllmConfig, + config: VllmConfig, role: KVConnectorRole, ) -> KVConnectorBase: if not envs.VLLM_USE_V1: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index e559c3c5a7b3..0b90e86b8156 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -5,7 +5,7 @@ import torch from lmcache.integration.vllm.vllm_v1_adapter import LMCacheConnectorV1Impl -from vllm.config import ConnectorVllmConfig +from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, @@ -25,7 +25,7 @@ class LMCacheConnectorV1(SupportsHMA, KVConnectorBase_V1): - def __init__(self, vllm_config: ConnectorVllmConfig, role: KVConnectorRole): + def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) self._lmcache_engine = LMCacheConnectorV1Impl(vllm_config, role, self) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c7b27b5c660e..c60e1d71c444 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1,13 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy import itertools import time from collections import defaultdict from collections.abc import Iterable from typing import Any -from vllm.config import ConnectorVllmConfig, VllmConfig +from vllm.config import VllmConfig from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.distributed.kv_transfer.kv_connector.v1 import ( @@ -86,9 +87,8 @@ def __init__( "Encoder-decoder models are not currently supported with KV connectors" ) - connector_vllm_config = ConnectorVllmConfig( - self.vllm_config, kv_cache_config - ) + connector_vllm_config = copy.deepcopy(self.vllm_config) + connector_vllm_config.kv_cache_config = kv_cache_config self.connector = KVConnectorFactory.create_connector( config=connector_vllm_config, role=KVConnectorRole.SCHEDULER ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 137e317bbe4b..7f6d2269f6bb 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -13,7 +13,7 @@ import torch.nn as nn import vllm.envs as envs -from vllm.config import ConnectorVllmConfig, VllmConfig +from vllm.config import VllmConfig from vllm.distributed import ( ensure_model_parallel_initialized, init_distributed_environment, @@ -331,7 +331,8 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: # NOTE(Kuntai): This need to be done before `initialize_kv_cache`, # because `initialize_kv_cache` will inject kv cache groups not # related to kv cache connector (e.g. kv cache sharing layers). - connector_vllm_config = ConnectorVllmConfig(self.vllm_config, kv_cache_config) + connector_vllm_config = copy.deepcopy(self.vllm_config) + connector_vllm_config.kv_cache_config = kv_cache_config ensure_kv_transfer_initialized(connector_vllm_config) if self.vllm_config.model_config.enable_sleep_mode: From 7e963b962d8fb1172f297ed126830758c60a003d Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Fri, 17 Oct 2025 05:36:52 -0700 Subject: [PATCH 13/27] use deepcopy instead Signed-off-by: KuntaiDu --- vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/worker/gpu_worker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c60e1d71c444..6b0ad8829a55 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -88,7 +88,7 @@ def __init__( ) connector_vllm_config = copy.deepcopy(self.vllm_config) - connector_vllm_config.kv_cache_config = kv_cache_config + connector_vllm_config.kv_cache_config = copy.deepcopy(kv_cache_config) self.connector = KVConnectorFactory.create_connector( config=connector_vllm_config, role=KVConnectorRole.SCHEDULER ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 7f6d2269f6bb..c91870e9a53c 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -332,7 +332,7 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: # because `initialize_kv_cache` will inject kv cache groups not # related to kv cache connector (e.g. kv cache sharing layers). connector_vllm_config = copy.deepcopy(self.vllm_config) - connector_vllm_config.kv_cache_config = kv_cache_config + connector_vllm_config.kv_cache_config = copy.deepcopy(kv_cache_config) ensure_kv_transfer_initialized(connector_vllm_config) if self.vllm_config.model_config.enable_sleep_mode: From 650d666b99a27e0f1f55051fb4cda5851d1608fc Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Fri, 17 Oct 2025 05:39:03 -0700 Subject: [PATCH 14/27] adjust the comments Signed-off-by: KuntaiDu --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index e3983705de76..69f7178541d9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -72,7 +72,8 @@ class SupportsHMA: """ - Mixin class for connectors that support hybrid memory allocator (HMA). + The class that indicates the corresponding connector supports hybrid memory + allocator (HMA). This is required to use the connector together with hybrid memory allocator. """ From 37a589d89ff1f3cdf77fc4223db4f753564a4b8d Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Fri, 17 Oct 2025 05:40:55 -0700 Subject: [PATCH 15/27] adjust comments Signed-off-by: KuntaiDu --- vllm/v1/core/sched/scheduler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 6b0ad8829a55..1b1b52b117e7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1303,9 +1303,8 @@ def _connector_finished( block_ids = self.kv_cache_manager.get_block_ids(request.request_id) if not supports_hma(self.connector): - # NOTE(Kuntai): this code path is a hack. - # We should remove this code path after all connectors - # support hybrid memory allocator. + # NOTE(Kuntai): We should remove this code path after we require + # all connectors to support HMA. return self.connector.request_finished(request, block_ids[0]) else: return self.connector.request_finished(request, block_ids) From 6abc1c20b0fe9d54baf704d228025a8c7dce6f36 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Fri, 17 Oct 2025 05:41:16 -0700 Subject: [PATCH 16/27] adjust comments Signed-off-by: KuntaiDu --- vllm/v1/core/sched/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 1b1b52b117e7..582739ab33b7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1303,7 +1303,7 @@ def _connector_finished( block_ids = self.kv_cache_manager.get_block_ids(request.request_id) if not supports_hma(self.connector): - # NOTE(Kuntai): We should remove this code path after we require + # NOTE(Kuntai): We should remove this code path after we enforce # all connectors to support HMA. return self.connector.request_finished(request, block_ids[0]) else: From 27774f37f08b66362bf887a95cfa5bd9ea5decb7 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Mon, 20 Oct 2025 08:19:17 -0700 Subject: [PATCH 17/27] change deepcopy to shallowcopy --- shallow copy should be enough Signed-off-by: KuntaiDu --- vllm/v1/core/sched/scheduler.py | 5 ++--- vllm/v1/worker/gpu_worker.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 582739ab33b7..bb2c3cc642ed 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import copy import itertools import time @@ -87,8 +86,8 @@ def __init__( "Encoder-decoder models are not currently supported with KV connectors" ) - connector_vllm_config = copy.deepcopy(self.vllm_config) - connector_vllm_config.kv_cache_config = copy.deepcopy(kv_cache_config) + connector_vllm_config = copy.copy(self.vllm_config) + connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config) self.connector = KVConnectorFactory.create_connector( config=connector_vllm_config, role=KVConnectorRole.SCHEDULER ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index c91870e9a53c..59a16f6718fa 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -331,8 +331,8 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: # NOTE(Kuntai): This need to be done before `initialize_kv_cache`, # because `initialize_kv_cache` will inject kv cache groups not # related to kv cache connector (e.g. kv cache sharing layers). - connector_vllm_config = copy.deepcopy(self.vllm_config) - connector_vllm_config.kv_cache_config = copy.deepcopy(kv_cache_config) + connector_vllm_config = copy.copy(self.vllm_config) + connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config) ensure_kv_transfer_initialized(connector_vllm_config) if self.vllm_config.model_config.enable_sleep_mode: From ababeec101de93b5297ed77fde168d82e0312236 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Mon, 20 Oct 2025 12:05:11 -0700 Subject: [PATCH 18/27] fix CPU offloading test Signed-off-by: KuntaiDu --- tests/v1/kv_offload/test_cpu_offloading.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index 0d90cc715fd4..e9c255b1ee99 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -27,6 +27,7 @@ def test_cpu_offloading(cpu_block_size: int) -> None: model="meta-llama/Llama-3.2-1B-Instruct", gpu_memory_utilization=0.5, kv_transfer_config=kv_transfer_config, + disable_hybrid_kv_cache_manager=True, ) prompts = ["Hi " * 100] From 1974b5f496f73894b626fe7025e97e386fe7d5d3 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Mon, 20 Oct 2025 12:15:56 -0700 Subject: [PATCH 19/27] fix CI errors Signed-off-by: KuntaiDu --- tests/v1/core/test_scheduler.py | 6 ++++++ tests/v1/core/utils.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index aaac2deb12ac..05b85d4e3009 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -899,6 +899,7 @@ def test_kv_connector_basic(): scheduler = create_scheduler( enable_prefix_caching=True, use_kv_connector=True, + disable_hybrid_kv_cache_manager=True, ) NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks() BLOCK_SIZE = scheduler.cache_config.block_size @@ -1028,6 +1029,7 @@ def test_kv_connector_unable_to_allocate(): use_kv_connector=True, block_size=BLOCK_SIZE, num_blocks=NUM_BLOCKS, + disable_hybrid_kv_cache_manager=True, ) NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 scheduler.connector.get_num_new_matched_tokens = Mock(name="method") @@ -1111,6 +1113,7 @@ def test_kv_connector_handles_preemption(): use_kv_connector=True, block_size=BLOCK_SIZE, num_blocks=NUM_BLOCKS, + disable_hybrid_kv_cache_manager=True, ) NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE @@ -1327,6 +1330,7 @@ def create_scheduler_with_priority( block_size: int = 16, max_model_len: int | None = None, num_speculative_tokens: int | None = None, + disable_hybrid_kv_cache_manager: bool = False, ) -> Scheduler: """Create scheduler with priority policy enabled. @@ -1351,6 +1355,7 @@ def create_scheduler_with_priority( disable_chunked_mm_input=disable_chunked_mm_input, enable_chunked_prefill=True, policy="priority", # Enable priority scheduling + disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager, ) model_config = ModelConfig( model=model, @@ -1958,6 +1963,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(): num_blocks=5, # Can hold 64 tokens (first block is null) block_size=16, # Standard block size use_kv_connector=True, + disable_hybrid_kv_cache_manager=True, ) # Create a request and schedule it diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 6e739d6b0e77..3f5e1b9eeaf7 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -46,6 +46,7 @@ def create_scheduler( num_speculative_tokens: int | None = None, skip_tokenizer_init: bool = False, async_scheduling: bool = False, + disable_hybrid_kv_cache_manager: bool = False, ) -> Scheduler | AsyncScheduler: """Create scheduler under test. @@ -70,6 +71,7 @@ def create_scheduler( disable_chunked_mm_input=disable_chunked_mm_input, enable_chunked_prefill=True, async_scheduling=async_scheduling, + disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager, ) model_config = ModelConfig( model=model, From 9198d3ecf44a9cb881a5f190da4353df6d79fb59 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Thu, 23 Oct 2025 13:27:15 -0700 Subject: [PATCH 20/27] fix NIXL-connector-related CI errors Signed-off-by: KuntaiDu --- tests/v1/kv_connector/unit/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index e3f30bd7698f..46ea46e53084 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -91,6 +91,9 @@ def create_vllm_config( max_num_batched_tokens=max_num_batched_tokens, max_model_len=max_model_len, enable_chunked_prefill=enable_chunked_prefill, + # Disable hybrid KV cache manager for testing + # Should be removed after we support hybrid KV cache manager-based testing. + disable_hybrid_kv_cache_manager=True, ) model_config = ModelConfig( model=model, From c6e0bc4a44634411d55142909b38e2b93d7c6243 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Thu, 23 Oct 2025 14:16:36 -0700 Subject: [PATCH 21/27] fix CI errors Signed-off-by: KuntaiDu --- tests/v1/core/test_scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index b0e79cf66a46..fba577239682 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1025,6 +1025,7 @@ def test_external_prefix_cache_metrics(): scheduler = create_scheduler( enable_prefix_caching=False, use_kv_connector=True, + disable_hybrid_kv_cache_manager=True, ) # Mock connector to simulate a partial external cache hit From 919fe9b347e884140de47f0213776688ee554645 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Thu, 23 Oct 2025 14:30:21 -0700 Subject: [PATCH 22/27] remove hma support from LMCache for now Signed-off-by: KuntaiDu --- .../kv_connector/v1/lmcache_connector.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 0b90e86b8156..3abb7791057a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -10,7 +10,6 @@ KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole, - SupportsHMA, ) from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput @@ -24,8 +23,8 @@ logger = init_logger(__name__) -class LMCacheConnectorV1(SupportsHMA, KVConnectorBase_V1): - def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): +class LMCacheConnectorV1(KVConnectorBase_V1): + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) self._lmcache_engine = LMCacheConnectorV1Impl(vllm_config, role, self) @@ -158,10 +157,10 @@ def build_connector_meta( """ return self._lmcache_engine.build_connector_meta(scheduler_output) - def request_finished( # type: ignore[override] + def request_finished( self, request: "Request", - block_ids: tuple[list[int], ...], + block_ids: list[int], ) -> tuple[bool, dict[str, Any] | None]: """ Called when a request has finished, before its blocks are freed. @@ -172,10 +171,5 @@ def request_finished( # type: ignore[override] get_finished(). Optional KVTransferParams to be included in the request outputs returned by the engine. - - Note: - This method intentionally uses tuple[list[int], ...] from - SupportsHMA interface instead of list[int] from - KVConnectorBase_V1 to support hybrid memory allocation. """ return self._lmcache_engine.request_finished(request, block_ids) From 0b67b7699110f66b31abe84b466bc4afd0207923 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Thu, 23 Oct 2025 14:44:05 -0700 Subject: [PATCH 23/27] add an extra sanity check for request_finished Signed-off-by: KuntaiDu --- .../distributed/kv_transfer/kv_connector/v1/base.py | 13 ++++++++----- vllm/v1/core/sched/scheduler.py | 5 ++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 78917c03fd6e..2562eb9ce70e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -70,21 +70,24 @@ logger = init_logger(__name__) -class SupportsHMA: +class SupportsHMA(ABC): """ The class that indicates the corresponding connector supports hybrid memory allocator (HMA). This is required to use the connector together with hybrid memory allocator. """ - def request_finished( + @abstractmethod + def request_finished_all_groups( self, request: "Request", block_ids: tuple[list[int], ...], ) -> tuple[bool, dict[str, Any] | None]: """ - Called exactly once when a request has finished, before its blocks are - freed. + Called exactly once when a request has finished for all kv cache groups, + before its blocks are freed for each group. + + NOTE(Kuntai): This function is only supported by connectors that support HMA. The connector may assumes responsibility for freeing the blocks asynchronously by returning True. @@ -96,7 +99,7 @@ def request_finished( Optional KVTransferParams to be included in the request outputs returned by the engine. """ - return False, None + raise NotImplementedError def supports_hma(connector: Any) -> bool: diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index a5ec664b89b4..7afee15a2da6 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1327,8 +1327,11 @@ def _connector_finished( block_ids = self.kv_cache_manager.get_block_ids(request.request_id) if not supports_hma(self.connector): - # NOTE(Kuntai): We should remove this code path after we enforce + # NOTE(Kuntai): We should deprecate this code path after we enforce # all connectors to support HMA. + # Hybrid memory allocator should be already turned off for this + # code path, but let's double-check here. + assert len(self.kv_cache_config.kv_cache_groups) == 1 return self.connector.request_finished(request, block_ids[0]) else: return self.connector.request_finished(request, block_ids) From 36e42a1a96484a701f5a3433a4f70922654829ff Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Thu, 23 Oct 2025 22:11:37 -0700 Subject: [PATCH 24/27] fix bug Signed-off-by: KuntaiDu --- tests/v1/kv_connector/unit/test_multi_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py index 74ae3ca9a863..793c4afc837b 100644 --- a/tests/v1/kv_connector/unit/test_multi_connector.py +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -80,6 +80,7 @@ def test_multi_shared_storage_connector_consistency(): enforce_eager=True, gpu_memory_utilization=0.5, kv_transfer_config=kv_transfer_config, + disable_hybrid_kv_cache_manager=True, ) # Run generation - this should trigger saving KV cache _ = llm.generate(PROMPTS, SAMPLING_PARAMS) From 0df4f02b9bc546df697505f1e3e63bc4027e9016 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Thu, 23 Oct 2025 22:22:43 -0700 Subject: [PATCH 25/27] fix CI bug Signed-off-by: KuntaiDu --- tests/v1/kv_connector/unit/test_shared_storage_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py index e7013a794a8c..6040ed5a6806 100644 --- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py +++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py @@ -132,6 +132,7 @@ def test_shared_storage_connector_hashes(tmp_path): enforce_eager=True, kv_transfer_config=kv_transfer_config, limit_mm_per_prompt={"image": 2}, + disable_hybrid_kv_cache_manager=True, ) # don't put this import at the top level From 5d88c0dcb8475032dc6e030ef8bdf3c1e407148b Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Thu, 23 Oct 2025 22:24:52 -0700 Subject: [PATCH 26/27] fix CI issues Signed-off-by: KuntaiDu --- tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh | 2 ++ tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index a9817313cf02..a756858e2cc5 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -136,6 +136,7 @@ run_tests_for_model() { vllm serve $model_name \ --port $PORT \ --enforce-eager \ + --disable-hybrid-kv-cache-manager \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '$KV_CONFIG'" @@ -178,6 +179,7 @@ run_tests_for_model() { --port $PORT \ --enforce-eager \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ + --disable-hybrid-kv-cache-manager \ --kv-transfer-config '$KV_CONFIG'" # DP-EP attention mode diff --git a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh index c48b452e24cd..a3eeedb2e514 100755 --- a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh @@ -85,6 +85,7 @@ run_tests_for_model() { --port $PREFILL_PORT \ --enforce-eager \ --gpu-memory-utilization 0.2 \ + --disable-hybrid-kv-cache-manager \ --kv-transfer-config '$KV_CONFIG'" if [ -n "$model_args" ]; then @@ -103,6 +104,7 @@ run_tests_for_model() { --port $DECODE_PORT \ --enforce-eager \ --gpu-memory-utilization 0.2 \ + --disable-hybrid-kv-cache-manager \ --kv-transfer-config '$KV_CONFIG'" if [ -n "$model_args" ]; then From 2fac4fbcfe38d97a274549dc071563fe567a1232 Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Fri, 24 Oct 2025 00:43:14 -0700 Subject: [PATCH 27/27] fix CI errors Signed-off-by: KuntaiDu --- tests/v1/kv_connector/unit/test_nixl_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index e073321c637b..445d115010cd 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -932,6 +932,7 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend): "gpu_memory_utilization": 0.5, "kv_transfer_config": kv_transfer_config, "distributed_executor_backend": distributed_executor_backend, + "disable_hybrid_kv_cache_manager": True, } timeout = 6