updated

robertgshaw2-redhat · robertgshaw2-redhat · commit 8e1eadca4bb2 · 2025-04-10T20:26:37.000Z
Signed-off-by: rshaw@neuralmagic.com &lt;robertgshaw2@gmail.com&gt;
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -19,8 +19,6 @@
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
-    from vllm.v1.core.kv_cache_manager import KVCacheManager
-    from vllm.v1.core.kv_cache_utils import KVCacheBlock
     from vllm.v1.request import Request
 
 
@@ -148,32 +146,33 @@ def wait_for_save(self):
     # Scheduler-side methods
     # ==============================
     @abstractmethod
-    def get_external_prefix_cache_blocks(
+    def get_num_matched_tokens(
         self,
         request: "Request",
-        computed_blocks: list["KVCacheBlock"],
         num_computed_tokens: int,
-        kv_cache_manager: "KVCacheManager",
-    ) -> list["KVCacheBlock"]:
+    ) -> int:
         """
-        Get the external prefix cache blocks from the connector.
-
-        This function may change the state of the connector, which will
-        be used by `build_connector_meta` later.
-
-        This function will also allocate/free the blocks dynamically when  
-        there is remote cache hit.
-
+        Check for external KV cache hit.
+        
         Args:
             request (Request): the request object.
-            computed_blocks (list[KVCacheBlock]): the 'local' computed blocks.
-            num_computed_tokens (int): the number of 'local' computed tokens.
-            kv_cache_manager (KVCacheManager): the KV cache manager to 
-                allocate/free the blocks if needed.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
 
         Returns:
-            The updated list of the computed blocks (appended with the remote
-            cached blocks)
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+        pass
+
+    @abstractmethod
+    def update_state_after_alloc(self, request: Request,
+                                 num_allocated_blocks: int):
+        """
+        Update KVConnector state after temporary buffer alloc.
+
+        For SharedStorageConnector, update _request_needs_load
+        if the CacheManager this allocated blocks for us.
         """
         pass
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -16,8 +16,6 @@
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
-    from vllm.v1.core.kv_cache_manager import KVCacheManager
-    from vllm.v1.core.kv_cache_utils import KVCacheBlock
     from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -152,7 +150,7 @@ def inject_kv_into_layer(
                 kv_cache_layer = attn_layer.kv_cache[\
                         forward_context.virtual_engine]
 
-                filename = self.generate_filename_debug(
+                filename = self._generate_filename_debug(
                     layer_name, request.token_ids)
                 kv_cache = safetensors.torch.load_file(
                     filename)["kv_cache"].cuda()
@@ -201,7 +199,7 @@ def extract_kv_from_layer(
         assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
         for request in connector_metadata.requests:
             if request.is_store:
-                filename = self.generate_filename_debug(
+                filename = self._generate_filename_debug(
                     layer_name, request.token_ids)
                 kv_cache = extract_kv_from_layer(kv_layer,
                                                  request.slot_mapping)
@@ -211,78 +209,47 @@ def extract_kv_from_layer(
     def wait_for_save(self):
         return
 
-    def get_external_prefix_cache_blocks(
+    def get_num_matched_tokens(
         self,
         request: "Request",
-        computed_blocks: list["KVCacheBlock"],
         num_computed_tokens: int,
-        kv_cache_manager: "KVCacheManager",
-    ) -> list["KVCacheBlock"]:
-        """Get the external prefix cache blocks from the connector.
-
-        This function may change the state of the connector, which will be 
-        used by `build_connector_meta` later.
-
-        Args:
-            request (Request): the request object.
-            computed_blocks (list[KVCacheBlock]): the 'local' computed blocks.
-            num_computed_tokens (int): the number of 'local' computed tokens.
-            kv_cache_manager (KVCacheManager): the KV cache manager to 
-                allocate/free the blocks if needed.
-
-        Returns:
-            The updated list of the computed blocks (appended with the remote
-            cached blocks)
+    ) -> int:
         """
+        Check for external KV cache hit.
+        
+        Returns the number of tokens that can be loaded from the 
+        external KV cache beyond what is already computed.
+        """
+
         # NOTE: in this debug implementation, we assume that the prompt is
         # cached_prompt + newly_generated_single_token
         # Therefore, we use prompt_token_ids[:-1] to determine the folder name
 
         # NOTE: in current v1 scheduler, the num_computed_tokens is aligned
         # with the block granularity. And it expects the returned blocks and
         # num_computed_tokens to also be aligned with the block granularity.
-        if not self.found_match_for_request(request):
-            return computed_blocks
+        if not self._found_match_for_request(request):
+            return 0
+
+        logger.info("External Cache Hit!")
 
         # Now, first num_tokens_to_check tokens are hit, we need to prepare
         # the metadata for the worker connector to correctly load the KV
-
-        logger.info("Hit the cache! Allocate new blocks!")
         num_tokens_to_check = align_to_block_size(
             len(request.prompt_token_ids) - 1, self._block_size)
-        need_to_allocate = num_tokens_to_check - num_computed_tokens
-        if need_to_allocate > 0:
-            # HACK: We don't want the scheduler see the blocks are allocated
-            # and associated with the current request. Instead, we want the
-            # scheduler find that the blocks are already allocated and they
-            # are associated with some other requests (i.e., the case of
-            # prefix caching.
-
-            # HACK: KVCacheManager.allocate_slots will pre-allocate a few
-            # blocks, which will cause problems in the later allocations.
-            # We should make sure the pre allocation does not happen.
-            old_req_id = request.request_id
-            request.request_id = "temp-req-id-for-connector"
-            allocated_blocks = kv_cache_manager.allocate_slots(
-                request,
-                need_to_allocate,
-                computed_blocks,
-                skip_preallocate=True,
-                skip_inc_ref_count=True)
-            request.request_id = old_req_id
-            kv_cache_manager.req_to_blocks.pop("temp-req-id-for-connector")
-            kv_cache_manager.num_cached_block.pop("temp-req-id-for-connector")
-
-            num_expected_blocks = need_to_allocate // self._block_size
-            if len(allocated_blocks) > num_expected_blocks:
-                logger.error("Detected pre-allocated blocks in the connector!"
-                             "This should not happen!")
-                allocated_blocks = allocated_blocks[:num_expected_blocks]
 
+        return num_tokens_to_check - num_computed_tokens
+
+    def update_state_after_alloc(self, request: Request,
+                                 num_allocated_blocks: int):
+        """
+        Update KVConnector state after temporary buffer alloc.
+
+        For SharedStorageConnector, update _request_needs_load
+        if the CacheManager this allocated blocks for us.
+        """
+        if num_allocated_blocks > 0:
             self._requests_need_load.append(request.request_id)
-            return computed_blocks + allocated_blocks
-        else:
-            return computed_blocks
 
     def build_connector_meta(
             self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
@@ -302,7 +269,7 @@ def build_connector_meta(
                 # NOTE: here, we set the store and load being exclusive,
                 # but in LMCache use case, a single request can have both
                 # store and load status
-                if not self.found_match_for_request(request):
+                if not self._found_match_for_request(request):
                     meta.add_request(request, self._block_size, is_store=True)
 
         self._requests_need_load.clear()
@@ -312,20 +279,20 @@ def build_connector_meta(
     # Helper functions
     # ==============================
 
-    def found_match_for_request(
+    def _found_match_for_request(
         self,
         request: "Request",
     ) -> bool:
         """Check if the cache is hit for the request.
         """
         num_tokens_to_check = align_to_block_size(
             len(request.prompt_token_ids) - 1, self._block_size)
-        foldername = self.generate_foldername_debug(torch.tensor(
+        foldername = self._generate_foldername_debug(torch.tensor(
             request.prompt_token_ids)[:num_tokens_to_check],
-                                                    create_folder=False)
+                                                     create_folder=False)
         return os.path.exists(foldername)
 
-    def generate_foldername_debug(
+    def _generate_foldername_debug(
         self,
         input_ids: torch.Tensor,
         create_folder=False,
@@ -340,16 +307,16 @@ def generate_foldername_debug(
             os.makedirs(foldername, exist_ok=True)
         return foldername
 
-    def generate_filename_debug(
+    def _generate_filename_debug(
         self,
         layer_name: str,
         input_ids: torch.Tensor,
     ) -> str:
         """Generate a file name based on the layer name and the hash 
         of the bytes of the input ids.
         """
-        foldername = self.generate_foldername_debug(input_ids,
-                                                    create_folder=True)
+        foldername = self._generate_foldername_debug(input_ids,
+                                                     create_folder=True)
         return os.path.join(foldername, f"{layer_name}.safetensors")
 
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -84,7 +84,11 @@ def __init__(
         # data for reempted ones.
         self.num_cached_block: dict[str, int] = {}
         self.prefix_cache_stats = PrefixCacheStats()
-        self.connector = connector
+
+        # KVConnector: buffer reqs for KVConnector. We write
+        # the external KVs to the "buffer" req and leverage
+        # prefix caching to share with the "real" req
+        self.kv_connector_buffer_reqs: list[Request] = []
 
     @property
     def usage(self) -> float:
@@ -159,13 +163,6 @@ def get_computed_blocks(
             # we shouldn't modify it directly.
             block_hashes.append(last_block_hash)
 
-        # Check the remote cache for the external prefix cache blocks.
-        if self.connector is not None:
-            computed_blocks =\
-                    self.connector.get_external_prefix_cache_blocks(
-                            request, computed_blocks,
-                            len(computed_blocks) * self.block_size, self)
-
         # NOTE(woosuk): Since incomplete blocks are not eligible for
         # sharing, `num_computed_tokens` is always a multiple of
         # `block_size`.
@@ -178,7 +175,6 @@ def allocate_slots(
         num_tokens: int,
         new_computed_blocks: Optional[list[KVCacheBlock]] = None,
         skip_preallocate: bool = False,
-        skip_inc_ref_count: bool = False,
     ) -> Optional[list[KVCacheBlock]]:
         """Add slots for a request with new tokens to append.
 
@@ -188,11 +184,7 @@ def allocate_slots(
                 not include the tokens that have already been computed.
             new_computed_blocks: A list of new computed blocks just hitting the
                 prefix caching.
-            skip_preallocate: Whether to skip preallocating blocks for 
-                the request.
-            skip_preallocate: Whether to skip incrementing the ref count. This
-                is useful for the KVConnector to allocate blocks which will be
-                filled by the remote KVs for a single model step().
+            skip_preallocate: Whether to skip preallocating blocks.
 
         Blocks layout:
         -----------------------------------------------------------------------
@@ -246,12 +238,11 @@ def allocate_slots(
             return None
 
         # Touch the computed blocks to make sure they won't be evicted.
-        if self.enable_caching and not skip_inc_ref_count:
+        if self.enable_caching:
             self.block_pool.touch(new_computed_blocks)
         else:
-            assert not new_computed_blocks, (
-                "Computed blocks should be empty when "
-                "prefix caching is disabled")
+            assert not new_computed_blocks, "Computed blocks should "\
+                "be empty when prefix caching is disabled"
 
         # Append the new computed blocks to the request blocks until now to
         # avoid the case where the new blocks cannot be allocated.
@@ -396,3 +387,56 @@ def free_block_hashes(self, request: Request) -> None:
         is finished, not when it is preempted.
         """
         self.req_to_block_hashes.pop(request.request_id, None)
+
+    def alloc_and_get_external_blocks(
+        self,
+        request: "Request",
+        computed_blocks: list["KVCacheBlock"],
+        num_computed_tokens: int,
+        kv_connector: KVConnectorBase_V1,
+    ) -> tuple[list["KVCacheBlock"], int]:
+
+        # Check for cache hit.
+        need_to_allocate = kv_connector.get_num_matched_tokens(
+            request, num_computed_tokens)
+        num_allocated_blocks = 0
+
+        # Cache hit: allocate buffer.
+        if need_to_allocate > 0:
+            # HACK: We don't want the scheduler see the blocks are allocated
+            # and associated with the current request. Instead, we want the
+            # scheduler find that the blocks are already allocated and they
+            # are associated with some other requests (i.e., the case of
+            # prefix caching.
+
+            old_req_id = request.request_id
+            request.request_id = f"{old_req_id}-buf-for-kv-connector"
+            allocated_blocks = self.allocate_slots(
+                request,
+                need_to_allocate,
+                computed_blocks,
+                skip_preallocate=True,
+            )
+            request.request_id = old_req_id
+
+            num_expected_blocks = need_to_allocate // self.block_size
+            num_allocated_blocks = len(
+                allocated_blocks) if allocated_blocks else 0
+            assert num_allocated_blocks <= num_expected_blocks, ""\
+                    "Detected pre-allocated blocks in the connector! "\
+                    "This should not happen!"
+
+        # Update internal state. In case of:
+        # * SharedStorageConnector: add req_id to _requests_need_load
+        #   so that we know to load this requests KVs later.
+        kv_connector.update_state_after_alloc(request, num_allocated_blocks)
+        num_computed_blocks = len(computed_blocks) * self.block_size
+        return computed_blocks, num_computed_blocks
+
+    def free_buffer_requests(self) -> None:
+        """Free buffer requests for the KV connector."""
+
+        for buffer_req in self.kv_connector_buffer_reqs:
+            self.free(buffer_req)
+            self.free_block_hashes(buffer_req)
+        self.kv_connector_buffer_reqs.clear()
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -308,6 +308,20 @@ def schedule(self) -> SchedulerOutput:
                 # Get already-cached tokens.
                 computed_blocks, num_computed_tokens = \
                     self.kv_cache_manager.get_computed_blocks(request)
+
+                # KVConnector: get blocks externally-cached tokens.
+                # Internally, this allocates a "buffer" req with a prompt
+                # corresponding to externally cached tokens. In alloc_slots
+                # below, we will compute a cache hit and thus skip the
+                # computation for externally cached tokens.
+                # NOTE: since this allocates temporary buffer requests,
+                # we must call kv_cache_manager.free_buffer_requests() below.
+                if self.connector is not None:
+                    computed_blocks, num_computed_tokens = \
+                        self.kv_cache_manager.alloc_and_get_external_blocks(
+                            request, computed_blocks,
+                            num_computed_tokens, self.connector)
+
                 # Number of tokens to be scheduled.
                 # We use `request.num_tokens` instead of
                 # `request.num_prompt_tokens` to consider the resumed requests,
@@ -467,6 +481,11 @@ def schedule(self) -> SchedulerOutput:
         for req_id, num_scheduled_token in num_scheduled_tokens.items():
             self.requests[req_id].num_computed_tokens += num_scheduled_token
 
+        # KVConnector: once we have allocated the buffer blocks to the
+        # "real" requests (via prefix caching), free the tmp buffer reqs.
+        if self.connector is not None:
+            self.kv_cache_manager.free_buffer_requests()
+
         self.finished_req_ids = set()
         return scheduler_output