Fix PR comments

David Ben-David · David Ben-David · commit 3d41b4712f57 · 2025-09-30T12:28:01.000+03:00
Signed-off-by: David Ben-David &lt;davidb@pliops.com&gt;
diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py
@@ -281,8 +281,8 @@ def _run(self, decoded_tokens: list[int]):
 
             model_runner_output = create_model_runner_output(
                 reqs=self.scheduler.running,
-                finished_sending=list(finished_sending),
-                finished_recving=list(finished_recving),
+                finished_sending=finished_sending,
+                finished_recving=finished_recving,
                 token_id=token_id)
 
             if self.scheduler.running:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -236,6 +236,16 @@ def get_block_ids_with_load_errors(self) -> set[int]:
         Returns:
             Set of block IDs that encountered load errors.
             Empty set if no load errors occurred.
+
+        Notes:
+            - Applies to both sync- and async-loading requests.
+            - Async loading: failed blocks may be reported in any forward pass
+              up to and including the pass where the request ID is returned by
+              `get_finished()`. Even if failures occur, the request must still
+              be reported via `get_finished()`, and the failed block IDs must
+              appear here no later than that same pass.
+            - Sync loading: failed blocks should be reported in the forward
+              pass in which they are detected.
         """
         return set()
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -879,12 +879,12 @@ def update_from_output(
         kv_connector_stats = (kv_connector_output.kv_connector_stats
                               if kv_connector_output else None)
 
-        affected_req_ids = None
+        failed_kv_load_req_ids = None
         if kv_connector_output and kv_connector_output.invalid_block_ids:
             # These blocks contain externally computed tokens that failed to
             # load. Identify affected requests and adjust their computed token
             # count to trigger recomputation of the invalid blocks.
-            affected_req_ids = self._handle_invalid_blocks(
+            failed_kv_load_req_ids = self._handle_invalid_blocks(
                 kv_connector_output.invalid_block_ids)
 
         # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more,
@@ -894,7 +894,7 @@ def update_from_output(
         stopped_preempted_reqs: set[Request] = set()
         for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
             assert num_tokens_scheduled > 0
-            if affected_req_ids and req_id in affected_req_ids:
+            if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids:
                 # Skip requests that were recovered from KV load failure
                 continue
             request = self.requests.get(req_id)
@@ -1325,11 +1325,30 @@ def _update_from_kv_xfer_finished(self,
 
     def _update_requests_with_invalid_blocks(
             self, requests: Iterable[Request],
-            invalid_block_ids: set[int]) -> tuple[set[str], int, set[int]]:
+            invalid_block_ids: set[int]) -> tuple[set[str], int]:
+        """
+        Identify and update requests affected by invalid KV cache blocks.
+
+        This method scans the given requests, detects those with invalid blocks
+        and adjusts their `num_computed_tokens` to the longest valid prefix.
+        For observability, it also accumulates the total number of tokens that
+        will need to be recomputed across all affected requests.
+
+        Args:
+            requests: The set of requests to scan for invalid blocks.
+            invalid_block_ids: IDs of invalid blocks.
+
+        Returns:
+            tuple:
+                - affected_req_ids (set[str]): IDs of requests impacted by
+                invalid blocks.
+                - total_affected_tokens (int): Total number of tokens that must
+                be recomputed across all affected requests (for observability).
+        """
         affected_req_ids: set[str] = set()
         total_affected_tokens = 0
         # If a block is invalid and shared by multiple requests in the batch,
-        # all requests must be rescheduled, but only the first will recompute
+        # these requests must be rescheduled, but only the first will recompute
         # it. This set tracks blocks already marked for recomputation.
         marked_invalid_block_ids: set[int] = set()
         for request in requests:
@@ -1341,12 +1360,14 @@ def _update_requests_with_invalid_blocks(
             # We iterate only over blocks that may contain externally computed
             # tokens
             if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                # Async loading. If num_computed_tokens is set it implies we
+                # already processed some block failures for it in a prior step
                 req_num_computed_tokens = (
-                    request.num_computed_tokens if request.request_id
+                    request.num_computed_tokens if req_id
                     in self.failed_recving_kv_req_ids else len(req_block_ids) *
                     self.block_size)
             else:
-                # In sync load, num_computed_tokens includes new tokens
+                # Sync loading. num_computed_tokens includes new tokens
                 req_num_computed_tokens = request.num_cached_tokens
 
             req_num_computed_blocks = (req_num_computed_tokens +
@@ -1364,6 +1385,8 @@ def _update_requests_with_invalid_blocks(
                     # and was already marked for recomputation.
                     # This means this request can still consider this block
                     # as computed when rescheduled.
+                    # Currently this only applies to sync loading; Async
+                    # loading does not yet support block sharing
                     continue
 
                 marked_invalid_block_ids.add(block_id)
@@ -1374,6 +1397,7 @@ def _update_requests_with_invalid_blocks(
                     continue
 
                 marked_invalid_block = True
+                # Truncate the computed tokens at the first failed block
                 request.num_computed_tokens = idx * self.block_size
                 total_affected_tokens += (req_num_computed_tokens -
                                           request.num_computed_tokens)
@@ -1383,14 +1407,15 @@ def _update_requests_with_invalid_blocks(
                     # All invalid blocks of this request are shared with
                     # previous requests and will be recomputed by them.
                     # Revert to considering only cached tokens as computed.
+                    # Currently this only applies to sync loading; Async
+                    # loading does not yet support block sharing
                     total_affected_tokens += (request.num_computed_tokens -
                                               request.num_cached_tokens)
                     request.num_computed_tokens = request.num_cached_tokens
 
                 affected_req_ids.add(request.request_id)
 
-        return (affected_req_ids, total_affected_tokens,
-                marked_invalid_block_ids)
+        return (affected_req_ids, total_affected_tokens)
 
     def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]:
         total_requests_to_reschedule = 0
@@ -1400,36 +1425,31 @@ def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]:
         async_load_reqs = (
             req for req in self.waiting
             if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS)
-        (affected_req_ids, num_tokens_to_reschedule,
-         marked_invalid_block_ids) = (
-             self._update_requests_with_invalid_blocks(async_load_reqs,
-                                                       invalid_block_ids))
+        async_affected_req_ids, num_tokens_to_reschedule = (
+            self._update_requests_with_invalid_blocks(async_load_reqs,
+                                                      invalid_block_ids))
 
-        total_requests_to_reschedule += len(affected_req_ids)
+        total_requests_to_reschedule += len(async_affected_req_ids)
         total_tokens_to_reschedule += num_tokens_to_reschedule
 
         # Mark requests with async KV load failures; they will be rescheduled
         # once loading completes
-        self.failed_recving_kv_req_ids |= affected_req_ids
-
-        # Remove async loaded invalid blocks already handled,
-        # as they cannot be shared with running requests.
-        invalid_block_ids.difference_update(marked_invalid_block_ids)
+        self.failed_recving_kv_req_ids |= async_affected_req_ids
 
         # --- Handle sync KV loads (running requests) ---
-        affected_req_ids, num_tokens_to_reschedule, _ = (
+        sync_affected_req_ids, num_tokens_to_reschedule = (
             self._update_requests_with_invalid_blocks(self.running,
                                                       invalid_block_ids))
 
-        total_requests_to_reschedule += len(affected_req_ids)
+        total_requests_to_reschedule += len(sync_affected_req_ids)
         total_tokens_to_reschedule += num_tokens_to_reschedule
 
         if total_requests_to_reschedule:
-            logger.info(
+            logger.warning(
                 "Recovered from KV load failure: "
                 "%d request(s) rescheduled (%d tokens affected).",
                 total_requests_to_reschedule, total_tokens_to_reschedule)
 
         # Return the IDs of affected running requests to skip in
         # update_from_output.
-        return affected_req_ids
+        return sync_affected_req_ids
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
@@ -48,9 +48,9 @@ class CachedRequestState:
     def __post_init__(self):
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
             self.prompt_token_ids, self.prompt_embeds)
-        # 'last_generator_offset' and 'last_gelen_last_output_token_ids' are
-        # used to allow safe rollback in case a sampled token turns out to be
-        # invalid (e.g., due to KV load errors).
+        # 'last_generator_offset' and 'len_last_output_token_ids' are used to
+        # allow safe rollback in case a sampled token turns out to be invalid
+        # (e.g., due to KV load errors).
         self.last_generator_offset = 0 if self.generator else None
         self.len_last_output_token_ids = len(self.output_token_ids)
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -636,7 +636,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             resumed_from_preemption = req_data.resumed_from_preemption[i]
 
             # Update the cached states.
-            if (num_computed_tokens <= req_state.num_computed_tokens):
+            if num_computed_tokens <= req_state.num_computed_tokens:
                 # The request was rescheduled after a KV load failure. Clear
                 # the last sampled tokens and rewind the generator state
                 len_output_token_ids = len(req_state.output_token_ids)