From 5c5e75169a1bddeba45d04ab5a2313b88b6aa9cb Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Wed, 30 Apr 2025 00:13:18 +0000
Subject: [PATCH] updated

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 .../openai_completion_client.py               |  6 ++++--
 vllm/v1/core/block_pool.py                    |  2 --
 vllm/v1/core/kv_cache_manager.py              | 19 +++++++++++++------
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index b31ebcccce3b..1f8a7e5b078c 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -6,6 +6,9 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8192/v1"
 
+PROMPT = "The absolute best part about working for Red Hat is that we get to work on open source software. Red Hat is a leader in many key open source infrastructure technologies like Linux, Kubernetes, and recently vLLM, which means that there is a lot of opportunity to work with community and customers on key infrastructure projects. This means",  # noqa: E501
+PROMPT = "The absolute best part about working for Red Hat is that we get to work on open source software. Red Hat is a leader in many key open source infrastructure technologies like Linux, Kubernetes, and recently vLLM, "  # noqa: E501
+
 
 def main():
     client = OpenAI(
@@ -21,8 +24,7 @@ def main():
     stream = True
     completion = client.completions.create(
         model="meta-llama/Llama-3.1-8B-Instruct",
-        prompt=
-        "The absolute best part about working for Red Hat is that we get to work on open source software. Red Hat is a leader in many key open source infrastructure technologies like Linux, Kubernetes, and recently vLLM, which means that there is a lot of opportunity to work with community and customers on key infrastructure projects. This means",  # noqa: E501
+        prompt=PROMPT,
         echo=False,
         stream=stream)
 
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index ffbe6d74e868..74f3f7852c9a 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -117,8 +117,6 @@ def cache_full_blocks(
             prev_block_hash_value = prev_block.block_hash.hash_value
 
         for i, blk in enumerate(new_full_blocks):
-            if blk.block_hash is not None:
-                continue
             assert blk.block_hash is None
 
             if i < len(new_block_hashes):
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index ff5485932dcb..41d9f1b65c23 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -279,12 +279,19 @@ def allocate_slots(
             new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
             req_blocks.extend(new_blocks)
 
-        if not self.enable_caching or skip_cache_blocks:
-            # If self.enable_caching, this is true since can only
-            # get to this codepath when we have never been scheduled.
-            assert request.request_id not in self.num_cached_block
+        if not self.enable_caching:
             return new_blocks
 
+        if skip_cache_blocks:
+            # NOTE(rob): this assert is valid because we only call
+            # skip_cache_blocks=True on the first time of WAITING
+            # during a P/D setup.
+            assert request.request_id not in self.num_cached_block
+            # NOTE(rob): this is necessary so we don't double
+            # cache a block after is has finished recving.
+            self.num_cached_block[request.request_id] = len(
+                new_computed_blocks)
+
         self.cache_blocks(
             request=request,
             num_tokens=num_tokens,
@@ -313,8 +320,8 @@ def cache_blocks(
         # Speculated tokens might be rejected in the future, so we do
         # not cache any speculated tokens. We only cache blocks with
         # generated (accepted) tokens.
-        num_full_blocks_after_append = (
-            num_computed_tokens + num_tokens - len(request.spec_token_ids)) // self.block_size
+        num_full_blocks_after_append = (num_computed_tokens + num_tokens - len(
+            request.spec_token_ids)) // self.block_size
 
         self.block_pool.cache_full_blocks(
             request=request,