From 5c5e75169a1bddeba45d04ab5a2313b88b6aa9cb Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 30 Apr 2025 00:13:18 +0000 Subject: [PATCH] updated Signed-off-by: rshaw@neuralmagic.com --- .../openai_completion_client.py | 6 ++++-- vllm/v1/core/block_pool.py | 2 -- vllm/v1/core/kv_cache_manager.py | 19 +++++++++++++------ 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index b31ebcccce3b..1f8a7e5b078c 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -6,6 +6,9 @@ openai_api_key = "EMPTY" openai_api_base = "http://localhost:8192/v1" +PROMPT = "The absolute best part about working for Red Hat is that we get to work on open source software. Red Hat is a leader in many key open source infrastructure technologies like Linux, Kubernetes, and recently vLLM, which means that there is a lot of opportunity to work with community and customers on key infrastructure projects. This means", # noqa: E501 +PROMPT = "The absolute best part about working for Red Hat is that we get to work on open source software. Red Hat is a leader in many key open source infrastructure technologies like Linux, Kubernetes, and recently vLLM, " # noqa: E501 + def main(): client = OpenAI( @@ -21,8 +24,7 @@ def main(): stream = True completion = client.completions.create( model="meta-llama/Llama-3.1-8B-Instruct", - prompt= - "The absolute best part about working for Red Hat is that we get to work on open source software. Red Hat is a leader in many key open source infrastructure technologies like Linux, Kubernetes, and recently vLLM, which means that there is a lot of opportunity to work with community and customers on key infrastructure projects. This means", # noqa: E501 + prompt=PROMPT, echo=False, stream=stream) diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index ffbe6d74e868..74f3f7852c9a 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -117,8 +117,6 @@ def cache_full_blocks( prev_block_hash_value = prev_block.block_hash.hash_value for i, blk in enumerate(new_full_blocks): - if blk.block_hash is not None: - continue assert blk.block_hash is None if i < len(new_block_hashes): diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index ff5485932dcb..41d9f1b65c23 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -279,12 +279,19 @@ def allocate_slots( new_blocks = self.block_pool.get_new_blocks(num_new_blocks) req_blocks.extend(new_blocks) - if not self.enable_caching or skip_cache_blocks: - # If self.enable_caching, this is true since can only - # get to this codepath when we have never been scheduled. - assert request.request_id not in self.num_cached_block + if not self.enable_caching: return new_blocks + if skip_cache_blocks: + # NOTE(rob): this assert is valid because we only call + # skip_cache_blocks=True on the first time of WAITING + # during a P/D setup. + assert request.request_id not in self.num_cached_block + # NOTE(rob): this is necessary so we don't double + # cache a block after is has finished recving. + self.num_cached_block[request.request_id] = len( + new_computed_blocks) + self.cache_blocks( request=request, num_tokens=num_tokens, @@ -313,8 +320,8 @@ def cache_blocks( # Speculated tokens might be rejected in the future, so we do # not cache any speculated tokens. We only cache blocks with # generated (accepted) tokens. - num_full_blocks_after_append = ( - num_computed_tokens + num_tokens - len(request.spec_token_ids)) // self.block_size + num_full_blocks_after_append = (num_computed_tokens + num_tokens - len( + request.spec_token_ids)) // self.block_size self.block_pool.cache_full_blocks( request=request,