Skip to content

Commit 0be087a

Browse files
zhuohan123rtourgeman
authored andcommitted
[Core] Don't count preempted tokens in prefix cache hit rate (vllm-project#25787)
Signed-off-by: Zhuohan Li <[email protected]>
1 parent ffe5fcc commit 0be087a

File tree

4 files changed

+60
-41
lines changed

4 files changed

+60
-41
lines changed

vllm/v1/core/kv_cache_manager.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ class KVCacheBlocks:
2727
`blocks[i][j]` refers to the i-th kv_cache_group
2828
and the j-th block of tokens.We don't use block of
2929
tokens as the outer dimension because it assumes all
30-
kv_cache_groups have the same number of blocks, which is true for now but
31-
will be broken if we want to give different block_size to different
30+
kv_cache_groups have the same number of blocks, which is true for now but
31+
will be broken if we want to give different block_size to different
3232
kv_cache_groups in the future.
3333
"""
3434

@@ -184,9 +184,17 @@ def get_computed_blocks(self,
184184

185185
if self.log_stats:
186186
assert self.prefix_cache_stats is not None
187-
self.prefix_cache_stats.requests += 1
188-
self.prefix_cache_stats.queries += request.num_tokens
189-
self.prefix_cache_stats.hits += num_new_computed_tokens
187+
if request.num_preemptions > 0:
188+
# Previously preempted request
189+
self.prefix_cache_stats.preempted_requests += 1
190+
self.prefix_cache_stats.preempted_queries += request.num_tokens
191+
self.prefix_cache_stats.preempted_hits += (
192+
num_new_computed_tokens)
193+
else:
194+
# New request
195+
self.prefix_cache_stats.requests += 1
196+
self.prefix_cache_stats.queries += request.num_tokens
197+
self.prefix_cache_stats.hits += num_new_computed_tokens
190198

191199
return KVCacheBlocks(computed_blocks), num_new_computed_tokens
192200

@@ -209,10 +217,10 @@ def allocate_slots(
209217
already been computed locally (i.e. new_computed_blocks).
210218
num_new_computed_tokens: The number of new computed tokens just
211219
hitting the prefix caching, excluding external tokens.
212-
new_computed_blocks: The cached blocks for the above new computed
220+
new_computed_blocks: The cached blocks for the above new computed
213221
tokens.
214222
num_lookahead_tokens: The number of speculative tokens to allocate.
215-
This is used by spec decode proposers with kv-cache such
223+
This is used by spec decode proposers with kv-cache such
216224
as eagle.
217225
delay_cache_blocks: Whether to skip caching the blocks. This is
218226
used by P/D when allocating blocks used in a KV transfer
@@ -365,7 +373,7 @@ def get_num_common_prefix_blocks(
365373
requests in the current step.
366374
367375
Returns:
368-
list[int]: The number of common prefix blocks for each kv cache
376+
list[int]: The number of common prefix blocks for each kv cache
369377
group.
370378
"""
371379
assert request.status == RequestStatus.RUNNING

vllm/v1/core/sched/scheduler.py

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -251,46 +251,48 @@ def schedule(self) -> SchedulerOutput:
251251
req_index += 1
252252
continue
253253

254+
# Schedule newly needed KV blocks for the request.
254255
while True:
255256
new_blocks = self.kv_cache_manager.allocate_slots(
256257
request,
257258
num_new_tokens,
258259
num_lookahead_tokens=self.num_lookahead_tokens)
259-
if new_blocks is None:
260-
# The request cannot be scheduled.
261-
# Preempt the lowest-priority request.
262-
if self.policy == SchedulingPolicy.PRIORITY:
263-
preempted_req = max(
264-
self.running,
265-
key=lambda r: (r.priority, r.arrival_time),
266-
)
267-
self.running.remove(preempted_req)
268-
if preempted_req in scheduled_running_reqs:
269-
scheduled_running_reqs.remove(preempted_req)
270-
else:
271-
preempted_req = self.running.pop()
272-
273-
self.kv_cache_manager.free(preempted_req)
274-
self.encoder_cache_manager.free(preempted_req)
275-
preempted_req.status = RequestStatus.PREEMPTED
276-
preempted_req.num_computed_tokens = 0
277-
if self.log_stats:
278-
preempted_req.record_event(
279-
EngineCoreEventType.PREEMPTED, scheduled_timestamp)
280-
281-
self.waiting.prepend_request(preempted_req)
282-
preempted_reqs.append(preempted_req)
283-
if preempted_req == request:
284-
# No more request to preempt.
285-
can_schedule = False
286-
break
287-
else:
260+
261+
if new_blocks is not None:
288262
# The request can be scheduled.
289-
can_schedule = True
290263
break
291-
if not can_schedule:
264+
265+
# The request cannot be scheduled.
266+
# Preempt the lowest-priority request.
267+
if self.policy == SchedulingPolicy.PRIORITY:
268+
preempted_req = max(
269+
self.running,
270+
key=lambda r: (r.priority, r.arrival_time),
271+
)
272+
self.running.remove(preempted_req)
273+
if preempted_req in scheduled_running_reqs:
274+
scheduled_running_reqs.remove(preempted_req)
275+
else:
276+
preempted_req = self.running.pop()
277+
278+
self.kv_cache_manager.free(preempted_req)
279+
self.encoder_cache_manager.free(preempted_req)
280+
preempted_req.status = RequestStatus.PREEMPTED
281+
preempted_req.num_computed_tokens = 0
282+
preempted_req.num_preemptions += 1
283+
if self.log_stats:
284+
preempted_req.record_event(EngineCoreEventType.PREEMPTED,
285+
scheduled_timestamp)
286+
287+
self.waiting.prepend_request(preempted_req)
288+
preempted_reqs.append(preempted_req)
289+
if preempted_req == request:
290+
# No more request to preempt. Cannot schedule this request.
291+
break
292+
293+
if new_blocks is None:
294+
# Cannot schedule this request.
292295
break
293-
assert new_blocks is not None
294296

295297
# Schedule the request.
296298
scheduled_running_reqs.append(request)

vllm/v1/metrics/stats.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,19 @@ class PrefixCacheStats:
1717
"""Stores prefix cache hit statistics."""
1818
# Whether reset_prefix_cache was invoked.
1919
reset: bool = False
20-
# The number of requests in this update.
20+
# The number of new requests in this update.
2121
requests: int = 0
2222
# The number of queries in these requests. Note that "queries" here
2323
# means the number of tokens that were queried from the cache.
2424
queries: int = 0
2525
# The number of hits in these requests.
2626
hits: int = 0
27+
# The number of previously preempted requests in this update.
28+
preempted_requests: int = 0
29+
# The `queries` number for preempted requests.
30+
preempted_queries: int = 0
31+
# The `hits` number for preempted requests.
32+
preempted_hits: int = 0
2733

2834

2935
@dataclass

vllm/v1/request.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ def __init__(
115115
# indicates that the output is corrupted
116116
self.num_nans_in_logits = 0
117117

118+
# The number of requests being preempted by the scheduler
119+
self.num_preemptions = 0
120+
118121
self.block_hashes: list[BlockHash] = []
119122
self.get_hash_new_full_blocks: Optional[Callable[
120123
[], list[BlockHash]]] = None

0 commit comments

Comments
 (0)