88from typing import Set , Tuple
99
1010from vllm .block import BlockTable , PhysicalTokenBlock
11+ from vllm .core .block .common import CacheMetricData
1112from vllm .core .block .utils import check_no_caching_or_swa_for_blockmgr_encdec
1213from vllm .core .evictor_v1 import EvictionPolicy , Evictor , make_evictor
1314from vllm .core .interfaces import AllocStatus , BlockSpaceManager
@@ -60,6 +61,11 @@ def contains_block(self, block_hash: int) -> bool:
6061 def update_hash (self , block_hash : int , block : PhysicalTokenBlock ):
6162 pass
6263
64+ @abstractmethod
65+ def get_prefix_cache_hit_rate (self ) -> float :
66+ """Prefix cache hit rate. -1 means not supported or disabled."""
67+ pass
68+
6369
6470class CachedBlockAllocator (BlockAllocatorBase ):
6571 """Manages free physical token blocks for a device.
@@ -85,6 +91,8 @@ def __init__(self,
8591
8692 self .default_hash_ctr = count ()
8793
94+ self .cache_metric_data = CacheMetricData ()
95+
8896 def allocate_block (self , block_hash : int ,
8997 num_hashed_tokens : int ) -> PhysicalTokenBlock :
9098 if self .current_num_blocks == self .num_blocks :
@@ -105,15 +113,17 @@ def allocate(self,
105113 num_hashed_tokens : int = 0 ) -> PhysicalTokenBlock :
106114 if block_hash is None :
107115 block_hash = next (self .default_hash_ctr )
116+
108117 if block_hash in self .evictor :
109118 assert block_hash not in self .cached_blocks
110119 block = self .evictor .remove (block_hash )
111120 assert block .ref_count == 0
112121 self .cached_blocks [block_hash ] = block
113- block .ref_count += 1
114- assert block .block_hash == block_hash
115- return block
116- if block_hash not in self .cached_blocks :
122+
123+ if block_hash in self .cached_blocks :
124+ self .cache_metric_data .query (hit = True )
125+ else :
126+ self .cache_metric_data .query (hit = False )
117127 self .cached_blocks [block_hash ] = self .allocate_block (
118128 block_hash , num_hashed_tokens )
119129 block = self .cached_blocks [block_hash ]
@@ -150,6 +160,9 @@ def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
150160 del self .cached_blocks [old_hash ]
151161 self .cached_blocks [block_hash ] = block
152162
163+ def get_prefix_cache_hit_rate (self ) -> float :
164+ return self .cache_metric_data .get_hit_rate ()
165+
153166
154167class UncachedBlockAllocator (BlockAllocatorBase ):
155168 """Manages free physical token blocks for a device.
@@ -209,6 +222,9 @@ def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
209222 raise NotImplementedError (
210223 "Invalid codepath for uncached block allocator." )
211224
225+ def get_prefix_cache_hit_rate (self ) -> float :
226+ return - 1
227+
212228
213229class BlockSpaceManagerV1 (BlockSpaceManager ):
214230 """Manages the mapping between logical and physical token blocks."""
@@ -705,3 +721,10 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup):
705721 if self .enable_caching :
706722 for seq in seq_group .get_seqs ():
707723 self .compute_full_blocks_in_seq (seq )
724+
725+ def get_prefix_cache_hit_rate (self , device : Device ) -> float :
726+ if device == Device .GPU :
727+ return self .gpu_allocator .get_prefix_cache_hit_rate ()
728+ if device == Device .CPU :
729+ return self .cpu_allocator .get_prefix_cache_hit_rate ()
730+ raise ValueError (f"Invalid device: { device } " )
0 commit comments