From 37915dee051c57df212869d22c2019ea3c716b95 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Thu, 30 Jan 2025 23:07:08 +0000 Subject: [PATCH 1/7] done Signed-off-by: Cody Yu --- tests/entrypoints/openai/test_metrics.py | 1 + vllm/v1/core/kv_cache_manager.py | 28 +++++++++++++++++++++++- vllm/v1/core/kv_cache_utils.py | 12 +++++++++- vllm/v1/core/scheduler.py | 2 ++ vllm/v1/metrics/loggers.py | 14 ++++++++++-- vllm/v1/metrics/stats.py | 2 +- 6 files changed, 54 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index de2333901cc9..06477495840c 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -203,6 +203,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:num_requests_running", "vllm:num_requests_waiting", "vllm:gpu_cache_usage_perc", + "vllm:gpu_prefix_cache_hit_rate_perc", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", "vllm:request_success_total", diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index f8d08d0e4023..2cbdfd443a42 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -6,7 +6,7 @@ from vllm.logger import init_logger from vllm.utils import cdiv from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, - KVCacheBlock, + KVCacheBlock, PrefixCachingMetrics, generate_block_hash_extra_keys, hash_block_tokens, hash_request_tokens) @@ -78,11 +78,34 @@ def __init__( self.req_to_block_hashes: DefaultDict[ str, List[BlockHashType]] = defaultdict(list) + # Prefix cache metrics. + self.prefix_caching_metrics: PrefixCachingMetrics = { + "query_total": 0, + "query_hit": 0, + } + @property def usage(self) -> float: + """Get the KV cache usage. + + Returns: + The KV cache usage (between 0.0 and 1.0). + """ return 1.0 - (self.free_block_queue.num_free_blocks / self.num_gpu_blocks) + @property + def prefix_cache_hit_rate(self) -> float: + """Get the overall hit rate of prefix caching. + + Returns: + The hit rate of prefix caching (between 0.0 and 1.0). + """ + if self.prefix_caching_metrics["query_total"] == 0: + return 0.0 + return self.prefix_caching_metrics[ + "query_hit"] / self.prefix_caching_metrics["query_total"] + def get_computed_blocks( self, request: Request) -> Tuple[List[KVCacheBlock], int]: """Get the computed (cached) blocks for the request. @@ -118,6 +141,9 @@ def get_computed_blocks( else: break + self.prefix_caching_metrics["query_total"] += len(block_hashes) + self.prefix_caching_metrics["query_hit"] += len(computed_blocks) + # NOTE(woosuk): Since incomplete blocks are not eligible for # sharing, `num_computed_tokens` is always a multiple of # `block_size`. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6888f1a3e182..df5d4656a36a 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -2,7 +2,7 @@ """KV-Cache Utilities.""" from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, List, NamedTuple, Optional, Tuple +from typing import Any, List, NamedTuple, Optional, Tuple, TypedDict from vllm.config import VllmConfig from vllm.logger import init_logger @@ -28,6 +28,16 @@ class BlockHashType(NamedTuple): extra_keys: Optional[Any] = None +class PrefixCachingMetrics(TypedDict): + """Metrics for prefix caching.""" + + query_total: int + """The total number of queries.""" + + query_hit: int + """The number of queries that hit the prefix cache.""" + + @dataclass class KVCacheBlock: """KV-cache block metadata.""" diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 1aa34ee38602..c418486b489e 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -599,6 +599,8 @@ def make_stats(self) -> SchedulerStats: num_running_reqs=len(self.running), num_waiting_reqs=len(self.waiting), gpu_cache_usage=self.kv_cache_manager.usage, + gpu_prefix_cache_hit_rate=self.kv_cache_manager. + prefix_cache_hit_rate, ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index eb1acf584c6b..1b06896f5496 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -72,13 +72,15 @@ def log(self, scheduler_stats: SchedulerStats, logger.info( "Avg prompt throughput: %.1f tokens/s, " "Avg generation throughput: %.1f tokens/s, " - "Running: %d reqs, Waiting: %d reqs " - "GPU KV cache usage: %.1f%%.", + "Running: %d reqs, Waiting: %d reqs, " + "GPU KV cache usage: %.1f%%, " + "GPU prefix cache hit rate: %.1f%%", prompt_throughput, generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, scheduler_stats.gpu_cache_usage * 100, + scheduler_stats.gpu_prefix_cache_hit_rate * 100, ) @@ -107,6 +109,12 @@ def __init__(self, model_config: ModelConfig): documentation="GPU KV-cache usage. 1 means 100 percent usage.", labelnames=labelnames).labels(*labelvalues) + self.gauge_gpu_prefix_cache_hit_rate = prometheus_client.Gauge( + name="vllm:gpu_prefix_cache_hit_rate_perc", + documentation= + "GPU prefix cache hit rate. 1 means 100 percent hit rate.", + labelnames=labelnames).labels(*labelvalues) + self.counter_prompt_tokens = prometheus_client.Counter( name="vllm:prompt_tokens_total", documentation="Number of prefill tokens processed.", @@ -169,6 +177,8 @@ def log(self, scheduler_stats: SchedulerStats, self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + self.gauge_gpu_prefix_cache_hit_rate.set( + scheduler_stats.gpu_prefix_cache_hit_rate) self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) self.counter_generation_tokens.inc( diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 5e588d35ea4d..aee91cfead31 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -17,7 +17,7 @@ class SchedulerStats: num_waiting_reqs: int = 0 gpu_cache_usage: float = 0.0 - # gpu_prefix_cache_hit_rate: float = 0.0 + gpu_prefix_cache_hit_rate: float = 0.0 @dataclass From 6494dde9e1788242083f76371669bcf11fbafe90 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 31 Jan 2025 17:23:13 +0000 Subject: [PATCH 2/7] reset Signed-off-by: Cody Yu --- vllm/v1/core/kv_cache_manager.py | 18 +++++++++++------- vllm/v1/core/scheduler.py | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 2cbdfd443a42..da18a2c63055 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -94,17 +94,21 @@ def usage(self) -> float: return 1.0 - (self.free_block_queue.num_free_blocks / self.num_gpu_blocks) - @property - def prefix_cache_hit_rate(self) -> float: - """Get the overall hit rate of prefix caching. + def get_and_reset_prefix_cache_hit_rate(self) -> float: + """Get the overall hit rate of prefix caching and reset + the metrics. Returns: The hit rate of prefix caching (between 0.0 and 1.0). """ - if self.prefix_caching_metrics["query_total"] == 0: - return 0.0 - return self.prefix_caching_metrics[ - "query_hit"] / self.prefix_caching_metrics["query_total"] + hit_rate = 0.0 + if self.prefix_caching_metrics["query_total"] > 0: + hit_rate = self.prefix_caching_metrics[ + "query_hit"] / self.prefix_caching_metrics["query_total"] + + self.prefix_caching_metrics["query_hit"] = 0 + self.prefix_caching_metrics["query_total"] = 0 + return hit_rate def get_computed_blocks( self, request: Request) -> Tuple[List[KVCacheBlock], int]: diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index c418486b489e..e58dfdf6bd01 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -600,7 +600,7 @@ def make_stats(self) -> SchedulerStats: num_waiting_reqs=len(self.waiting), gpu_cache_usage=self.kv_cache_manager.usage, gpu_prefix_cache_hit_rate=self.kv_cache_manager. - prefix_cache_hit_rate, + get_and_reset_prefix_cache_hit_rate(), ) From 7d0bed5d85ce5607a230d027ba8d0785a19c053e Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 31 Jan 2025 19:16:09 +0000 Subject: [PATCH 3/7] improve Signed-off-by: Cody Yu --- vllm/v1/core/kv_cache_manager.py | 31 ++++++++----------- vllm/v1/core/kv_cache_utils.py | 53 +++++++++++++++++++++++++++----- vllm/v1/core/scheduler.py | 2 +- 3 files changed, 60 insertions(+), 26 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index da18a2c63055..043b8c2aa56e 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -79,10 +79,7 @@ def __init__( str, List[BlockHashType]] = defaultdict(list) # Prefix cache metrics. - self.prefix_caching_metrics: PrefixCachingMetrics = { - "query_total": 0, - "query_hit": 0, - } + self.prefix_caching_metrics = PrefixCachingMetrics() @property def usage(self) -> float: @@ -94,21 +91,14 @@ def usage(self) -> float: return 1.0 - (self.free_block_queue.num_free_blocks / self.num_gpu_blocks) - def get_and_reset_prefix_cache_hit_rate(self) -> float: - """Get the overall hit rate of prefix caching and reset - the metrics. + @property + def prefix_cache_hit_rate(self) -> float: + """Get the prefix caching hit rate. Returns: - The hit rate of prefix caching (between 0.0 and 1.0). + The prefix caching hit rate. """ - hit_rate = 0.0 - if self.prefix_caching_metrics["query_total"] > 0: - hit_rate = self.prefix_caching_metrics[ - "query_hit"] / self.prefix_caching_metrics["query_total"] - - self.prefix_caching_metrics["query_hit"] = 0 - self.prefix_caching_metrics["query_total"] = 0 - return hit_rate + return self.prefix_caching_metrics.hit_rate def get_computed_blocks( self, request: Request) -> Tuple[List[KVCacheBlock], int]: @@ -145,8 +135,10 @@ def get_computed_blocks( else: break - self.prefix_caching_metrics["query_total"] += len(block_hashes) - self.prefix_caching_metrics["query_hit"] += len(computed_blocks) + self.prefix_caching_metrics.add_request_query( + num_queries=len(block_hashes), + num_hits=len(computed_blocks), + ) # NOTE(woosuk): Since incomplete blocks are not eligible for # sharing, `num_computed_tokens` is always a multiple of @@ -310,6 +302,9 @@ def reset_prefix_cache(self) -> bool: for block in self.block_pool: block.reset_hash() + # Reset the prefix caching metrics. + self.prefix_caching_metrics.reset() + logger.info("Successfully reset prefix cache") return True diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index df5d4656a36a..d23061911eb1 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 """KV-Cache Utilities.""" +from collections import deque from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, List, NamedTuple, Optional, Tuple, TypedDict +from typing import Any, List, NamedTuple, Optional, Tuple from vllm.config import VllmConfig from vllm.logger import init_logger @@ -28,14 +29,52 @@ class BlockHashType(NamedTuple): extra_keys: Optional[Any] = None -class PrefixCachingMetrics(TypedDict): - """Metrics for prefix caching.""" +class PrefixCachingMetrics: + """Metrics for prefix caching with a hit rate of the most recent N requests. - query_total: int - """The total number of queries.""" + Args: + interval: The number of the most recent requests to aggregate. + Defaults to 1000. + """ + + def __init__(self, interval: int = 1000): + self.interval = interval + self.aggregated_query_total = 0 + self.aggregated_query_hit = 0 + self.request_queries: deque[Tuple[int, int]] = deque() - query_hit: int - """The number of queries that hit the prefix cache.""" + def add_request_query(self, num_queries: int, num_hits: int): + """Add a request to the metrics. This function is called when + a new request is being scheduled and is looking for computed blocks. + When there are more than `interval` requests, the oldest request + is removed from the metrics. + + Args: + num_queries: The number of queries in the request. + num_hits: The number of hits in the request. + """ + + self.request_queries.append((num_queries, num_hits)) + if len(self.request_queries) > self.interval: + old_num_queries, old_num_hits = self.request_queries.popleft() + self.aggregated_query_total -= old_num_queries + self.aggregated_query_hit -= old_num_hits + + self.aggregated_query_total += num_queries + self.aggregated_query_hit += num_hits + + def reset(self): + """Reset the metrics.""" + self.aggregated_query_total = 0 + self.aggregated_query_hit = 0 + self.request_queries.clear() + + @property + def hit_rate(self) -> float: + """Calculate the hit rate for the past N requests.""" + if self.aggregated_query_total == 0: + return 0.0 + return self.aggregated_query_hit / self.aggregated_query_total @dataclass diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index e58dfdf6bd01..c418486b489e 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -600,7 +600,7 @@ def make_stats(self) -> SchedulerStats: num_waiting_reqs=len(self.waiting), gpu_cache_usage=self.kv_cache_manager.usage, gpu_prefix_cache_hit_rate=self.kv_cache_manager. - get_and_reset_prefix_cache_hit_rate(), + prefix_cache_hit_rate, ) From fd10665c78273d1d633459b8c46d0263eb477b7e Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 31 Jan 2025 19:32:33 +0000 Subject: [PATCH 4/7] test Signed-off-by: Cody Yu --- tests/v1/core/test_kv_cache_utils.py | 33 +++++++++++++++++++++++++++- vllm/v1/core/kv_cache_manager.py | 4 ++-- vllm/v1/core/kv_cache_utils.py | 12 +++++----- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 8df4cbe1be71..de6c99c092be 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -5,7 +5,7 @@ from vllm.multimodal.inputs import MultiModalKwargs from vllm.sampling_params import SamplingParams from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, - KVCacheBlock, + KVCacheBlock, PrefixCachingMetrics, generate_block_hash_extra_keys, hash_block_tokens, hash_request_tokens) @@ -277,3 +277,34 @@ def test_hash_request_tokens_no_mm_inputs(): assert block_hashes[0].extra_keys is None assert block_hashes[1].token_ids == (3, 4, 5) assert block_hashes[1].extra_keys is None + + +def test_metrics(): + """ + Test the prefix caching metrics. + """ + metrics = PrefixCachingMetrics(interval=5) + assert metrics.hit_rate == 0.0 + + metrics.add_request_query(20, 9) + # 9 / 20 = 0.45 + assert metrics.hit_rate == 0.45 + + for _ in range(4): + metrics.add_request_query(20, 4) + + # 25 / 100 = 0.25 + assert metrics.hit_rate == 0.25 + + metrics.add_request_query(10, 2) + + # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2 + assert metrics.aggregated_query_total == 90 + assert metrics.aggregated_query_hit == 18 + assert metrics.hit_rate == 0.2 + + metrics.reset() + assert metrics.hit_rate == 0.0 + assert metrics.aggregated_query_total == 0 + assert metrics.aggregated_query_hit == 0 + assert not metrics.query_queue diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 043b8c2aa56e..6a322be8a72f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -78,8 +78,8 @@ def __init__( self.req_to_block_hashes: DefaultDict[ str, List[BlockHashType]] = defaultdict(list) - # Prefix cache metrics. - self.prefix_caching_metrics = PrefixCachingMetrics() + # Prefix cache metrics. TODO: Make the interval configurable. + self.prefix_caching_metrics = PrefixCachingMetrics(interval=1000) @property def usage(self) -> float: diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index d23061911eb1..497891557972 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -39,9 +39,11 @@ class PrefixCachingMetrics: def __init__(self, interval: int = 1000): self.interval = interval + # The current aggregated query total and hit. self.aggregated_query_total = 0 self.aggregated_query_hit = 0 - self.request_queries: deque[Tuple[int, int]] = deque() + # A deque of (num_queries, num_hits) for the most recent requests. + self.query_queue: deque[Tuple[int, int]] = deque() def add_request_query(self, num_queries: int, num_hits: int): """Add a request to the metrics. This function is called when @@ -54,9 +56,9 @@ def add_request_query(self, num_queries: int, num_hits: int): num_hits: The number of hits in the request. """ - self.request_queries.append((num_queries, num_hits)) - if len(self.request_queries) > self.interval: - old_num_queries, old_num_hits = self.request_queries.popleft() + self.query_queue.append((num_queries, num_hits)) + if len(self.query_queue) > self.interval: + old_num_queries, old_num_hits = self.query_queue.popleft() self.aggregated_query_total -= old_num_queries self.aggregated_query_hit -= old_num_hits @@ -67,7 +69,7 @@ def reset(self): """Reset the metrics.""" self.aggregated_query_total = 0 self.aggregated_query_hit = 0 - self.request_queries.clear() + self.query_queue.clear() @property def hit_rate(self) -> float: From 60e1637801c3f32c39d350ec6e5ac1b1c2ddc18e Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 5 Feb 2025 08:21:40 -0500 Subject: [PATCH 5/7] [V1][Metrics] Replace prefix cache hit rate with queries/hits counters With a hit rate metric, we need to avoid the situation where if the hit rate drops due to a sudden increase in misses, the change gets diluted by the large accumulated totals. With Prometheus, it is better to log monotonic counters and use Prometheus queries to calculate rates according to whatever time interval makes sense to the operator, also allowing them to easily adjust the time interval in their PromQL queries. With the logging logger, we can continue to aggregate across an interval of (roughly) 1000 requests. Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 3 +- tests/v1/core/test_kv_cache_utils.py | 14 +++++--- vllm/v1/core/kv_cache_manager.py | 27 +++++++------- vllm/v1/core/kv_cache_utils.py | 46 +++++++++++++++--------- vllm/v1/core/scheduler.py | 3 +- vllm/v1/metrics/loggers.py | 29 +++++++++++---- vllm/v1/metrics/stats.py | 15 ++++++-- 7 files changed, 90 insertions(+), 47 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 06477495840c..8c1bb1a897e3 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -203,7 +203,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:num_requests_running", "vllm:num_requests_waiting", "vllm:gpu_cache_usage_perc", - "vllm:gpu_prefix_cache_hit_rate_perc", + "vllm:gpu_prefix_cache_queries", + "vllm:gpu_prefix_cache_hits", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", "vllm:request_success_total", diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index de6c99c092be..ba08b83ec54e 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -9,6 +9,7 @@ generate_block_hash_extra_keys, hash_block_tokens, hash_request_tokens) +from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -283,28 +284,33 @@ def test_metrics(): """ Test the prefix caching metrics. """ + + def stats(requests, queries, hits): + return PrefixCacheStats(requests=requests, queries=queries, hits=hits) + metrics = PrefixCachingMetrics(interval=5) assert metrics.hit_rate == 0.0 - metrics.add_request_query(20, 9) + metrics.observe(stats(1, 20, 9)) # 9 / 20 = 0.45 assert metrics.hit_rate == 0.45 - for _ in range(4): - metrics.add_request_query(20, 4) + metrics.observe(stats(4, 80, 16)) # 25 / 100 = 0.25 assert metrics.hit_rate == 0.25 - metrics.add_request_query(10, 2) + metrics.observe(stats(1, 10, 2)) # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2 + assert metrics.aggregated_requests == 5 assert metrics.aggregated_query_total == 90 assert metrics.aggregated_query_hit == 18 assert metrics.hit_rate == 0.2 metrics.reset() assert metrics.hit_rate == 0.0 + assert metrics.aggregated_requests == 0 assert metrics.aggregated_query_total == 0 assert metrics.aggregated_query_hit == 0 assert not metrics.query_queue diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 6a322be8a72f..71114e0d0e08 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -6,10 +6,11 @@ from vllm.logger import init_logger from vllm.utils import cdiv from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, - KVCacheBlock, PrefixCachingMetrics, + KVCacheBlock, generate_block_hash_extra_keys, hash_block_tokens, hash_request_tokens) +from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request, RequestStatus logger = init_logger(__name__) @@ -78,8 +79,7 @@ def __init__( self.req_to_block_hashes: DefaultDict[ str, List[BlockHashType]] = defaultdict(list) - # Prefix cache metrics. TODO: Make the interval configurable. - self.prefix_caching_metrics = PrefixCachingMetrics(interval=1000) + self.prefix_cache_stats = PrefixCacheStats() @property def usage(self) -> float: @@ -91,14 +91,15 @@ def usage(self) -> float: return 1.0 - (self.free_block_queue.num_free_blocks / self.num_gpu_blocks) - @property - def prefix_cache_hit_rate(self) -> float: - """Get the prefix caching hit rate. + def make_prefix_cache_stats(self) -> PrefixCacheStats: + """Get (and reset) the prefix cache query and hit counts. Returns: - The prefix caching hit rate. + The prefix caching stats - query count, and hit count. """ - return self.prefix_caching_metrics.hit_rate + stats = self.prefix_cache_stats + self.prefix_cache_stats = PrefixCacheStats() + return stats def get_computed_blocks( self, request: Request) -> Tuple[List[KVCacheBlock], int]: @@ -135,10 +136,9 @@ def get_computed_blocks( else: break - self.prefix_caching_metrics.add_request_query( - num_queries=len(block_hashes), - num_hits=len(computed_blocks), - ) + self.prefix_cache_stats.requests += 1 + self.prefix_cache_stats.queries += len(block_hashes) + self.prefix_cache_stats.hits += len(computed_blocks) # NOTE(woosuk): Since incomplete blocks are not eligible for # sharing, `num_computed_tokens` is always a multiple of @@ -302,8 +302,7 @@ def reset_prefix_cache(self) -> bool: for block in self.block_pool: block.reset_hash() - # Reset the prefix caching metrics. - self.prefix_caching_metrics.reset() + self.prefix_cache_stats.reset = True logger.info("Successfully reset prefix cache") return True diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 497891557972..44c5ff5e083b 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -9,6 +9,7 @@ from vllm.logger import init_logger from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec, KVCacheTensor) +from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request logger = init_logger(__name__) @@ -40,33 +41,44 @@ class PrefixCachingMetrics: def __init__(self, interval: int = 1000): self.interval = interval # The current aggregated query total and hit. + self.aggregated_requests = 0 self.aggregated_query_total = 0 self.aggregated_query_hit = 0 - # A deque of (num_queries, num_hits) for the most recent requests. - self.query_queue: deque[Tuple[int, int]] = deque() + # A deque of (requests, queries, hits) for the most recent requests. + self.query_queue: deque[Tuple[int, int, int]] = deque() - def add_request_query(self, num_queries: int, num_hits: int): - """Add a request to the metrics. This function is called when - a new request is being scheduled and is looking for computed blocks. - When there are more than `interval` requests, the oldest request - is removed from the metrics. + def observe(self, stats: PrefixCacheStats): + """Observe the prefix caching for a set of requests. - Args: - num_queries: The number of queries in the request. - num_hits: The number of hits in the request. + This function is called with information gathered when new requests + are being scheduled and are looking for computed blocks. + + When there are more than `interval` requests, the oldest set of + requestsare removed from the metrics. + + Stats: + reset: Whether reset_prefix_cache was invoked. + requests: The number of requests in this update. + queries: The number of queries in these requests. + hits: The number of hits in these requests. """ + if stats.reset: + self.reset() - self.query_queue.append((num_queries, num_hits)) - if len(self.query_queue) > self.interval: - old_num_queries, old_num_hits = self.query_queue.popleft() - self.aggregated_query_total -= old_num_queries - self.aggregated_query_hit -= old_num_hits + self.query_queue.append((stats.requests, stats.queries, stats.hits)) + self.aggregated_requests += stats.requests + self.aggregated_query_total += stats.queries + self.aggregated_query_hit += stats.hits - self.aggregated_query_total += num_queries - self.aggregated_query_hit += num_hits + if self.aggregated_requests > self.interval: + old_requests, old_queries, old_hits = self.query_queue.popleft() + self.aggregated_requests -= old_requests + self.aggregated_query_total -= old_queries + self.aggregated_query_hit -= old_hits def reset(self): """Reset the metrics.""" + self.aggregated_requests = 0 self.aggregated_query_total = 0 self.aggregated_query_hit = 0 self.query_queue.clear() diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index c418486b489e..10b8f8d0dd9c 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -599,8 +599,7 @@ def make_stats(self) -> SchedulerStats: num_running_reqs=len(self.running), num_waiting_reqs=len(self.waiting), gpu_cache_usage=self.kv_cache_manager.usage, - gpu_prefix_cache_hit_rate=self.kv_cache_manager. - prefix_cache_hit_rate, + prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(), ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 1b06896f5496..3472761dc180 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -9,6 +9,7 @@ from vllm.config import ModelConfig from vllm.logger import init_logger +from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics from vllm.v1.engine import FinishReason from vllm.v1.metrics.stats import IterationStats, SchedulerStats @@ -37,6 +38,9 @@ def _reset(self, now): self.num_prompt_tokens: List[int] = [] self.num_generation_tokens: List[int] = [] + # Prefix cache metrics. TODO: Make the interval configurable. + self.prefix_caching_metrics = PrefixCachingMetrics() + def _local_interval_elapsed(self, now: float) -> bool: # Log every _LOCAL_LOGGING_INTERVAL_SEC. elapsed_time = now - self.last_log_time @@ -58,6 +62,8 @@ def log(self, scheduler_stats: SchedulerStats, self._track_iteration_stats(iteration_stats) + self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats) + now = time.monotonic() if not self._local_interval_elapsed(now): return @@ -74,13 +80,13 @@ def log(self, scheduler_stats: SchedulerStats, "Avg generation throughput: %.1f tokens/s, " "Running: %d reqs, Waiting: %d reqs, " "GPU KV cache usage: %.1f%%, " - "GPU prefix cache hit rate: %.1f%%", + "Prefix cache hit rate: %.1f%%", prompt_throughput, generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, scheduler_stats.gpu_cache_usage * 100, - scheduler_stats.gpu_prefix_cache_hit_rate * 100, + self.prefix_caching_metrics.hit_rate * 100, ) @@ -109,10 +115,16 @@ def __init__(self, model_config: ModelConfig): documentation="GPU KV-cache usage. 1 means 100 percent usage.", labelnames=labelnames).labels(*labelvalues) - self.gauge_gpu_prefix_cache_hit_rate = prometheus_client.Gauge( - name="vllm:gpu_prefix_cache_hit_rate_perc", + self.counter_gpu_prefix_cache_queries = prometheus_client.Counter( + name="vllm:gpu_prefix_cache_queries", + documentation= + "GPU prefix cache queries, in terms of number of queried blocks.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_gpu_prefix_cache_hits = prometheus_client.Counter( + name="vllm:gpu_prefix_cache_hits", documentation= - "GPU prefix cache hit rate. 1 means 100 percent hit rate.", + "GPU prefix cache hits, in terms of number of cached blocks.", labelnames=labelnames).labels(*labelvalues) self.counter_prompt_tokens = prometheus_client.Counter( @@ -177,8 +189,11 @@ def log(self, scheduler_stats: SchedulerStats, self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) - self.gauge_gpu_prefix_cache_hit_rate.set( - scheduler_stats.gpu_prefix_cache_hit_rate) + + self.counter_gpu_prefix_cache_queries.inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_gpu_prefix_cache_hits.inc( + scheduler_stats.prefix_cache_stats.hits) self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) self.counter_generation_tokens.inc( diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index aee91cfead31..17d10a7e43a8 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import time -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import TYPE_CHECKING, List if TYPE_CHECKING: @@ -9,6 +9,15 @@ from vllm.v1.engine import EngineCoreOutput, FinishReason +@dataclass +class PrefixCacheStats: + """Stores prefix cache hit statistics.""" + reset: bool = False + requests: int = 0 + hits: int = 0 + queries: int = 0 + + @dataclass class SchedulerStats: """Stats associated with the scheduler.""" @@ -17,7 +26,9 @@ class SchedulerStats: num_waiting_reqs: int = 0 gpu_cache_usage: float = 0.0 - gpu_prefix_cache_hit_rate: float = 0.0 + + prefix_cache_stats: PrefixCacheStats = field( + default_factory=PrefixCacheStats) @dataclass From c9f8cf3ae502d0624b85b726a01c336339e81ec5 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 7 Feb 2025 11:05:57 -0800 Subject: [PATCH 6/7] minor Signed-off-by: Cody Yu --- vllm/v1/core/kv_cache_manager.py | 4 ++-- vllm/v1/core/kv_cache_utils.py | 13 +++++++------ vllm/v1/metrics/stats.py | 6 +++++- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 71114e0d0e08..f75d31f542cf 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -92,10 +92,10 @@ def usage(self) -> float: self.num_gpu_blocks) def make_prefix_cache_stats(self) -> PrefixCacheStats: - """Get (and reset) the prefix cache query and hit counts. + """Get (and reset) the prefix cache stats. Returns: - The prefix caching stats - query count, and hit count. + The current prefix caching stats. """ stats = self.prefix_cache_stats self.prefix_cache_stats = PrefixCacheStats() diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 44c5ff5e083b..bddb482d2916 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -40,7 +40,7 @@ class PrefixCachingMetrics: def __init__(self, interval: int = 1000): self.interval = interval - # The current aggregated query total and hit. + # The current aggregated values. self.aggregated_requests = 0 self.aggregated_query_total = 0 self.aggregated_query_hit = 0 @@ -56,20 +56,21 @@ def observe(self, stats: PrefixCacheStats): When there are more than `interval` requests, the oldest set of requestsare removed from the metrics. - Stats: - reset: Whether reset_prefix_cache was invoked. - requests: The number of requests in this update. - queries: The number of queries in these requests. - hits: The number of hits in these requests. + Args: + stats: The prefix cache stats. """ + # reset_prefix_cache was invoked before the current update. + # Reset the metrics before aggregating the current stats. if stats.reset: self.reset() + # Update the metrics. self.query_queue.append((stats.requests, stats.queries, stats.hits)) self.aggregated_requests += stats.requests self.aggregated_query_total += stats.queries self.aggregated_query_hit += stats.hits + # Remove the oldest stats if the number of requests exceeds. if self.aggregated_requests > self.interval: old_requests, old_queries, old_hits = self.query_queue.popleft() self.aggregated_requests -= old_requests diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 17d10a7e43a8..4ad7bc55c815 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -12,10 +12,14 @@ @dataclass class PrefixCacheStats: """Stores prefix cache hit statistics.""" + # Whether reset_prefix_cache was invoked. reset: bool = False + # The number of requests in this update. requests: int = 0 - hits: int = 0 + # The number of queries in these requests. queries: int = 0 + # The number of hits in these requests. + hits: int = 0 @dataclass From 6d474330e588033ffeacc7afd30bb9edc38fc727 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Mon, 10 Feb 2025 09:28:42 -0800 Subject: [PATCH 7/7] comment Signed-off-by: Cody Yu --- vllm/v1/metrics/stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 4ad7bc55c815..f806b0adf5d5 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -16,7 +16,8 @@ class PrefixCacheStats: reset: bool = False # The number of requests in this update. requests: int = 0 - # The number of queries in these requests. + # The number of queries in these requests. Note that "queries" here + # means the number of blocks that were queried from the cache. queries: int = 0 # The number of hits in these requests. hits: int = 0