From df51e19772d40c4f94406ed63ff9055e2cb60bc0 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Mon, 3 Mar 2025 06:21:00 -0500 Subject: [PATCH 1/2] [V0][Metrics] Deprecated duplicate queue time metric vllm:time_in_queue_requests appears to be an exact duplicate of vllm:request_queue_time_seconds. Both record first_scheduled_time-arrival_time: ``` if seq_group.is_finished(): time_queue_requests.append( seq_group.metrics.first_scheduled_time - seq_group.metrics.arrival_time) ``` ``` def maybe_set_first_scheduled_time(self, time: float) -> None: if self.metrics.first_scheduled_time is None: self.metrics.first_scheduled_time = time self.metrics.time_in_queue = time - self.metrics.arrival_time ``` vllm:time_in_queue_requests was added by #9659 and vllm:request_queue_time_seconds was later added by #4464. However, neither existed when each PR was first created. The latter seems like the right one to keep since it is implemented in V1, used in the Grafana dashboard, and has test coverage. Signed-off-by: Mark McLoughlin --- vllm/engine/metrics.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index cb3ca7a11881..cf3155613369 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -181,10 +181,13 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): "Histogram of time spent in DECODE phase for request.", labelnames=labelnames, buckets=request_latency_buckets) + # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds: + # TODO: in 0.9, only enable if show_hidden_metrics=True self.histogram_time_in_queue_request = self._histogram_cls( name="vllm:time_in_queue_requests", - documentation= - "Histogram of time the request spent in the queue in seconds.", + documentation=( + "Histogram of time the request spent in the queue in seconds. " + "DEPRECATED: use vllm:request_queue_time_seconds instead."), labelnames=labelnames, buckets=request_latency_buckets) self.histogram_model_forward_time_request = self._histogram_cls( From 20a15cc67d77ba46a940bbc11ff19c8815e5925d Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Mon, 3 Mar 2025 13:13:31 -0500 Subject: [PATCH 2/2] [V1][Metrics] Deprecate vllm:model_forward/execute_time_milliseconds Metrics originally added by #9659 These seem to be of questionable value relative to the existing prefill, decode, and inference time metrics. And since they would be challenging to implement in V1, and they don't conform to the standard of using seconds as units, let's deprecate them Signed-off-by: Mark McLoughlin --- vllm/engine/metrics.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index cf3155613369..97fddedd1cb4 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -190,18 +190,26 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): "DEPRECATED: use vllm:request_queue_time_seconds instead."), labelnames=labelnames, buckets=request_latency_buckets) + + # Deprecated in 0.8 - use prefill/decode/inference time metrics + # TODO: in 0.9, only enable if show_hidden_metrics=True self.histogram_model_forward_time_request = self._histogram_cls( name="vllm:model_forward_time_milliseconds", - documentation= - "Histogram of time spent in the model forward pass in ms.", + documentation=( + "Histogram of time spent in the model forward pass in ms. " + "DEPRECATED: use prefill/decode/inference time metrics instead." + ), labelnames=labelnames, buckets=build_1_2_3_5_8_buckets(3000)) self.histogram_model_execute_time_request = self._histogram_cls( name="vllm:model_execute_time_milliseconds", - documentation= - "Histogram of time spent in the model execute function in ms.", + documentation=( + "Histogram of time spent in the model execute function in ms." + "DEPRECATED: use prefill/decode/inference time metrics instead." + ), labelnames=labelnames, buckets=build_1_2_3_5_8_buckets(3000)) + # Metadata self.histogram_num_prompt_tokens_request = self._histogram_cls( name="vllm:request_prompt_tokens",