From df51e19772d40c4f94406ed63ff9055e2cb60bc0 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 3 Mar 2025 06:21:00 -0500
Subject: [PATCH 1/2] [V0][Metrics] Deprecated duplicate queue time metric

vllm:time_in_queue_requests appears to be an exact duplicate
of vllm:request_queue_time_seconds.

Both record first_scheduled_time-arrival_time:

```
if seq_group.is_finished():
    time_queue_requests.append(
        seq_group.metrics.first_scheduled_time -
        seq_group.metrics.arrival_time)
```

```
def maybe_set_first_scheduled_time(self, time: float) -> None:
    if self.metrics.first_scheduled_time is None:
        self.metrics.first_scheduled_time = time
        self.metrics.time_in_queue = time - self.metrics.arrival_time
```

vllm:time_in_queue_requests was added by #9659 and
vllm:request_queue_time_seconds was later added by #4464. However,
neither existed when each PR was first created.

The latter seems like the right one to keep since it is implemented
in V1, used in the Grafana dashboard, and has test coverage.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 vllm/engine/metrics.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index cb3ca7a11881..cf3155613369 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -181,10 +181,13 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             "Histogram of time spent in DECODE phase for request.",
             labelnames=labelnames,
             buckets=request_latency_buckets)
+        # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.histogram_time_in_queue_request = self._histogram_cls(
             name="vllm:time_in_queue_requests",
-            documentation=
-            "Histogram of time the request spent in the queue in seconds.",
+            documentation=(
+                "Histogram of time the request spent in the queue in seconds. "
+                "DEPRECATED: use vllm:request_queue_time_seconds instead."),
             labelnames=labelnames,
             buckets=request_latency_buckets)
         self.histogram_model_forward_time_request = self._histogram_cls(

From 20a15cc67d77ba46a940bbc11ff19c8815e5925d Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 3 Mar 2025 13:13:31 -0500
Subject: [PATCH 2/2] [V1][Metrics] Deprecate
 vllm:model_forward/execute_time_milliseconds

Metrics originally added by #9659

These seem to be of questionable value relative to the existing
prefill, decode, and inference time metrics. And since they would
be challenging to implement in V1, and they don't conform to the
standard of using seconds as units, let's deprecate them

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 vllm/engine/metrics.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index cf3155613369..97fddedd1cb4 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -190,18 +190,26 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
                 "DEPRECATED: use vllm:request_queue_time_seconds instead."),
             labelnames=labelnames,
             buckets=request_latency_buckets)
+
+        # Deprecated in 0.8 - use prefill/decode/inference time metrics
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.histogram_model_forward_time_request = self._histogram_cls(
             name="vllm:model_forward_time_milliseconds",
-            documentation=
-            "Histogram of time spent in the model forward pass in ms.",
+            documentation=(
+                "Histogram of time spent in the model forward pass in ms. "
+                "DEPRECATED: use prefill/decode/inference time metrics instead."
+            ),
             labelnames=labelnames,
             buckets=build_1_2_3_5_8_buckets(3000))
         self.histogram_model_execute_time_request = self._histogram_cls(
             name="vllm:model_execute_time_milliseconds",
-            documentation=
-            "Histogram of time spent in the model execute function in ms.",
+            documentation=(
+                "Histogram of time spent in the model execute function in ms."
+                "DEPRECATED: use prefill/decode/inference time metrics instead."
+            ),
             labelnames=labelnames,
             buckets=build_1_2_3_5_8_buckets(3000))
+
         #   Metadata
         self.histogram_num_prompt_tokens_request = self._histogram_cls(
             name="vllm:request_prompt_tokens",