Skip to content

Commit fdab236

Browse files
committed
[V1][Metrics] Implement vllm:request_params_max_tokens
This just observes SamplingParams.max_tokens values in a histogram. Signed-off-by: Mark McLoughlin <[email protected]>
1 parent d66c82b commit fdab236

File tree

4 files changed

+20
-0
lines changed

4 files changed

+20
-0
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
242242
"vllm:request_params_n_sum",
243243
"vllm:request_params_n_bucket",
244244
"vllm:request_params_n_count",
245+
"vllm:request_params_max_tokens_sum",
246+
"vllm:request_params_max_tokens_bucket",
247+
"vllm:request_params_max_tokens_count",
245248
"vllm:time_to_first_token_seconds_sum",
246249
"vllm:time_to_first_token_seconds_bucket",
247250
"vllm:time_to_first_token_seconds_count",

vllm/v1/engine/output_processor.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def __init__(
3636
prompt_token_ids: list[int],
3737
logprobs_processor: LogprobsProcessor,
3838
detokenizer: IncrementalDetokenizer,
39+
max_tokens_param: Optional[int],
3940
arrival_time: float,
4041
queue: Optional[asyncio.Queue[RequestOutput]],
4142
log_stats: bool,
@@ -50,6 +51,7 @@ def __init__(
5051
self.prompt_len = len(prompt_token_ids)
5152
self.logprobs_processor = logprobs_processor
5253
self.detokenizer = detokenizer
54+
self.max_tokens_param = max_tokens_param
5355
self.is_prefilling = True
5456
self.queue = queue
5557

@@ -83,6 +85,8 @@ def from_new_request(
8385
tokenizer=tokenizer,
8486
request=request,
8587
),
88+
max_tokens_param=(request.sampling_params.max_tokens if
89+
request.sampling_params is not None else None),
8690
arrival_time=request.arrival_time,
8791
queue=queue,
8892
log_stats=log_stats,
@@ -354,6 +358,7 @@ def _update_stats_from_finished(self, req_state: RequestState,
354358
iteration_stats.update_from_finished_request(
355359
finish_reason=finish_reason,
356360
num_prompt_tokens=len(req_state.prompt_token_ids),
361+
max_tokens_param=req_state.max_tokens_param,
357362
req_stats=req_state.stats)
358363
self.lora_states.finish_request(req_state)
359364

vllm/v1/metrics/loggers.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,13 @@ def __init__(self, vllm_config: VllmConfig):
206206
buckets=[1, 2, 5, 10, 20],
207207
labelnames=labelnames).labels(*labelvalues)
208208

209+
self.histogram_max_tokens_request = \
210+
prometheus_client.Histogram(
211+
name="vllm:request_params_max_tokens",
212+
documentation="Histogram of the max_tokens request parameter.",
213+
buckets=build_1_2_5_buckets(max_model_len),
214+
labelnames=labelnames).labels(*labelvalues)
215+
209216
#
210217
# Histogram of timing intervals
211218
#
@@ -357,6 +364,8 @@ def log(self, scheduler_stats: SchedulerStats,
357364
finished_request.num_prompt_tokens)
358365
self.histogram_num_generation_tokens_request.observe(
359366
finished_request.num_generation_tokens)
367+
self.histogram_max_tokens_request.observe(
368+
finished_request.max_tokens_param)
360369

361370
if self.gauge_lora_info is not None:
362371
running_lora_adapters = \

vllm/v1/metrics/stats.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class FinishedRequestStats:
6666
e2e_latency: float = 0.0
6767
num_prompt_tokens: int = 0
6868
num_generation_tokens: int = 0
69+
max_tokens_param: Optional[int] = None
6970
queued_time: float = 0.0
7071
prefill_time: float = 0.0
7172
inference_time: float = 0.0
@@ -152,6 +153,7 @@ def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
152153

153154
def update_from_finished_request(self, finish_reason: "FinishReason",
154155
num_prompt_tokens: int,
156+
max_tokens_param: Optional[int],
155157
req_stats: RequestStateStats):
156158
e2e_latency = self._time_since(req_stats.arrival_time)
157159

@@ -175,6 +177,7 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
175177
e2e_latency=e2e_latency,
176178
num_prompt_tokens=num_prompt_tokens,
177179
num_generation_tokens=req_stats.num_generation_tokens,
180+
max_tokens_param=max_tokens_param,
178181
queued_time=queued_time,
179182
prefill_time=prefill_time,
180183
inference_time=inference_time,

0 commit comments

Comments
 (0)