From 319bc37dca95161c5dafe699de3ab1027c615a79 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 31 Jan 2024 12:05:26 +0200 Subject: [PATCH 01/40] Add vllm:request_max_tokens --- vllm/engine/llm_engine.py | 9 +++++++++ vllm/engine/metrics.py | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 02c673c96fd9..a131063b7782 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -845,6 +845,7 @@ def _get_stats(self, # Iteration stats if we have scheduler output. num_prompt_tokens = 0 num_generation_tokens = 0 + max_tokens = [] time_to_first_tokens = [] time_per_output_tokens = [] time_e2e_requests = [] @@ -857,6 +858,13 @@ def _get_stats(self, else: num_generation_tokens = scheduler_outputs.num_batched_tokens + # Sampling Params. + if prompt_run: + max_tokens = [ + seq_group.sampling_params.max_tokens + for seq_group in scheduler_outputs.scheduled_seq_groups + ] + # Latency Timings. time_last_iters = [] for seq_group in scheduler_outputs.scheduled_seq_groups: @@ -878,6 +886,7 @@ def _get_stats(self, cpu_cache_usage=cpu_cache_usage, num_prompt_tokens=num_prompt_tokens, num_generation_tokens=num_generation_tokens, + max_tokens=max_tokens, time_to_first_tokens=time_to_first_tokens, time_per_output_tokens=time_per_output_tokens, time_e2e_requests=time_e2e_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index e613b9f551b2..d9657d9743ab 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -61,6 +61,15 @@ def add_global_metrics_labels(**kwargs): "vllm:e2e_request_latency_seconds", "Histogram of end to end request latency in seconds.", buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) + +histogram_max_tokens = Histogram( + "vllm:request_max_tokens", + "Histogram of the max_tokens request parameter.", + buckets=[ + 1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000, + 20_000, 50_000, 100_000 + ], +) # end-metrics-definitions @@ -79,6 +88,7 @@ class Stats: # Raw stats from last model iteration. num_prompt_tokens: int num_generation_tokens: int + max_tokens: List[int] time_to_first_tokens: List[float] time_per_output_tokens: List[float] time_e2e_requests: List[float] @@ -116,6 +126,8 @@ def _log_prometheus(self, stats: Stats) -> None: counter_generation_tokens.add(labels, stats.num_generation_tokens) # Observe request level latencies in histograms. + for val in stats.max_tokens: + histogram_max_tokens.observe(labels, val) for ttft in stats.time_to_first_tokens: histogram_time_to_first_token.observe(labels, ttft) for tpot in stats.time_per_output_tokens: From 9d4ce95ac0d9e160766c96c52074e91db3af81e1 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 31 Jan 2024 12:08:05 +0200 Subject: [PATCH 02/40] Remove trailing dots from comments that are not sentences --- vllm/engine/llm_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a131063b7782..d6bbb9ae960f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -852,20 +852,20 @@ def _get_stats(self, if scheduler_outputs is not None: prompt_run = scheduler_outputs.prompt_run - # Number of Tokens. + # Number of Tokens if prompt_run: num_prompt_tokens = scheduler_outputs.num_batched_tokens else: num_generation_tokens = scheduler_outputs.num_batched_tokens - # Sampling Params. + # Sampling Params if prompt_run: max_tokens = [ seq_group.sampling_params.max_tokens for seq_group in scheduler_outputs.scheduled_seq_groups ] - # Latency Timings. + # Latency Timings time_last_iters = [] for seq_group in scheduler_outputs.scheduled_seq_groups: # Time since last token. (n.b. updates seq_group.last_token_time) From ae7eb6e243cd3f6c13e91f6e6ac497864c2fec9c Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 31 Jan 2024 14:35:28 +0200 Subject: [PATCH 03/40] Add vllm:request_success --- vllm/engine/llm_engine.py | 13 ++++++++++++- vllm/engine/metrics.py | 11 +++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d6bbb9ae960f..492956062997 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,5 +1,5 @@ import copy -from collections import defaultdict +from collections import defaultdict, Counter as CollectionsCounter import os import time from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, @@ -849,6 +849,7 @@ def _get_stats(self, time_to_first_tokens = [] time_per_output_tokens = [] time_e2e_requests = [] + finished_reason_counter = CollectionsCounter() if scheduler_outputs is not None: prompt_run = scheduler_outputs.prompt_run @@ -877,6 +878,15 @@ def _get_stats(self, time_to_first_tokens = time_last_iters if prompt_run else [] time_per_output_tokens = [] if prompt_run else time_last_iters + # Finished Requests + for seq_group in scheduler_outputs.scheduled_seq_groups: + if not seq_group.is_finished(): + continue + finished_reason_counter += CollectionsCounter([ + SequenceStatus.get_finished_reason(seq.status) + for seq in seq_group.get_finished_seqs() + ]) + return Stats( now=now, num_running=num_running, @@ -884,6 +894,7 @@ def _get_stats(self, num_waiting=num_waiting, gpu_cache_usage=gpu_cache_usage, cpu_cache_usage=cpu_cache_usage, + finished_reason_counter=finished_reason_counter, num_prompt_tokens=num_prompt_tokens, num_generation_tokens=num_generation_tokens, max_tokens=max_tokens, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index d9657d9743ab..357180885d66 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -5,6 +5,7 @@ import numpy as np from typing import List from dataclasses import dataclass +from collections import Counter as CollectionsCounter logger = init_logger(__name__) @@ -28,6 +29,8 @@ def add_global_metrics_labels(**kwargs): "Number of prefill tokens processed.") counter_generation_tokens = Counter("vllm:generation_tokens_total", "Number of generation tokens processed.") +counter_request_success = Counter("vllm:request_success", + "Count of successfully processed requests.") gauge_scheduler_running = Gauge( "vllm:num_requests_running", @@ -86,6 +89,7 @@ class Stats: cpu_cache_usage: float # Raw stats from last model iteration. + finished_reason_counter: CollectionsCounter[str, int] num_prompt_tokens: int num_generation_tokens: int max_tokens: List[int] @@ -125,6 +129,13 @@ def _log_prometheus(self, stats: Stats) -> None: counter_prompt_tokens.add(labels, stats.num_prompt_tokens) counter_generation_tokens.add(labels, stats.num_generation_tokens) + for finished_reason, count in stats.finished_reason_counter.items(): + counter_request_success.add( + { + **labels, + "finished_reason": finished_reason, + }, count) + # Observe request level latencies in histograms. for val in stats.max_tokens: histogram_max_tokens.observe(labels, val) From 2188daa75f67f6e5c0a707dd66d11a5800a49409 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 31 Jan 2024 14:41:51 +0200 Subject: [PATCH 04/40] Remove redundant space --- vllm/core/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 4fdf9ec341cf..2c2c728f6d74 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -136,7 +136,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: for seq_group in state_queue: if not request_ids: # Using 'break' here may add two extra iterations, - # but is acceptable to reduce complexity . + # but is acceptable to reduce complexity. break if seq_group.request_id in request_ids: # Appending aborted group into pending list. From e41c15f4f574560462332aaaf7edc2095a4acc20 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 31 Jan 2024 17:07:10 +0200 Subject: [PATCH 05/40] Add vllm:request_n --- vllm/engine/llm_engine.py | 6 ++++++ vllm/engine/metrics.py | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 492956062997..7506f4a1643a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -846,6 +846,7 @@ def _get_stats(self, num_prompt_tokens = 0 num_generation_tokens = 0 max_tokens = [] + request_n = [] time_to_first_tokens = [] time_per_output_tokens = [] time_e2e_requests = [] @@ -865,6 +866,10 @@ def _get_stats(self, seq_group.sampling_params.max_tokens for seq_group in scheduler_outputs.scheduled_seq_groups ] + request_n = [ + seq_group.sampling_params.n + for seq_group in scheduler_outputs.scheduled_seq_groups + ] # Latency Timings time_last_iters = [] @@ -898,6 +903,7 @@ def _get_stats(self, num_prompt_tokens=num_prompt_tokens, num_generation_tokens=num_generation_tokens, max_tokens=max_tokens, + request_n=request_n, time_to_first_tokens=time_to_first_tokens, time_per_output_tokens=time_per_output_tokens, time_e2e_requests=time_e2e_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 357180885d66..b9ecc18bc99c 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -73,6 +73,11 @@ def add_global_metrics_labels(**kwargs): 20_000, 50_000, 100_000 ], ) +histogram_request_n = Histogram( + "vllm:request_n", + "Histogram of the n request parameter.", + buckets=[1, 2, 5, 10, 20], +) # end-metrics-definitions @@ -93,6 +98,7 @@ class Stats: num_prompt_tokens: int num_generation_tokens: int max_tokens: List[int] + request_n: List[int] time_to_first_tokens: List[float] time_per_output_tokens: List[float] time_e2e_requests: List[float] @@ -139,6 +145,8 @@ def _log_prometheus(self, stats: Stats) -> None: # Observe request level latencies in histograms. for val in stats.max_tokens: histogram_max_tokens.observe(labels, val) + for n in stats.request_n: + histogram_request_n.observe(labels, n) for ttft in stats.time_to_first_tokens: histogram_time_to_first_token.observe(labels, ttft) for tpot in stats.time_per_output_tokens: From 71ec7c3c7b16ca0423ddf2aec5bff8a1309cbae2 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 5 Feb 2024 12:19:36 +0200 Subject: [PATCH 06/40] Add vllm:prompt_tokens --- vllm/engine/llm_engine.py | 6 ++++++ vllm/engine/metrics.py | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 7506f4a1643a..0bc8ca8acf6d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -845,6 +845,7 @@ def _get_stats(self, # Iteration stats if we have scheduler output. num_prompt_tokens = 0 num_generation_tokens = 0 + num_prompt_tokens_lst = [] max_tokens = [] request_n = [] time_to_first_tokens = [] @@ -870,6 +871,10 @@ def _get_stats(self, seq_group.sampling_params.n for seq_group in scheduler_outputs.scheduled_seq_groups ] + num_prompt_tokens_lst = [ + len(seq_group.prompt_token_ids) + for seq_group in scheduler_outputs.scheduled_seq_groups + ] # Latency Timings time_last_iters = [] @@ -902,6 +907,7 @@ def _get_stats(self, finished_reason_counter=finished_reason_counter, num_prompt_tokens=num_prompt_tokens, num_generation_tokens=num_generation_tokens, + num_prompt_tokens_lst=num_prompt_tokens_lst, max_tokens=max_tokens, request_n=request_n, time_to_first_tokens=time_to_first_tokens, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index b9ecc18bc99c..aba056f1a813 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -47,6 +47,14 @@ def add_global_metrics_labels(**kwargs): "vllm:cpu_cache_usage_perc", "CPU KV-cache usage. 1 means 100 percent usage.") +histogram_prompt_tokens = Histogram( + "vllm:prompt_tokens", + "Number of prefill tokens processed.", + buckets=[ + 1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000, + 20_000, 50_000, 100_000 + ], +) histogram_time_to_first_token = Histogram( "vllm:time_to_first_token_seconds", "Histogram of time to first token in seconds.", @@ -97,6 +105,7 @@ class Stats: finished_reason_counter: CollectionsCounter[str, int] num_prompt_tokens: int num_generation_tokens: int + num_prompt_tokens_lst: List[int] max_tokens: List[int] request_n: List[int] time_to_first_tokens: List[float] @@ -134,6 +143,8 @@ def _log_prometheus(self, stats: Stats) -> None: # Add to token counters. counter_prompt_tokens.add(labels, stats.num_prompt_tokens) counter_generation_tokens.add(labels, stats.num_generation_tokens) + for val in stats.num_prompt_tokens_lst: + histogram_prompt_tokens.observe(labels, val) for finished_reason, count in stats.finished_reason_counter.items(): counter_request_success.add( From 45bd8394320a3362aa4ac303a471b193b9db3a05 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 5 Feb 2024 14:19:03 +0200 Subject: [PATCH 07/40] Add vllm:generation_tokens --- vllm/engine/llm_engine.py | 15 +++++++++++---- vllm/engine/metrics.py | 12 +++++++++++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 0bc8ca8acf6d..983f54c5d136 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -846,6 +846,7 @@ def _get_stats(self, num_prompt_tokens = 0 num_generation_tokens = 0 num_prompt_tokens_lst = [] + num_generation_tokens_lst = [] max_tokens = [] request_n = [] time_to_first_tokens = [] @@ -858,8 +859,17 @@ def _get_stats(self, # Number of Tokens if prompt_run: num_prompt_tokens = scheduler_outputs.num_batched_tokens + num_prompt_tokens_lst = [ + len(seq_group.prompt_token_ids) + for seq_group in scheduler_outputs.scheduled_seq_groups + ] else: num_generation_tokens = scheduler_outputs.num_batched_tokens + num_generation_tokens_lst = [ + seq.get_output_len() + for seq_group in scheduler_outputs.scheduled_seq_groups + for seq in seq_group.get_finished_seqs() + ] # Sampling Params if prompt_run: @@ -871,10 +881,6 @@ def _get_stats(self, seq_group.sampling_params.n for seq_group in scheduler_outputs.scheduled_seq_groups ] - num_prompt_tokens_lst = [ - len(seq_group.prompt_token_ids) - for seq_group in scheduler_outputs.scheduled_seq_groups - ] # Latency Timings time_last_iters = [] @@ -908,6 +914,7 @@ def _get_stats(self, num_prompt_tokens=num_prompt_tokens, num_generation_tokens=num_generation_tokens, num_prompt_tokens_lst=num_prompt_tokens_lst, + num_generation_tokens_lst=num_generation_tokens_lst, max_tokens=max_tokens, request_n=request_n, time_to_first_tokens=time_to_first_tokens, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index aba056f1a813..b5c8a1baf6dd 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -55,6 +55,14 @@ def add_global_metrics_labels(**kwargs): 20_000, 50_000, 100_000 ], ) +histogram_generation_tokens = Histogram( + "vllm:generation_tokens", + "Number of generation tokens processed.", + buckets=[ + 1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000, + 20_000, 50_000, 100_000 + ], +) histogram_time_to_first_token = Histogram( "vllm:time_to_first_token_seconds", "Histogram of time to first token in seconds.", @@ -106,6 +114,7 @@ class Stats: num_prompt_tokens: int num_generation_tokens: int num_prompt_tokens_lst: List[int] + num_generation_tokens_lst: List[int] max_tokens: List[int] request_n: List[int] time_to_first_tokens: List[float] @@ -145,7 +154,8 @@ def _log_prometheus(self, stats: Stats) -> None: counter_generation_tokens.add(labels, stats.num_generation_tokens) for val in stats.num_prompt_tokens_lst: histogram_prompt_tokens.observe(labels, val) - + for val in stats.num_generation_tokens_lst: + histogram_generation_tokens.observe(labels, val) for finished_reason, count in stats.finished_reason_counter.items(): counter_request_success.add( { From f237c5015a4912388b68f7a71ddb6aa73a151363 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 5 Feb 2024 15:26:56 +0200 Subject: [PATCH 08/40] Add comments --- vllm/engine/metrics.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index b5c8a1baf6dd..dde200efc96a 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -152,10 +152,8 @@ def _log_prometheus(self, stats: Stats) -> None: # Add to token counters. counter_prompt_tokens.add(labels, stats.num_prompt_tokens) counter_generation_tokens.add(labels, stats.num_generation_tokens) - for val in stats.num_prompt_tokens_lst: - histogram_prompt_tokens.observe(labels, val) - for val in stats.num_generation_tokens_lst: - histogram_generation_tokens.observe(labels, val) + + # Add to request counters. for finished_reason, count in stats.finished_reason_counter.items(): counter_request_success.add( { @@ -163,11 +161,19 @@ def _log_prometheus(self, stats: Stats) -> None: "finished_reason": finished_reason, }, count) - # Observe request level latencies in histograms. + # Observe number of tokens in histograms. + for val in stats.num_prompt_tokens_lst: + histogram_prompt_tokens.observe(labels, val) + for val in stats.num_generation_tokens_lst: + histogram_generation_tokens.observe(labels, val) + + # Observe sampling params in histograms. for val in stats.max_tokens: histogram_max_tokens.observe(labels, val) for n in stats.request_n: histogram_request_n.observe(labels, n) + + # Observe request level latencies in histograms. for ttft in stats.time_to_first_tokens: histogram_time_to_first_token.observe(labels, ttft) for tpot in stats.time_per_output_tokens: From f17a966f244c18966c744b0d7520ab6ab84145b7 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 5 Feb 2024 17:12:30 +0200 Subject: [PATCH 09/40] Rename metrics --- vllm/engine/metrics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index dde200efc96a..7112918881a4 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -47,16 +47,16 @@ def add_global_metrics_labels(**kwargs): "vllm:cpu_cache_usage_perc", "CPU KV-cache usage. 1 means 100 percent usage.") -histogram_prompt_tokens = Histogram( - "vllm:prompt_tokens", +histogram_request_prompt_tokens = Histogram( + "vllm:request_prompt_tokens", "Number of prefill tokens processed.", buckets=[ 1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000, 20_000, 50_000, 100_000 ], ) -histogram_generation_tokens = Histogram( - "vllm:generation_tokens", +histogram_request_generation_tokens = Histogram( + "vllm:request_generation_tokens", "Number of generation tokens processed.", buckets=[ 1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000, @@ -163,9 +163,9 @@ def _log_prometheus(self, stats: Stats) -> None: # Observe number of tokens in histograms. for val in stats.num_prompt_tokens_lst: - histogram_prompt_tokens.observe(labels, val) + histogram_request_prompt_tokens.observe(labels, val) for val in stats.num_generation_tokens_lst: - histogram_generation_tokens.observe(labels, val) + histogram_request_generation_tokens.observe(labels, val) # Observe sampling params in histograms. for val in stats.max_tokens: From 8e0d8c15cfc4820c07ef50c09a6a7c62e0892c91 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 7 Feb 2024 16:52:46 +0200 Subject: [PATCH 10/40] Make type hint compatible with python 3.8 --- vllm/engine/metrics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 7112918881a4..cc15dd7f6a1d 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -3,9 +3,8 @@ import time import numpy as np -from typing import List +from typing import List, Counter as CollectionsCounter from dataclasses import dataclass -from collections import Counter as CollectionsCounter logger = init_logger(__name__) @@ -110,7 +109,7 @@ class Stats: cpu_cache_usage: float # Raw stats from last model iteration. - finished_reason_counter: CollectionsCounter[str, int] + finished_reason_counter: CollectionsCounter[str] num_prompt_tokens: int num_generation_tokens: int num_prompt_tokens_lst: List[int] From 9ed04ef3e1e8fb66a566f9adc747c74ba1bfb6b1 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 12 Feb 2024 18:38:35 +0200 Subject: [PATCH 11/40] Rename metrics vllm:request_max_tokens -> vllm:request_params_max_tokens vllm:request_n -> vllm:request_params_n --- vllm/engine/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index cc15dd7f6a1d..9d9d0a67608e 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -81,7 +81,7 @@ def add_global_metrics_labels(**kwargs): buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) histogram_max_tokens = Histogram( - "vllm:request_max_tokens", + "vllm:request_params_max_tokens", "Histogram of the max_tokens request parameter.", buckets=[ 1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000, @@ -89,7 +89,7 @@ def add_global_metrics_labels(**kwargs): ], ) histogram_request_n = Histogram( - "vllm:request_n", + "vllm:request_params_n", "Histogram of the n request parameter.", buckets=[1, 2, 5, 10, 20], ) From 76cd7742c65105351a9b4df3b8c3e400340ce21f Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 26 Feb 2024 16:38:56 +0200 Subject: [PATCH 12/40] Consider the value of `max_model_len` when building buckets --- vllm/engine/llm_engine.py | 3 ++- vllm/engine/metrics.py | 45 ++++++++++++++++++++++++++------------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 139253de3afa..b9f22b9d7da6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -129,7 +129,8 @@ def __init__( if self.log_stats: self.stat_logger = StatLogger( local_interval=_LOCAL_LOGGING_INTERVAL_SEC, - labels=dict(model_name=model_config.model)) + labels=dict(model_name=model_config.model), + max_model_len=self.model_config.max_model_len) self.forward_dag = None if USE_RAY_COMPILED_DAG: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 33163e39df38..d11d80bb5953 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -17,7 +17,7 @@ # begin-metrics-definitions class Metrics: - def __init__(self, labelnames: List[str]): + def __init__(self, labelnames: List[str], max_model_len: int): # Unregister any existing vLLM collectors for collector in list(REGISTRY._collector_to_names): if hasattr(collector, "_name") and "vllm" in collector._name: @@ -62,19 +62,13 @@ def __init__(self, labelnames: List[str]): name="vllm:request_prompt_tokens", documentation="Number of prefill tokens processed.", labelnames=labelnames, - buckets=[ - 1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, - 10_000, 20_000, 50_000, 100_000 - ], + buckets=build_1_2_5_buckets(max_model_len), ) self.histogram_request_generation_tokens = Histogram( name="vllm:request_generation_tokens", documentation="Number of generation tokens processed.", labelnames=labelnames, - buckets=[ - 1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, - 10_000, 20_000, 50_000, 100_000 - ], + buckets=build_1_2_5_buckets(max_model_len), ) self.histogram_time_to_first_token = Histogram( name="vllm:time_to_first_token_seconds", @@ -101,10 +95,7 @@ def __init__(self, labelnames: List[str]): name="vllm:request_params_max_tokens", documentation="Histogram of the max_tokens request parameter.", labelnames=labelnames, - buckets=[ - 1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, - 10_000, 20_000, 50_000, 100_000 - ], + buckets=build_1_2_5_buckets(max_model_len), ) self.histogram_request_n = Histogram( name="vllm:request_params_n", @@ -129,6 +120,28 @@ def __init__(self, labelnames: List[str]): # end-metrics-definitions +def build_1_2_5_buckets(max_value: int): + """ + Builds a list of buckets with increasing powers of 10 multiplied by mantissa + values (1, 2, 5) until the value exceeds the specified maximum. + + Example: + >>> build_1_2_5_buckets(100) + [1, 2, 5, 10, 20, 50, 100] + """ + mantissa_lst = [1, 2, 5] + exponent = 0 + buckets = [] + while True: + for m in mantissa_lst: + value = m * 10**exponent + if value <= max_value: + buckets.append(value) + else: + return buckets + exponent += 1 + + @dataclass class Stats: """Created by LLMEngine for use by StatLogger.""" @@ -157,7 +170,8 @@ class Stats: class StatLogger: """StatLogger is used LLMEngine to log to Promethus and Stdout.""" - def __init__(self, local_interval: float, labels: Dict[str, str]) -> None: + def __init__(self, local_interval: float, labels: Dict[str, str], + max_model_len: int) -> None: # Metadata for logging locally. self.last_local_log = time.monotonic() self.local_interval = local_interval @@ -168,7 +182,8 @@ def __init__(self, local_interval: float, labels: Dict[str, str]) -> None: # Prometheus metrics self.labels = labels - self.metrics = Metrics(labelnames=list(labels.keys())) + self.metrics = Metrics(labelnames=list(labels.keys()), + max_model_len=max_model_len) def _get_throughput(self, tracked_stats: List[int], now: float) -> float: return float(np.sum(tracked_stats) / (now - self.last_local_log)) From 60f1049ed99baf45c10f25585741388feabf2705 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 13 Mar 2024 11:41:54 +0200 Subject: [PATCH 13/40] Fix too long line warning --- vllm/engine/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 03c309b18159..5800fb0e2d03 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -128,8 +128,8 @@ def __init__(self, labelnames: List[str], max_model_len: int): def build_1_2_5_buckets(max_value: int): """ - Builds a list of buckets with increasing powers of 10 multiplied by mantissa - values (1, 2, 5) until the value exceeds the specified maximum. + Builds a list of buckets with increasing powers of 10 multiplied by + mantissa values (1, 2, 5) until the value exceeds the specified maximum. Example: >>> build_1_2_5_buckets(100) From 95daee75e3e20bc77342f136b3360467dd172965 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Tue, 26 Mar 2024 18:51:47 +0200 Subject: [PATCH 14/40] Add HTTP metrics from prometheus-fastapi-instrumentator --- requirements.txt | 1 + vllm/entrypoints/openai/api_server.py | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 05ec2e804e13..106dc9b9bb2f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. prometheus_client >= 0.18.0 +prometheus-fastapi-instrumentator pynvml == 11.5.0 triton >= 2.1.0 outlines >= 0.0.27 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 00407bc0e809..a061b16a3fcc 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -6,7 +6,7 @@ import importlib import inspect -from prometheus_client import make_asgi_app +from prometheus_fastapi_instrumentator import Instrumentator import fastapi import uvicorn from http import HTTPStatus @@ -48,6 +48,8 @@ async def _force_log(): app = fastapi.FastAPI(lifespan=lifespan) +# Instrument the app with HTTP metrics and expose it on /metrics +Instrumentator().instrument(app).expose(app, endpoint="/metrics") class LoRAParserAction(argparse.Action): @@ -146,11 +148,6 @@ def parse_args(): return parser.parse_args() -# Add prometheus asgi middleware to route /metrics requests -metrics_app = make_asgi_app() -app.mount("/metrics", metrics_app) - - @app.exception_handler(RequestValidationError) async def validation_exception_handler(_, exc): err = openai_serving_chat.create_error_response(message=str(exc)) From 0f8dae94595a4ba6f99f96c8abfd21819f7b84cb Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Tue, 26 Mar 2024 19:06:11 +0200 Subject: [PATCH 15/40] Make ruff happy --- vllm/engine/llm_engine.py | 2 +- vllm/engine/metrics.py | 3 ++- vllm/entrypoints/openai/api_server.py | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 30c317502b35..5467ae0dc7dd 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,5 +1,5 @@ -from collections import Counter as CollectionsCounter import time +from collections import Counter as CollectionsCounter from typing import Iterable, List, Optional, Tuple, Type, Union from transformers import PreTrainedTokenizer diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index c6816fa33411..ce36fdfed450 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,6 +1,7 @@ import time from dataclasses import dataclass -from typing import Dict, List, Counter as CollectionsCounter +from typing import Counter as CollectionsCounter +from typing import Dict, List import numpy as np from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index cb8ddaceae23..1343bbf89868 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -5,14 +5,13 @@ from contextlib import asynccontextmanager from http import HTTPStatus -from prometheus_fastapi_instrumentator import Instrumentator import fastapi import uvicorn from fastapi import Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse -from prometheus_client import make_asgi_app +from prometheus_fastapi_instrumentator import Instrumentator import vllm from vllm.engine.arg_utils import AsyncEngineArgs From bce096c1dcd340da7910cc30f1a382f74db8ed05 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Thu, 28 Mar 2024 15:42:34 +0200 Subject: [PATCH 16/40] Remove vllm:request_params_max_tokens --- vllm/engine/llm_engine.py | 6 ------ vllm/engine/metrics.py | 10 ---------- 2 files changed, 16 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5467ae0dc7dd..fd443a7cb0c1 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -677,7 +677,6 @@ def _get_stats(self, num_generation_tokens = 0 num_prompt_tokens_lst = [] num_generation_tokens_lst = [] - max_tokens = [] request_n = [] time_to_first_tokens = [] time_per_output_tokens = [] @@ -706,10 +705,6 @@ def _get_stats(self, # Sampling Params if prompt_run: - max_tokens = [ - seq_group.sampling_params.max_tokens - for seq_group in scheduler_outputs.scheduled_seq_groups - ] request_n = [ seq_group.sampling_params.n for seq_group in scheduler_outputs.scheduled_seq_groups @@ -750,7 +745,6 @@ def _get_stats(self, num_generation_tokens=num_generation_tokens, num_prompt_tokens_lst=num_prompt_tokens_lst, num_generation_tokens_lst=num_generation_tokens_lst, - max_tokens=max_tokens, request_n=request_n, time_to_first_tokens=time_to_first_tokens, time_per_output_tokens=time_per_output_tokens, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ce36fdfed450..ce5d0120b81d 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -99,12 +99,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): documentation="Histogram of end to end request latency in seconds.", labelnames=labelnames, buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) - self.histogram_max_tokens = Histogram( - name="vllm:request_params_max_tokens", - documentation="Histogram of the max_tokens request parameter.", - labelnames=labelnames, - buckets=build_1_2_5_buckets(max_model_len), - ) self.histogram_request_n = Histogram( name="vllm:request_params_n", documentation="Histogram of the n request parameter.", @@ -168,7 +162,6 @@ class Stats: num_generation_tokens: int num_prompt_tokens_lst: List[int] num_generation_tokens_lst: List[int] - max_tokens: List[int] request_n: List[int] time_to_first_tokens: List[float] time_per_output_tokens: List[float] @@ -240,9 +233,6 @@ def _log_prometheus(self, stats: Stats) -> None: **self.labels).observe(val) # Observe sampling params in histograms. - for val in stats.max_tokens: - self.metrics.histogram_max_tokens.labels( - **self.labels).observe(val) for n in stats.request_n: self.metrics.histogram_request_n.labels(**self.labels).observe(n) From e15f65394ba296ca3fe5937aa1342f6ac93e8eaf Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 29 Mar 2024 12:35:46 +0300 Subject: [PATCH 17/40] Move deprecated metrics to legacy section --- vllm/engine/metrics.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ce5d0120b81d..e4743b7f62f7 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -54,14 +54,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames) # Raw stats from last model iteration - self.counter_prompt_tokens = Counter( - name="vllm:prompt_tokens_total", - documentation="Number of prefill tokens processed.", - labelnames=labelnames) - self.counter_generation_tokens = Counter( - name="vllm:generation_tokens_total", - documentation="Number of generation tokens processed.", - labelnames=labelnames) self.counter_request_success = Counter( name="vllm:request_success", documentation="Count of successfully processed requests.", @@ -117,6 +109,16 @@ def __init__(self, labelnames: List[str], max_model_len: int): documentation="Average generation throughput in tokens/s.", labelnames=labelnames, ) + # Deprecated in favor of vllm:request_prompt_tokens_sum + self.counter_prompt_tokens = Counter( + name="vllm:prompt_tokens_total", + documentation="Number of prefill tokens processed.", + labelnames=labelnames) + # Deprecated in favor of vllm:request_generation_tokens_sum + self.counter_generation_tokens = Counter( + name="vllm:generation_tokens_total", + documentation="Number of generation tokens processed.", + labelnames=labelnames) # end-metrics-definitions From 7b05baa309df75b861babca835c0bcf18908cf2a Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 1 Apr 2024 09:54:16 +0300 Subject: [PATCH 18/40] Add metric vllm:request_params_best_of --- vllm/engine/llm_engine.py | 6 ++++++ vllm/engine/metrics.py | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fd443a7cb0c1..7b5cc4307da8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -678,6 +678,7 @@ def _get_stats(self, num_prompt_tokens_lst = [] num_generation_tokens_lst = [] request_n = [] + request_best_of = [] time_to_first_tokens = [] time_per_output_tokens = [] time_e2e_requests = [] @@ -709,6 +710,10 @@ def _get_stats(self, seq_group.sampling_params.n for seq_group in scheduler_outputs.scheduled_seq_groups ] + request_best_of = [ + seq_group.sampling_params.best_of + for seq_group in scheduler_outputs.scheduled_seq_groups + ] # Latency Timings time_last_iters = [] @@ -746,6 +751,7 @@ def _get_stats(self, num_prompt_tokens_lst=num_prompt_tokens_lst, num_generation_tokens_lst=num_generation_tokens_lst, request_n=request_n, + request_best_of=request_best_of, time_to_first_tokens=time_to_first_tokens, time_per_output_tokens=time_per_output_tokens, time_e2e_requests=time_e2e_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index e4743b7f62f7..5659627f720f 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -97,6 +97,12 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames, buckets=[1, 2, 5, 10, 20], ) + self.histogram_request_best_of = Histogram( + name="vllm:request_params_best_of", + documentation="Histogram of the best_of request parameter.", + labelnames=labelnames, + buckets=[1, 2, 5, 10, 20], + ) # Legacy metrics self.gauge_avg_prompt_throughput = Gauge( @@ -165,6 +171,7 @@ class Stats: num_prompt_tokens_lst: List[int] num_generation_tokens_lst: List[int] request_n: List[int] + request_best_of: List[int] time_to_first_tokens: List[float] time_per_output_tokens: List[float] time_e2e_requests: List[float] @@ -237,6 +244,9 @@ def _log_prometheus(self, stats: Stats) -> None: # Observe sampling params in histograms. for n in stats.request_n: self.metrics.histogram_request_n.labels(**self.labels).observe(n) + for best_of in stats.request_best_of: + self.metrics.histogram_request_best_of.labels( + **self.labels).observe(best_of) # Observe request level latencies in histograms. for ttft in stats.time_to_first_tokens: From 095825925cb96926df11f8b02b0a9b786464f89e Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 1 Apr 2024 10:00:23 +0300 Subject: [PATCH 19/40] Revert to exposing /metrics using make_asgi_app() --- vllm/entrypoints/openai/api_server.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1343bbf89868..704ee20ba133 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -11,6 +11,7 @@ from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse +from prometheus_client import make_asgi_app from prometheus_fastapi_instrumentator import Instrumentator import vllm @@ -45,8 +46,12 @@ async def _force_log(): app = fastapi.FastAPI(lifespan=lifespan) -# Instrument the app with HTTP metrics and expose it on /metrics -Instrumentator().instrument(app).expose(app, endpoint="/metrics") +# Instrument the app with HTTP metrics +Instrumentator().instrument(app) + +# Add prometheus ASGI middleware to route /metrics requests +metrics_app = make_asgi_app() +app.mount("/metrics", metrics_app) def parse_args(): From 5e2c246f060e7ffa5a73f263e46e0f14d6ed9608 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 1 Apr 2024 10:21:29 +0300 Subject: [PATCH 20/40] Register 'finished_reason' label name on metric creation --- vllm/engine/metrics.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 5659627f720f..9da90747d568 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -13,6 +13,8 @@ disable_created_metrics() +LABEL_NAME_FINISHED_REASON = "finished_reason" + # The begin-* and end* here are used by the documentation generator # to extract the metrics definitions. @@ -57,7 +59,7 @@ def __init__(self, labelnames: List[str], max_model_len: int): self.counter_request_success = Counter( name="vllm:request_success", documentation="Count of successfully processed requests.", - labelnames=labelnames) + labelnames=labelnames + [LABEL_NAME_FINISHED_REASON]) self.histogram_request_prompt_tokens = Histogram( name="vllm:request_prompt_tokens", documentation="Number of prefill tokens processed.", @@ -227,11 +229,11 @@ def _log_prometheus(self, stats: Stats) -> None: # Add to request counters. for finished_reason, count in stats.finished_reason_counter.items(): - self.metrics.counter_request_success.labels({ - **self.labels, - "finished_reason": - finished_reason, - }).inc(count) + self.metrics.counter_request_success.labels( + **{ + **self.labels, + LABEL_NAME_FINISHED_REASON: finished_reason, + }).inc(count) # Observe number of tokens in histograms. for val in stats.num_prompt_tokens_lst: From 1eeb31d775968ae2dc226a2b8b73773747cec81f Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 1 Apr 2024 13:31:27 +0300 Subject: [PATCH 21/40] Fix merge issues --- vllm/engine/llm_engine.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 013fd29b039e..ea11db60f135 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -736,20 +736,22 @@ def _get_stats(self, else: num_generation_tokens = scheduler_outputs.num_batched_tokens num_generation_tokens_lst = [ - seq.get_output_len() - for seq_group in scheduler_outputs.scheduled_seq_groups - for seq in seq_group.get_finished_seqs() + seq.get_output_len() for scheduled_seq_group in + scheduler_outputs.scheduled_seq_groups for seq in + scheduled_seq_group.seq_group.get_finished_seqs() ] # Sampling Params if prompt_run: request_n = [ - seq_group.sampling_params.n - for seq_group in scheduler_outputs.scheduled_seq_groups + scheduled_seq_group.seq_group.sampling_params.n + for scheduled_seq_group in + scheduler_outputs.scheduled_seq_groups ] request_best_of = [ - seq_group.sampling_params.best_of - for seq_group in scheduler_outputs.scheduled_seq_groups + scheduled_seq_group.seq_group.sampling_params.best_of + for scheduled_seq_group in + scheduler_outputs.scheduled_seq_groups ] # Latency Timings @@ -768,12 +770,12 @@ def _get_stats(self, time_per_output_tokens = [] if prompt_run else time_last_iters # Finished Requests - for seq_group in scheduler_outputs.scheduled_seq_groups: - if not seq_group.is_finished(): + for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: + if not scheduled_seq_group.seq_group.is_finished(): continue finished_reason_counter += CollectionsCounter([ - SequenceStatus.get_finished_reason(seq.status) - for seq in seq_group.get_finished_seqs() + SequenceStatus.get_finished_reason(seq.status) for seq in + scheduled_seq_group.seq_group.get_finished_seqs() ]) return Stats( From 4c41a8968e127f93e4089a1583a44c3e2b620fba Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 17 Apr 2024 16:38:11 +0300 Subject: [PATCH 22/40] Fix merge issues --- vllm/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e36c636657ed..e365cf6fc866 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -23,7 +23,7 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup) + SequenceGroup, SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) From ac8435bff5b2e53ca7a22da01bd339e2a0c13174 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 17 Apr 2024 18:25:52 +0300 Subject: [PATCH 23/40] Add 3 panels to Grafana dashboard --- examples/production_monitoring/grafana.json | 283 ++++++++++++++++++++ 1 file changed, 283 insertions(+) diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index 071f134c6e5e..5e9bd5bd0386 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -873,6 +873,289 @@ ], "title": "Cache Utilization", "type": "timeseries" + }, + { + "type": "heatmap", + "title": "Request Prompt Length", + "description": "Heatmap of request prompt length", + "gridPos": { + "x": 0, + "y": 24, + "w": 12, + "h": 8 + }, + "datasource": { + "uid": "prometheus", + "type": "prometheus" + }, + "id": 12, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A", + "expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))", + "range": true, + "instant": false, + "editorMode": "builder", + "legendFormat": "{{le}}", + "useBackend": false, + "disableTextWrap": false, + "fullMetaSearch": false, + "includeNullMetadata": true, + "format": "heatmap" + } + ], + "options": { + "calculate": false, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "none", + "axisLabel": "Prompt Length" + }, + "rowsFrame": { + "layout": "auto", + "value": "Request count" + }, + "color": { + "mode": "scheme", + "fill": "dark-orange", + "scale": "exponential", + "exponent": 0.5, + "scheme": "Spectral", + "steps": 64, + "reverse": false, + "min": 0 + }, + "cellGap": 1, + "filterValues": { + "le": 1e-9 + }, + "tooltip": { + "show": true, + "yHistogram": true + }, + "legend": { + "show": true + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "cellValues": { + "unit": "none" + } + }, + "fieldConfig": { + "defaults": { + "custom": { + "scaleDistribution": { + "type": "linear" + }, + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + } + } + }, + "overrides": [] + }, + "pluginVersion": "10.2.0" + }, + { + "datasource": { + "uid": "prometheus", + "type": "prometheus" + }, + "type": "heatmap", + "title": "Request Generation Length", + "description": "Heatmap of request generation length", + "gridPos": { + "x": 12, + "y": 24, + "w": 12, + "h": 8 + }, + "id": 13, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "refId": "A", + "expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))", + "range": true, + "instant": false, + "editorMode": "builder", + "legendFormat": "{{le}}", + "useBackend": false, + "disableTextWrap": false, + "fullMetaSearch": false, + "includeNullMetadata": true, + "format": "heatmap" + } + ], + "options": { + "calculate": false, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "none", + "axisLabel": "Generation Length" + }, + "rowsFrame": { + "layout": "auto", + "value": "Request count" + }, + "color": { + "mode": "scheme", + "fill": "dark-orange", + "scale": "exponential", + "exponent": 0.5, + "scheme": "Spectral", + "steps": 64, + "reverse": false, + "min": 0 + }, + "cellGap": 1, + "filterValues": { + "le": 1e-9 + }, + "tooltip": { + "show": true, + "yHistogram": true + }, + "legend": { + "show": true + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "cellValues": { + "unit": "none" + } + }, + "fieldConfig": { + "defaults": { + "custom": { + "scaleDistribution": { + "type": "linear" + }, + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + } + } + }, + "overrides": [] + }, + "pluginVersion": "10.2.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "barAlignment": 0, + "lineWidth": 1, + "fillOpacity": 0, + "gradientMode": "none", + "spanNulls": false, + "insertNulls": false, + "showPoints": "auto", + "pointSize": 5, + "stacking": { + "mode": "none", + "group": "A" + }, + "axisPlacement": "auto", + "axisLabel": "", + "axisColorMode": "text", + "axisBorderShow": false, + "scaleDistribution": { + "type": "linear" + }, + "axisCenteredZero": false, + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 11, + "options": { + "tooltip": { + "mode": "single", + "sort": "none" + }, + "legend": { + "showLegend": true, + "displayMode": "list", + "placement": "bottom", + "calcs": [] + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Finish Reason", + "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.", + "type": "timeseries" } ], "refresh": "", From f22abf5b61f5e1d43de40b07a0d88edba443ff37 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 11:55:30 +0300 Subject: [PATCH 24/40] Change order of deprecated metrics and add comments --- vllm/engine/metrics.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ee11856e2a2a..dfcc5e8b7d17 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -106,17 +106,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): buckets=[1, 2, 5, 10, 20], ) - # Legacy metrics - self.gauge_avg_prompt_throughput = Gauge( - name="vllm:avg_prompt_throughput_toks_per_s", - documentation="Average prefill throughput in tokens/s.", - labelnames=labelnames, - ) - self.gauge_avg_generation_throughput = Gauge( - name="vllm:avg_generation_throughput_toks_per_s", - documentation="Average generation throughput in tokens/s.", - labelnames=labelnames, - ) # Deprecated in favor of vllm:request_prompt_tokens_sum self.counter_prompt_tokens = Counter( name="vllm:prompt_tokens_total", @@ -127,6 +116,18 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:generation_tokens_total", documentation="Number of generation tokens processed.", labelnames=labelnames) + # Deprecated in favor of vllm:prompt_tokens_total + self.gauge_avg_prompt_throughput = Gauge( + name="vllm:avg_prompt_throughput_toks_per_s", + documentation="Average prefill throughput in tokens/s.", + labelnames=labelnames, + ) + # Deprecated in favor of vllm:generation_tokens_total + self.gauge_avg_generation_throughput = Gauge( + name="vllm:avg_generation_throughput_toks_per_s", + documentation="Average generation throughput in tokens/s.", + labelnames=labelnames, + ) # end-metrics-definitions From 9352ce731785511d5f5bd05f0ac82bf8cbda0041 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 12:02:41 +0300 Subject: [PATCH 25/40] Rename LABEL_NAME_FINISHED_REASON and make it a class variable of Metrics --- vllm/engine/metrics.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index dfcc5e8b7d17..a01bfc6e3a0f 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -13,14 +13,13 @@ disable_created_metrics() -LABEL_NAME_FINISHED_REASON = "finished_reason" - # The begin-* and end* here are used by the documentation generator # to extract the metrics definitions. # begin-metrics-definitions class Metrics: + labelname_finish_reason = "finished_reason" def __init__(self, labelnames: List[str], max_model_len: int): # Unregister any existing vLLM collectors @@ -59,7 +58,7 @@ def __init__(self, labelnames: List[str], max_model_len: int): self.counter_request_success = Counter( name="vllm:request_success", documentation="Count of successfully processed requests.", - labelnames=labelnames + [LABEL_NAME_FINISHED_REASON]) + labelnames=labelnames + [Metrics.labelname_finish_reason]) self.histogram_request_prompt_tokens = Histogram( name="vllm:request_prompt_tokens", documentation="Number of prefill tokens processed.", @@ -236,11 +235,11 @@ def _log_prometheus(self, stats: Stats) -> None: # Add to request counters. for finished_reason, count in stats.finished_reason_counter.items(): - self.metrics.counter_request_success.labels( - **{ - **self.labels, - LABEL_NAME_FINISHED_REASON: finished_reason, - }).inc(count) + self.metrics.counter_request_success.labels(**{ + **self.labels, + Metrics.labelname_finish_reason: + finished_reason, + }).inc(count) # Observe number of tokens in histograms. for val in stats.num_prompt_tokens_lst: From b2c0445bb10fb99e0d900235ffe40e4a49de84fe Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 12:07:59 +0300 Subject: [PATCH 26/40] Set minimum version to prometheus-fastapi-instrumentator --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index 13183a4ae090..4dec739595ba 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -10,7 +10,7 @@ fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. prometheus_client >= 0.18.0 -prometheus-fastapi-instrumentator +prometheus-fastapi-instrumentator >= 7.0.0 tiktoken == 0.6.0 # Required for DBRX tokenizer lm-format-enforcer == 0.9.3 outlines == 0.0.34 # Requires torch >= 2.1.0 From e147575d16ffe45e6eada2c6d072a9df3a86e12d Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 13:12:36 +0300 Subject: [PATCH 27/40] Change finished_reason from counter to list --- vllm/engine/llm_engine.py | 18 ++++++++---------- vllm/engine/metrics.py | 5 +++-- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e365cf6fc866..914e605fae39 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,5 +1,4 @@ import time -from collections import Counter as CollectionsCounter from typing import Iterable, List, Optional, Type, Union from transformers import PreTrainedTokenizer @@ -584,7 +583,7 @@ def _get_stats(self, time_to_first_tokens = [] time_per_output_tokens = [] time_e2e_requests = [] - finished_reason_counter = CollectionsCounter() + finished_reason_lst = [] if scheduler_outputs is not None: prompt_run = scheduler_outputs.num_prefill_groups > 0 @@ -637,13 +636,12 @@ def _get_stats(self, time_per_output_tokens = [] if prompt_run else time_last_iters # Finished Requests - for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: - if not scheduled_seq_group.seq_group.is_finished(): - continue - finished_reason_counter += CollectionsCounter([ - SequenceStatus.get_finished_reason(seq.status) for seq in - scheduled_seq_group.seq_group.get_finished_seqs() - ]) + finished_reason_lst = [ + SequenceStatus.get_finished_reason(seq.status) for + scheduled_seq_group in scheduler_outputs.scheduled_seq_groups + if scheduled_seq_group.seq_group.is_finished() + for seq in scheduled_seq_group.seq_group.get_finished_seqs() + ] return Stats( now=now, @@ -652,7 +650,7 @@ def _get_stats(self, num_waiting=num_waiting, gpu_cache_usage=gpu_cache_usage, cpu_cache_usage=cpu_cache_usage, - finished_reason_counter=finished_reason_counter, + finished_reason_lst=finished_reason_lst, num_prompt_tokens=num_prompt_tokens, num_generation_tokens=num_generation_tokens, num_prompt_tokens_lst=num_prompt_tokens_lst, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index a01bfc6e3a0f..73f1ae5dda92 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -167,7 +167,7 @@ class Stats: cpu_cache_usage: float # Raw stats from last model iteration. - finished_reason_counter: CollectionsCounter[str] + finished_reason_lst: List[str] num_prompt_tokens: int num_generation_tokens: int num_prompt_tokens_lst: List[int] @@ -234,7 +234,8 @@ def _log_prometheus(self, stats: Stats) -> None: stats.num_generation_tokens) # Add to request counters. - for finished_reason, count in stats.finished_reason_counter.items(): + finished_reason_counter = CollectionsCounter(stats.finished_reason_lst) + for finished_reason, count in finished_reason_counter.items(): self.metrics.counter_request_success.labels(**{ **self.labels, Metrics.labelname_finish_reason: From f9bc64ea9bedb006608574f429356a621ea06f8e Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 14:03:54 +0300 Subject: [PATCH 28/40] Compute deprecated metrics using the newer version Deprecated metrics: - num_prompt_tokens - num_generation_tokens --- vllm/engine/llm_engine.py | 10 ---------- vllm/engine/metrics.py | 10 ++++------ 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 914e605fae39..d99150ebc1ff 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -574,8 +574,6 @@ def _get_stats(self, num_waiting = len(self.scheduler.waiting) # Iteration stats if we have scheduler output. - num_prompt_tokens = 0 - num_generation_tokens = 0 num_prompt_tokens_lst = [] num_generation_tokens_lst = [] request_n = [] @@ -594,13 +592,7 @@ def _get_stats(self, for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups ] - num_prompt_tokens = sum(num_prompt_tokens_lst) - num_generation_tokens = sum( - scheduled_seq_group.seq_group.num_seqs() - for scheduled_seq_group in - scheduler_outputs.scheduled_seq_groups) else: - num_generation_tokens = scheduler_outputs.num_batched_tokens num_generation_tokens_lst = [ seq.get_output_len() for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups for seq in @@ -651,8 +643,6 @@ def _get_stats(self, gpu_cache_usage=gpu_cache_usage, cpu_cache_usage=cpu_cache_usage, finished_reason_lst=finished_reason_lst, - num_prompt_tokens=num_prompt_tokens, - num_generation_tokens=num_generation_tokens, num_prompt_tokens_lst=num_prompt_tokens_lst, num_generation_tokens_lst=num_generation_tokens_lst, request_n=request_n, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 73f1ae5dda92..6e0afc81e98e 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -168,8 +168,6 @@ class Stats: # Raw stats from last model iteration. finished_reason_lst: List[str] - num_prompt_tokens: int - num_generation_tokens: int num_prompt_tokens_lst: List[int] num_generation_tokens_lst: List[int] request_n: List[int] @@ -229,9 +227,9 @@ def _log_prometheus(self, stats: Stats) -> None: # Add to token counters. self.metrics.counter_prompt_tokens.labels(**self.labels).inc( - stats.num_prompt_tokens) + sum(stats.num_prompt_tokens_lst)) self.metrics.counter_generation_tokens.labels(**self.labels).inc( - stats.num_generation_tokens) + sum(stats.num_generation_tokens_lst)) # Add to request counters. finished_reason_counter = CollectionsCounter(stats.finished_reason_lst) @@ -291,8 +289,8 @@ def log(self, stats: Stats) -> None: self._log_prometheus(stats) # Save tracked stats for token counters. - self.num_prompt_tokens.append(stats.num_prompt_tokens) - self.num_generation_tokens.append(stats.num_generation_tokens) + self.num_prompt_tokens.append(sum(stats.num_prompt_tokens_lst)) + self.num_generation_tokens.append(sum(stats.num_generation_tokens_lst)) # Log locally every local_interval seconds. if self._local_interval_elapsed(stats.now): From 5ded719d9cfd09b300562ec1d4df21fb0a4e79a3 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 14:15:53 +0300 Subject: [PATCH 29/40] Rename variables. Strip '_lst' suffix. variables: - num_prompt_tokens_lst -> num_prompt_tokens - num_generation_tokens_lst -> num_generation_tokens --- vllm/engine/llm_engine.py | 12 ++++++------ vllm/engine/metrics.py | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d99150ebc1ff..97c696720c70 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -574,8 +574,8 @@ def _get_stats(self, num_waiting = len(self.scheduler.waiting) # Iteration stats if we have scheduler output. - num_prompt_tokens_lst = [] - num_generation_tokens_lst = [] + num_prompt_tokens = [] + num_generation_tokens = [] request_n = [] request_best_of = [] time_to_first_tokens = [] @@ -587,13 +587,13 @@ def _get_stats(self, # Number of Tokens if prompt_run: - num_prompt_tokens_lst = [ + num_prompt_tokens = [ len(scheduled_seq_group.seq_group.prompt_token_ids) for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups ] else: - num_generation_tokens_lst = [ + num_generation_tokens = [ seq.get_output_len() for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups for seq in scheduled_seq_group.seq_group.get_finished_seqs() @@ -643,8 +643,8 @@ def _get_stats(self, gpu_cache_usage=gpu_cache_usage, cpu_cache_usage=cpu_cache_usage, finished_reason_lst=finished_reason_lst, - num_prompt_tokens_lst=num_prompt_tokens_lst, - num_generation_tokens_lst=num_generation_tokens_lst, + num_prompt_tokens=num_prompt_tokens, + num_generation_tokens=num_generation_tokens, request_n=request_n, request_best_of=request_best_of, time_to_first_tokens=time_to_first_tokens, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 6e0afc81e98e..4aab78260ccf 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -168,8 +168,8 @@ class Stats: # Raw stats from last model iteration. finished_reason_lst: List[str] - num_prompt_tokens_lst: List[int] - num_generation_tokens_lst: List[int] + num_prompt_tokens: List[int] + num_generation_tokens: List[int] request_n: List[int] request_best_of: List[int] time_to_first_tokens: List[float] @@ -227,9 +227,9 @@ def _log_prometheus(self, stats: Stats) -> None: # Add to token counters. self.metrics.counter_prompt_tokens.labels(**self.labels).inc( - sum(stats.num_prompt_tokens_lst)) + sum(stats.num_prompt_tokens)) self.metrics.counter_generation_tokens.labels(**self.labels).inc( - sum(stats.num_generation_tokens_lst)) + sum(stats.num_generation_tokens)) # Add to request counters. finished_reason_counter = CollectionsCounter(stats.finished_reason_lst) @@ -241,10 +241,10 @@ def _log_prometheus(self, stats: Stats) -> None: }).inc(count) # Observe number of tokens in histograms. - for val in stats.num_prompt_tokens_lst: + for val in stats.num_prompt_tokens: self.metrics.histogram_request_prompt_tokens.labels( **self.labels).observe(val) - for val in stats.num_generation_tokens_lst: + for val in stats.num_generation_tokens: self.metrics.histogram_request_generation_tokens.labels( **self.labels).observe(val) @@ -289,8 +289,8 @@ def log(self, stats: Stats) -> None: self._log_prometheus(stats) # Save tracked stats for token counters. - self.num_prompt_tokens.append(sum(stats.num_prompt_tokens_lst)) - self.num_generation_tokens.append(sum(stats.num_generation_tokens_lst)) + self.num_prompt_tokens.append(sum(stats.num_prompt_tokens)) + self.num_generation_tokens.append(sum(stats.num_generation_tokens)) # Log locally every local_interval seconds. if self._local_interval_elapsed(stats.now): From dd84d515a0da84d23fbb92e4735cdf1a07d606f9 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 17:43:49 +0300 Subject: [PATCH 30/40] Update naming schema Stats to have the _suffix pattern Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- vllm/engine/llm_engine.py | 92 +++++++++++++++++++++++---------------- vllm/engine/metrics.py | 80 +++++++++++++++++++--------------- 2 files changed, 98 insertions(+), 74 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 97c696720c70..d7b552473560 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -553,47 +553,53 @@ def do_log_stats(self) -> None: def _get_stats(self, scheduler_outputs: Optional[SchedulerOutputs]) -> Stats: - """Get Stats to be Logged to Prometheus.""" + """Get Stats to be Logged to StdOut and Prometheus.""" now = time.time() - # KV Cache Usage in %. + # System State + # Scheduler State + num_running_sys = len(self.scheduler.running) + num_swapped_sys = len(self.scheduler.swapped) + num_waiting_sys = len(self.scheduler.waiting) + + # KV Cache Usage in % num_total_gpu = self.cache_config.num_gpu_blocks num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks() - gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu) + gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu) num_total_cpu = self.cache_config.num_cpu_blocks - cpu_cache_usage = 0. + cpu_cache_usage_sys = 0. if num_total_cpu > 0: num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks( ) - cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu) - - # Scheduler State - num_running = len(self.scheduler.running) - num_swapped = len(self.scheduler.swapped) - num_waiting = len(self.scheduler.waiting) - - # Iteration stats if we have scheduler output. - num_prompt_tokens = [] - num_generation_tokens = [] - request_n = [] - request_best_of = [] - time_to_first_tokens = [] - time_per_output_tokens = [] + cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu) + + # Iteration stats + time_to_first_tokens_iter = [] + time_per_output_tokens_iter = [] + + # Request stats + # Latency time_e2e_requests = [] - finished_reason_lst = [] + # Metadata + num_prompt_tokens_requests = [] + num_generation_tokens_requests = [] + best_of_requests = [] + n__requests = [] + finished_reason_requests = [] + if scheduler_outputs is not None: prompt_run = scheduler_outputs.num_prefill_groups > 0 # Number of Tokens if prompt_run: - num_prompt_tokens = [ + num_prompt_tokens_requests = [ len(scheduled_seq_group.seq_group.prompt_token_ids) for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups ] else: - num_generation_tokens = [ + num_generation_tokens_requests = [ seq.get_output_len() for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups for seq in scheduled_seq_group.seq_group.get_finished_seqs() @@ -601,12 +607,12 @@ def _get_stats(self, # Sampling Params if prompt_run: - request_n = [ + n__requests = [ scheduled_seq_group.seq_group.sampling_params.n for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups ] - request_best_of = [ + best_of_requests = [ scheduled_seq_group.seq_group.sampling_params.best_of for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups @@ -624,11 +630,11 @@ def _get_stats(self, time_e2e_requests.append(now - seq_group.metrics.arrival_time) - time_to_first_tokens = time_last_iters if prompt_run else [] - time_per_output_tokens = [] if prompt_run else time_last_iters + time_to_first_tokens_iter = time_last_iters if prompt_run else [] + time_per_output_tokens_iter = [] if prompt_run else time_last_iters # Finished Requests - finished_reason_lst = [ + finished_reason_requests = [ SequenceStatus.get_finished_reason(seq.status) for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups if scheduled_seq_group.seq_group.is_finished() @@ -637,19 +643,29 @@ def _get_stats(self, return Stats( now=now, - num_running=num_running, - num_swapped=num_swapped, - num_waiting=num_waiting, - gpu_cache_usage=gpu_cache_usage, - cpu_cache_usage=cpu_cache_usage, - finished_reason_lst=finished_reason_lst, - num_prompt_tokens=num_prompt_tokens, - num_generation_tokens=num_generation_tokens, - request_n=request_n, - request_best_of=request_best_of, - time_to_first_tokens=time_to_first_tokens, - time_per_output_tokens=time_per_output_tokens, + + # System stats + # Scheduler State + num_running_sys=num_running_sys, + num_swapped_sys=num_swapped_sys, + num_waiting_sys=num_waiting_sys, + # KV Cache Usage in % + gpu_cache_usage_sys=gpu_cache_usage_sys, + cpu_cache_usage_sys=cpu_cache_usage_sys, + + # Iteration stats + time_to_first_tokens_iter=time_to_first_tokens_iter, + time_per_output_tokens_iter=time_per_output_tokens_iter, + + # Request stats + # Latency time_e2e_requests=time_e2e_requests, + # Metadata + num_prompt_tokens_requests=num_prompt_tokens_requests, + num_generation_tokens_requests=num_generation_tokens_requests, + best_of_requests=best_of_requests, + n_requests=n__requests, + finished_reason_requests=finished_reason_requests, ) def add_lora(self, lora_request: LoRARequest) -> bool: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 4aab78260ccf..54602ffc1b7b 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -159,22 +159,28 @@ class Stats: """Created by LLMEngine for use by StatLogger.""" now: float - # System stats. - num_running: int - num_waiting: int - num_swapped: int - gpu_cache_usage: float - cpu_cache_usage: float - - # Raw stats from last model iteration. - finished_reason_lst: List[str] - num_prompt_tokens: List[int] - num_generation_tokens: List[int] - request_n: List[int] - request_best_of: List[int] - time_to_first_tokens: List[float] - time_per_output_tokens: List[float] + # System stats (should have _sys suffix) + # Scheduler State + num_running_sys: int + num_waiting_sys: int + num_swapped_sys: int + # KV Cache Usage in % + gpu_cache_usage_sys: float + cpu_cache_usage_sys: float + + # Iteration stats (should have _iter suffix) + time_to_first_tokens_iter: List[float] + time_per_output_tokens_iter: List[float] + + # Request stats (should have _requests suffix) + # Latency time_e2e_requests: List[float] + # Metadata + num_prompt_tokens_requests: List[int] + num_generation_tokens_requests: List[int] + best_of_requests: List[int] + n_requests: List[int] + finished_reason_requests: List[str] class SupportsMetricsInfo(Protocol): @@ -215,24 +221,25 @@ def _local_interval_elapsed(self, now: float) -> bool: def _log_prometheus(self, stats: Stats) -> None: # Set system stat gauges. self.metrics.gauge_scheduler_running.labels(**self.labels).set( - stats.num_running) + stats.num_running_sys) self.metrics.gauge_scheduler_swapped.labels(**self.labels).set( - stats.num_swapped) + stats.num_swapped_sys) self.metrics.gauge_scheduler_waiting.labels(**self.labels).set( - stats.num_waiting) + stats.num_waiting_sys) self.metrics.gauge_gpu_cache_usage.labels(**self.labels).set( - stats.gpu_cache_usage) + stats.gpu_cache_usage_sys) self.metrics.gauge_cpu_cache_usage.labels(**self.labels).set( - stats.cpu_cache_usage) + stats.cpu_cache_usage_sys) # Add to token counters. self.metrics.counter_prompt_tokens.labels(**self.labels).inc( - sum(stats.num_prompt_tokens)) + sum(stats.num_prompt_tokens_requests)) self.metrics.counter_generation_tokens.labels(**self.labels).inc( - sum(stats.num_generation_tokens)) + sum(stats.num_generation_tokens_requests)) # Add to request counters. - finished_reason_counter = CollectionsCounter(stats.finished_reason_lst) + finished_reason_counter = CollectionsCounter( + stats.finished_reason_requests) for finished_reason, count in finished_reason_counter.items(): self.metrics.counter_request_success.labels(**{ **self.labels, @@ -241,25 +248,25 @@ def _log_prometheus(self, stats: Stats) -> None: }).inc(count) # Observe number of tokens in histograms. - for val in stats.num_prompt_tokens: + for val in stats.num_prompt_tokens_requests: self.metrics.histogram_request_prompt_tokens.labels( **self.labels).observe(val) - for val in stats.num_generation_tokens: + for val in stats.num_generation_tokens_requests: self.metrics.histogram_request_generation_tokens.labels( **self.labels).observe(val) # Observe sampling params in histograms. - for n in stats.request_n: + for n in stats.n_requests: self.metrics.histogram_request_n.labels(**self.labels).observe(n) - for best_of in stats.request_best_of: + for best_of in stats.best_of_requests: self.metrics.histogram_request_best_of.labels( **self.labels).observe(best_of) # Observe request level latencies in histograms. - for ttft in stats.time_to_first_tokens: + for ttft in stats.time_to_first_tokens_iter: self.metrics.histogram_time_to_first_token.labels( **self.labels).observe(ttft) - for tpot in stats.time_per_output_tokens: + for tpot in stats.time_per_output_tokens_iter: self.metrics.histogram_time_per_output_token.labels( **self.labels).observe(tpot) for e2e in stats.time_e2e_requests: @@ -289,8 +296,9 @@ def log(self, stats: Stats) -> None: self._log_prometheus(stats) # Save tracked stats for token counters. - self.num_prompt_tokens.append(sum(stats.num_prompt_tokens)) - self.num_generation_tokens.append(sum(stats.num_generation_tokens)) + self.num_prompt_tokens.append(sum(stats.num_prompt_tokens_requests)) + self.num_generation_tokens.append( + sum(stats.num_generation_tokens_requests)) # Log locally every local_interval seconds. if self._local_interval_elapsed(stats.now): @@ -309,11 +317,11 @@ def log(self, stats: Stats) -> None: f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, " f"Avg generation throughput: " f"{generation_throughput:.1f} tokens/s, " - f"Running: {stats.num_running} reqs, " - f"Swapped: {stats.num_swapped} reqs, " - f"Pending: {stats.num_waiting} reqs, " - f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, " - f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%") + f"Running: {stats.num_running_sys} reqs, " + f"Swapped: {stats.num_swapped_sys} reqs, " + f"Pending: {stats.num_waiting_sys} reqs, " + f"GPU KV cache usage: {stats.gpu_cache_usage_sys * 100:.1f}%, " + f"CPU KV cache usage: {stats.cpu_cache_usage_sys * 100:.1f}%") # Reset tracked stats for next interval. self.num_prompt_tokens = [] From e127a4c71d3abf5485c087ae63c8458a545e4613 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 18:22:30 +0300 Subject: [PATCH 31/40] Fix the incorrect logic for chunked prefill Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- vllm/engine/llm_engine.py | 85 ++++++++++++++++++--------------------- vllm/sequence.py | 18 +++++++-- 2 files changed, 53 insertions(+), 50 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d7b552473560..64d6169e0db5 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -585,61 +585,54 @@ def _get_stats(self, num_prompt_tokens_requests = [] num_generation_tokens_requests = [] best_of_requests = [] - n__requests = [] + n_requests = [] finished_reason_requests = [] if scheduler_outputs is not None: - prompt_run = scheduler_outputs.num_prefill_groups > 0 - - # Number of Tokens - if prompt_run: - num_prompt_tokens_requests = [ - len(scheduled_seq_group.seq_group.prompt_token_ids) - for scheduled_seq_group in - scheduler_outputs.scheduled_seq_groups - ] - else: - num_generation_tokens_requests = [ - seq.get_output_len() for scheduled_seq_group in - scheduler_outputs.scheduled_seq_groups for seq in - scheduled_seq_group.seq_group.get_finished_seqs() - ] - - # Sampling Params - if prompt_run: - n__requests = [ - scheduled_seq_group.seq_group.sampling_params.n - for scheduled_seq_group in - scheduler_outputs.scheduled_seq_groups - ] - best_of_requests = [ - scheduled_seq_group.seq_group.sampling_params.best_of - for scheduled_seq_group in - scheduler_outputs.scheduled_seq_groups - ] - - # Latency Timings - time_last_iters = [] for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: seq_group = scheduled_seq_group.seq_group - # Time since last token. + + # Iteration time # (n.b. updates seq_group.metrics.last_token_time) - time_last_iters.append(seq_group.get_last_latency(now)) - # Time since arrival for all finished requests. + latency = seq_group.maybe_get_last_latency(now) + + # Number of tokens (for throughput calculations). + # Because of chunked prefill, we can have prefills and decodes + # in the same batch. + # If token_chunk_size > 1, this seq_group is a chunk of a + # prefill. + # If token_chunk_size = 1, this seq_group is a decode. + if (seq_group.is_prefill() + or scheduled_seq_group.token_chunk_size > 1): + num_prompt_tokens_iter += ( + scheduled_seq_group.token_chunk_size) + if latency is not None: + time_to_first_tokens_iter.append(latency) + else: + assert latency is not None + time_per_output_tokens_iter.append(latency) + + # Because of chunked prefill, we can have a single sequence + # group that does multiple prompt_runs. To prevent logging + # the same metadata more than once per request, we standardize + # on logging request level information for finished requests, + # which can only happen once. if seq_group.is_finished(): + # Latency timings time_e2e_requests.append(now - seq_group.metrics.arrival_time) - time_to_first_tokens_iter = time_last_iters if prompt_run else [] - time_per_output_tokens_iter = [] if prompt_run else time_last_iters - - # Finished Requests - finished_reason_requests = [ - SequenceStatus.get_finished_reason(seq.status) for - scheduled_seq_group in scheduler_outputs.scheduled_seq_groups - if scheduled_seq_group.seq_group.is_finished() - for seq in scheduled_seq_group.seq_group.get_finished_seqs() - ] + # Metadata + num_prompt_tokens_requests.append( + len(seq_group.prompt_token_ids)) + num_generation_tokens_requests.extend( + seq.get_output_len() + for seq in seq_group.get_finished_seqs()) + best_of_requests.append(seq_group.sampling_params.best_of) + n_requests.append(seq_group.sampling_params.n) + finished_reason_requests.extend( + SequenceStatus.get_finished_reason(seq.status) + for seq in seq_group.get_finished_seqs()) return Stats( now=now, @@ -664,7 +657,7 @@ def _get_stats(self, num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, best_of_requests=best_of_requests, - n_requests=n__requests, + n_requests=n_requests, finished_reason_requests=finished_reason_requests, ) diff --git a/vllm/sequence.py b/vllm/sequence.py index 92362a9a5d2a..2f3cda77e822 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -160,7 +160,7 @@ def reset_state_for_recompute(self) -> None: self._stage = SequenceStage.PREFILL def get_num_uncomputed_tokens(self) -> int: - """Return the number of prefil tokens that are not computed.""" + """Return the number of prefill tokens that are not computed.""" # we use `get_len()` which includes prompt_len + output_len instead # of prompt_len here. This is because during recompute we need to # prefill for both prompt and output. @@ -442,15 +442,25 @@ def prompt_token_ids(self) -> List[int]: def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 - def get_last_latency(self, now: float) -> float: - """Gets last token latency for Request level timings.""" + def maybe_get_last_latency(self, now: float) -> Optional[float]: + """Sets the last token time for Request level timings.""" + # If prompt not done, return None. + if self.is_prefill(): + return None + + # Otherwise return token latency. latency = now - self.metrics.last_token_time self.metrics.last_token_time = now return latency def maybe_set_first_token_time(self, time: float) -> None: """Sets the first token time for Request level timings.""" - if self.metrics.first_token_time is None: + # Note: in a case where a sequence_group is swapped and + # recomputed, the time between iterations is counted + # in TPOT, rather than recalculating TTFT (since from the ) + # POV of the user, there is simply a long generation delay. + if (self.metrics.first_token_time is None + and self.get_seqs()[0].get_output_len() == 1): self.metrics.first_token_time = time def maybe_set_first_scheduled_time(self, time: float) -> None: From 2d366096f0e3b755b2ae973e16483ccc7825cc7e Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 18:38:21 +0300 Subject: [PATCH 32/40] Restore num_prompt_tokens_iter and num_generation_tokens_iter Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- vllm/engine/llm_engine.py | 6 ++++++ vllm/engine/metrics.py | 29 ++++++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 64d6169e0db5..7e9255771e26 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -575,6 +575,8 @@ def _get_stats(self, cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu) # Iteration stats + num_prompt_tokens_iter = 0 + num_generation_tokens_iter = 0 time_to_first_tokens_iter = [] time_per_output_tokens_iter = [] @@ -609,6 +611,8 @@ def _get_stats(self, if latency is not None: time_to_first_tokens_iter.append(latency) else: + num_generation_tokens_iter += seq_group.num_unfinished_seqs( + ) assert latency is not None time_per_output_tokens_iter.append(latency) @@ -647,6 +651,8 @@ def _get_stats(self, cpu_cache_usage_sys=cpu_cache_usage_sys, # Iteration stats + num_prompt_tokens_iter=num_prompt_tokens_iter, + num_generation_tokens_iter=num_generation_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter, time_per_output_tokens_iter=time_per_output_tokens_iter, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 54602ffc1b7b..43306afa4132 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -55,6 +55,14 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames) # Raw stats from last model iteration + self.counter_prompt_tokens = Counter( + name="vllm:prompt_tokens_total", + documentation="Number of prefill tokens processed.", + labelnames=labelnames) + self.counter_generation_tokens = Counter( + name="vllm:generation_tokens_total", + documentation="Number of generation tokens processed.", + labelnames=labelnames) self.counter_request_success = Counter( name="vllm:request_success", documentation="Count of successfully processed requests.", @@ -105,16 +113,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): buckets=[1, 2, 5, 10, 20], ) - # Deprecated in favor of vllm:request_prompt_tokens_sum - self.counter_prompt_tokens = Counter( - name="vllm:prompt_tokens_total", - documentation="Number of prefill tokens processed.", - labelnames=labelnames) - # Deprecated in favor of vllm:request_generation_tokens_sum - self.counter_generation_tokens = Counter( - name="vllm:generation_tokens_total", - documentation="Number of generation tokens processed.", - labelnames=labelnames) # Deprecated in favor of vllm:prompt_tokens_total self.gauge_avg_prompt_throughput = Gauge( name="vllm:avg_prompt_throughput_toks_per_s", @@ -169,6 +167,8 @@ class Stats: cpu_cache_usage_sys: float # Iteration stats (should have _iter suffix) + num_prompt_tokens_iter: int + num_generation_tokens_iter: int time_to_first_tokens_iter: List[float] time_per_output_tokens_iter: List[float] @@ -233,9 +233,9 @@ def _log_prometheus(self, stats: Stats) -> None: # Add to token counters. self.metrics.counter_prompt_tokens.labels(**self.labels).inc( - sum(stats.num_prompt_tokens_requests)) + stats.num_prompt_tokens_iter) self.metrics.counter_generation_tokens.labels(**self.labels).inc( - sum(stats.num_generation_tokens_requests)) + stats.num_generation_tokens_iter) # Add to request counters. finished_reason_counter = CollectionsCounter( @@ -296,9 +296,8 @@ def log(self, stats: Stats) -> None: self._log_prometheus(stats) # Save tracked stats for token counters. - self.num_prompt_tokens.append(sum(stats.num_prompt_tokens_requests)) - self.num_generation_tokens.append( - sum(stats.num_generation_tokens_requests)) + self.num_prompt_tokens.append(stats.num_prompt_tokens_iter) + self.num_generation_tokens.append(stats.num_generation_tokens_iter) # Log locally every local_interval seconds. if self._local_interval_elapsed(stats.now): From e81d95a55a3bd13518d33260d26182d30b19acf7 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 19 Apr 2024 19:00:58 +0300 Subject: [PATCH 33/40] Refactor metrics logging methods Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- vllm/engine/metrics.py | 110 ++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 43306afa4132..251db938ccb0 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,7 +1,7 @@ import time from dataclasses import dataclass from typing import Counter as CollectionsCounter -from typing import Dict, List, Protocol +from typing import Dict, List, Protocol, Union import numpy as np from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info, @@ -219,59 +219,65 @@ def _local_interval_elapsed(self, now: float) -> bool: return elapsed_time > self.local_interval def _log_prometheus(self, stats: Stats) -> None: - # Set system stat gauges. - self.metrics.gauge_scheduler_running.labels(**self.labels).set( - stats.num_running_sys) - self.metrics.gauge_scheduler_swapped.labels(**self.labels).set( - stats.num_swapped_sys) - self.metrics.gauge_scheduler_waiting.labels(**self.labels).set( - stats.num_waiting_sys) - self.metrics.gauge_gpu_cache_usage.labels(**self.labels).set( - stats.gpu_cache_usage_sys) - self.metrics.gauge_cpu_cache_usage.labels(**self.labels).set( - stats.cpu_cache_usage_sys) - - # Add to token counters. - self.metrics.counter_prompt_tokens.labels(**self.labels).inc( - stats.num_prompt_tokens_iter) - self.metrics.counter_generation_tokens.labels(**self.labels).inc( - stats.num_generation_tokens_iter) - - # Add to request counters. + # System state data + self._log_gauge(self.metrics.gauge_scheduler_running, + stats.num_running_sys) + self._log_gauge(self.metrics.gauge_scheduler_swapped, + stats.num_swapped_sys) + self._log_gauge(self.metrics.gauge_scheduler_waiting, + stats.num_waiting_sys) + self._log_gauge(self.metrics.gauge_gpu_cache_usage, + stats.gpu_cache_usage_sys) + self._log_gauge(self.metrics.gauge_cpu_cache_usage, + stats.cpu_cache_usage_sys) + + # Iteration level data + self._log_counter(self.metrics.counter_prompt_tokens, + stats.num_prompt_tokens_iter) + self._log_counter(self.metrics.counter_generation_tokens, + stats.num_generation_tokens_iter) + self._log_histogram(self.metrics.histogram_time_to_first_token, + stats.time_to_first_tokens_iter) + self._log_histogram(self.metrics.histogram_time_per_output_token, + stats.time_per_output_tokens_iter) + + # Request level data + # Latency + self._log_histogram(self.metrics.histogram_e2e_request_latency, + stats.time_e2e_requests) + # Metadata finished_reason_counter = CollectionsCounter( stats.finished_reason_requests) - for finished_reason, count in finished_reason_counter.items(): - self.metrics.counter_request_success.labels(**{ - **self.labels, - Metrics.labelname_finish_reason: - finished_reason, - }).inc(count) - - # Observe number of tokens in histograms. - for val in stats.num_prompt_tokens_requests: - self.metrics.histogram_request_prompt_tokens.labels( - **self.labels).observe(val) - for val in stats.num_generation_tokens_requests: - self.metrics.histogram_request_generation_tokens.labels( - **self.labels).observe(val) - - # Observe sampling params in histograms. - for n in stats.n_requests: - self.metrics.histogram_request_n.labels(**self.labels).observe(n) - for best_of in stats.best_of_requests: - self.metrics.histogram_request_best_of.labels( - **self.labels).observe(best_of) - - # Observe request level latencies in histograms. - for ttft in stats.time_to_first_tokens_iter: - self.metrics.histogram_time_to_first_token.labels( - **self.labels).observe(ttft) - for tpot in stats.time_per_output_tokens_iter: - self.metrics.histogram_time_per_output_token.labels( - **self.labels).observe(tpot) - for e2e in stats.time_e2e_requests: - self.metrics.histogram_e2e_request_latency.labels( - **self.labels).observe(e2e) + self._log_counter_labels(self.metrics.counter_request_success, + finished_reason_counter, + Metrics.labelname_finish_reason) + self._log_histogram(self.metrics.histogram_request_prompt_tokens, + stats.num_prompt_tokens_requests) + self._log_histogram(self.metrics.histogram_request_generation_tokens, + stats.num_generation_tokens_requests) + self._log_histogram(self.metrics.histogram_request_n, stats.n_requests) + self._log_histogram(self.metrics.histogram_request_best_of, + stats.best_of_requests) + + def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None: + # Convenience function for logging to gauge. + gauge.labels(**self.labels).set(data) + + def _log_counter(self, counter: Counter, data: Union[int, float]) -> None: + # Convenience function for logging to counter. + counter.labels(**self.labels).inc(data) + + def _log_counter_labels(self, counter: Counter, data: CollectionsCounter, + label_key: str) -> None: + # Convenience function for collection counter of labels. + for label, count in data.items(): + counter.labels(**{**self.labels, label_key: label}).inc(count) + + def _log_histogram(self, histogram: Histogram, + data: Union[List[int], List[float]]) -> None: + # Convenience function for logging list to histogram. + for datum in data: + histogram.labels(**self.labels).observe(datum) def _log_prometheus_interval(self, prompt_throughput: float, generation_throughput: float) -> None: From ece2ec0ffe7602f8914d6616564f107627a2ca0e Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Sat, 20 Apr 2024 00:18:57 +0300 Subject: [PATCH 34/40] Reorder metrics definition to match Stats order Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- vllm/engine/metrics.py | 56 +++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 251db938ccb0..8664d9a91c40 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -33,18 +33,20 @@ def __init__(self, labelnames: List[str], max_model_len: int): documentation='information of cache_config') # System stats + # Scheduler State self.gauge_scheduler_running = Gauge( name="vllm:num_requests_running", documentation="Number of requests currently running on GPU.", labelnames=labelnames) - self.gauge_scheduler_swapped = Gauge( - name="vllm:num_requests_swapped", - documentation="Number of requests swapped to CPU.", - labelnames=labelnames) self.gauge_scheduler_waiting = Gauge( name="vllm:num_requests_waiting", documentation="Number of requests waiting to be processed.", labelnames=labelnames) + self.gauge_scheduler_swapped = Gauge( + name="vllm:num_requests_swapped", + documentation="Number of requests swapped to CPU.", + labelnames=labelnames) + # KV Cache Usage in % self.gauge_gpu_cache_usage = Gauge( name="vllm:gpu_cache_usage_perc", documentation="GPU KV-cache usage. 1 means 100 percent usage.", @@ -54,7 +56,7 @@ def __init__(self, labelnames: List[str], max_model_len: int): documentation="CPU KV-cache usage. 1 means 100 percent usage.", labelnames=labelnames) - # Raw stats from last model iteration + # Iteration stats self.counter_prompt_tokens = Counter( name="vllm:prompt_tokens_total", documentation="Number of prefill tokens processed.", @@ -63,22 +65,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:generation_tokens_total", documentation="Number of generation tokens processed.", labelnames=labelnames) - self.counter_request_success = Counter( - name="vllm:request_success", - documentation="Count of successfully processed requests.", - labelnames=labelnames + [Metrics.labelname_finish_reason]) - self.histogram_request_prompt_tokens = Histogram( - name="vllm:request_prompt_tokens", - documentation="Number of prefill tokens processed.", - labelnames=labelnames, - buckets=build_1_2_5_buckets(max_model_len), - ) - self.histogram_request_generation_tokens = Histogram( - name="vllm:request_generation_tokens", - documentation="Number of generation tokens processed.", - labelnames=labelnames, - buckets=build_1_2_5_buckets(max_model_len), - ) self.histogram_time_to_first_token = Histogram( name="vllm:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", @@ -95,16 +81,26 @@ def __init__(self, labelnames: List[str], max_model_len: int): 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5 ]) + + # Request stats + # Latency self.histogram_e2e_request_latency = Histogram( name="vllm:e2e_request_latency_seconds", documentation="Histogram of end to end request latency in seconds.", labelnames=labelnames, buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) - self.histogram_request_n = Histogram( - name="vllm:request_params_n", - documentation="Histogram of the n request parameter.", + # Metadata + self.histogram_request_prompt_tokens = Histogram( + name="vllm:request_prompt_tokens", + documentation="Number of prefill tokens processed.", labelnames=labelnames, - buckets=[1, 2, 5, 10, 20], + buckets=build_1_2_5_buckets(max_model_len), + ) + self.histogram_request_generation_tokens = Histogram( + name="vllm:request_generation_tokens", + documentation="Number of generation tokens processed.", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len), ) self.histogram_request_best_of = Histogram( name="vllm:request_params_best_of", @@ -112,6 +108,16 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames, buckets=[1, 2, 5, 10, 20], ) + self.histogram_request_n = Histogram( + name="vllm:request_params_n", + documentation="Histogram of the n request parameter.", + labelnames=labelnames, + buckets=[1, 2, 5, 10, 20], + ) + self.counter_request_success = Counter( + name="vllm:request_success", + documentation="Count of successfully processed requests.", + labelnames=labelnames + [Metrics.labelname_finish_reason]) # Deprecated in favor of vllm:prompt_tokens_total self.gauge_avg_prompt_throughput = Gauge( From 5a658c8fcb5dc3a34ea44fb33bd45c073c018f20 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Sat, 20 Apr 2024 00:25:34 +0300 Subject: [PATCH 35/40] Rename metric variables to match suffix convention Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- vllm/engine/metrics.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 8664d9a91c40..fb5aaef328d7 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -84,31 +84,31 @@ def __init__(self, labelnames: List[str], max_model_len: int): # Request stats # Latency - self.histogram_e2e_request_latency = Histogram( + self.histogram_e2e_time_request = Histogram( name="vllm:e2e_request_latency_seconds", documentation="Histogram of end to end request latency in seconds.", labelnames=labelnames, buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) # Metadata - self.histogram_request_prompt_tokens = Histogram( + self.histogram_num_prompt_tokens_request = Histogram( name="vllm:request_prompt_tokens", documentation="Number of prefill tokens processed.", labelnames=labelnames, buckets=build_1_2_5_buckets(max_model_len), ) - self.histogram_request_generation_tokens = Histogram( + self.histogram_num_generation_tokens_request = Histogram( name="vllm:request_generation_tokens", documentation="Number of generation tokens processed.", labelnames=labelnames, buckets=build_1_2_5_buckets(max_model_len), ) - self.histogram_request_best_of = Histogram( + self.histogram_best_of_request = Histogram( name="vllm:request_params_best_of", documentation="Histogram of the best_of request parameter.", labelnames=labelnames, buckets=[1, 2, 5, 10, 20], ) - self.histogram_request_n = Histogram( + self.histogram_n_request = Histogram( name="vllm:request_params_n", documentation="Histogram of the n request parameter.", labelnames=labelnames, @@ -249,7 +249,7 @@ def _log_prometheus(self, stats: Stats) -> None: # Request level data # Latency - self._log_histogram(self.metrics.histogram_e2e_request_latency, + self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests) # Metadata finished_reason_counter = CollectionsCounter( @@ -257,12 +257,13 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_counter_labels(self.metrics.counter_request_success, finished_reason_counter, Metrics.labelname_finish_reason) - self._log_histogram(self.metrics.histogram_request_prompt_tokens, + self._log_histogram(self.metrics.histogram_num_prompt_tokens_request, stats.num_prompt_tokens_requests) - self._log_histogram(self.metrics.histogram_request_generation_tokens, - stats.num_generation_tokens_requests) - self._log_histogram(self.metrics.histogram_request_n, stats.n_requests) - self._log_histogram(self.metrics.histogram_request_best_of, + self._log_histogram( + self.metrics.histogram_num_generation_tokens_request, + stats.num_generation_tokens_requests) + self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) + self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests) def _log_gauge(self, gauge: Gauge, data: Union[int, float]) -> None: From 717b5593be8c41107a5de9f8f73c278f0a01025a Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Sat, 20 Apr 2024 15:06:01 +0300 Subject: [PATCH 36/40] Make mypy happy --- vllm/engine/llm_engine.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 7e9255771e26..b8fe2adf70bf 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -577,18 +577,18 @@ def _get_stats(self, # Iteration stats num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 - time_to_first_tokens_iter = [] - time_per_output_tokens_iter = [] + time_to_first_tokens_iter: List[float] = [] + time_per_output_tokens_iter: List[float] = [] # Request stats # Latency - time_e2e_requests = [] + time_e2e_requests: List[float] = [] # Metadata - num_prompt_tokens_requests = [] - num_generation_tokens_requests = [] - best_of_requests = [] - n_requests = [] - finished_reason_requests = [] + num_prompt_tokens_requests: List[int] = [] + num_generation_tokens_requests: List[int] = [] + best_of_requests: List[int] = [] + n_requests: List[int] = [] + finished_reason_requests: List[str] = [] if scheduler_outputs is not None: for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: From f103ad89c49bfe4980eeacad0b0d0ac75087bc57 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 25 Apr 2024 21:36:14 +0000 Subject: [PATCH 37/40] ./format --- vllm/engine/llm_engine.py | 4 ++-- vllm/engine/metrics.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 0879500c9e82..182aab0c54d4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -22,7 +22,7 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceStage) + SequenceGroup, SequenceStage, SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) @@ -705,7 +705,7 @@ def _get_stats( time_to_first_tokens_iter=time_to_first_tokens_iter, time_per_output_tokens_iter=time_per_output_tokens_iter, spec_decode_metrics=spec_decode_metrics, - + # Request stats # Latency time_e2e_requests=time_e2e_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 4ded6e8863a6..a2776a8cecea 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,7 +1,8 @@ import time from dataclasses import dataclass +from typing import TYPE_CHECKING from typing import Counter as CollectionsCounter -from typing import TYPE_CHECKING, Dict, List, Optional, Protocol +from typing import Dict, List, Optional, Protocol, Union import numpy as np from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info, From cc0d5eb140655b4fa27df4e1fd3283e0ac6183cd Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 28 Apr 2024 20:00:52 +0000 Subject: [PATCH 38/40] fixed chunked prefill logic --- vllm/engine/llm_engine.py | 40 ++++++++++++--------------- vllm/entrypoints/openai/api_server.py | 11 +++----- vllm/sequence.py | 2 +- 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e0ebeeb24370..fa5c0c2a332d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -22,8 +22,7 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupMetadata, - SequenceStage, SequenceStatus) + SequenceGroup, SequenceGroupMetadata, SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) @@ -656,29 +655,29 @@ def _get_stats( finished_reason_requests: List[str] = [] if scheduler_outputs is not None: - for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: + for idx, scheduled_seq_group in enumerate(scheduler_outputs.scheduled_seq_groups): seq_group = scheduled_seq_group.seq_group - # Iteration time + # Note: Here we assume the sequence groups are ordered prefill > decode. + is_prefill_group = idx < scheduler_outputs.num_prefill_groups + + # Last token time (None is still in prefill phase). # (n.b. updates seq_group.metrics.last_token_time) latency = seq_group.maybe_get_last_latency(now) # Number of tokens (for throughput calculations). - # Because of chunked prefill, we can have prefills and decodes - # in the same batch. - # If token_chunk_size > 1, this seq_group is a chunk of a - # prefill. - # If token_chunk_size = 1, this seq_group is a decode. - if (seq_group.is_prefill() - or scheduled_seq_group.token_chunk_size > 1): - num_prompt_tokens_iter += ( - scheduled_seq_group.token_chunk_size) + if is_prefill_group: + num_prompt_tokens_iter += scheduled_seq_group.token_chunk_size if latency is not None: time_to_first_tokens_iter.append(latency) else: - num_generation_tokens_iter += seq_group.num_unfinished_seqs( - ) - assert latency is not None + num_generation_tokens_iter += seq_group.num_unfinished_seqs() + if latency is None: + raise ValueError( + "seq_group.maybe_get_last_latency(now) returned None, " + "which should only happen in prefill phase but detected " + "that the current seq_group is in decode phase." + ) time_per_output_tokens_iter.append(latency) # Because of chunked prefill, we can have a single sequence @@ -692,16 +691,13 @@ def _get_stats( seq_group.metrics.arrival_time) # Metadata - num_prompt_tokens_requests.append( - len(seq_group.prompt_token_ids)) + num_prompt_tokens_requests.append(len(seq_group.prompt_token_ids)) num_generation_tokens_requests.extend( - seq.get_output_len() - for seq in seq_group.get_finished_seqs()) + [seq.get_output_len() for seq in seq_group.get_finished_seqs()]) best_of_requests.append(seq_group.sampling_params.best_of) n_requests.append(seq_group.sampling_params.n) finished_reason_requests.extend( - SequenceStatus.get_finished_reason(seq.status) - for seq in seq_group.get_finished_seqs()) + [SequenceStatus.get_finished_reason(seq.status) for seq in seq_group.get_finished_seqs()]) # Spec decode, if enabled, emits specialized metrics from the worker in # sampler output. diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9aa8eacd7d51..5cf0c1e5aebe 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -12,7 +12,6 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse from prometheus_client import make_asgi_app -from prometheus_fastapi_instrumentator import Instrumentator import vllm from vllm.engine.arg_utils import AsyncEngineArgs @@ -48,12 +47,6 @@ async def _force_log(): app = fastapi.FastAPI(lifespan=lifespan) -# Instrument the app with HTTP metrics -Instrumentator().instrument(app) - -# Add prometheus ASGI middleware to route /metrics requests -metrics_app = make_asgi_app() -app.mount("/metrics", metrics_app) def parse_args(): @@ -61,6 +54,10 @@ def parse_args(): return parser.parse_args() +# Add prometheus asgi middleware to route /metrics requests +metrics_app = make_asgi_app() +app.mount("/metrics", metrics_app) + @app.exception_handler(RequestValidationError) async def validation_exception_handler(_, exc): err = openai_serving_chat.create_error_response(message=str(exc)) diff --git a/vllm/sequence.py b/vllm/sequence.py index 35dd43ab0ab0..a145b0a723b1 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -444,7 +444,7 @@ def lora_int_id(self) -> int: def maybe_get_last_latency(self, now: float) -> Optional[float]: """Sets the last token time for Request level timings.""" - # If prompt not done, return None. + # If still in prefill phase, return None. if self.is_prefill(): return None From d7f493be7e900b4fd5b5612074aabe1b12bb46ee Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 28 Apr 2024 20:07:27 +0000 Subject: [PATCH 39/40] make linter happy --- vllm/engine/llm_engine.py | 38 ++++++++++++++++----------- vllm/engine/metrics.py | 10 +++---- vllm/entrypoints/openai/api_server.py | 1 + 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fa5c0c2a332d..4e8848e5bee6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -22,7 +22,8 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupMetadata, SequenceStatus) + SequenceGroup, SequenceGroupMetadata, + SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) @@ -655,29 +656,31 @@ def _get_stats( finished_reason_requests: List[str] = [] if scheduler_outputs is not None: - for idx, scheduled_seq_group in enumerate(scheduler_outputs.scheduled_seq_groups): + for idx, scheduled_seq_group in enumerate( + scheduler_outputs.scheduled_seq_groups): seq_group = scheduled_seq_group.seq_group - # Note: Here we assume the sequence groups are ordered prefill > decode. + # Here we assume the seq_groups are ordered prefill > decode. is_prefill_group = idx < scheduler_outputs.num_prefill_groups - # Last token time (None is still in prefill phase). + # Last token time (None is still in prefill phase). # (n.b. updates seq_group.metrics.last_token_time) latency = seq_group.maybe_get_last_latency(now) # Number of tokens (for throughput calculations). if is_prefill_group: - num_prompt_tokens_iter += scheduled_seq_group.token_chunk_size + num_prompt_tokens_iter += ( + scheduled_seq_group.token_chunk_size) if latency is not None: time_to_first_tokens_iter.append(latency) else: - num_generation_tokens_iter += seq_group.num_unfinished_seqs() + num_generation_tokens_iter += seq_group.num_unfinished_seqs( + ) if latency is None: raise ValueError( - "seq_group.maybe_get_last_latency(now) returned None, " - "which should only happen in prefill phase but detected " - "that the current seq_group is in decode phase." - ) + "seq_group.maybe_get_last_latency(now)=None " + "which should only happen in prefill but detected " + "that the current seq_group is in decode phase.") time_per_output_tokens_iter.append(latency) # Because of chunked prefill, we can have a single sequence @@ -691,13 +694,18 @@ def _get_stats( seq_group.metrics.arrival_time) # Metadata - num_prompt_tokens_requests.append(len(seq_group.prompt_token_ids)) - num_generation_tokens_requests.extend( - [seq.get_output_len() for seq in seq_group.get_finished_seqs()]) + num_prompt_tokens_requests.append( + len(seq_group.prompt_token_ids)) + num_generation_tokens_requests.extend([ + seq.get_output_len() + for seq in seq_group.get_finished_seqs() + ]) best_of_requests.append(seq_group.sampling_params.best_of) n_requests.append(seq_group.sampling_params.n) - finished_reason_requests.extend( - [SequenceStatus.get_finished_reason(seq.status) for seq in seq_group.get_finished_seqs()]) + finished_reason_requests.extend([ + SequenceStatus.get_finished_reason(seq.status) + for seq in seq_group.get_finished_seqs() + ]) # Spec decode, if enabled, emits specialized metrics from the worker in # sampler output. diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index c45937ee2c2f..45bfad03ec86 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -339,11 +339,11 @@ def log(self, stats: Stats) -> None: "CPU KV cache usage: %.1f%%", prompt_throughput, generation_throughput, - stats.num_running, - stats.num_swapped, - stats.num_waiting, - stats.gpu_cache_usage * 100, - stats.cpu_cache_usage * 100, + stats.num_running_sys, + stats.num_swapped_sys, + stats.num_waiting_sys, + stats.gpu_cache_usage_sys * 100, + stats.cpu_cache_usage_sys * 100, ) # Reset tracked stats for next interval. diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5cf0c1e5aebe..af9ba7a3bc82 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -58,6 +58,7 @@ def parse_args(): metrics_app = make_asgi_app() app.mount("/metrics", metrics_app) + @app.exception_handler(RequestValidationError) async def validation_exception_handler(_, exc): err = openai_serving_chat.create_error_response(message=str(exc)) From 54bf260f3532a9e8e5f57df605c7571ee64c5757 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 28 Apr 2024 21:32:43 +0000 Subject: [PATCH 40/40] fixed issues with chunked prefill X metrics --- vllm/engine/llm_engine.py | 52 ++++++++++++++++++++++++++------------- vllm/sequence.py | 8 +++--- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4e8848e5bee6..835803fd4e75 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -655,32 +655,40 @@ def _get_stats( n_requests: List[int] = [] finished_reason_requests: List[str] = [] + # NOTE: This loop assumes prefill seq_groups are before + # decode seq_groups in scheduled_seq_groups. if scheduler_outputs is not None: + num_generation_tokens_from_prefill_groups = 0. + if scheduler_outputs.num_prefill_groups > 0 and len( + scheduler_outputs.scheduled_seq_groups + ) != scheduler_outputs.num_prefill_groups: + print("DETECTED CHUNKED") + for idx, scheduled_seq_group in enumerate( scheduler_outputs.scheduled_seq_groups): + group_was_prefill = idx < scheduler_outputs.num_prefill_groups seq_group = scheduled_seq_group.seq_group - # Here we assume the seq_groups are ordered prefill > decode. - is_prefill_group = idx < scheduler_outputs.num_prefill_groups - - # Last token time (None is still in prefill phase). - # (n.b. updates seq_group.metrics.last_token_time) - latency = seq_group.maybe_get_last_latency(now) - - # Number of tokens (for throughput calculations). - if is_prefill_group: + # NOTE: a seq_group that completed all of its prefill tokens + # in the last iteration will have seq_group.is_prefill() = False + # with group_was_prefill = True + if group_was_prefill: + # Number of prompt tokens. num_prompt_tokens_iter += ( scheduled_seq_group.token_chunk_size) - if latency is not None: + + # If the seq_group just finished the prefill state + # get TTFT. + if not seq_group.is_prefill(): + latency = seq_group.get_last_latency(now) time_to_first_tokens_iter.append(latency) + + # One generation token per finished prefill. + num_generation_tokens_from_prefill_groups += ( + seq_group.num_seqs()) else: - num_generation_tokens_iter += seq_group.num_unfinished_seqs( - ) - if latency is None: - raise ValueError( - "seq_group.maybe_get_last_latency(now)=None " - "which should only happen in prefill but detected " - "that the current seq_group is in decode phase.") + # TPOTs. + latency = seq_group.get_last_latency(now) time_per_output_tokens_iter.append(latency) # Because of chunked prefill, we can have a single sequence @@ -707,6 +715,16 @@ def _get_stats( for seq in seq_group.get_finished_seqs() ]) + # Number of generation tokens. + # num_batched_tokens equals the number of prompt_tokens plus the + # number of decode_tokens in a single iteration. So, + # num_generation_tokens = num_batched_tokens - num_prompt_tokens + # + num_generation_tokens_from_prefill_groups (since we generate + # one token on prefills on iters where the prefill finishes). + num_generation_tokens_iter = ( + scheduler_outputs.num_batched_tokens - num_prompt_tokens_iter + + num_generation_tokens_from_prefill_groups) + # Spec decode, if enabled, emits specialized metrics from the worker in # sampler output. if model_output and (model_output[0].spec_decode_worker_metrics diff --git a/vllm/sequence.py b/vllm/sequence.py index a145b0a723b1..0e931ebbb657 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -442,11 +442,13 @@ def prompt_token_ids(self) -> List[int]: def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 - def maybe_get_last_latency(self, now: float) -> Optional[float]: + def get_last_latency(self, now: float) -> Optional[float]: """Sets the last token time for Request level timings.""" - # If still in prefill phase, return None. + # If still in prefill phase, raise Error. if self.is_prefill(): - return None + raise ValueError( + "seq_group.get_last_latency() should not be called " + "if the seq_group is in prefill phase.") # Otherwise return token latency. latency = now - self.metrics.last_token_time