From 5ddba6aa8c0faf427d5a6d618accba9099353e26 Mon Sep 17 00:00:00 2001 From: harrywu <904714159@qq.com> Date: Tue, 30 Apr 2024 01:45:59 +0000 Subject: [PATCH 01/16] add max_num_generation_tokens_requests --- vllm/engine/llm_engine.py | 26 ++++++++++++++++++++++++++ vllm/engine/metrics.py | 8 ++++++++ vllm/engine/metrics_types.py | 5 +++++ 3 files changed, 39 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 94271c4a9315..ddb006fc307c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1808,11 +1808,16 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests: List[float] = [] + time_queue_requests: List[float] = [] + time_inference_requests: List[float] = [] + time_prefill_requests: List[float] = [] + time_decode_requests: List[float] = [] # Metadata num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] best_of_requests: List[int] = [] n_requests: List[int] = [] + max_num_generation_tokens_requests = [] finished_reason_requests: List[str] = [] # NOTE: This loop assumes prefill seq_groups are before @@ -1869,6 +1874,19 @@ def _get_stats(self, # Latency timings time_e2e_requests.append(now - seq_group.metrics.arrival_time) + time_queue_requests.append( + seq_group.metrics.first_scheduled_time - + seq_group.metrics.arrival_time) + time_prefill_requests.append( + seq_group.metrics.first_token_time - + seq_group.metrics.first_scheduled_time + ) + time_decode_requests.append( + now - + seq_group.metrics.first_token_time + ) + time_inference_requests.append( + now - seq_group.metrics.first_scheduled_time) # Metadata num_prompt_tokens_requests.append( len(seq_group.prompt_token_ids)) @@ -1876,6 +1894,9 @@ def _get_stats(self, seq.get_output_len() for seq in seq_group.get_finished_seqs() ]) + max_num_generation_tokens_requests.append( + max(seq.get_output_len() for seq in seq_group.get_seqs()) + ) if seq_group.sampling_params is not None: best_of_requests.append( seq_group.sampling_params.best_of) @@ -1928,11 +1949,16 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests=time_e2e_requests, + time_queue_requests=time_queue_requests, + time_inference_requests=time_inference_requests, + time_prefill_requests=time_prefill_requests, + time_decode_requests=time_decode_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, best_of_requests=best_of_requests, n_requests=n_requests, + max_num_generation_tokens_requests=max_num_generation_tokens_requests, finished_reason_requests=finished_reason_requests, ) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 74277cae7c8e..7c9b74ee78f1 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -140,6 +140,12 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames, buckets=[1, 2, 5, 10, 20], ) + self.histogram_max_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_max_num_generation_tokens", + documentation= + "Histogram of maximum number of requested generation tokens.", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len)) self.histogram_n_request = self._histogram_cls( name="vllm:request_params_n", documentation="Histogram of the n request parameter.", @@ -472,6 +478,8 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_histogram( self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests) + self._log_histogram(self.metrics.histogram_max_num_generation_tokens_request, + stats.max_num_generation_tokens_requests) self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 1eccb2359340..b6e08c6d711c 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -46,11 +46,16 @@ class Stats: # Request stats (should have _requests suffix) # Latency time_e2e_requests: List[float] + time_queue_requests: List[float] + time_inference_requests: List[float] + time_prefill_requests: List[float] + time_decode_requests: List[float] # Metadata num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int] best_of_requests: List[int] n_requests: List[int] + max_num_generation_tokens_requests: List[int] finished_reason_requests: List[str] spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None From 29c14394d3a71ea7c9a4276838ee03e31ebd95d6 Mon Sep 17 00:00:00 2001 From: harrywu <904714159@qq.com> Date: Tue, 30 Apr 2024 02:03:40 +0000 Subject: [PATCH 02/16] format --- vllm/engine/llm_engine.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ddb006fc307c..42e92cb8a2b5 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1895,7 +1895,10 @@ def _get_stats(self, for seq in seq_group.get_finished_seqs() ]) max_num_generation_tokens_requests.append( - max(seq.get_output_len() for seq in seq_group.get_seqs()) + max( + seq.get_output_len() + for seq in seq_group.get_seqs() + ) ) if seq_group.sampling_params is not None: best_of_requests.append( From 82759e27546dd5537c7f170e1d160a6d1dcaa82f Mon Sep 17 00:00:00 2001 From: harrywu <904714159@qq.com> Date: Wed, 1 May 2024 09:49:11 +0000 Subject: [PATCH 03/16] update --- vllm/engine/metrics.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 7c9b74ee78f1..41f4446848e9 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -120,6 +120,30 @@ def __init__(self, labelnames: List[str], max_model_len: int): documentation="Histogram of end to end request latency in seconds.", labelnames=labelnames, buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) + self.histogram_queue_time_request = Histogram( + name="vllm:request_queue_time_seconds", + documentation= + "Histogram of time spent in WAITING phase for request.", + labelnames=labelnames, + buckets=[0.1, 1.0, 2.5, 5.0, 10.0, 20.0, 50.0, 100]) + self.histogram_inference_time_request = Histogram( + name="vllm:request_inference_time_seconds", + documentation= + "Histogram of time spent in RUNNING phase for request.", + labelnames=labelnames, + buckets=[0.1, 1.0, 2.5, 5.0, 10.0, 20.0, 50.0, 100]) + self.histogram_prefill_time_request = Histogram( + name="vllm:request_prefill_time_seconds", + documentation= + "Histogram of time spent in PREFILL phase for request.", + labelnames=labelnames, + buckets=[0.1, 1.0, 2.5, 5.0, 10.0, 20.0, 50.0, 100]) + self.histogram_decode_time_request = Histogram( + name="vllm:request_decode_time_seconds", + documentation= + "Histogram of time spent in DECODE phase for request.", + labelnames=labelnames, + buckets=[0.1, 1.0, 2.5, 5.0, 10.0, 20.0, 50.0, 100]) # Metadata self.histogram_num_prompt_tokens_request = self._histogram_cls( name="vllm:request_prompt_tokens", @@ -467,6 +491,14 @@ def _log_prometheus(self, stats: Stats) -> None: # Latency self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests) + self._log_histogram(self.metrics.histogram_queue_time_request, + stats.time_queue_requests) + self._log_histogram(self.metrics.histogram_inference_time_request, + stats.time_inference_requests) + self._log_histogram(self.metrics.histogram_decode_time_request, + stats.time_prefill_requests) + self._log_histogram(self.metrics.histogram_prefill_time_request, + stats.time_decode_requests) # Metadata finished_reason_counter = CollectionsCounter( stats.finished_reason_requests) @@ -478,8 +510,9 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_histogram( self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests) - self._log_histogram(self.metrics.histogram_max_num_generation_tokens_request, - stats.max_num_generation_tokens_requests) + self._log_histogram( + self.metrics.histogram_max_num_generation_tokens_request, + stats.max_num_generation_tokens_requests) self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests) From b0017c66d081e0c3267a3d6f7120599988c78372 Mon Sep 17 00:00:00 2001 From: harrywu <904714159@qq.com> Date: Wed, 1 May 2024 10:42:18 +0000 Subject: [PATCH 04/16] format --- vllm/engine/llm_engine.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 42e92cb8a2b5..fa31bc78e506 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1878,13 +1878,10 @@ def _get_stats(self, seq_group.metrics.first_scheduled_time - seq_group.metrics.arrival_time) time_prefill_requests.append( - seq_group.metrics.first_token_time - - seq_group.metrics.first_scheduled_time - ) + seq_group.metrics.first_token_time - + seq_group.metrics.first_scheduled_time) time_decode_requests.append( - now - - seq_group.metrics.first_token_time - ) + now - seq_group.metrics.first_token_time) time_inference_requests.append( now - seq_group.metrics.first_scheduled_time) # Metadata @@ -1961,7 +1958,8 @@ def _get_stats(self, num_generation_tokens_requests=num_generation_tokens_requests, best_of_requests=best_of_requests, n_requests=n_requests, - max_num_generation_tokens_requests=max_num_generation_tokens_requests, + max_num_generation_tokens_requests= + max_num_generation_tokens_requests, finished_reason_requests=finished_reason_requests, ) From db3fc2a1aae2fa20175f3599a574fa77e915c5cd Mon Sep 17 00:00:00 2001 From: harrywu <904714159@qq.com> Date: Wed, 1 May 2024 14:39:22 +0000 Subject: [PATCH 05/16] num_tokens_iter = num_generation_tokens_iter+num_prompt_tokens_iter --- vllm/engine/llm_engine.py | 5 ++++- vllm/engine/metrics.py | 5 +++++ vllm/engine/metrics_types.py | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fa31bc78e506..5952e9624a7e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1800,6 +1800,7 @@ def _get_stats(self, # Iteration stats num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 + num_tokens_iter = 0 time_to_first_tokens_iter: List[float] = [] time_per_output_tokens_iter: List[float] = [] num_preemption_iter = (0 if scheduler_outputs is None else @@ -1915,7 +1916,8 @@ def _get_stats(self, num_generation_tokens_iter = ( actual_num_batched_tokens - num_prompt_tokens_iter + num_generation_tokens_from_prefill_groups) - + num_tokens_iter = (num_generation_tokens_iter + + num_prompt_tokens_iter) # Spec decode, if enabled, emits specialized metrics from the worker in # sampler output. if model_output and (model_output[0].spec_decode_worker_metrics @@ -1941,6 +1943,7 @@ def _get_stats(self, # Iteration stats num_prompt_tokens_iter=num_prompt_tokens_iter, num_generation_tokens_iter=num_generation_tokens_iter, + num_tokens_iter=num_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter, time_per_output_tokens_iter=time_per_output_tokens_iter, spec_decode_metrics=spec_decode_metrics, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 41f4446848e9..43845a500d14 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -96,6 +96,10 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:generation_tokens_total", documentation="Number of generation tokens processed.", labelnames=labelnames) + self.counter_tokens = self._counter_cls( + name="vllm:tokens_total", + documentation="Number of prefill plus generation tokens processed.", + labelnames=labelnames) self.histogram_time_to_first_token = self._histogram_cls( name="vllm:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", @@ -482,6 +486,7 @@ def _log_prometheus(self, stats: Stats) -> None: stats.num_prompt_tokens_iter) self._log_counter(self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter) + self._log_counter(self.metrics.counter_tokens, stats.num_tokens_iter) self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter) self._log_histogram(self.metrics.histogram_time_per_output_token, diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index b6e08c6d711c..2468afd5a2d1 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -39,6 +39,7 @@ class Stats: # Iteration stats (should have _iter suffix) num_prompt_tokens_iter: int num_generation_tokens_iter: int + num_tokens_iter: int time_to_first_tokens_iter: List[float] time_per_output_tokens_iter: List[float] num_preemption_iter: int From 706dbe1dc84eb96443627c0ac36b529e6c66d3c7 Mon Sep 17 00:00:00 2001 From: harrywu <904714159@qq.com> Date: Thu, 2 May 2024 02:45:28 +0000 Subject: [PATCH 06/16] dashboard update --- examples/production_monitoring/grafana.json | 343 ++++++++++++++------ 1 file changed, 240 insertions(+), 103 deletions(-) diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index d1389f5392c8..d6b041048455 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -58,12 +58,25 @@ "links": [], "liveNow": false, "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 14, + "panels": [], + "title": "System stats", + "type": "row" + }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "End to end request latency measured in seconds.", + "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", "fieldConfig": { "defaults": { "color": { @@ -115,17 +128,17 @@ } ] }, - "unit": "s" + "unit": "none" }, "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 0, - "y": 0 + "y": 1 }, - "id": 9, + "id": 3, "options": { "legend": { "calcs": [], @@ -145,12 +158,12 @@ "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "editorMode": "code", + "expr": "vllm:num_requests_running{model_name=\"$model_name\"}", "fullMetaSearch": false, - "includeNullMetadata": false, + "includeNullMetadata": true, "instant": false, - "legendFormat": "P99", + "legendFormat": "Num Running", "range": true, "refId": "A", "useBackend": false @@ -162,12 +175,12 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": false, + "includeNullMetadata": true, "instant": false, - "legendFormat": "P95", + "legendFormat": "Num Swapped", "range": true, "refId": "B", "useBackend": false @@ -179,12 +192,12 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": false, + "includeNullMetadata": true, "instant": false, - "legendFormat": "P90", + "legendFormat": "Num Waiting", "range": true, "refId": "C", "useBackend": false @@ -220,7 +233,7 @@ "refId": "E" } ], - "title": "E2E Request Latency", + "title": "Scheduler State", "type": "timeseries" }, { @@ -228,7 +241,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "Number of tokens processed per second", + "description": "Percentage of used cache blocks by vLLM.", "fieldConfig": { "defaults": { "color": { @@ -279,17 +292,18 @@ "value": 80 } ] - } + }, + "unit": "percentunit" }, "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 12, - "y": 0 + "y": 1 }, - "id": 8, + "id": 4, "options": { "legend": { "calcs": [], @@ -308,44 +322,49 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", - "fullMetaSearch": false, - "includeNullMetadata": false, + "editorMode": "code", + "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", "instant": false, - "legendFormat": "Prompt Tokens/Sec", + "legendFormat": "GPU Cache Usage", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", - "fullMetaSearch": false, + "editorMode": "code", + "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", "hide": false, - "includeNullMetadata": false, "instant": false, - "legendFormat": "Generation Tokens/Sec", + "legendFormat": "CPU Cache Usage", "range": true, - "refId": "B", - "useBackend": false + "refId": "B" } ], - "title": "Token Throughput", + "title": "Cache Utilization", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 15, + "panels": [], + "title": "Iteration stats", + "type": "row" + }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "Inter token latency in seconds.", + "description": "P50, P90, P95, and P99 TTFT latency in seconds.", "fieldConfig": { "defaults": { "color": { @@ -405,9 +424,9 @@ "h": 8, "w": 12, "x": 0, - "y": 8 + "y": 11 }, - "id": 10, + "id": 5, "options": { "legend": { "calcs": [], @@ -428,8 +447,9 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, + "hide": false, "includeNullMetadata": false, "instant": false, "legendFormat": "P99", @@ -444,9 +464,8 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, - "hide": false, "includeNullMetadata": false, "instant": false, "legendFormat": "P95", @@ -461,7 +480,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -478,7 +497,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -494,15 +513,15 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, - "legendFormat": "Mean", + "legendFormat": "Average", "range": true, "refId": "E" } ], - "title": "Time Per Output Token Latency", + "title": "Time To First Token Latency", "type": "timeseries" }, { @@ -510,7 +529,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", + "description": "Number of tokens processed per second", "fieldConfig": { "defaults": { "color": { @@ -561,8 +580,7 @@ "value": 80 } ] - }, - "unit": "none" + } }, "overrides": [] }, @@ -570,9 +588,9 @@ "h": 8, "w": 12, "x": 12, - "y": 8 + "y": 11 }, - "id": 3, + "id": 8, "options": { "legend": { "calcs": [], @@ -592,12 +610,12 @@ "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "vllm:num_requests_running{model_name=\"$model_name\"}", + "editorMode": "code", + "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", "fullMetaSearch": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "instant": false, - "legendFormat": "Num Running", + "legendFormat": "Prompt Tokens/Sec", "range": true, "refId": "A", "useBackend": false @@ -609,12 +627,12 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", + "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "instant": false, - "legendFormat": "Num Swapped", + "legendFormat": "Generation Tokens/Sec", "range": true, "refId": "B", "useBackend": false @@ -624,20 +642,16 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}", - "fullMetaSearch": false, + "editorMode": "code", + "expr": "rate(vllm:tokens_total{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, - "includeNullMetadata": true, "instant": false, - "legendFormat": "Num Waiting", + "legendFormat": "Total Tokens/Sec", "range": true, - "refId": "C", - "useBackend": false + "refId": "C" } ], - "title": "Scheduler State", + "title": "Token Throughput", "type": "timeseries" }, { @@ -645,7 +659,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "P50, P90, P95, and P99 TTFT latency in seconds.", + "description": "Inter token latency in seconds.", "fieldConfig": { "defaults": { "color": { @@ -705,9 +719,9 @@ "h": 8, "w": 12, "x": 0, - "y": 16 + "y": 19 }, - "id": 5, + "id": 10, "options": { "legend": { "calcs": [], @@ -728,9 +742,8 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, - "hide": false, "includeNullMetadata": false, "instant": false, "legendFormat": "P99", @@ -745,8 +758,9 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, + "hide": false, "includeNullMetadata": false, "instant": false, "legendFormat": "P95", @@ -761,7 +775,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -778,7 +792,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -794,23 +808,36 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, - "legendFormat": "Average", + "legendFormat": "Mean", "range": true, "refId": "E" } ], - "title": "Time To First Token Latency", + "title": "Time Per Output Token Latency", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 16, + "panels": [], + "title": "Request status", + "type": "row" + }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "Percentage of used cache blocks by vLLM.", + "description": "End to end request latency measured in seconds.", "fieldConfig": { "defaults": { "color": { @@ -853,8 +880,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -862,17 +888,17 @@ } ] }, - "unit": "percentunit" + "unit": "s" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 16 + "x": 0, + "y": 28 }, - "id": 4, + "id": 9, "options": { "legend": { "calcs": [], @@ -891,23 +917,30 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "disableTextWrap": false, "editorMode": "code", - "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, "instant": false, - "legendFormat": "GPU Cache Usage", + "legendFormat": "P99", "range": true, - "refId": "A" + "refId": "A", + "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "editorMode": "code", - "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, "hide": false, + "includeNullMetadata": false, "instant": false, - "legendFormat": "CPU Cache Usage", + "legendFormat": "P95", "range": true, "refId": "B" } @@ -1147,8 +1180,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1163,9 +1195,9 @@ "h": 8, "w": 12, "x": 0, - "y": 32 + "y": 60 }, - "id": 11, + "id": 24, "options": { "legend": { "calcs": [], @@ -1185,20 +1217,125 @@ "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))", + "editorMode": "code", + "expr": "(increase(vllm:request_params_n_sum{model_name=\"$model_name\"}[$__rate_interval]))", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "interval": "", - "legendFormat": "__auto", + "legendFormat": "params_n", "range": true, "refId": "A", "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddkg692mcrzeob" + }, + "editorMode": "code", + "expr": "(increase(vllm:request_params_best_of_sum{model_name=\"$model_name\"}[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "best_of", + "range": true, + "refId": "B" } ], "title": "Finish Reason", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddkg692mcrzeob" + }, + "description": "Heatmap of num of request max generation tokens", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 60 + }, + "id": 26, + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": { + "unit": "none" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "Request count" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisLabel": "Generation Length", + "axisPlacement": "left", + "reverse": false, + "unit": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(le) (increase(vllm:request_max_num_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request Max Generation Tokens", + "type": "heatmap" } ], "refresh": "", @@ -1242,8 +1379,8 @@ ] }, "time": { - "from": "now-5m", - "to": "now" + "from": "2024-05-02T01:45:00.000Z", + "to": "2024-05-02T01:55:00.000Z" }, "timepicker": {}, "timezone": "", From 6979673cea01d83c1cbbcb0bafafe6fa05f47867 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 10 May 2024 21:49:29 +0000 Subject: [PATCH 07/16] updated grafana --- examples/production_monitoring/grafana.json | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index d6b041048455..8cdd678cc4e5 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -1389,4 +1389,3 @@ "version": 1, "weekStart": "" } - From 890a7e5762d2ac20580a74f4521df17f48711902 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 10 May 2024 21:50:25 +0000 Subject: [PATCH 08/16] updated buckets to be consistent --- vllm/engine/metrics.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 43845a500d14..080a7b3044f4 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -119,35 +119,36 @@ def __init__(self, labelnames: List[str], max_model_len: int): # Request stats # Latency + request_latency_buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0] self.histogram_e2e_time_request = self._histogram_cls( name="vllm:e2e_request_latency_seconds", documentation="Histogram of end to end request latency in seconds.", labelnames=labelnames, - buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) - self.histogram_queue_time_request = Histogram( + buckets=request_latency_buckets) + self.histogram_queue_time_request = self._histogram_cls( name="vllm:request_queue_time_seconds", documentation= "Histogram of time spent in WAITING phase for request.", labelnames=labelnames, - buckets=[0.1, 1.0, 2.5, 5.0, 10.0, 20.0, 50.0, 100]) - self.histogram_inference_time_request = Histogram( + buckets=request_latency_buckets) + self.histogram_inference_time_request = self._histogram_cls( name="vllm:request_inference_time_seconds", documentation= "Histogram of time spent in RUNNING phase for request.", labelnames=labelnames, - buckets=[0.1, 1.0, 2.5, 5.0, 10.0, 20.0, 50.0, 100]) - self.histogram_prefill_time_request = Histogram( + buckets=request_latency_buckets) + self.histogram_prefill_time_request = self._histogram_cls( name="vllm:request_prefill_time_seconds", documentation= "Histogram of time spent in PREFILL phase for request.", labelnames=labelnames, - buckets=[0.1, 1.0, 2.5, 5.0, 10.0, 20.0, 50.0, 100]) - self.histogram_decode_time_request = Histogram( + buckets=request_latency_buckets) + self.histogram_decode_time_request = self._histogram_cls( name="vllm:request_decode_time_seconds", documentation= "Histogram of time spent in DECODE phase for request.", labelnames=labelnames, - buckets=[0.1, 1.0, 2.5, 5.0, 10.0, 20.0, 50.0, 100]) + buckets=request_latency_buckets) # Metadata self.histogram_num_prompt_tokens_request = self._histogram_cls( name="vllm:request_prompt_tokens", From d8911cdb74bb59ab1ac85e15a514ecbcf556e0cc Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 10 May 2024 21:53:20 +0000 Subject: [PATCH 09/16] format --- vllm/engine/metrics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 080a7b3044f4..eec3a3da3d6d 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -119,7 +119,9 @@ def __init__(self, labelnames: List[str], max_model_len: int): # Request stats # Latency - request_latency_buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0] + request_latency_buckets = [ + 1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0 + ] self.histogram_e2e_time_request = self._histogram_cls( name="vllm:e2e_request_latency_seconds", documentation="Histogram of end to end request latency in seconds.", From 153b2ecacc910af0d0649b219d05dcff3ba42260 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 10 May 2024 22:24:03 +0000 Subject: [PATCH 10/16] updated iteration num tokens to be a histogram --- vllm/engine/metrics.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index eec3a3da3d6d..ab6b9325cb52 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -100,6 +100,11 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:tokens_total", documentation="Number of prefill plus generation tokens processed.", labelnames=labelnames) + self.histogram_iteration_tokens = self._histogram_cls( + name="vllm:iteration_tokens_total", + documentation="Histogram of number of tokens per engine_step.", + labelnames=labelnames, + buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]) self.histogram_time_to_first_token = self._histogram_cls( name="vllm:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", @@ -489,7 +494,8 @@ def _log_prometheus(self, stats: Stats) -> None: stats.num_prompt_tokens_iter) self._log_counter(self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter) - self._log_counter(self.metrics.counter_tokens, stats.num_tokens_iter) + self._log_histogram(self.metrics.histogram_iteration_tokens, + [stats.num_tokens_iter]) self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter) self._log_histogram(self.metrics.histogram_time_per_output_token, From 36eb9ac46842156a1ff8d4968c0db8199c96bf66 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 27 May 2024 22:41:06 +0000 Subject: [PATCH 11/16] removed max_num_requested_tokens --- vllm/engine/llm_engine.py | 4 +--- vllm/engine/metrics.py | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5952e9624a7e..22e29ec6a8d2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1818,7 +1818,7 @@ def _get_stats(self, num_generation_tokens_requests: List[int] = [] best_of_requests: List[int] = [] n_requests: List[int] = [] - max_num_generation_tokens_requests = [] + max_num_generation_tokens_requests: List[int] = [] finished_reason_requests: List[str] = [] # NOTE: This loop assumes prefill seq_groups are before @@ -1961,8 +1961,6 @@ def _get_stats(self, num_generation_tokens_requests=num_generation_tokens_requests, best_of_requests=best_of_requests, n_requests=n_requests, - max_num_generation_tokens_requests= - max_num_generation_tokens_requests, finished_reason_requests=finished_reason_requests, ) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ab6b9325cb52..ec6609123a63 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -524,9 +524,6 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_histogram( self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests) - self._log_histogram( - self.metrics.histogram_max_num_generation_tokens_request, - stats.max_num_generation_tokens_requests) self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests) From 3937c1f56a1cf4f6e68c28587c14d27d52999e10 Mon Sep 17 00:00:00 2001 From: harrywu <904714159@qq.com> Date: Sun, 8 Sep 2024 17:10:30 +0800 Subject: [PATCH 12/16] log max_num_generation_tokens --- examples/production_monitoring/grafana.json | 343 ++++++-------------- vllm/engine/llm_engine.py | 9 +- vllm/engine/metrics.py | 3 + 3 files changed, 110 insertions(+), 245 deletions(-) diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index 8cdd678cc4e5..1af710115730 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -58,25 +58,12 @@ "links": [], "liveNow": false, "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 14, - "panels": [], - "title": "System stats", - "type": "row" - }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", + "description": "End to end request latency measured in seconds.", "fieldConfig": { "defaults": { "color": { @@ -128,17 +115,17 @@ } ] }, - "unit": "none" + "unit": "s" }, "overrides": [] }, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 0, - "y": 1 + "y": 0 }, - "id": 3, + "id": 9, "options": { "legend": { "calcs": [], @@ -158,12 +145,12 @@ "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, - "editorMode": "code", - "expr": "vllm:num_requests_running{model_name=\"$model_name\"}", + "editorMode": "builder", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "instant": false, - "legendFormat": "Num Running", + "legendFormat": "P99", "range": true, "refId": "A", "useBackend": false @@ -175,12 +162,12 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "instant": false, - "legendFormat": "Num Swapped", + "legendFormat": "P95", "range": true, "refId": "B", "useBackend": false @@ -192,12 +179,12 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "instant": false, - "legendFormat": "Num Waiting", + "legendFormat": "P90", "range": true, "refId": "C", "useBackend": false @@ -233,7 +220,7 @@ "refId": "E" } ], - "title": "Scheduler State", + "title": "E2E Request Latency", "type": "timeseries" }, { @@ -241,7 +228,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "Percentage of used cache blocks by vLLM.", + "description": "Number of tokens processed per second", "fieldConfig": { "defaults": { "color": { @@ -292,18 +279,17 @@ "value": 80 } ] - }, - "unit": "percentunit" + } }, "overrides": [] }, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 12, - "y": 1 + "y": 0 }, - "id": 4, + "id": 8, "options": { "legend": { "calcs": [], @@ -322,49 +308,44 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "editorMode": "code", - "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, "instant": false, - "legendFormat": "GPU Cache Usage", + "legendFormat": "Prompt Tokens/Sec", "range": true, - "refId": "A" + "refId": "A", + "useBackend": false }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "editorMode": "code", - "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, "hide": false, + "includeNullMetadata": false, "instant": false, - "legendFormat": "CPU Cache Usage", + "legendFormat": "Generation Tokens/Sec", "range": true, - "refId": "B" + "refId": "B", + "useBackend": false } ], - "title": "Cache Utilization", + "title": "Token Throughput", "type": "timeseries" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 10 - }, - "id": 15, - "panels": [], - "title": "Iteration stats", - "type": "row" - }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "P50, P90, P95, and P99 TTFT latency in seconds.", + "description": "Inter token latency in seconds.", "fieldConfig": { "defaults": { "color": { @@ -424,9 +405,9 @@ "h": 8, "w": 12, "x": 0, - "y": 11 + "y": 8 }, - "id": 5, + "id": 10, "options": { "legend": { "calcs": [], @@ -447,9 +428,8 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, - "hide": false, "includeNullMetadata": false, "instant": false, "legendFormat": "P99", @@ -464,8 +444,9 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, + "hide": false, "includeNullMetadata": false, "instant": false, "legendFormat": "P95", @@ -480,7 +461,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -497,7 +478,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -513,15 +494,15 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, - "legendFormat": "Average", + "legendFormat": "Mean", "range": true, "refId": "E" } ], - "title": "Time To First Token Latency", + "title": "Time Per Output Token Latency", "type": "timeseries" }, { @@ -529,7 +510,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "Number of tokens processed per second", + "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", "fieldConfig": { "defaults": { "color": { @@ -580,7 +561,8 @@ "value": 80 } ] - } + }, + "unit": "none" }, "overrides": [] }, @@ -588,9 +570,9 @@ "h": 8, "w": 12, "x": 12, - "y": 11 + "y": 8 }, - "id": 8, + "id": 3, "options": { "legend": { "calcs": [], @@ -610,12 +592,12 @@ "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, - "editorMode": "code", - "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", + "editorMode": "builder", + "expr": "vllm:num_requests_running{model_name=\"$model_name\"}", "fullMetaSearch": false, - "includeNullMetadata": false, + "includeNullMetadata": true, "instant": false, - "legendFormat": "Prompt Tokens/Sec", + "legendFormat": "Num Running", "range": true, "refId": "A", "useBackend": false @@ -627,12 +609,12 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": false, + "includeNullMetadata": true, "instant": false, - "legendFormat": "Generation Tokens/Sec", + "legendFormat": "Num Swapped", "range": true, "refId": "B", "useBackend": false @@ -642,16 +624,20 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "editorMode": "code", - "expr": "rate(vllm:tokens_total{model_name=\"$model_name\"}[$__rate_interval])", + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}", + "fullMetaSearch": false, "hide": false, + "includeNullMetadata": true, "instant": false, - "legendFormat": "Total Tokens/Sec", + "legendFormat": "Num Waiting", "range": true, - "refId": "C" + "refId": "C", + "useBackend": false } ], - "title": "Token Throughput", + "title": "Scheduler State", "type": "timeseries" }, { @@ -659,7 +645,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "Inter token latency in seconds.", + "description": "P50, P90, P95, and P99 TTFT latency in seconds.", "fieldConfig": { "defaults": { "color": { @@ -719,9 +705,9 @@ "h": 8, "w": 12, "x": 0, - "y": 19 + "y": 16 }, - "id": 10, + "id": 5, "options": { "legend": { "calcs": [], @@ -742,8 +728,9 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, + "hide": false, "includeNullMetadata": false, "instant": false, "legendFormat": "P99", @@ -758,9 +745,8 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, - "hide": false, "includeNullMetadata": false, "instant": false, "legendFormat": "P95", @@ -775,7 +761,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -792,7 +778,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -808,36 +794,23 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, - "legendFormat": "Mean", + "legendFormat": "Average", "range": true, "refId": "E" } ], - "title": "Time Per Output Token Latency", + "title": "Time To First Token Latency", "type": "timeseries" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 27 - }, - "id": 16, - "panels": [], - "title": "Request status", - "type": "row" - }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "description": "End to end request latency measured in seconds.", + "description": "Percentage of used cache blocks by vLLM.", "fieldConfig": { "defaults": { "color": { @@ -880,7 +853,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -888,17 +862,17 @@ } ] }, - "unit": "s" + "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 28 + "x": 12, + "y": 16 }, - "id": 9, + "id": 4, "options": { "legend": { "calcs": [], @@ -917,30 +891,23 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, + "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", "instant": false, - "legendFormat": "P99", + "legendFormat": "GPU Cache Usage", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", - "fullMetaSearch": false, + "editorMode": "code", + "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", "hide": false, - "includeNullMetadata": false, "instant": false, - "legendFormat": "P95", + "legendFormat": "CPU Cache Usage", "range": true, "refId": "B" } @@ -1180,7 +1147,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1195,9 +1163,9 @@ "h": 8, "w": 12, "x": 0, - "y": 60 + "y": 32 }, - "id": 24, + "id": 11, "options": { "legend": { "calcs": [], @@ -1217,125 +1185,20 @@ "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, - "editorMode": "code", - "expr": "(increase(vllm:request_params_n_sum{model_name=\"$model_name\"}[$__rate_interval]))", + "editorMode": "builder", + "expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "interval": "", - "legendFormat": "params_n", + "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddkg692mcrzeob" - }, - "editorMode": "code", - "expr": "(increase(vllm:request_params_best_of_sum{model_name=\"$model_name\"}[$__rate_interval]))", - "hide": false, - "instant": false, - "legendFormat": "best_of", - "range": true, - "refId": "B" } ], "title": "Finish Reason", "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "ddkg692mcrzeob" - }, - "description": "Heatmap of num of request max generation tokens", - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 60 - }, - "id": 26, - "options": { - "calculate": false, - "cellGap": 1, - "cellValues": { - "unit": "none" - }, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "min": 0, - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Spectral", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto", - "value": "Request count" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": true - }, - "yAxis": { - "axisLabel": "Generation Length", - "axisPlacement": "left", - "reverse": false, - "unit": "none" - } - }, - "pluginVersion": "10.4.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sum by(le) (increase(vllm:request_max_num_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))", - "format": "heatmap", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{le}}", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Request Max Generation Tokens", - "type": "heatmap" } ], "refresh": "", @@ -1379,8 +1242,8 @@ ] }, "time": { - "from": "2024-05-02T01:45:00.000Z", - "to": "2024-05-02T01:55:00.000Z" + "from": "now-5m", + "to": "now" }, "timepicker": {}, "timezone": "", diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 22e29ec6a8d2..02fdb90b953a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1893,11 +1893,8 @@ def _get_stats(self, for seq in seq_group.get_finished_seqs() ]) max_num_generation_tokens_requests.append( - max( - seq.get_output_len() - for seq in seq_group.get_seqs() - ) - ) + max(seq.get_output_len() + for seq in seq_group.get_seqs())) if seq_group.sampling_params is not None: best_of_requests.append( seq_group.sampling_params.best_of) @@ -1959,6 +1956,8 @@ def _get_stats(self, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, + max_num_generation_tokens_requests= + max_num_generation_tokens_requests, best_of_requests=best_of_requests, n_requests=n_requests, finished_reason_requests=finished_reason_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ec6609123a63..1742f4d70ba8 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -527,6 +527,9 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) self._log_histogram(self.metrics.histogram_best_of_request, stats.best_of_requests) + self._log_histogram( + self.metrics.histogram_max_num_generation_tokens_request, + stats.max_num_generation_tokens_requests) def _log_prometheus_interval(self, prompt_throughput: float, generation_throughput: float) -> None: From 982f9116104ab3cb7ec696b9b6569f9a9f55fccd Mon Sep 17 00:00:00 2001 From: harrywu <904714159@qq.com> Date: Sun, 8 Sep 2024 19:00:44 +0800 Subject: [PATCH 13/16] first_scheduled_time can be None --- vllm/engine/llm_engine.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 02fdb90b953a..e36234212038 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1875,16 +1875,18 @@ def _get_stats(self, # Latency timings time_e2e_requests.append(now - seq_group.metrics.arrival_time) - time_queue_requests.append( - seq_group.metrics.first_scheduled_time - - seq_group.metrics.arrival_time) - time_prefill_requests.append( - seq_group.metrics.first_token_time - - seq_group.metrics.first_scheduled_time) - time_decode_requests.append( - now - seq_group.metrics.first_token_time) - time_inference_requests.append( - now - seq_group.metrics.first_scheduled_time) + if (seq_group.metrics.first_scheduled_time is not None and + seq_group.metrics.first_token_time is not None): + time_queue_requests.append( + seq_group.metrics.first_scheduled_time - + seq_group.metrics.arrival_time) + time_prefill_requests.append( + seq_group.metrics.first_token_time - + seq_group.metrics.first_scheduled_time) + time_decode_requests.append( + now - seq_group.metrics.first_token_time) + time_inference_requests.append( + now - seq_group.metrics.first_scheduled_time) # Metadata num_prompt_tokens_requests.append( len(seq_group.prompt_token_ids)) From 9caa213c63cb313f0a6bded1b5c1192610f49b12 Mon Sep 17 00:00:00 2001 From: harrywu <904714159@qq.com> Date: Sun, 8 Sep 2024 19:50:41 +0800 Subject: [PATCH 14/16] add some panels in grafana dashboard --- examples/production_monitoring/grafana.json | 383 ++++++++++++++++++-- 1 file changed, 343 insertions(+), 40 deletions(-) diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index 1af710115730..f76a61bb5eec 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -1,33 +1,4 @@ { - "__inputs": [ - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.4.2" - }, - { - "type": "panel", - "id": "heatmap", - "name": "Heatmap", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -54,7 +25,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 1, "links": [], "liveNow": false, "panels": [ @@ -76,6 +47,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -241,6 +213,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -358,6 +331,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -523,6 +497,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -658,6 +633,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -823,6 +799,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -984,7 +961,7 @@ "unit": "none" } }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1076,7 +1053,7 @@ "unit": "none" } }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1117,6 +1094,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -1147,8 +1125,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1199,6 +1176,319 @@ ], "title": "Finish Reason", "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Queue Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_prefill_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Prefill", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:request_decode_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Decode", + "range": true, + "refId": "B" + } + ], + "title": "Requests Prefill and Decode Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_max_num_generation_tokens_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Tokens", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Generation Token in Sequence Group", + "type": "timeseries" } ], "refresh": "", @@ -1207,21 +1497,34 @@ "templating": { "list": [ { - "type": "datasource", - "name": "DS_PROMETHEUS", - "label": "datasource", - "current": {}, + "current": { + "selected": false, + "text": "prometheus", + "value": "edx8memhpd9tsa" + }, "hide": 0, "includeAll": false, + "label": "datasource", "multi": false, + "name": "DS_PROMETHEUS", "options": [], "query": "prometheus", "queryValue": "", "refresh": 1, "regex": "", - "skipUrlSync": false + "skipUrlSync": false, + "type": "datasource" }, { + "current": { + "selected": false, + "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct", + "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct" + }, + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, "definition": "label_values(model_name)", "hide": 0, "includeAll": false, @@ -1249,6 +1552,6 @@ "timezone": "", "title": "vLLM", "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b", - "version": 1, + "version": 8, "weekStart": "" } From fa66f541763c55e6aea9a06a28ef358a41e188a7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 8 Nov 2024 04:33:27 +0000 Subject: [PATCH 15/16] Fix wrong category Signed-off-by: DarkLight1337 --- vllm/engine/llm_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6e745a74692d..9ae9b0d7de4e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1875,14 +1875,14 @@ def _get_stats(self, time_inference_requests=time_inference_requests, time_prefill_requests=time_prefill_requests, time_decode_requests=time_decode_requests, + time_in_queue_requests=time_in_queue_requests, + model_forward_time_requests=model_forward_time_requests, + model_execute_time_requests=model_execute_time_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, max_num_generation_tokens_requests= max_num_generation_tokens_requests, - time_in_queue_requests=time_in_queue_requests, - model_forward_time_requests=model_forward_time_requests, - model_execute_time_requests=model_execute_time_requests, n_requests=n_requests, max_tokens_requests=max_tokens_requests, finished_reason_requests=finished_reason_requests, From 8bb930b4b0beed2913ba51577206a37d05ec34cc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 8 Nov 2024 04:34:42 +0000 Subject: [PATCH 16/16] Remove best_of Signed-off-by: DarkLight1337 --- vllm/engine/metrics.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 9e937d5ae0ba..e896bcdded2d 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -204,12 +204,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames, buckets=build_1_2_5_buckets(max_model_len), ) - self.histogram_best_of_request = self._histogram_cls( - name="vllm:request_params_best_of", - documentation="Histogram of the best_of request parameter.", - labelnames=labelnames, - buckets=[1, 2, 5, 10, 20], - ) self.histogram_max_num_generation_tokens_request = self._histogram_cls( name="vllm:request_max_num_generation_tokens", documentation=