diff --git a/examples/online_serving/dashboards/perses/performance_statistics.yaml b/examples/online_serving/dashboards/perses/performance_statistics.yaml index 2e8d24c3324b..8030fe2f00a9 100644 --- a/examples/online_serving/dashboards/perses/performance_statistics.yaml +++ b/examples/online_serving/dashboards/perses/performance_statistics.yaml @@ -530,7 +530,7 @@ spec: name: accelerators-thanos-querier-datasource # Multiply by 100 so we can read it as a percentage without setting a unit (avoids CUE unit conflicts) query: > - 100 * avg(vllm:gpu_cache_usage_perc) + 100 * avg(vllm:kv_cache_usage_perc) "18": kind: Panel diff --git a/examples/online_serving/dashboards/perses/query_statistics.yaml b/examples/online_serving/dashboards/perses/query_statistics.yaml index 28109aae8151..ad8e047f6dfe 100644 --- a/examples/online_serving/dashboards/perses/query_statistics.yaml +++ b/examples/online_serving/dashboards/perses/query_statistics.yaml @@ -98,7 +98,7 @@ spec: kind: PrometheusTimeSeriesQuery spec: datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } - query: avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0) + query: avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) or vector(0) minStep: "15s" core_running_ts: @@ -168,7 +168,7 @@ spec: spec: datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } # multiply by 100 to present percentage; omit format.unit to avoid schema conflicts - query: (avg(vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0) + query: (avg(vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0) minStep: "15s" core_kv_usage_pct_ts: @@ -187,7 +187,7 @@ spec: kind: PrometheusTimeSeriesQuery spec: datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } - query: (avg by (service) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0) + query: (avg by (service) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0) minStep: "15s" # --- Per-Pod breakdowns (works on Simulator & Real) --- @@ -246,7 +246,7 @@ spec: spec: datasource: { kind: PrometheusDatasource, name: accelerators-thanos-querier-datasource } # if your exporter labels kv metric with pod (the sim does), this works; otherwise it will just return empty - query: (avg by (pod) (vllm:gpu_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0) + query: (avg by (pod) (vllm:kv_cache_usage_perc{namespace="$NS",service="$SVC"}) * 100) or vector(0) minStep: "15s" # --- Real vLLM only (zeros on simulator) --- diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json index 37abc9de926f..1c89d4593830 100644 --- a/examples/online_serving/prometheus_grafana/grafana.json +++ b/examples/online_serving/prometheus_grafana/grafana.json @@ -852,7 +852,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", + "expr": "vllm:kv_cache_usage_perc{model_name=\"$model_name\"}", "instant": false, "legendFormat": "GPU Cache Usage", "range": true,