vllm-project · haitwang-cloud · Dec 5, 2025 · Dec 8, 2025 · gemini-code-assist · Dec 5, 2025
diff --git a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml
@@ -19,7 +19,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '0.5'
   scaleTargetRef:
     apiVersion: apps/v1

diff --git a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml
@@ -15,7 +15,7 @@ spec:
       protocolType: http
       port: '8000'
       path: /metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '50'
   scaleTargetRef:
     apiVersion: apps/v1

diff --git a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml
@@ -17,7 +17,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '0.5'
   scaleTargetRef:
     apiVersion: apps/v1

diff --git a/development/app/README.md b/development/app/README.md
@@ -201,7 +201,7 @@ The following keys can be included in the JSON payload to override metrics:
 - `swapped` → `vllm:num_requests_swapped`
 - `avg_prompt_throughput` → `vllm:avg_prompt_throughput_toks_per_s`
 - `avg_generation_throughput` → `vllm:avg_generation_throughput_toks_per_s`
-- `gpu_cache_usage_perc` → `vllm:gpu_cache_usage_perc`
+- `kv_cache_usage_perc` → `vllm:kv_cache_usage_perc`
 - `cpu_cache_usage_perc` → `vllm:cpu_cache_usage_perc`
 - `model_name` – sets the `model_name` label on all metrics
 
@@ -217,7 +217,7 @@ curl -X GET http://localhost:8000/metrics
 curl -X POST http://localhost:8000/set_metrics \
   -H "Content-Type: application/json" \
   -d '{
-    "gpu_cache_usage_perc": 75.0,
+    "kv_cache_usage_perc": 75.0,
     "running": 50,
     "waiting": 10,
     "success_total": 200

diff --git a/development/app/app.py b/development/app/app.py
@@ -886,8 +886,8 @@ def metrics():
     waiting = overrides.get("waiting", randint(1, 100))
     swapped = overrides.get("swapped", randint(1, 100))
     max_running_capacity = 100
-    gpu_cache_usage_perc = overrides.get(
-        "gpu_cache_usage_perc", min(100.0, (running / max_running_capacity) * 100)
+    kv_cache_usage_perc = overrides.get(
+        "kv_cache_usage_perc", min(100.0, (running / max_running_capacity) * 100)
     )
     cpu_cache_usage_perc = overrides.get(
         "cpu_cache_usage_perc", min(100.0, (cpu_running / max_running_capacity) * 100)
@@ -946,10 +946,10 @@ def metrics():
             ),
         },
         {
-            "name": "gpu_cache_usage_perc",
+            "name": "kv_cache_usage_perc",
             "type": "gauge",
             "description": "GPU KV-cache usage. 1 means 100 percent usage.",
-            "value": overrides.get("gpu_cache_usage_perc", gpu_cache_usage_perc),
+            "value": overrides.get("kv_cache_usage_perc", kv_cache_usage_perc),
         },
         {
             "name": "cpu_cache_usage_perc",

diff --git a/development/tutorials/distributed/fleet-autoscaling.yaml b/development/tutorials/distributed/fleet-autoscaling.yaml
@@ -16,7 +16,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '70'
   scaleTargetRef:
     apiVersion: orchestration.aibrix.ai/v1alpha1

diff --git a/development/tutorials/podautoscaler/hpa.yaml b/development/tutorials/podautoscaler/hpa.yaml
@@ -11,7 +11,7 @@ spec:
   metrics:
   - pods:
       metric:
-        name: gpu_cache_usage_perc
+        name: kv_cache_usage_perc
       target:
         averageValue: "40"
         type: AverageValue

diff --git a/development/tutorials/podautoscaler/pa.yaml b/development/tutorials/podautoscaler/pa.yaml
@@ -18,6 +18,6 @@ spec:
     protocolType: "http"
     port: "8000"
     path: "/metrics"
-    targetMetric: "gpu_cache_usage_perc"
+    targetMetric: "kv_cache_usage_perc"
     targetValue: "40"
   scalingStrategy: "HPA"
diff --git a/docs/source/features/autoscaling/metric-based-autoscaling.rst b/docs/source/features/autoscaling/metric-based-autoscaling.rst
@@ -120,7 +120,7 @@ check its logs in this way.
 
     kubectl logs <aibrix-controller-manager-podname> -n aibrix-system -f
 
-Expected log output. You can see the current metric is gpu_cache_usage_perc. You can check each pod's current metric value.
+Expected log output. You can see the current metric is kv_cache_usage_perc. You can check each pod's current metric value.
 
 .. image:: ../../assets/images/autoscaler/aibrix-controller-manager-output.png
    :alt: AiBrix controller manager output

diff --git a/docs/source/features/autoscaling/optimizer-based-autoscaling.rst b/docs/source/features/autoscaling/optimizer-based-autoscaling.rst
@@ -98,7 +98,7 @@ Here we show the preliminary experiment results to show how different autoscalin
 Experiments Results
 ^^^^^^^^^^^^^^^^^^^
 
-- gpu_cache_usage_perc: 70
+- kv_cache_usage_perc: 70
 
 .. image:: ../../assets/images/autoscaler/optimizer-based-autoscaling-70-results.png
    :alt: result

diff --git a/docs/source/features/multi-engine.rst b/docs/source/features/multi-engine.rst
@@ -104,8 +104,8 @@ We only support limited number of metrics from different engines and we will con
      - vllm:request_prefill_time_seconds
      - N/A
      - N/A
-   * - gpu_cache_usage_perc
-     - vllm:gpu_cache_usage_perc
+   * - kv_cache_usage_perc
+     - vllm:kv_cache_usage_perc
      - sglang:token_usage [1]_
      - kv_cache_utilization
    * - engine_utilization

diff --git a/docs/source/features/runtime.rst b/docs/source/features/runtime.rst
@@ -170,9 +170,9 @@ Unified metrics help to standardize the metrics for different inference engines
     # HELP vllm:num_requests_waiting Number of requests waiting to be processed.
     # TYPE vllm:num_requests_waiting gauge
     vllm:num_requests_waiting{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0
-    # HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
-    # TYPE vllm:gpu_cache_usage_perc gauge
-    vllm:gpu_cache_usage_perc{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0
+    # HELP vllm:kv_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
+    # TYPE vllm:kv_cache_usage_perc gauge
+    vllm:kv_cache_usage_perc{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0
     # HELP vllm:cpu_cache_usage_perc CPU KV-cache usage. 1 means 100 percent usage.
     # TYPE vllm:cpu_cache_usage_perc gauge
     vllm:cpu_cache_usage_perc{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0

diff --git a/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json b/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json
@@ -1769,7 +1769,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "editorMode": "code",
-          "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\",job=\"${job}\"})",
+          "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\",job=\"${job}\"})",
           "instant": false,
           "legendFormat": "GPU Cache Usage",
           "range": true,

diff --git a/pkg/controller/podautoscaler/algorithm/mock_context_test.go b/pkg/controller/podautoscaler/algorithm/mock_context_test.go
@@ -50,7 +50,7 @@ type mockScalingContext struct {
 	PanicWindow    time.Duration
 	ScaleDownDelay time.Duration
 
-	MetricTargets map[string]scalingctx.MetricTarget // key: metric name (e.g., "cpu", "gpu_cache_usage_perc")
+	MetricTargets map[string]scalingctx.MetricTarget // key: metric name (e.g., "cpu", "kv_cache_usage_perc")
 }
 
 // Ensure MockScalingContext implements the ScalingContext interface

diff --git a/pkg/controller/podautoscaler/autoscaler_test.go b/pkg/controller/podautoscaler/autoscaler_test.go
@@ -45,7 +45,7 @@ func TestComputeDesiredReplicas(t *testing.T) {
 			metricsSources: []autoscalingv1alpha1.MetricSource{
 				{
 					MetricSourceType: autoscalingv1alpha1.POD,
-					TargetMetric:     "gpu_cache_usage_perc",
+					TargetMetric:     "kv_cache_usage_perc",
 					TargetValue:      "50",
 				},
 			},
@@ -56,7 +56,7 @@ func TestComputeDesiredReplicas(t *testing.T) {
 			metricsSources: []autoscalingv1alpha1.MetricSource{
 				{
 					MetricSourceType: autoscalingv1alpha1.POD,
-					TargetMetric:     "gpu_cache_usage_perc",
+					TargetMetric:     "kv_cache_usage_perc",
 					TargetValue:      "50",
 				},
 				{

diff --git a/pkg/controller/podautoscaler/context/context.go b/pkg/controller/podautoscaler/context/context.go
@@ -87,7 +87,7 @@ type baseScalingContext struct {
 	// Panic mode state
 	InPanicMode bool
 	// MetricTargets used to store multiple metrics
-	MetricTargets map[string]MetricTarget // key: metric name (e.g., "cpu", "gpu_cache_usage_perc")
+	MetricTargets map[string]MetricTarget // key: metric name (e.g., "cpu", "kv_cache_usage_perc")
 }
 
 type MetricTarget struct {

diff --git a/pkg/controller/podautoscaler/context/context_test.go b/pkg/controller/podautoscaler/context/context_test.go
@@ -37,7 +37,7 @@ func TestUpdateByPaTypes_MetricsSources(t *testing.T) {
 			MetricsSources: []autoscalingv1alpha1.MetricSource{
 				{
 					MetricSourceType: autoscalingv1alpha1.POD,
-					TargetMetric:     "gpu_cache_usage_perc",
+					TargetMetric:     "kv_cache_usage_perc",
 					TargetValue:      "50",
 				},
 				{
@@ -55,10 +55,10 @@ func TestUpdateByPaTypes_MetricsSources(t *testing.T) {
 	}
 
 	expectedMetricTargets := map[string]MetricTarget{
-		"gpu_cache_usage_perc": {
+		"kv_cache_usage_perc": {
 			TargetValue:   50,
 			TotalValue:    100,
-			ScalingMetric: "gpu_cache_usage_perc",
+			ScalingMetric: "kv_cache_usage_perc",
 			MetricType:    autoscalingv1alpha1.POD,
 		},
 		"cpu": {

diff --git a/pkg/controller/podautoscaler/hpa_resources_test.go b/pkg/controller/podautoscaler/hpa_resources_test.go
@@ -55,7 +55,7 @@ func TestMakeHPA(t *testing.T) {
 			MetricsSources: []autoscalingv1alpha1.MetricSource{
 				{
 					MetricSourceType: autoscalingv1alpha1.POD,
-					TargetMetric:     "gpu_cache_usage_perc",
+					TargetMetric:     "kv_cache_usage_perc",
 					TargetValue:      "50",
 				},
 				{
@@ -131,7 +131,7 @@ func TestMakeHPA(t *testing.T) {
 					Type: autoscalingv2.PodsMetricSourceType,
 					Pods: &autoscalingv2.PodsMetricSource{
 						Metric: autoscalingv2.MetricIdentifier{
-							Name: "gpu_cache_usage_perc",
+							Name: "kv_cache_usage_perc",
 						},
 						Target: autoscalingv2.MetricTarget{
 							Type:         autoscalingv2.AverageValueMetricType,

diff --git a/pkg/controller/podautoscaler/metrics/client_test.go b/pkg/controller/podautoscaler/metrics/client_test.go
@@ -31,7 +31,7 @@ func TestUpdateMetrics(t *testing.T) {
 	metricKey := types.MetricKey{
 		Namespace:   "default",
 		Name:        "test-llm",
-		MetricName:  "gpu_cache_usage_perc",
+		MetricName:  "kv_cache_usage_perc",
 		PaNamespace: "default",
 		PaName:      "test-llm-apa",
 	}
@@ -45,22 +45,22 @@ func TestUpdateMetrics(t *testing.T) {
 	assert.NoError(t, err)
 
 	assert.Len(t, client.stableWindows, 1)
-	tw := client.stableWindows["default/test-llm-apa/gpu_cache_usage_perc"]
+	tw := client.stableWindows["default/test-llm-apa/kv_cache_usage_perc"]
 	assert.NotNil(t, tw)
 	assert.Len(t, tw.Values(), 1)
 	assert.Equal(t, expectedValue, tw.Values()[0])
 
 	assert.Len(t, client.panicWindows, 1)
-	tw = client.panicWindows["default/test-llm-apa/gpu_cache_usage_perc"]
+	tw = client.panicWindows["default/test-llm-apa/kv_cache_usage_perc"]
 	assert.NotNil(t, tw)
 	assert.Len(t, tw.Values(), 1)
 	assert.Equal(t, expectedValue, tw.Values()[0])
 
 	assert.Len(t, client.stableHistory, 1)
-	assert.NotNil(t, client.stableHistory["default/test-llm-apa/gpu_cache_usage_perc"])
+	assert.NotNil(t, client.stableHistory["default/test-llm-apa/kv_cache_usage_perc"])
 
 	assert.Len(t, client.panicHistory, 1)
-	assert.NotNil(t, client.panicHistory["default/test-llm-apa/gpu_cache_usage_perc"])
+	assert.NotNil(t, client.panicHistory["default/test-llm-apa/kv_cache_usage_perc"])
 }
 
 func TestGetMetricValue(t *testing.T) {
@@ -69,7 +69,7 @@ func TestGetMetricValue(t *testing.T) {
 	metricKey := types.MetricKey{
 		Namespace:   "default",
 		Name:        "test-llm",
-		MetricName:  "gpu_cache_usage_perc",
+		MetricName:  "kv_cache_usage_perc",
 		PaNamespace: "default",
 		PaName:      "test-llm-apa",
 	}

diff --git a/pkg/controller/podautoscaler/metrics/fetcher_test.go b/pkg/controller/podautoscaler/metrics/fetcher_test.go
@@ -48,7 +48,7 @@ func TestRestMetricsFetcher_FetchPodMetrics(t *testing.T) {
 
 	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		_, err := expfmt.MetricFamilyToText(w, &dto.MetricFamily{
-			Name: ptr.To("vllm:gpu_cache_usage_perc"),
+			Name: ptr.To("vllm:kv_cache_usage_perc"),
 			Type: dto.MetricType_GAUGE.Enum(),
 			Metric: []*dto.Metric{
 				{
@@ -84,7 +84,7 @@ func TestRestMetricsFetcher_FetchPodMetrics(t *testing.T) {
 	}
 	source := autoscalingv1alpha1.MetricSource{
 		MetricSourceType: autoscalingv1alpha1.POD,
-		TargetMetric:     "gpu_cache_usage_perc",
+		TargetMetric:     "kv_cache_usage_perc",
 		Port:             port,
 	}
 

diff --git a/pkg/metrics/engine_fetcher_test.go b/pkg/metrics/engine_fetcher_test.go
@@ -36,9 +36,9 @@ vllm_num_requests_running{model_name="meta-llama/Llama-2-7b-chat-hf"} 2.0
 # HELP vllm_num_requests_waiting Number of requests waiting to be processed.
 # TYPE vllm_num_requests_waiting gauge
 vllm_num_requests_waiting{model_name="meta-llama/Llama-2-7b-chat-hf"} 3.0
-# HELP vllm_gpu_cache_usage_perc GPU KV-cache usage. 1.0 means 100 percent usage.
-# TYPE vllm_gpu_cache_usage_perc gauge
-vllm_gpu_cache_usage_perc 0.75
+# HELP vllm_kv_cache_usage_perc GPU KV-cache usage. 1.0 means 100 percent usage.
+# TYPE vllm_kv_cache_usage_perc gauge
+vllm_kv_cache_usage_perc 0.75
 # HELP vllm_time_to_first_token_seconds Histogram of time to first token in seconds.
 # TYPE vllm_time_to_first_token_seconds histogram
 vllm_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-2-7b-chat-hf",le="0.001"} 0.0

diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
@@ -39,7 +39,7 @@ const (
 	AvgTPOT5mPod                         = "avg_tpot_pod_5m"
 	AvgPromptToksPerReq                  = "avg_prompt_toks_per_req"
 	AvgGenerationToksPerReq              = "avg_generation_toks_per_req"
-	GPUCacheUsagePerc                    = "gpu_cache_usage_perc"
+	GPUCacheUsagePerc                    = "kv_cache_usage_perc"
-	GPUCacheUsagePerc                    = "kv_cache_usage_perc"
+KVCacheUsagePerc                    = "kv_cache_usage_perc"
-	GPUCacheUsagePerc                    = "kv_cache_usage_perc"
+KVCacheUsagePerc                    = "kv_cache_usage_perc"
 	GPUBusyTimeRatio                     = "gpu_busy_time_ratio"
 	CPUCacheUsagePerc                    = "cpu_cache_usage_perc"
 	EngineUtilization                    = "engine_utilization"
@@ -304,7 +304,7 @@ var (
 				Raw: Counter,
 			},
 			EngineMetricsNameMapping: map[string]string{
-				"vllm":   "vllm:gpu_cache_usage_perc",
+				"vllm":   "vllm:kv_cache_usage_perc",
 				"sglang": "sglang:token_usage", // Based on https://github.com/sgl-project/sglang/issues/5979
 				"xllm":   "kv_cache_utilization",
 			},

diff --git a/pkg/metrics/utils_test.go b/pkg/metrics/utils_test.go
@@ -33,9 +33,9 @@ func TestParseHistogramWithLabels(t *testing.T) {
 # HELP vllm:num_requests_waiting Number of requests waiting to be processed.
 # TYPE vllm:num_requests_waiting gauge
 vllm:num_requests_waiting{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.0
-# HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
-# TYPE vllm:gpu_cache_usage_perc gauge
-vllm:gpu_cache_usage_perc{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.0
+# HELP vllm:kv_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
+# TYPE vllm:kv_cache_usage_perc gauge
+vllm:kv_cache_usage_perc{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.0
 # HELP vllm:time_per_output_token_seconds histogram
 vllm:time_per_output_token_seconds_sum{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.23455095291137695
 vllm:time_per_output_token_seconds_count{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 29.0

diff --git a/python/aibrix/aibrix/metrics/engine_rules.py b/python/aibrix/aibrix/metrics/engine_rules.py
@@ -26,8 +26,8 @@
     "vllm:num_requests_waiting": RenameStandardRule(
         "vllm:num_requests_waiting", "aibrix:queue_size"
     ),
-    "vllm:gpu_cache_usage_perc": RenameStandardRule(
-        "vllm:gpu_cache_usage_perc", "aibrix:gpu_cache_usage_perc"
+    "vllm:kv_cache_usage_perc": RenameStandardRule(
+        "vllm:kv_cache_usage_perc", "aibrix:kv_cache_usage_perc"
     ),
     # Token processing metrics
     "vllm:prompt_tokens_total": RenameStandardRule(

diff --git a/python/aibrix/tests/metrics/test_metrics_multi_engine.py b/python/aibrix/tests/metrics/test_metrics_multi_engine.py
@@ -33,9 +33,9 @@ def test_get_metric_rules_vllm(self):
         assert isinstance(rules["vllm:num_requests_waiting"], RenameStandardRule)
         assert rules["vllm:num_requests_waiting"].new_name == "aibrix:queue_size"
 
-        assert "vllm:gpu_cache_usage_perc" in rules
+        assert "vllm:kv_cache_usage_perc" in rules
         assert (
-            rules["vllm:gpu_cache_usage_perc"].new_name == "aibrix:gpu_cache_usage_perc"
+            rules["vllm:kv_cache_usage_perc"].new_name == "aibrix:kv_cache_usage_perc"
         )
 
         # Check token processing metrics exist
@@ -307,7 +307,7 @@ def test_vllm_hybrid_coverage(self):
         # Essential metrics should be renamed
         essential_mapped = [
             "vllm:num_requests_waiting",
-            "vllm:gpu_cache_usage_perc",
+            "vllm:kv_cache_usage_perc",
             "vllm:prompt_tokens_total",
             "vllm:generation_tokens_total",
             "vllm:time_to_first_token_seconds",

diff --git a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch
@@ -2087,7 +2087,7 @@ index 000000000..311169501
 +              "uid": "${DS_PROMETHEUS}"
 +            },
 +            "editorMode": "code",
-+            "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})",
++            "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})",
 +            "instant": false,
 +            "legendFormat": "GPU Cache Usage",
 +            "range": true,

diff --git a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch
@@ -1648,7 +1648,7 @@ index 000000000..311169501
 +              "uid": "${DS_PROMETHEUS}"
 +            },
 +            "editorMode": "code",
-+            "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})",
++            "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})",
 +            "instant": false,
 +            "legendFormat": "GPU Cache Usage",
 +            "range": true,

diff --git a/samples/autoscaling/apa.yaml b/samples/autoscaling/apa.yaml
@@ -19,7 +19,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '0.5'
   scaleTargetRef:
     apiVersion: apps/v1

diff --git a/samples/autoscaling/hpa.yaml b/samples/autoscaling/hpa.yaml
@@ -15,7 +15,7 @@ spec:
       protocolType: http
       port: '8000'
       path: /metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '50'
   scaleTargetRef:
     apiVersion: apps/v1