diff --git a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml
index a81d13815..f9be15c00 100644
--- a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml
+++ b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml
@@ -19,7 +19,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '0.5'
   scaleTargetRef:
     apiVersion: apps/v1
diff --git a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml
index 55adfc001..80d69cfe5 100644
--- a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml
+++ b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml
@@ -15,7 +15,7 @@ spec:
       protocolType: http
       port: '8000'
       path: /metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '50'
   scaleTargetRef:
     apiVersion: apps/v1
diff --git a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml
index c49d4546b..81d5b2d25 100644
--- a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml
+++ b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml
@@ -17,7 +17,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '0.5'
   scaleTargetRef:
     apiVersion: apps/v1
diff --git a/development/app/README.md b/development/app/README.md
index 7fe310d91..608e410aa 100644
--- a/development/app/README.md
+++ b/development/app/README.md
@@ -201,7 +201,7 @@ The following keys can be included in the JSON payload to override metrics:
 - `swapped` → `vllm:num_requests_swapped`
 - `avg_prompt_throughput` → `vllm:avg_prompt_throughput_toks_per_s`
 - `avg_generation_throughput` → `vllm:avg_generation_throughput_toks_per_s`
-- `gpu_cache_usage_perc` → `vllm:gpu_cache_usage_perc`
+- `kv_cache_usage_perc` → `vllm:kv_cache_usage_perc`
 - `cpu_cache_usage_perc` → `vllm:cpu_cache_usage_perc`
 - `model_name` – sets the `model_name` label on all metrics
 
@@ -217,7 +217,7 @@ curl -X GET http://localhost:8000/metrics
 curl -X POST http://localhost:8000/set_metrics \
   -H "Content-Type: application/json" \
   -d '{
-    "gpu_cache_usage_perc": 75.0,
+    "kv_cache_usage_perc": 75.0,
     "running": 50,
     "waiting": 10,
     "success_total": 200
diff --git a/development/app/app.py b/development/app/app.py
index fe64cd1e8..b778e6230 100644
--- a/development/app/app.py
+++ b/development/app/app.py
@@ -886,8 +886,8 @@ def metrics():
     waiting = overrides.get("waiting", randint(1, 100))
     swapped = overrides.get("swapped", randint(1, 100))
     max_running_capacity = 100
-    gpu_cache_usage_perc = overrides.get(
-        "gpu_cache_usage_perc", min(100.0, (running / max_running_capacity) * 100)
+    kv_cache_usage_perc = overrides.get(
+        "kv_cache_usage_perc", min(100.0, (running / max_running_capacity) * 100)
     )
     cpu_cache_usage_perc = overrides.get(
         "cpu_cache_usage_perc", min(100.0, (cpu_running / max_running_capacity) * 100)
@@ -946,10 +946,10 @@ def metrics():
             ),
         },
         {
-            "name": "gpu_cache_usage_perc",
+            "name": "kv_cache_usage_perc",
             "type": "gauge",
             "description": "GPU KV-cache usage. 1 means 100 percent usage.",
-            "value": overrides.get("gpu_cache_usage_perc", gpu_cache_usage_perc),
+            "value": overrides.get("kv_cache_usage_perc", kv_cache_usage_perc),
         },
         {
             "name": "cpu_cache_usage_perc",
diff --git a/development/tutorials/distributed/fleet-autoscaling.yaml b/development/tutorials/distributed/fleet-autoscaling.yaml
index 605bb08a8..a3a940d00 100644
--- a/development/tutorials/distributed/fleet-autoscaling.yaml
+++ b/development/tutorials/distributed/fleet-autoscaling.yaml
@@ -16,7 +16,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '70'
   scaleTargetRef:
     apiVersion: orchestration.aibrix.ai/v1alpha1
diff --git a/development/tutorials/podautoscaler/hpa.yaml b/development/tutorials/podautoscaler/hpa.yaml
index 502ba7fc3..acd067fd2 100644
--- a/development/tutorials/podautoscaler/hpa.yaml
+++ b/development/tutorials/podautoscaler/hpa.yaml
@@ -11,7 +11,7 @@ spec:
   metrics:
   - pods:
       metric:
-        name: gpu_cache_usage_perc
+        name: kv_cache_usage_perc
       target:
         averageValue: "40"
         type: AverageValue
diff --git a/development/tutorials/podautoscaler/pa.yaml b/development/tutorials/podautoscaler/pa.yaml
index ce1a15985..39c66ef9f 100644
--- a/development/tutorials/podautoscaler/pa.yaml
+++ b/development/tutorials/podautoscaler/pa.yaml
@@ -18,6 +18,6 @@ spec:
     protocolType: "http"
     port: "8000"
     path: "/metrics"
-    targetMetric: "gpu_cache_usage_perc"
+    targetMetric: "kv_cache_usage_perc"
     targetValue: "40"
   scalingStrategy: "HPA"
diff --git a/docs/source/features/autoscaling/metric-based-autoscaling.rst b/docs/source/features/autoscaling/metric-based-autoscaling.rst
index 9fccf91b7..8e46c7c1d 100644
--- a/docs/source/features/autoscaling/metric-based-autoscaling.rst
+++ b/docs/source/features/autoscaling/metric-based-autoscaling.rst
@@ -120,7 +120,7 @@ check its logs in this way.
 
     kubectl logs <aibrix-controller-manager-podname> -n aibrix-system -f
 
-Expected log output. You can see the current metric is gpu_cache_usage_perc. You can check each pod's current metric value.
+Expected log output. You can see the current metric is kv_cache_usage_perc. You can check each pod's current metric value.
 
 .. image:: ../../assets/images/autoscaler/aibrix-controller-manager-output.png
    :alt: AiBrix controller manager output
diff --git a/docs/source/features/autoscaling/optimizer-based-autoscaling.rst b/docs/source/features/autoscaling/optimizer-based-autoscaling.rst
index 7af39a16d..98ff4cdf6 100644
--- a/docs/source/features/autoscaling/optimizer-based-autoscaling.rst
+++ b/docs/source/features/autoscaling/optimizer-based-autoscaling.rst
@@ -98,7 +98,7 @@ Here we show the preliminary experiment results to show how different autoscalin
 Experiments Results
 ^^^^^^^^^^^^^^^^^^^
 
-- gpu_cache_usage_perc: 70
+- kv_cache_usage_perc: 70
 
 .. image:: ../../assets/images/autoscaler/optimizer-based-autoscaling-70-results.png
    :alt: result
diff --git a/docs/source/features/multi-engine.rst b/docs/source/features/multi-engine.rst
index 22508796c..07285ff60 100644
--- a/docs/source/features/multi-engine.rst
+++ b/docs/source/features/multi-engine.rst
@@ -104,8 +104,8 @@ We only support limited number of metrics from different engines and we will con
      - vllm:request_prefill_time_seconds
      - N/A
      - N/A
-   * - gpu_cache_usage_perc
-     - vllm:gpu_cache_usage_perc
+   * - kv_cache_usage_perc
+     - vllm:kv_cache_usage_perc
      - sglang:token_usage [1]_
      - kv_cache_utilization
    * - engine_utilization
diff --git a/docs/source/features/runtime.rst b/docs/source/features/runtime.rst
index 1f6118b35..7598cd9ab 100644
--- a/docs/source/features/runtime.rst
+++ b/docs/source/features/runtime.rst
@@ -170,9 +170,9 @@ Unified metrics help to standardize the metrics for different inference engines
     # HELP vllm:num_requests_waiting Number of requests waiting to be processed.
     # TYPE vllm:num_requests_waiting gauge
     vllm:num_requests_waiting{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0
-    # HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
-    # TYPE vllm:gpu_cache_usage_perc gauge
-    vllm:gpu_cache_usage_perc{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0
+    # HELP vllm:kv_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
+    # TYPE vllm:kv_cache_usage_perc gauge
+    vllm:kv_cache_usage_perc{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0
     # HELP vllm:cpu_cache_usage_perc CPU KV-cache usage. 1 means 100 percent usage.
     # TYPE vllm:cpu_cache_usage_perc gauge
     vllm:cpu_cache_usage_perc{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0
diff --git a/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json b/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json
index af6717010..2a698722c 100644
--- a/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json
+++ b/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json
@@ -1769,7 +1769,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "editorMode": "code",
-          "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\",job=\"${job}\"})",
+          "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\",job=\"${job}\"})",
           "instant": false,
           "legendFormat": "GPU Cache Usage",
           "range": true,
diff --git a/pkg/controller/podautoscaler/algorithm/mock_context_test.go b/pkg/controller/podautoscaler/algorithm/mock_context_test.go
index 97934175b..bc957f6a0 100644
--- a/pkg/controller/podautoscaler/algorithm/mock_context_test.go
+++ b/pkg/controller/podautoscaler/algorithm/mock_context_test.go
@@ -50,7 +50,7 @@ type mockScalingContext struct {
 	PanicWindow    time.Duration
 	ScaleDownDelay time.Duration
 
-	MetricTargets map[string]scalingctx.MetricTarget // key: metric name (e.g., "cpu", "gpu_cache_usage_perc")
+	MetricTargets map[string]scalingctx.MetricTarget // key: metric name (e.g., "cpu", "kv_cache_usage_perc")
 }
 
 // Ensure MockScalingContext implements the ScalingContext interface
diff --git a/pkg/controller/podautoscaler/autoscaler_test.go b/pkg/controller/podautoscaler/autoscaler_test.go
index c4f340647..67b7904bd 100644
--- a/pkg/controller/podautoscaler/autoscaler_test.go
+++ b/pkg/controller/podautoscaler/autoscaler_test.go
@@ -45,7 +45,7 @@ func TestComputeDesiredReplicas(t *testing.T) {
 			metricsSources: []autoscalingv1alpha1.MetricSource{
 				{
 					MetricSourceType: autoscalingv1alpha1.POD,
-					TargetMetric:     "gpu_cache_usage_perc",
+					TargetMetric:     "kv_cache_usage_perc",
 					TargetValue:      "50",
 				},
 			},
@@ -56,7 +56,7 @@ func TestComputeDesiredReplicas(t *testing.T) {
 			metricsSources: []autoscalingv1alpha1.MetricSource{
 				{
 					MetricSourceType: autoscalingv1alpha1.POD,
-					TargetMetric:     "gpu_cache_usage_perc",
+					TargetMetric:     "kv_cache_usage_perc",
 					TargetValue:      "50",
 				},
 				{
diff --git a/pkg/controller/podautoscaler/context/context.go b/pkg/controller/podautoscaler/context/context.go
index 106343452..0fa7d242a 100644
--- a/pkg/controller/podautoscaler/context/context.go
+++ b/pkg/controller/podautoscaler/context/context.go
@@ -87,7 +87,7 @@ type baseScalingContext struct {
 	// Panic mode state
 	InPanicMode bool
 	// MetricTargets used to store multiple metrics
-	MetricTargets map[string]MetricTarget // key: metric name (e.g., "cpu", "gpu_cache_usage_perc")
+	MetricTargets map[string]MetricTarget // key: metric name (e.g., "cpu", "kv_cache_usage_perc")
 }
 
 type MetricTarget struct {
diff --git a/pkg/controller/podautoscaler/context/context_test.go b/pkg/controller/podautoscaler/context/context_test.go
index b2ce7560b..7c919ed34 100644
--- a/pkg/controller/podautoscaler/context/context_test.go
+++ b/pkg/controller/podautoscaler/context/context_test.go
@@ -37,7 +37,7 @@ func TestUpdateByPaTypes_MetricsSources(t *testing.T) {
 			MetricsSources: []autoscalingv1alpha1.MetricSource{
 				{
 					MetricSourceType: autoscalingv1alpha1.POD,
-					TargetMetric:     "gpu_cache_usage_perc",
+					TargetMetric:     "kv_cache_usage_perc",
 					TargetValue:      "50",
 				},
 				{
@@ -55,10 +55,10 @@ func TestUpdateByPaTypes_MetricsSources(t *testing.T) {
 	}
 
 	expectedMetricTargets := map[string]MetricTarget{
-		"gpu_cache_usage_perc": {
+		"kv_cache_usage_perc": {
 			TargetValue:   50,
 			TotalValue:    100,
-			ScalingMetric: "gpu_cache_usage_perc",
+			ScalingMetric: "kv_cache_usage_perc",
 			MetricType:    autoscalingv1alpha1.POD,
 		},
 		"cpu": {
diff --git a/pkg/controller/podautoscaler/hpa_resources_test.go b/pkg/controller/podautoscaler/hpa_resources_test.go
index 3de9157f0..d21afb507 100644
--- a/pkg/controller/podautoscaler/hpa_resources_test.go
+++ b/pkg/controller/podautoscaler/hpa_resources_test.go
@@ -55,7 +55,7 @@ func TestMakeHPA(t *testing.T) {
 			MetricsSources: []autoscalingv1alpha1.MetricSource{
 				{
 					MetricSourceType: autoscalingv1alpha1.POD,
-					TargetMetric:     "gpu_cache_usage_perc",
+					TargetMetric:     "kv_cache_usage_perc",
 					TargetValue:      "50",
 				},
 				{
@@ -131,7 +131,7 @@ func TestMakeHPA(t *testing.T) {
 					Type: autoscalingv2.PodsMetricSourceType,
 					Pods: &autoscalingv2.PodsMetricSource{
 						Metric: autoscalingv2.MetricIdentifier{
-							Name: "gpu_cache_usage_perc",
+							Name: "kv_cache_usage_perc",
 						},
 						Target: autoscalingv2.MetricTarget{
 							Type:         autoscalingv2.AverageValueMetricType,
diff --git a/pkg/controller/podautoscaler/metrics/client_test.go b/pkg/controller/podautoscaler/metrics/client_test.go
index 6b249b9cd..55f84aacc 100644
--- a/pkg/controller/podautoscaler/metrics/client_test.go
+++ b/pkg/controller/podautoscaler/metrics/client_test.go
@@ -31,7 +31,7 @@ func TestUpdateMetrics(t *testing.T) {
 	metricKey := types.MetricKey{
 		Namespace:   "default",
 		Name:        "test-llm",
-		MetricName:  "gpu_cache_usage_perc",
+		MetricName:  "kv_cache_usage_perc",
 		PaNamespace: "default",
 		PaName:      "test-llm-apa",
 	}
@@ -45,22 +45,22 @@ func TestUpdateMetrics(t *testing.T) {
 	assert.NoError(t, err)
 
 	assert.Len(t, client.stableWindows, 1)
-	tw := client.stableWindows["default/test-llm-apa/gpu_cache_usage_perc"]
+	tw := client.stableWindows["default/test-llm-apa/kv_cache_usage_perc"]
 	assert.NotNil(t, tw)
 	assert.Len(t, tw.Values(), 1)
 	assert.Equal(t, expectedValue, tw.Values()[0])
 
 	assert.Len(t, client.panicWindows, 1)
-	tw = client.panicWindows["default/test-llm-apa/gpu_cache_usage_perc"]
+	tw = client.panicWindows["default/test-llm-apa/kv_cache_usage_perc"]
 	assert.NotNil(t, tw)
 	assert.Len(t, tw.Values(), 1)
 	assert.Equal(t, expectedValue, tw.Values()[0])
 
 	assert.Len(t, client.stableHistory, 1)
-	assert.NotNil(t, client.stableHistory["default/test-llm-apa/gpu_cache_usage_perc"])
+	assert.NotNil(t, client.stableHistory["default/test-llm-apa/kv_cache_usage_perc"])
 
 	assert.Len(t, client.panicHistory, 1)
-	assert.NotNil(t, client.panicHistory["default/test-llm-apa/gpu_cache_usage_perc"])
+	assert.NotNil(t, client.panicHistory["default/test-llm-apa/kv_cache_usage_perc"])
 }
 
 func TestGetMetricValue(t *testing.T) {
@@ -69,7 +69,7 @@ func TestGetMetricValue(t *testing.T) {
 	metricKey := types.MetricKey{
 		Namespace:   "default",
 		Name:        "test-llm",
-		MetricName:  "gpu_cache_usage_perc",
+		MetricName:  "kv_cache_usage_perc",
 		PaNamespace: "default",
 		PaName:      "test-llm-apa",
 	}
diff --git a/pkg/controller/podautoscaler/metrics/fetcher_test.go b/pkg/controller/podautoscaler/metrics/fetcher_test.go
index e4ba2e61a..1091f3e34 100644
--- a/pkg/controller/podautoscaler/metrics/fetcher_test.go
+++ b/pkg/controller/podautoscaler/metrics/fetcher_test.go
@@ -48,7 +48,7 @@ func TestRestMetricsFetcher_FetchPodMetrics(t *testing.T) {
 
 	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		_, err := expfmt.MetricFamilyToText(w, &dto.MetricFamily{
-			Name: ptr.To("vllm:gpu_cache_usage_perc"),
+			Name: ptr.To("vllm:kv_cache_usage_perc"),
 			Type: dto.MetricType_GAUGE.Enum(),
 			Metric: []*dto.Metric{
 				{
@@ -84,7 +84,7 @@ func TestRestMetricsFetcher_FetchPodMetrics(t *testing.T) {
 	}
 	source := autoscalingv1alpha1.MetricSource{
 		MetricSourceType: autoscalingv1alpha1.POD,
-		TargetMetric:     "gpu_cache_usage_perc",
+		TargetMetric:     "kv_cache_usage_perc",
 		Port:             port,
 	}
 
diff --git a/pkg/metrics/engine_fetcher_test.go b/pkg/metrics/engine_fetcher_test.go
index 14f7ed253..1560cc217 100644
--- a/pkg/metrics/engine_fetcher_test.go
+++ b/pkg/metrics/engine_fetcher_test.go
@@ -36,9 +36,9 @@ vllm_num_requests_running{model_name="meta-llama/Llama-2-7b-chat-hf"} 2.0
 # HELP vllm_num_requests_waiting Number of requests waiting to be processed.
 # TYPE vllm_num_requests_waiting gauge
 vllm_num_requests_waiting{model_name="meta-llama/Llama-2-7b-chat-hf"} 3.0
-# HELP vllm_gpu_cache_usage_perc GPU KV-cache usage. 1.0 means 100 percent usage.
-# TYPE vllm_gpu_cache_usage_perc gauge
-vllm_gpu_cache_usage_perc 0.75
+# HELP vllm_kv_cache_usage_perc GPU KV-cache usage. 1.0 means 100 percent usage.
+# TYPE vllm_kv_cache_usage_perc gauge
+vllm_kv_cache_usage_perc 0.75
 # HELP vllm_time_to_first_token_seconds Histogram of time to first token in seconds.
 # TYPE vllm_time_to_first_token_seconds histogram
 vllm_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-2-7b-chat-hf",le="0.001"} 0.0
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index f672ebf3a..21896d691 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -39,7 +39,7 @@ const (
 	AvgTPOT5mPod                         = "avg_tpot_pod_5m"
 	AvgPromptToksPerReq                  = "avg_prompt_toks_per_req"
 	AvgGenerationToksPerReq              = "avg_generation_toks_per_req"
-	GPUCacheUsagePerc                    = "gpu_cache_usage_perc"
+	GPUCacheUsagePerc                    = "kv_cache_usage_perc"
 	GPUBusyTimeRatio                     = "gpu_busy_time_ratio"
 	CPUCacheUsagePerc                    = "cpu_cache_usage_perc"
 	EngineUtilization                    = "engine_utilization"
@@ -304,7 +304,7 @@ var (
 				Raw: Counter,
 			},
 			EngineMetricsNameMapping: map[string]string{
-				"vllm":   "vllm:gpu_cache_usage_perc",
+				"vllm":   "vllm:kv_cache_usage_perc",
 				"sglang": "sglang:token_usage", // Based on https://github.com/sgl-project/sglang/issues/5979
 				"xllm":   "kv_cache_utilization",
 			},
diff --git a/pkg/metrics/utils_test.go b/pkg/metrics/utils_test.go
index cb2e0966b..9795be224 100644
--- a/pkg/metrics/utils_test.go
+++ b/pkg/metrics/utils_test.go
@@ -33,9 +33,9 @@ func TestParseHistogramWithLabels(t *testing.T) {
 # HELP vllm:num_requests_waiting Number of requests waiting to be processed.
 # TYPE vllm:num_requests_waiting gauge
 vllm:num_requests_waiting{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.0
-# HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
-# TYPE vllm:gpu_cache_usage_perc gauge
-vllm:gpu_cache_usage_perc{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.0
+# HELP vllm:kv_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
+# TYPE vllm:kv_cache_usage_perc gauge
+vllm:kv_cache_usage_perc{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.0
 # HELP vllm:time_per_output_token_seconds histogram
 vllm:time_per_output_token_seconds_sum{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.23455095291137695
 vllm:time_per_output_token_seconds_count{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 29.0
diff --git a/python/aibrix/aibrix/metrics/engine_rules.py b/python/aibrix/aibrix/metrics/engine_rules.py
index acb0610cc..0dc30d0b2 100644
--- a/python/aibrix/aibrix/metrics/engine_rules.py
+++ b/python/aibrix/aibrix/metrics/engine_rules.py
@@ -26,8 +26,8 @@
     "vllm:num_requests_waiting": RenameStandardRule(
         "vllm:num_requests_waiting", "aibrix:queue_size"
     ),
-    "vllm:gpu_cache_usage_perc": RenameStandardRule(
-        "vllm:gpu_cache_usage_perc", "aibrix:gpu_cache_usage_perc"
+    "vllm:kv_cache_usage_perc": RenameStandardRule(
+        "vllm:kv_cache_usage_perc", "aibrix:kv_cache_usage_perc"
     ),
     # Token processing metrics
     "vllm:prompt_tokens_total": RenameStandardRule(
diff --git a/python/aibrix/tests/metrics/test_metrics_multi_engine.py b/python/aibrix/tests/metrics/test_metrics_multi_engine.py
index a7b4d197d..002628d5d 100644
--- a/python/aibrix/tests/metrics/test_metrics_multi_engine.py
+++ b/python/aibrix/tests/metrics/test_metrics_multi_engine.py
@@ -33,9 +33,9 @@ def test_get_metric_rules_vllm(self):
         assert isinstance(rules["vllm:num_requests_waiting"], RenameStandardRule)
         assert rules["vllm:num_requests_waiting"].new_name == "aibrix:queue_size"
 
-        assert "vllm:gpu_cache_usage_perc" in rules
+        assert "vllm:kv_cache_usage_perc" in rules
         assert (
-            rules["vllm:gpu_cache_usage_perc"].new_name == "aibrix:gpu_cache_usage_perc"
+            rules["vllm:kv_cache_usage_perc"].new_name == "aibrix:kv_cache_usage_perc"
         )
 
         # Check token processing metrics exist
@@ -307,7 +307,7 @@ def test_vllm_hybrid_coverage(self):
         # Essential metrics should be renamed
         essential_mapped = [
             "vllm:num_requests_waiting",
-            "vllm:gpu_cache_usage_perc",
+            "vllm:kv_cache_usage_perc",
             "vllm:prompt_tokens_total",
             "vllm:generation_tokens_total",
             "vllm:time_to_first_token_seconds",
diff --git a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch
index 747c7955c..68b93447e 100644
--- a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch
+++ b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch
@@ -2087,7 +2087,7 @@ index 000000000..311169501
 +              "uid": "${DS_PROMETHEUS}"
 +            },
 +            "editorMode": "code",
-+            "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})",
++            "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})",
 +            "instant": false,
 +            "legendFormat": "GPU Cache Usage",
 +            "range": true,
diff --git a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch
index c1eee2899..c450f5187 100644
--- a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch
+++ b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch
@@ -1648,7 +1648,7 @@ index 000000000..311169501
 +              "uid": "${DS_PROMETHEUS}"
 +            },
 +            "editorMode": "code",
-+            "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})",
++            "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})",
 +            "instant": false,
 +            "legendFormat": "GPU Cache Usage",
 +            "range": true,
diff --git a/samples/autoscaling/apa.yaml b/samples/autoscaling/apa.yaml
index 56b1e1bc3..ef5a81c59 100644
--- a/samples/autoscaling/apa.yaml
+++ b/samples/autoscaling/apa.yaml
@@ -19,7 +19,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '0.5'
   scaleTargetRef:
     apiVersion: apps/v1
diff --git a/samples/autoscaling/hpa.yaml b/samples/autoscaling/hpa.yaml
index abcde92fe..32b8d37b3 100644
--- a/samples/autoscaling/hpa.yaml
+++ b/samples/autoscaling/hpa.yaml
@@ -15,7 +15,7 @@ spec:
       protocolType: http
       port: '8000'
       path: /metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '50'
   scaleTargetRef:
     apiVersion: apps/v1
diff --git a/samples/autoscaling/kpa.yaml b/samples/autoscaling/kpa.yaml
index 013043919..19499978a 100644
--- a/samples/autoscaling/kpa.yaml
+++ b/samples/autoscaling/kpa.yaml
@@ -17,7 +17,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '0.5'
   scaleTargetRef:
     apiVersion: apps/v1
diff --git a/samples/autoscaling/multimetrics-apa.yaml b/samples/autoscaling/multimetrics-apa.yaml
index f634c0730..6f731b057 100644
--- a/samples/autoscaling/multimetrics-apa.yaml
+++ b/samples/autoscaling/multimetrics-apa.yaml
@@ -19,7 +19,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '0.5'
     - metricSourceType: pod
       protocolType: http
diff --git a/samples/deepseek-r1/deepseek-r1-autoscaling.yaml b/samples/deepseek-r1/deepseek-r1-autoscaling.yaml
index cceda0edb..569095337 100644
--- a/samples/deepseek-r1/deepseek-r1-autoscaling.yaml
+++ b/samples/deepseek-r1/deepseek-r1-autoscaling.yaml
@@ -16,7 +16,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '50'
   scaleTargetRef:
     apiVersion: orchestration.aibrix.ai/v1alpha1
diff --git a/samples/deepseek-r1/static/AIBrix Engine Dashboard (vLLM)-1741078999667.json b/samples/deepseek-r1/static/AIBrix Engine Dashboard (vLLM)-1741078999667.json
index c5a4e5118..5ed6ef579 100644
--- a/samples/deepseek-r1/static/AIBrix Engine Dashboard (vLLM)-1741078999667.json	
+++ b/samples/deepseek-r1/static/AIBrix Engine Dashboard (vLLM)-1741078999667.json	
@@ -1607,7 +1607,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "editorMode": "code",
-          "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\"})",
+          "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\"})",
           "instant": false,
           "legendFormat": "GPU Cache Usage",
           "range": true,
diff --git a/samples/volcano-engine/autoscaler.yaml b/samples/volcano-engine/autoscaler.yaml
index 89e9cd322..de338deea 100644
--- a/samples/volcano-engine/autoscaler.yaml
+++ b/samples/volcano-engine/autoscaler.yaml
@@ -17,7 +17,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '0.3'
   scaleTargetRef:
     apiVersion: apps/v1
diff --git a/samples/volcano-engine/hpa-r1.yaml b/samples/volcano-engine/hpa-r1.yaml
index 815c4fcf6..f4f48f8c6 100644
--- a/samples/volcano-engine/hpa-r1.yaml
+++ b/samples/volcano-engine/hpa-r1.yaml
@@ -14,7 +14,7 @@ spec:
       protocolType: http
       port: '8000'
       path: metrics
-      targetMetric: gpu_cache_usage_perc
+      targetMetric: kv_cache_usage_perc
       targetValue: '50'
   scaleTargetRef:
     apiVersion: orchestration.aibrix.ai/v1alpha1
diff --git a/test/integration/controller/podautoscaler_test.go b/test/integration/controller/podautoscaler_test.go
index e6adb8238..45fa0b520 100644
--- a/test/integration/controller/podautoscaler_test.go
+++ b/test/integration/controller/podautoscaler_test.go
@@ -914,7 +914,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() {
 						ScaleTargetRefWithKind("StormService", "orchestration.aibrix.ai/v1alpha1", "test-stormservice-role").
 						SubTargetSelector("prefill"). // Only scale "prefill" role
 						MetricSource(wrapper.MakeMetricSourcePod(
-							autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.7")).
+							autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.7")).
 						Obj()
 				},
 				updates: []*update{
@@ -956,7 +956,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() {
 						ScaleTargetRefWithKind("StormService", "orchestration.aibrix.ai/v1alpha1", "test-stormservice-conflict").
 						SubTargetSelector("prefill"). // Same role as PA1
 						MetricSource(wrapper.MakeMetricSourcePod(
-							autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.7")).
+							autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.7")).
 						Obj()
 				},
 				updates: []*update{
@@ -1011,7 +1011,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() {
 								ScaleTargetRefWithKind("StormService", "orchestration.aibrix.ai/v1alpha1", "test-stormservice-conflict").
 								SubTargetSelector("prefill"). // Same role
 								MetricSource(wrapper.MakeMetricSourcePod(
-									autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.7")).
+									autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.7")).
 								Obj()
 							gomega.Expect(k8sClient.Create(ctx, pa1)).To(gomega.Succeed())
 							time.Sleep(time.Second * 2)
@@ -1053,7 +1053,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() {
 						Annotations(annotations).
 						ScaleTargetRefWithKind("Deployment", "apps/v1", "cooldown-deployment").
 						MetricSource(wrapper.MakeMetricSourcePod(
-							autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.5")).
+							autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.5")).
 						Obj()
 				},
 				updates: []*update{
@@ -1089,7 +1089,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() {
 						Annotations(annotations).
 						ScaleTargetRefWithKind("Deployment", "apps/v1", "delay-deployment").
 						MetricSource(wrapper.MakeMetricSourcePod(
-							autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.5")).
+							autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.5")).
 						Obj()
 				},
 				updates: []*update{
@@ -1129,7 +1129,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() {
 						Annotations(annotations).
 						ScaleTargetRefWithKind("Deployment", "apps/v1", "annotations-deployment").
 						MetricSource(wrapper.MakeMetricSourcePod(
-							autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.5")).
+							autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.5")).
 						Obj()
 				},
 				updates: []*update{
diff --git a/test/integration/webhook/podautoscaler_webhook_test.go b/test/integration/webhook/podautoscaler_webhook_test.go
index 2be54fc6d..fe25ba24b 100644
--- a/test/integration/webhook/podautoscaler_webhook_test.go
+++ b/test/integration/webhook/podautoscaler_webhook_test.go
@@ -85,7 +85,7 @@ var _ = ginkgo.Describe("podautoscaler default and validation", func() {
 					MaxReplicas(10).
 					ScaleTargetRefWithKind("Deployment", "apps/v1", "test").
 					MetricSource(wrapper.MakeMetricSourcePod(autoscalingapi.HTTP,
-						"8080", "/metrics", "gpu_cache_usage_perc", "0.5")).
+						"8080", "/metrics", "kv_cache_usage_perc", "0.5")).
 					Obj()
 			},
 			failed: false,
@@ -99,7 +99,7 @@ var _ = ginkgo.Describe("podautoscaler default and validation", func() {
 					MaxReplicas(3).
 					ScaleTargetRefWithKind("Deployment", "apps/v1", "test-ss").
 					MetricSource(wrapper.MakeMetricSourceExternal(autoscalingapi.HTTP,
-						"monitoring.example.com", "/metrics", "gpu_cache_usage_perc", "0.5")).
+						"monitoring.example.com", "/metrics", "kv_cache_usage_perc", "0.5")).
 					Obj()
 			},
 			failed: false,