diff --git a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml index a81d13815..f9be15c00 100644 --- a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml +++ b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/apa.yaml @@ -19,7 +19,7 @@ spec: protocolType: http port: '8000' path: metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '0.5' scaleTargetRef: apiVersion: apps/v1 diff --git a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml index 55adfc001..80d69cfe5 100644 --- a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml +++ b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/hpa.yaml @@ -15,7 +15,7 @@ spec: protocolType: http port: '8000' path: /metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '50' scaleTargetRef: apiVersion: apps/v1 diff --git a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml index c49d4546b..81d5b2d25 100644 --- a/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml +++ b/benchmarks/scenarios/autoscaling/deepseek-llm-7b-chat/kpa.yaml @@ -17,7 +17,7 @@ spec: protocolType: http port: '8000' path: metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '0.5' scaleTargetRef: apiVersion: apps/v1 diff --git a/development/app/README.md b/development/app/README.md index 7fe310d91..608e410aa 100644 --- a/development/app/README.md +++ b/development/app/README.md @@ -201,7 +201,7 @@ The following keys can be included in the JSON payload to override metrics: - `swapped` → `vllm:num_requests_swapped` - `avg_prompt_throughput` → `vllm:avg_prompt_throughput_toks_per_s` - `avg_generation_throughput` → `vllm:avg_generation_throughput_toks_per_s` -- `gpu_cache_usage_perc` → `vllm:gpu_cache_usage_perc` +- `kv_cache_usage_perc` → `vllm:kv_cache_usage_perc` - `cpu_cache_usage_perc` → `vllm:cpu_cache_usage_perc` - `model_name` – sets the `model_name` label on all metrics @@ -217,7 +217,7 @@ curl -X GET http://localhost:8000/metrics curl -X POST http://localhost:8000/set_metrics \ -H "Content-Type: application/json" \ -d '{ - "gpu_cache_usage_perc": 75.0, + "kv_cache_usage_perc": 75.0, "running": 50, "waiting": 10, "success_total": 200 diff --git a/development/app/app.py b/development/app/app.py index fe64cd1e8..b778e6230 100644 --- a/development/app/app.py +++ b/development/app/app.py @@ -886,8 +886,8 @@ def metrics(): waiting = overrides.get("waiting", randint(1, 100)) swapped = overrides.get("swapped", randint(1, 100)) max_running_capacity = 100 - gpu_cache_usage_perc = overrides.get( - "gpu_cache_usage_perc", min(100.0, (running / max_running_capacity) * 100) + kv_cache_usage_perc = overrides.get( + "kv_cache_usage_perc", min(100.0, (running / max_running_capacity) * 100) ) cpu_cache_usage_perc = overrides.get( "cpu_cache_usage_perc", min(100.0, (cpu_running / max_running_capacity) * 100) @@ -946,10 +946,10 @@ def metrics(): ), }, { - "name": "gpu_cache_usage_perc", + "name": "kv_cache_usage_perc", "type": "gauge", "description": "GPU KV-cache usage. 1 means 100 percent usage.", - "value": overrides.get("gpu_cache_usage_perc", gpu_cache_usage_perc), + "value": overrides.get("kv_cache_usage_perc", kv_cache_usage_perc), }, { "name": "cpu_cache_usage_perc", diff --git a/development/tutorials/distributed/fleet-autoscaling.yaml b/development/tutorials/distributed/fleet-autoscaling.yaml index 605bb08a8..a3a940d00 100644 --- a/development/tutorials/distributed/fleet-autoscaling.yaml +++ b/development/tutorials/distributed/fleet-autoscaling.yaml @@ -16,7 +16,7 @@ spec: protocolType: http port: '8000' path: metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '70' scaleTargetRef: apiVersion: orchestration.aibrix.ai/v1alpha1 diff --git a/development/tutorials/podautoscaler/hpa.yaml b/development/tutorials/podautoscaler/hpa.yaml index 502ba7fc3..acd067fd2 100644 --- a/development/tutorials/podautoscaler/hpa.yaml +++ b/development/tutorials/podautoscaler/hpa.yaml @@ -11,7 +11,7 @@ spec: metrics: - pods: metric: - name: gpu_cache_usage_perc + name: kv_cache_usage_perc target: averageValue: "40" type: AverageValue diff --git a/development/tutorials/podautoscaler/pa.yaml b/development/tutorials/podautoscaler/pa.yaml index ce1a15985..39c66ef9f 100644 --- a/development/tutorials/podautoscaler/pa.yaml +++ b/development/tutorials/podautoscaler/pa.yaml @@ -18,6 +18,6 @@ spec: protocolType: "http" port: "8000" path: "/metrics" - targetMetric: "gpu_cache_usage_perc" + targetMetric: "kv_cache_usage_perc" targetValue: "40" scalingStrategy: "HPA" diff --git a/docs/source/features/autoscaling/metric-based-autoscaling.rst b/docs/source/features/autoscaling/metric-based-autoscaling.rst index 9fccf91b7..8e46c7c1d 100644 --- a/docs/source/features/autoscaling/metric-based-autoscaling.rst +++ b/docs/source/features/autoscaling/metric-based-autoscaling.rst @@ -120,7 +120,7 @@ check its logs in this way. kubectl logs -n aibrix-system -f -Expected log output. You can see the current metric is gpu_cache_usage_perc. You can check each pod's current metric value. +Expected log output. You can see the current metric is kv_cache_usage_perc. You can check each pod's current metric value. .. image:: ../../assets/images/autoscaler/aibrix-controller-manager-output.png :alt: AiBrix controller manager output diff --git a/docs/source/features/autoscaling/optimizer-based-autoscaling.rst b/docs/source/features/autoscaling/optimizer-based-autoscaling.rst index 7af39a16d..98ff4cdf6 100644 --- a/docs/source/features/autoscaling/optimizer-based-autoscaling.rst +++ b/docs/source/features/autoscaling/optimizer-based-autoscaling.rst @@ -98,7 +98,7 @@ Here we show the preliminary experiment results to show how different autoscalin Experiments Results ^^^^^^^^^^^^^^^^^^^ -- gpu_cache_usage_perc: 70 +- kv_cache_usage_perc: 70 .. image:: ../../assets/images/autoscaler/optimizer-based-autoscaling-70-results.png :alt: result diff --git a/docs/source/features/multi-engine.rst b/docs/source/features/multi-engine.rst index 22508796c..07285ff60 100644 --- a/docs/source/features/multi-engine.rst +++ b/docs/source/features/multi-engine.rst @@ -104,8 +104,8 @@ We only support limited number of metrics from different engines and we will con - vllm:request_prefill_time_seconds - N/A - N/A - * - gpu_cache_usage_perc - - vllm:gpu_cache_usage_perc + * - kv_cache_usage_perc + - vllm:kv_cache_usage_perc - sglang:token_usage [1]_ - kv_cache_utilization * - engine_utilization diff --git a/docs/source/features/runtime.rst b/docs/source/features/runtime.rst index 1f6118b35..7598cd9ab 100644 --- a/docs/source/features/runtime.rst +++ b/docs/source/features/runtime.rst @@ -170,9 +170,9 @@ Unified metrics help to standardize the metrics for different inference engines # HELP vllm:num_requests_waiting Number of requests waiting to be processed. # TYPE vllm:num_requests_waiting gauge vllm:num_requests_waiting{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0 - # HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage. - # TYPE vllm:gpu_cache_usage_perc gauge - vllm:gpu_cache_usage_perc{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0 + # HELP vllm:kv_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage. + # TYPE vllm:kv_cache_usage_perc gauge + vllm:kv_cache_usage_perc{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0 # HELP vllm:cpu_cache_usage_perc CPU KV-cache usage. 1 means 100 percent usage. # TYPE vllm:cpu_cache_usage_perc gauge vllm:cpu_cache_usage_perc{model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct"} 0.0 diff --git a/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json b/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json index af6717010..2a698722c 100644 --- a/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json +++ b/observability/grafana/AIBrix_vLLM_Engine_Dashboard.json @@ -1769,7 +1769,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\",job=\"${job}\"})", + "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\",job=\"${job}\"})", "instant": false, "legendFormat": "GPU Cache Usage", "range": true, diff --git a/pkg/controller/podautoscaler/algorithm/mock_context_test.go b/pkg/controller/podautoscaler/algorithm/mock_context_test.go index 97934175b..bc957f6a0 100644 --- a/pkg/controller/podautoscaler/algorithm/mock_context_test.go +++ b/pkg/controller/podautoscaler/algorithm/mock_context_test.go @@ -50,7 +50,7 @@ type mockScalingContext struct { PanicWindow time.Duration ScaleDownDelay time.Duration - MetricTargets map[string]scalingctx.MetricTarget // key: metric name (e.g., "cpu", "gpu_cache_usage_perc") + MetricTargets map[string]scalingctx.MetricTarget // key: metric name (e.g., "cpu", "kv_cache_usage_perc") } // Ensure MockScalingContext implements the ScalingContext interface diff --git a/pkg/controller/podautoscaler/autoscaler_test.go b/pkg/controller/podautoscaler/autoscaler_test.go index c4f340647..67b7904bd 100644 --- a/pkg/controller/podautoscaler/autoscaler_test.go +++ b/pkg/controller/podautoscaler/autoscaler_test.go @@ -45,7 +45,7 @@ func TestComputeDesiredReplicas(t *testing.T) { metricsSources: []autoscalingv1alpha1.MetricSource{ { MetricSourceType: autoscalingv1alpha1.POD, - TargetMetric: "gpu_cache_usage_perc", + TargetMetric: "kv_cache_usage_perc", TargetValue: "50", }, }, @@ -56,7 +56,7 @@ func TestComputeDesiredReplicas(t *testing.T) { metricsSources: []autoscalingv1alpha1.MetricSource{ { MetricSourceType: autoscalingv1alpha1.POD, - TargetMetric: "gpu_cache_usage_perc", + TargetMetric: "kv_cache_usage_perc", TargetValue: "50", }, { diff --git a/pkg/controller/podautoscaler/context/context.go b/pkg/controller/podautoscaler/context/context.go index 106343452..0fa7d242a 100644 --- a/pkg/controller/podautoscaler/context/context.go +++ b/pkg/controller/podautoscaler/context/context.go @@ -87,7 +87,7 @@ type baseScalingContext struct { // Panic mode state InPanicMode bool // MetricTargets used to store multiple metrics - MetricTargets map[string]MetricTarget // key: metric name (e.g., "cpu", "gpu_cache_usage_perc") + MetricTargets map[string]MetricTarget // key: metric name (e.g., "cpu", "kv_cache_usage_perc") } type MetricTarget struct { diff --git a/pkg/controller/podautoscaler/context/context_test.go b/pkg/controller/podautoscaler/context/context_test.go index b2ce7560b..7c919ed34 100644 --- a/pkg/controller/podautoscaler/context/context_test.go +++ b/pkg/controller/podautoscaler/context/context_test.go @@ -37,7 +37,7 @@ func TestUpdateByPaTypes_MetricsSources(t *testing.T) { MetricsSources: []autoscalingv1alpha1.MetricSource{ { MetricSourceType: autoscalingv1alpha1.POD, - TargetMetric: "gpu_cache_usage_perc", + TargetMetric: "kv_cache_usage_perc", TargetValue: "50", }, { @@ -55,10 +55,10 @@ func TestUpdateByPaTypes_MetricsSources(t *testing.T) { } expectedMetricTargets := map[string]MetricTarget{ - "gpu_cache_usage_perc": { + "kv_cache_usage_perc": { TargetValue: 50, TotalValue: 100, - ScalingMetric: "gpu_cache_usage_perc", + ScalingMetric: "kv_cache_usage_perc", MetricType: autoscalingv1alpha1.POD, }, "cpu": { diff --git a/pkg/controller/podautoscaler/hpa_resources_test.go b/pkg/controller/podautoscaler/hpa_resources_test.go index 3de9157f0..d21afb507 100644 --- a/pkg/controller/podautoscaler/hpa_resources_test.go +++ b/pkg/controller/podautoscaler/hpa_resources_test.go @@ -55,7 +55,7 @@ func TestMakeHPA(t *testing.T) { MetricsSources: []autoscalingv1alpha1.MetricSource{ { MetricSourceType: autoscalingv1alpha1.POD, - TargetMetric: "gpu_cache_usage_perc", + TargetMetric: "kv_cache_usage_perc", TargetValue: "50", }, { @@ -131,7 +131,7 @@ func TestMakeHPA(t *testing.T) { Type: autoscalingv2.PodsMetricSourceType, Pods: &autoscalingv2.PodsMetricSource{ Metric: autoscalingv2.MetricIdentifier{ - Name: "gpu_cache_usage_perc", + Name: "kv_cache_usage_perc", }, Target: autoscalingv2.MetricTarget{ Type: autoscalingv2.AverageValueMetricType, diff --git a/pkg/controller/podautoscaler/metrics/client_test.go b/pkg/controller/podautoscaler/metrics/client_test.go index 6b249b9cd..55f84aacc 100644 --- a/pkg/controller/podautoscaler/metrics/client_test.go +++ b/pkg/controller/podautoscaler/metrics/client_test.go @@ -31,7 +31,7 @@ func TestUpdateMetrics(t *testing.T) { metricKey := types.MetricKey{ Namespace: "default", Name: "test-llm", - MetricName: "gpu_cache_usage_perc", + MetricName: "kv_cache_usage_perc", PaNamespace: "default", PaName: "test-llm-apa", } @@ -45,22 +45,22 @@ func TestUpdateMetrics(t *testing.T) { assert.NoError(t, err) assert.Len(t, client.stableWindows, 1) - tw := client.stableWindows["default/test-llm-apa/gpu_cache_usage_perc"] + tw := client.stableWindows["default/test-llm-apa/kv_cache_usage_perc"] assert.NotNil(t, tw) assert.Len(t, tw.Values(), 1) assert.Equal(t, expectedValue, tw.Values()[0]) assert.Len(t, client.panicWindows, 1) - tw = client.panicWindows["default/test-llm-apa/gpu_cache_usage_perc"] + tw = client.panicWindows["default/test-llm-apa/kv_cache_usage_perc"] assert.NotNil(t, tw) assert.Len(t, tw.Values(), 1) assert.Equal(t, expectedValue, tw.Values()[0]) assert.Len(t, client.stableHistory, 1) - assert.NotNil(t, client.stableHistory["default/test-llm-apa/gpu_cache_usage_perc"]) + assert.NotNil(t, client.stableHistory["default/test-llm-apa/kv_cache_usage_perc"]) assert.Len(t, client.panicHistory, 1) - assert.NotNil(t, client.panicHistory["default/test-llm-apa/gpu_cache_usage_perc"]) + assert.NotNil(t, client.panicHistory["default/test-llm-apa/kv_cache_usage_perc"]) } func TestGetMetricValue(t *testing.T) { @@ -69,7 +69,7 @@ func TestGetMetricValue(t *testing.T) { metricKey := types.MetricKey{ Namespace: "default", Name: "test-llm", - MetricName: "gpu_cache_usage_perc", + MetricName: "kv_cache_usage_perc", PaNamespace: "default", PaName: "test-llm-apa", } diff --git a/pkg/controller/podautoscaler/metrics/fetcher_test.go b/pkg/controller/podautoscaler/metrics/fetcher_test.go index e4ba2e61a..1091f3e34 100644 --- a/pkg/controller/podautoscaler/metrics/fetcher_test.go +++ b/pkg/controller/podautoscaler/metrics/fetcher_test.go @@ -48,7 +48,7 @@ func TestRestMetricsFetcher_FetchPodMetrics(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { _, err := expfmt.MetricFamilyToText(w, &dto.MetricFamily{ - Name: ptr.To("vllm:gpu_cache_usage_perc"), + Name: ptr.To("vllm:kv_cache_usage_perc"), Type: dto.MetricType_GAUGE.Enum(), Metric: []*dto.Metric{ { @@ -84,7 +84,7 @@ func TestRestMetricsFetcher_FetchPodMetrics(t *testing.T) { } source := autoscalingv1alpha1.MetricSource{ MetricSourceType: autoscalingv1alpha1.POD, - TargetMetric: "gpu_cache_usage_perc", + TargetMetric: "kv_cache_usage_perc", Port: port, } diff --git a/pkg/metrics/engine_fetcher_test.go b/pkg/metrics/engine_fetcher_test.go index 14f7ed253..1560cc217 100644 --- a/pkg/metrics/engine_fetcher_test.go +++ b/pkg/metrics/engine_fetcher_test.go @@ -36,9 +36,9 @@ vllm_num_requests_running{model_name="meta-llama/Llama-2-7b-chat-hf"} 2.0 # HELP vllm_num_requests_waiting Number of requests waiting to be processed. # TYPE vllm_num_requests_waiting gauge vllm_num_requests_waiting{model_name="meta-llama/Llama-2-7b-chat-hf"} 3.0 -# HELP vllm_gpu_cache_usage_perc GPU KV-cache usage. 1.0 means 100 percent usage. -# TYPE vllm_gpu_cache_usage_perc gauge -vllm_gpu_cache_usage_perc 0.75 +# HELP vllm_kv_cache_usage_perc GPU KV-cache usage. 1.0 means 100 percent usage. +# TYPE vllm_kv_cache_usage_perc gauge +vllm_kv_cache_usage_perc 0.75 # HELP vllm_time_to_first_token_seconds Histogram of time to first token in seconds. # TYPE vllm_time_to_first_token_seconds histogram vllm_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-2-7b-chat-hf",le="0.001"} 0.0 diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index f672ebf3a..21896d691 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -39,7 +39,7 @@ const ( AvgTPOT5mPod = "avg_tpot_pod_5m" AvgPromptToksPerReq = "avg_prompt_toks_per_req" AvgGenerationToksPerReq = "avg_generation_toks_per_req" - GPUCacheUsagePerc = "gpu_cache_usage_perc" + GPUCacheUsagePerc = "kv_cache_usage_perc" GPUBusyTimeRatio = "gpu_busy_time_ratio" CPUCacheUsagePerc = "cpu_cache_usage_perc" EngineUtilization = "engine_utilization" @@ -304,7 +304,7 @@ var ( Raw: Counter, }, EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:gpu_cache_usage_perc", + "vllm": "vllm:kv_cache_usage_perc", "sglang": "sglang:token_usage", // Based on https://github.com/sgl-project/sglang/issues/5979 "xllm": "kv_cache_utilization", }, diff --git a/pkg/metrics/utils_test.go b/pkg/metrics/utils_test.go index cb2e0966b..9795be224 100644 --- a/pkg/metrics/utils_test.go +++ b/pkg/metrics/utils_test.go @@ -33,9 +33,9 @@ func TestParseHistogramWithLabels(t *testing.T) { # HELP vllm:num_requests_waiting Number of requests waiting to be processed. # TYPE vllm:num_requests_waiting gauge vllm:num_requests_waiting{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.0 -# HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage. -# TYPE vllm:gpu_cache_usage_perc gauge -vllm:gpu_cache_usage_perc{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.0 +# HELP vllm:kv_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage. +# TYPE vllm:kv_cache_usage_perc gauge +vllm:kv_cache_usage_perc{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.0 # HELP vllm:time_per_output_token_seconds histogram vllm:time_per_output_token_seconds_sum{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 0.23455095291137695 vllm:time_per_output_token_seconds_count{model_name="Qwen/Qwen2.5-1.5B-Instruct"} 29.0 diff --git a/python/aibrix/aibrix/metrics/engine_rules.py b/python/aibrix/aibrix/metrics/engine_rules.py index acb0610cc..0dc30d0b2 100644 --- a/python/aibrix/aibrix/metrics/engine_rules.py +++ b/python/aibrix/aibrix/metrics/engine_rules.py @@ -26,8 +26,8 @@ "vllm:num_requests_waiting": RenameStandardRule( "vllm:num_requests_waiting", "aibrix:queue_size" ), - "vllm:gpu_cache_usage_perc": RenameStandardRule( - "vllm:gpu_cache_usage_perc", "aibrix:gpu_cache_usage_perc" + "vllm:kv_cache_usage_perc": RenameStandardRule( + "vllm:kv_cache_usage_perc", "aibrix:kv_cache_usage_perc" ), # Token processing metrics "vllm:prompt_tokens_total": RenameStandardRule( diff --git a/python/aibrix/tests/metrics/test_metrics_multi_engine.py b/python/aibrix/tests/metrics/test_metrics_multi_engine.py index a7b4d197d..002628d5d 100644 --- a/python/aibrix/tests/metrics/test_metrics_multi_engine.py +++ b/python/aibrix/tests/metrics/test_metrics_multi_engine.py @@ -33,9 +33,9 @@ def test_get_metric_rules_vllm(self): assert isinstance(rules["vllm:num_requests_waiting"], RenameStandardRule) assert rules["vllm:num_requests_waiting"].new_name == "aibrix:queue_size" - assert "vllm:gpu_cache_usage_perc" in rules + assert "vllm:kv_cache_usage_perc" in rules assert ( - rules["vllm:gpu_cache_usage_perc"].new_name == "aibrix:gpu_cache_usage_perc" + rules["vllm:kv_cache_usage_perc"].new_name == "aibrix:kv_cache_usage_perc" ) # Check token processing metrics exist @@ -307,7 +307,7 @@ def test_vllm_hybrid_coverage(self): # Essential metrics should be renamed essential_mapped = [ "vllm:num_requests_waiting", - "vllm:gpu_cache_usage_perc", + "vllm:kv_cache_usage_perc", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", "vllm:time_to_first_token_seconds", diff --git a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch index 747c7955c..68b93447e 100644 --- a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch +++ b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.8.5-aibrix-kvcache.patch @@ -2087,7 +2087,7 @@ index 000000000..311169501 + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", -+ "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})", ++ "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})", + "instant": false, + "legendFormat": "GPU Cache Usage", + "range": true, diff --git a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch index c1eee2899..c450f5187 100644 --- a/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch +++ b/python/aibrix_kvcache/integration/vllm/patches/vllm_v0.9.1-aibrix-kvcache.patch @@ -1648,7 +1648,7 @@ index 000000000..311169501 + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", -+ "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})", ++ "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\", job=\"pods\"})", + "instant": false, + "legendFormat": "GPU Cache Usage", + "range": true, diff --git a/samples/autoscaling/apa.yaml b/samples/autoscaling/apa.yaml index 56b1e1bc3..ef5a81c59 100644 --- a/samples/autoscaling/apa.yaml +++ b/samples/autoscaling/apa.yaml @@ -19,7 +19,7 @@ spec: protocolType: http port: '8000' path: metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '0.5' scaleTargetRef: apiVersion: apps/v1 diff --git a/samples/autoscaling/hpa.yaml b/samples/autoscaling/hpa.yaml index abcde92fe..32b8d37b3 100644 --- a/samples/autoscaling/hpa.yaml +++ b/samples/autoscaling/hpa.yaml @@ -15,7 +15,7 @@ spec: protocolType: http port: '8000' path: /metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '50' scaleTargetRef: apiVersion: apps/v1 diff --git a/samples/autoscaling/kpa.yaml b/samples/autoscaling/kpa.yaml index 013043919..19499978a 100644 --- a/samples/autoscaling/kpa.yaml +++ b/samples/autoscaling/kpa.yaml @@ -17,7 +17,7 @@ spec: protocolType: http port: '8000' path: metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '0.5' scaleTargetRef: apiVersion: apps/v1 diff --git a/samples/autoscaling/multimetrics-apa.yaml b/samples/autoscaling/multimetrics-apa.yaml index f634c0730..6f731b057 100644 --- a/samples/autoscaling/multimetrics-apa.yaml +++ b/samples/autoscaling/multimetrics-apa.yaml @@ -19,7 +19,7 @@ spec: protocolType: http port: '8000' path: metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '0.5' - metricSourceType: pod protocolType: http diff --git a/samples/deepseek-r1/deepseek-r1-autoscaling.yaml b/samples/deepseek-r1/deepseek-r1-autoscaling.yaml index cceda0edb..569095337 100644 --- a/samples/deepseek-r1/deepseek-r1-autoscaling.yaml +++ b/samples/deepseek-r1/deepseek-r1-autoscaling.yaml @@ -16,7 +16,7 @@ spec: protocolType: http port: '8000' path: metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '50' scaleTargetRef: apiVersion: orchestration.aibrix.ai/v1alpha1 diff --git a/samples/deepseek-r1/static/AIBrix Engine Dashboard (vLLM)-1741078999667.json b/samples/deepseek-r1/static/AIBrix Engine Dashboard (vLLM)-1741078999667.json index c5a4e5118..5ed6ef579 100644 --- a/samples/deepseek-r1/static/AIBrix Engine Dashboard (vLLM)-1741078999667.json +++ b/samples/deepseek-r1/static/AIBrix Engine Dashboard (vLLM)-1741078999667.json @@ -1607,7 +1607,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(vllm:gpu_cache_usage_perc{model_name=\"$model_name\"})", + "expr": "avg(vllm:kv_cache_usage_perc{model_name=\"$model_name\"})", "instant": false, "legendFormat": "GPU Cache Usage", "range": true, diff --git a/samples/volcano-engine/autoscaler.yaml b/samples/volcano-engine/autoscaler.yaml index 89e9cd322..de338deea 100644 --- a/samples/volcano-engine/autoscaler.yaml +++ b/samples/volcano-engine/autoscaler.yaml @@ -17,7 +17,7 @@ spec: protocolType: http port: '8000' path: metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '0.3' scaleTargetRef: apiVersion: apps/v1 diff --git a/samples/volcano-engine/hpa-r1.yaml b/samples/volcano-engine/hpa-r1.yaml index 815c4fcf6..f4f48f8c6 100644 --- a/samples/volcano-engine/hpa-r1.yaml +++ b/samples/volcano-engine/hpa-r1.yaml @@ -14,7 +14,7 @@ spec: protocolType: http port: '8000' path: metrics - targetMetric: gpu_cache_usage_perc + targetMetric: kv_cache_usage_perc targetValue: '50' scaleTargetRef: apiVersion: orchestration.aibrix.ai/v1alpha1 diff --git a/test/integration/controller/podautoscaler_test.go b/test/integration/controller/podautoscaler_test.go index e6adb8238..45fa0b520 100644 --- a/test/integration/controller/podautoscaler_test.go +++ b/test/integration/controller/podautoscaler_test.go @@ -914,7 +914,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() { ScaleTargetRefWithKind("StormService", "orchestration.aibrix.ai/v1alpha1", "test-stormservice-role"). SubTargetSelector("prefill"). // Only scale "prefill" role MetricSource(wrapper.MakeMetricSourcePod( - autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.7")). + autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.7")). Obj() }, updates: []*update{ @@ -956,7 +956,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() { ScaleTargetRefWithKind("StormService", "orchestration.aibrix.ai/v1alpha1", "test-stormservice-conflict"). SubTargetSelector("prefill"). // Same role as PA1 MetricSource(wrapper.MakeMetricSourcePod( - autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.7")). + autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.7")). Obj() }, updates: []*update{ @@ -1011,7 +1011,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() { ScaleTargetRefWithKind("StormService", "orchestration.aibrix.ai/v1alpha1", "test-stormservice-conflict"). SubTargetSelector("prefill"). // Same role MetricSource(wrapper.MakeMetricSourcePod( - autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.7")). + autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.7")). Obj() gomega.Expect(k8sClient.Create(ctx, pa1)).To(gomega.Succeed()) time.Sleep(time.Second * 2) @@ -1053,7 +1053,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() { Annotations(annotations). ScaleTargetRefWithKind("Deployment", "apps/v1", "cooldown-deployment"). MetricSource(wrapper.MakeMetricSourcePod( - autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.5")). + autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.5")). Obj() }, updates: []*update{ @@ -1089,7 +1089,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() { Annotations(annotations). ScaleTargetRefWithKind("Deployment", "apps/v1", "delay-deployment"). MetricSource(wrapper.MakeMetricSourcePod( - autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.5")). + autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.5")). Obj() }, updates: []*update{ @@ -1129,7 +1129,7 @@ var _ = ginkgo.Describe("PodAutoscaler controller test", func() { Annotations(annotations). ScaleTargetRefWithKind("Deployment", "apps/v1", "annotations-deployment"). MetricSource(wrapper.MakeMetricSourcePod( - autoscalingv1alpha1.HTTP, "8080", "/metrics", "gpu_cache_usage_perc", "0.5")). + autoscalingv1alpha1.HTTP, "8080", "/metrics", "kv_cache_usage_perc", "0.5")). Obj() }, updates: []*update{ diff --git a/test/integration/webhook/podautoscaler_webhook_test.go b/test/integration/webhook/podautoscaler_webhook_test.go index 2be54fc6d..fe25ba24b 100644 --- a/test/integration/webhook/podautoscaler_webhook_test.go +++ b/test/integration/webhook/podautoscaler_webhook_test.go @@ -85,7 +85,7 @@ var _ = ginkgo.Describe("podautoscaler default and validation", func() { MaxReplicas(10). ScaleTargetRefWithKind("Deployment", "apps/v1", "test"). MetricSource(wrapper.MakeMetricSourcePod(autoscalingapi.HTTP, - "8080", "/metrics", "gpu_cache_usage_perc", "0.5")). + "8080", "/metrics", "kv_cache_usage_perc", "0.5")). Obj() }, failed: false, @@ -99,7 +99,7 @@ var _ = ginkgo.Describe("podautoscaler default and validation", func() { MaxReplicas(3). ScaleTargetRefWithKind("Deployment", "apps/v1", "test-ss"). MetricSource(wrapper.MakeMetricSourceExternal(autoscalingapi.HTTP, - "monitoring.example.com", "/metrics", "gpu_cache_usage_perc", "0.5")). + "monitoring.example.com", "/metrics", "kv_cache_usage_perc", "0.5")). Obj() }, failed: false,