llm-d · namasl · Oct 7, 2025 · Oct 7, 2025 · Oct 9, 2025 · Oct 10, 2025
diff --git a/experiments/precise-prefix-cache-aware-blog.yaml b/experiments/precise-prefix-cache-aware-blog.yaml
@@ -0,0 +1,34 @@
+# This follows https://llm-d.ai/blog/kvcache-wins-you-can-see
+setup:
+  constants:
+    - LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN: 12000
+    - LLMDBENCH_VLLM_COMMON_BLOCK_SIZE: 64
+    - LLMDBENCH_DEPLOY_MODEL_LIST: Qwen/Qwen3-32B
+    - LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM: 2
+    - LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS: 8
+    - LLMDBENCH_HARNESS_EXPERIMENT_PROFILE: shared_prefix_synthetic_aggressive.yaml
+    - LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL: true
+    - LLMDBENCH_CONTROL_WAIT_TIMEOUT: 900000
+    - LLMDBENCH_HARNESS_WAIT_TIMEOUT: 900000
+    - LLMDBENCH_IMAGE_REGISTRY: quay.io
+    - LLMDBENCH_IMAGE_REPO: namasluk
+    - LLMDBENCH_IMAGE_NAME: llm-d-benchmark
+    - LLMDBENCH_IMAGE_TAG: 251002.1
+  factors:
+    - LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE
+  levels:
+    LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE: random-scheduling,estimated-scheduling,load-scheduling,precise-scheduling
+  treatments:
+    random_scheduling: random-scheduling
+    estimated_scheduling: estimated-scheduling
+    load_scheduling: load-scheduling
+    precise_scheduling: precise-scheduling
+run:
+  factors:
+    - question_len
+    - output_len
+  levels:
+    question_len: "1200"
+    output_len: "1000"
+  treatments:
+    - "1200,1000"
diff --git a/experiments/precise-prefix-cache-aware.yaml b/experiments/precise-prefix-cache-aware.yaml
@@ -14,12 +14,18 @@ run:
   constants:
     - streaming: true
   factors:
-    - num_groups
-    - system_prompt_len
+    - question_len
+    - output_len
   levels:
-    num_groups: "40,60"
-    system_prompt_len: "80000,5000,1000"
+    question_len: "100,300,1000"
+    output_len: "100,300,1000"
   treatments:
-    long: "40,8000"
-    medium: "60,5000"
-    short: "60,1000"
+    - "100,100"
+    - "100,300"
+    - "100,1000"
+    - "300,100"
+    - "300,300"
+    - "300,1000"
+    - "1000,100"
+    - "1000,300"
+    - "1000,1000"
diff --git a/setup/presets/gaie/estimated-scheduling.yaml b/setup/presets/gaie/estimated-scheduling.yaml
@@ -0,0 +1,22 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+plugins:
+  - type: single-profile-handler
+  - type: prefix-cache-scorer
+    parameters:
+      hashBlockSize: 64
+      maxPrefixBlocksToMatch: 256
+      lruCapacityPerServer: 31250
+  - type: kv-cache-scorer
+  - type: queue-scorer
+  - type: max-score-picker
+schedulingProfiles:
+  - name: default
+    plugins:
+      - pluginRef: kv-cache-scorer
+        weight: 1.0
+      - pluginRef: queue-scorer
+        weight: 1.0
+      - pluginRef: prefix-cache-scorer
+        weight: 1.0
+      - pluginRef: max-score-picker
diff --git a/setup/presets/gaie/load-scheduling.yaml b/setup/presets/gaie/load-scheduling.yaml
@@ -0,0 +1,17 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+plugins:
+- type: queue-scorer
+- type: kv-cache-scorer
+- type: max-score-picker
+  parameters:
+    maxNumOfEndpoints: 1
+- type: single-profile-handler
+schedulingProfiles:
+- name: default
+  plugins:
+  - pluginRef: queue-scorer
+    weight: 1
+  - pluginRef: kv-cache-scorer
+    weight: 1
+  - pluginRef: max-score-picker
diff --git a/setup/presets/gaie/precise-scheduling.yaml b/setup/presets/gaie/precise-scheduling.yaml
@@ -0,0 +1,27 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+plugins:
+- type: single-profile-handler
+- type: prefix-cache-scorer
+  parameters:
+    mode: cache_tracking
+    indexerConfig:
+      tokenProcessorConfig:
+        blockSize: 64   
+        hashSeed: "42"
+      kvBlockIndexConfig:
+        enableMetrics: true    
+        metricsLoggingInterval: 60000000000 
+- type: kv-cache-scorer
+- type: queue-scorer
+- type: max-score-picker
+schedulingProfiles:
+- name: default
+  plugins:
+    - pluginRef: prefix-cache-scorer
+      weight: 3.0
+    - pluginRef: kv-cache-scorer
+      weight: 2.0
+    - pluginRef: queue-scorer
+      weight: 2.0
+    - pluginRef: max-score-picker
diff --git a/setup/presets/gaie/random-scheduling.yaml b/setup/presets/gaie/random-scheduling.yaml
@@ -0,0 +1,9 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+plugins:
+- type: single-profile-handler
+- type: random-picker
+schedulingProfiles:
+- name: default
+  plugins:
+    - pluginRef: random-picker
diff --git a/workload/profiles/inference-perf/shared_prefix_synthetic_aggressive.yaml.in b/workload/profiles/inference-perf/shared_prefix_synthetic_aggressive.yaml.in
@@ -0,0 +1,97 @@
+# Follows precise prefix scheduling well-lit path
+# https://llm-d.ai/blog/kvcache-wins-you-can-see
+#
+# X-70R-6kSys — 8×307,328 KV; target 70%-80% total usage
+#
+# CLUSTER
+# - Pods: 8
+# - KV/pod: 307,328 → Cluster capacity C_cap = 2,458,624
+#
+# SHAPE
+# - system_prompt_len = 6,000
+# - question_len      = 1,200      # ≤ 1,500; cached per user
+# - num_prompts_per_group = 5      # users per group
+# - output_len        = 1,000      # observed live ≈ 200–230 tokens/running request
+#
+# RESIDENT SIZING
+# - Resident per group R_group = 6,000 + 5×1,200 = 12,000
+# - num_groups G = 150
+#   Resident total = 150 × 12,000 = 1,800,000 → 1,800,000 / 2,458,624 ≈ 73.2%
+#
+# LIVE SIZING
+#
+# WORKING SET
+# - One FULL SET S = G × P = 150 × 5 = 750 requests
+#
+load:
+  type: poisson
+  stages:
+    # Warmup — seat residents (~1×S)
+    - rate: 15
+      duration: 50            # 15*50  = 750
+
+    # Main ladder
+    - rate: 3
+      duration: 20
+    - rate: 10
+      duration: 20
+    - rate: 15
+      duration: 20
+    - rate: 20
+      duration: 38           
+    - rate: 22
+      duration: 34           
+    - rate: 25
+      duration: 30           
+    - rate: 30
+      duration: 25           
+    - rate: 35
+      duration: 21           
+    - rate: 40               
+      duration: 38           
+    - rate: 43
+      duration: 36           
+    - rate: 46
+      duration: 33           
+    - rate: 49
+      duration: 30           
+    - rate: 52
+      duration: 29           
+    - rate: 55
+      duration: 27           
+    - rate: 57
+      duration: 26           
+    - rate: 60
+      duration: 25           
+
+api:
+  type: completion
+  streaming: true
+
+server:
+  type: vllm
+  model_name: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL
+  base_url: REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL
+  ignore_eos: true
+
+tokenizer:
+  pretrained_model_name_or_path: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL
+
+data:
+  type: shared_prefix
+  shared_prefix:
+    num_groups: 150               # Number of distinct shared prefixes
+    num_prompts_per_group: 5      # Number of unique questions per shared prefix
+    system_prompt_len: 6000       # Length of the shared prefix (in tokens)
+    question_len: 1200            # Length of the unique question part (in tokens)
+    output_len: 1000              # Target length for the model's generated output (in tokens)
+
+report:
+  request_lifecycle:
+    summary: true
+    per_stage: true
+    per_request: true
+
+storage:
+  local_storage:
+    path: /workspace