Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions experiments/precise-prefix-cache-aware-blog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# This follows https://llm-d.ai/blog/kvcache-wins-you-can-see
setup:
constants:
- LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN: 12000
- LLMDBENCH_VLLM_COMMON_BLOCK_SIZE: 64
- LLMDBENCH_DEPLOY_MODEL_LIST: Qwen/Qwen3-32B
- LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM: 2
- LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS: 8
- LLMDBENCH_HARNESS_EXPERIMENT_PROFILE: shared_prefix_synthetic_aggressive.yaml
- LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL: true
- LLMDBENCH_CONTROL_WAIT_TIMEOUT: 900000
- LLMDBENCH_HARNESS_WAIT_TIMEOUT: 900000
- LLMDBENCH_IMAGE_REGISTRY: quay.io
- LLMDBENCH_IMAGE_REPO: namasluk
- LLMDBENCH_IMAGE_NAME: llm-d-benchmark
- LLMDBENCH_IMAGE_TAG: 251002.1
factors:
- LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE
levels:
LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE: random-scheduling,estimated-scheduling,load-scheduling,precise-scheduling
treatments:
random_scheduling: random-scheduling
estimated_scheduling: estimated-scheduling
load_scheduling: load-scheduling
precise_scheduling: precise-scheduling
run:
factors:
- question_len
- output_len
levels:
question_len: "1200"
output_len: "1000"
treatments:
- "1200,1000"
20 changes: 13 additions & 7 deletions experiments/precise-prefix-cache-aware.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@ run:
constants:
- streaming: true
factors:
- num_groups
- system_prompt_len
- question_len
- output_len
levels:
num_groups: "40,60"
system_prompt_len: "80000,5000,1000"
question_len: "100,300,1000"
output_len: "100,300,1000"
treatments:
long: "40,8000"
medium: "60,5000"
short: "60,1000"
- "100,100"
- "100,300"
- "100,1000"
- "300,100"
- "300,300"
- "300,1000"
- "1000,100"
- "1000,300"
- "1000,1000"
22 changes: 22 additions & 0 deletions setup/presets/gaie/estimated-scheduling.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: single-profile-handler
- type: prefix-cache-scorer
parameters:
hashBlockSize: 64
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
- type: kv-cache-scorer
- type: queue-scorer
- type: max-score-picker
schedulingProfiles:
- name: default
plugins:
- pluginRef: kv-cache-scorer
weight: 1.0
- pluginRef: queue-scorer
weight: 1.0
- pluginRef: prefix-cache-scorer
weight: 1.0
- pluginRef: max-score-picker
17 changes: 17 additions & 0 deletions setup/presets/gaie/load-scheduling.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: queue-scorer
- type: kv-cache-scorer
- type: max-score-picker
parameters:
maxNumOfEndpoints: 1
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: queue-scorer
weight: 1
- pluginRef: kv-cache-scorer
weight: 1
- pluginRef: max-score-picker
27 changes: 27 additions & 0 deletions setup/presets/gaie/precise-scheduling.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: single-profile-handler
- type: prefix-cache-scorer
parameters:
mode: cache_tracking
indexerConfig:
tokenProcessorConfig:
blockSize: 64
hashSeed: "42"
kvBlockIndexConfig:
enableMetrics: true
metricsLoggingInterval: 60000000000
- type: kv-cache-scorer
- type: queue-scorer
- type: max-score-picker
schedulingProfiles:
- name: default
plugins:
- pluginRef: prefix-cache-scorer
weight: 3.0
- pluginRef: kv-cache-scorer
weight: 2.0
- pluginRef: queue-scorer
weight: 2.0
- pluginRef: max-score-picker
9 changes: 9 additions & 0 deletions setup/presets/gaie/random-scheduling.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: single-profile-handler
- type: random-picker
schedulingProfiles:
- name: default
plugins:
- pluginRef: random-picker
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Follows precise prefix scheduling well-lit path
# https://llm-d.ai/blog/kvcache-wins-you-can-see
#
# X-70R-6kSys — 8×307,328 KV; target 70%-80% total usage
#
# CLUSTER
# - Pods: 8
# - KV/pod: 307,328 → Cluster capacity C_cap = 2,458,624
#
# SHAPE
# - system_prompt_len = 6,000
# - question_len = 1,200 # ≤ 1,500; cached per user
# - num_prompts_per_group = 5 # users per group
# - output_len = 1,000 # observed live ≈ 200–230 tokens/running request
#
# RESIDENT SIZING
# - Resident per group R_group = 6,000 + 5×1,200 = 12,000
# - num_groups G = 150
# Resident total = 150 × 12,000 = 1,800,000 → 1,800,000 / 2,458,624 ≈ 73.2%
#
# LIVE SIZING
#
# WORKING SET
# - One FULL SET S = G × P = 150 × 5 = 750 requests
#
load:
type: poisson
stages:
# Warmup — seat residents (~1×S)
- rate: 15
duration: 50 # 15*50 = 750

# Main ladder
- rate: 3
duration: 20
- rate: 10
duration: 20
- rate: 15
duration: 20
- rate: 20
duration: 38
- rate: 22
duration: 34
- rate: 25
duration: 30
- rate: 30
duration: 25
- rate: 35
duration: 21
- rate: 40
duration: 38
- rate: 43
duration: 36
- rate: 46
duration: 33
- rate: 49
duration: 30
- rate: 52
duration: 29
- rate: 55
duration: 27
- rate: 57
duration: 26
- rate: 60
duration: 25

api:
type: completion
streaming: true

server:
type: vllm
model_name: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL
base_url: REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL
ignore_eos: true

tokenizer:
pretrained_model_name_or_path: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL

data:
type: shared_prefix
shared_prefix:
num_groups: 150 # Number of distinct shared prefixes
num_prompts_per_group: 5 # Number of unique questions per shared prefix
system_prompt_len: 6000 # Length of the shared prefix (in tokens)
question_len: 1200 # Length of the unique question part (in tokens)
output_len: 1000 # Target length for the model's generated output (in tokens)

report:
request_lifecycle:
summary: true
per_stage: true
per_request: true

storage:
local_storage:
path: /workspace