diff --git a/benchmarks/benchmark_cache_engine.py b/benchmarks/benchmark_cache_engine.py index a75a2008b7..d0072c402f 100644 --- a/benchmarks/benchmark_cache_engine.py +++ b/benchmarks/benchmark_cache_engine.py @@ -31,7 +31,7 @@ def main(args): num_put_requests = 0 request_id = 0 for req in reqs: - fake_slot_mapping = torch.arange(req.token_mask[req.token_mask].sum(), dtype=torch.int64) + fake_slot_mapping = torch.arange(req.token_mask[req.token_mask].sum(), dtype=torch.int64).numpy() local_vars = { 'cache_engine': cache_engine, 'req': req, @@ -41,23 +41,25 @@ def main(args): if req.request_type == "get": num_get_requests += 1 if not args.only_put: - profiler.runctx('graph, return_mask, transfer_call_back, finished_ops_ids = ' + profiler.runctx('graph, return_mask, transfer_call_back, op_callback_dict, finished_ops_ids = ' 'cache_engine.get(request_id, req.token_ids, req.token_mask, ' 'fake_slot_mapping, -1, -1)', globals(), local_vars) else: - graph, return_mask, transfer_call_back, finished_ops_ids = \ + graph, return_mask, transfer_call_back, op_callback_dict, finished_ops_ids = \ cache_engine.get(request_id, req.token_ids, req.token_mask, fake_slot_mapping, -1, -1) local_vars.update({ 'graph': graph, 'return_mask': return_mask, 'transfer_call_back': transfer_call_back, + 'op_callback_dict': op_callback_dict, 'finished_ops_ids': finished_ops_ids }) profiler.runctx('transfer_call_back()', globals(), local_vars) return_mask = local_vars['return_mask'] + op_callback_dict = local_vars['op_callback_dict'] cache_hit_ratio = return_mask.sum() / req.token_mask.sum() cache_hit_ratio_list.append(cache_hit_ratio) flexkv_logger.info(f"need get {req.token_mask.sum()} tokens, " @@ -66,16 +68,17 @@ def main(args): elif req.request_type == "put": num_put_requests += 1 if not args.only_get: - profiler.runctx('graph, return_mask, transfer_call_back, finished_ops_ids = ' + profiler.runctx('graph, return_mask, transfer_call_back, op_callback_dict, finished_ops_ids = ' 'cache_engine.put(request_id, req.token_ids, req.token_mask, fake_slot_mapping)', globals(), local_vars) else: - graph, return_mask, transfer_call_back, finished_ops_ids = \ + graph, return_mask, transfer_call_back, op_callback_dict, finished_ops_ids = \ cache_engine.put(request_id, req.token_ids, req.token_mask, fake_slot_mapping) local_vars.update({ 'graph': graph, 'return_mask': return_mask, 'transfer_call_back': transfer_call_back, + 'op_callback_dict': op_callback_dict, 'finished_ops_ids': finished_ops_ids }) @@ -105,7 +108,7 @@ def parse_args(): parser = ArgumentParser() parser.add_argument("--config", type=str, - default="./benchmarks/example_config.json") + default="./benchmarks/example_config.yml") parser.add_argument("--only-get", action="store_true") parser.add_argument("--only-put", action="store_true") parser.add_argument("--num-users", type=int, default=20) diff --git a/benchmarks/benchmark_single_batch.py b/benchmarks/benchmark_single_batch.py index 397030becd..91b7c3b948 100644 --- a/benchmarks/benchmark_single_batch.py +++ b/benchmarks/benchmark_single_batch.py @@ -7,7 +7,7 @@ import torch from flexkv.server.client import KVTPClient -from flexkv.common.storage import KVCacheLayout +from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType from flexkv.common.debug import flexkv_logger from flexkv.common.config import ModelConfig, CacheConfig from utils import load_config @@ -33,7 +33,7 @@ def run_tp_client(dp_client_id, tp_rank, server_recv_port, model_config, cache_c num_gpu_blocks = cache_config.num_gpu_blocks gpu_kv_layout = KVCacheLayout( - type=cache_config.gpu_kv_layout_type, + type=KVCacheLayoutType.LAYERFIRST, num_layer=model_config.num_layers, num_block=num_gpu_blocks, tokens_per_block=cache_config.tokens_per_block, @@ -66,13 +66,12 @@ def shutdown_tp_client(tp_client_processes): def benchmark_flexkv(model_config: ModelConfig, cache_config: CacheConfig, benchmark_config: BenchmarkConfig, - gpu_register_port: str, - server_recv_port: str): + ): if model_config.tp_size * model_config.dp_size > torch.cuda.device_count(): raise ValueError(f"tp_size {model_config.tp_size} * dp_size {model_config.dp_size} is greater than " f"the number of available GPUs {torch.cuda.device_count()}") print(f"{benchmark_config = }") - kvmanager = KVManager(model_config, cache_config, gpu_register_port, server_recv_port) + kvmanager = KVManager(model_config, cache_config) kvmanager.start() tp_client_processes = [] @@ -85,7 +84,7 @@ def benchmark_flexkv(model_config: ModelConfig, for tp_rank in range(model_config.tp_size): tp_client_process = Process( target=run_tp_client, - args=(0, tp_rank, gpu_register_port, + args=(0, tp_rank, kvmanager.gpu_register_port, model_config, cache_config), daemon=True ) @@ -161,7 +160,7 @@ def benchmark_flexkv(model_config: ModelConfig, def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--config", type=str, default="benchmarks/example_config.json") + parser.add_argument("--config", type=str, default="benchmarks/example_config.yml") # benchmark config parser.add_argument("--num-layers", type=int, default=-1) parser.add_argument("--batch-size", type=int, default=1) @@ -184,8 +183,5 @@ def parse_args(): # pad sequence length to divisible by tokens_per_block benchmark_config.sequence_length = \ ((benchmark_config.sequence_length - 1) // cache_config.tokens_per_block + 1) * cache_config.tokens_per_block - import uuid - gpu_register_port = f"ipc:///tmp/flexkv_gpu_{uuid.uuid4().hex[:8]}" - server_recv_port = f"ipc:///tmp/flexkv_srv_{uuid.uuid4().hex[:8]}" - benchmark_flexkv(model_config, cache_config, benchmark_config, gpu_register_port, server_recv_port) + benchmark_flexkv(model_config, cache_config, benchmark_config) diff --git a/benchmarks/benchmark_workers.py b/benchmarks/benchmark_workers.py index f1694ccd52..009d6cec69 100644 --- a/benchmarks/benchmark_workers.py +++ b/benchmarks/benchmark_workers.py @@ -13,9 +13,9 @@ from flexkv.transfer.worker import GPUCPUTransferWorker, CPUSSDDiskTransferWorker, WorkerHandle, tpGPUCPUTransferWorker from flexkv.storage.allocator import CPUAllocator, GPUAllocator, SSDAllocator from flexkv.common.storage import KVCacheLayoutType, KVCacheLayout -from flexkv.common.config import ModelConfig, CacheConfig +from flexkv.common.config import ModelConfig, CacheConfig, GLOBAL_CONFIG_FROM_ENV from flexkv.common.debug import flexkv_logger - +from utils import load_config # flexkv_logger.set_level("OFF") @@ -32,30 +32,32 @@ class BenchmarkConfig: def make_configs(args: dict) -> Tuple[ModelConfig, CacheConfig, BenchmarkConfig]: config_file = args.config try: - with open(config_file) as f: - config = json.load(f) - model_config = ModelConfig(**config["ModelConfig"]) - model_config.dtype = eval(f"torch.{model_config.dtype}") - cache_config = CacheConfig(**config["CacheConfig"]) - cache_config.num_gpu_blocks = args.num_blocks - bench_config = BenchmarkConfig() - bench_config.transfer_type = TransferType(args.transfer_type) - bench_config.num_layers_to_transfer = args.num_layers - bench_config.num_blocks_to_transfer = args.num_blocks - bench_config.shuffle_ids = args.shuffle_ids - bench_config.warmup_round = args.warmup_round - bench_config.benchmark_round = args.benchmark_round - bench_config.bidirectional = args.bi - return model_config, cache_config, bench_config + model_config, cache_config = load_config(config_file) + if args.transfer_type == "H2D" or args.transfer_type == "D2H": + cache_config.enable_ssd = False + elif args.transfer_type == "H2DISK" or args.transfer_type == "DISK2H": + assert cache_config.enable_ssd, "SSD cache must be enabled for DISK2H or H2DISK benchmark" + bench_config = BenchmarkConfig( + transfer_type=TransferType(args.transfer_type), + num_layers_to_transfer=args.num_layers, + num_blocks_to_transfer=args.num_blocks, + shuffle_ids=args.shuffle_ids, + warmup_round=args.warmup_round, + benchmark_round=args.benchmark_round, + bidirectional=args.bi + ) + cache_config.num_ssd_blocks = max(cache_config.num_ssd_blocks, bench_config.num_blocks_to_transfer) + return model_config, cache_config, bench_config except Exception as e: raise ValueError(f"Failed to load config file {config_file}: {e}") from None def create_cpu_gpu_worker( model_config: ModelConfig, - cache_config: CacheConfig) -> Tuple[WorkerHandle, mp.Queue]: + cache_config: CacheConfig, + num_gpu_blocks: int) -> Tuple[WorkerHandle, mp.Queue]: mp.set_start_method('spawn', force=True) cpu_layout = KVCacheLayout( - type=KVCacheLayoutType(cache_config.cpu_kv_layout_type), + type=GLOBAL_CONFIG_FROM_ENV.cpu_layout_type, num_layer=model_config.num_layers, num_block=cache_config.num_cpu_blocks, tokens_per_block=cache_config.tokens_per_block, @@ -63,9 +65,9 @@ def create_cpu_gpu_worker( head_size=model_config.head_size, ) gpu_layout = KVCacheLayout( - type=KVCacheLayoutType.LAYERWISE, + type=KVCacheLayoutType.LAYERFIRST, num_layer=model_config.num_layers, - num_block=cache_config.num_gpu_blocks, + num_block=num_gpu_blocks, tokens_per_block=cache_config.tokens_per_block, num_head=model_config.num_kv_heads, head_size=model_config.head_size, @@ -132,7 +134,7 @@ def create_cpu_ssd_worker( cache_config: CacheConfig) -> Tuple[WorkerHandle, mp.Queue]: mp.set_start_method('spawn', force=True) cpu_layout = KVCacheLayout( - type=KVCacheLayoutType(cache_config.cpu_kv_layout_type), + type=GLOBAL_CONFIG_FROM_ENV.cpu_layout_type, num_layer=model_config.num_layers, num_block=cache_config.num_cpu_blocks, tokens_per_block=cache_config.tokens_per_block, @@ -140,7 +142,7 @@ def create_cpu_ssd_worker( head_size=model_config.head_size, ) ssd_layout = KVCacheLayout( - type=KVCacheLayoutType(cache_config.ssd_kv_layout_type), + type=GLOBAL_CONFIG_FROM_ENV.ssd_layout_type, num_layer=model_config.num_layers, num_block=cache_config.num_ssd_blocks, tokens_per_block=cache_config.tokens_per_block, @@ -157,6 +159,7 @@ def create_cpu_ssd_worker( dtype=model_config.dtype, num_chunks=model_config.num_layers, cache_dir=cache_config.ssd_cache_dir, + max_file_size_gb=GLOBAL_CONFIG_FROM_ENV.max_file_size_gb, ) finished_ops_queue = mp.Queue() # Create a shared memory buffer for transfer operations @@ -216,7 +219,7 @@ def bench_worker(args): bidirectional = bench_config.bidirectional if transfer_type == TransferType.H2D or transfer_type == TransferType.D2H: - worker_handle, finished_ops_queue = create_cpu_gpu_worker(model_config, cache_config) + worker_handle, finished_ops_queue = create_cpu_gpu_worker(model_config, cache_config, num_blocks_to_transfer) elif transfer_type == TransferType.H2DISK or transfer_type == TransferType.DISK2H: worker_handle, finished_ops_queue = create_cpu_ssd_worker(model_config, cache_config) else: @@ -325,7 +328,7 @@ def parse_args(): default=16) parser.add_argument("--config", type=str, - default="./benchmarks/example_config.json") + default="./benchmarks/example_config.yml") parser.add_argument("--shuffle-ids", action="store_true") parser.add_argument("--warmup-round", diff --git a/benchmarks/example_config.json b/benchmarks/example_config.json deleted file mode 100644 index 0aea8e5e3f..0000000000 --- a/benchmarks/example_config.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "ModelConfig": { - "num_layers": 64, - "num_kv_heads": 8, - "head_size": 128, - "dtype": "bfloat16", - "use_mla": false, - "tp_size": 1, - "dp_size": 1 - }, - "CacheConfig": { - "enable_cpu": true, - "enable_ssd": true, - "enable_remote": false, - "tokens_per_block": 16, - "enable_gds": false, - "gpu_kv_layout_type": "LAYERWISE", - "cpu_kv_layout_type": "BLOCKWISE", - "ssd_kv_layout_type": "BLOCKWISE", - "remote_kv_layout_type": "BLOCKWISE", - "num_cpu_blocks": 2048, - "num_ssd_blocks": 4096, - "num_remote_blocks": null, - "use_ce_transfer_h2d": false, - "use_ce_transfer_d2h": false, - "transfer_sms_h2d": 8, - "transfer_sms_d2h": 8, - "max_blocks_per_file": 32000, - "ssd_cache_dir": "./ssd_cache1/", - "ssd_cache_iouring_entries": 512, - "ssd_cache_iouring_flags": 1, - "remote_cache_size_mode": "file_size", - "remote_file_size": null, - "remote_file_num": null, - "remote_file_prefix": null, - "remote_cache_path": null, - "remote_config_custom": null, - "enable_trace": false, - "trace_file_path": "./flexkv_trace.log", - "trace_max_file_size_mb": 100, - "trace_max_files": 5, - "trace_flush_interval_ms": 1000, - "evict_ratio": 0.05, - "index_accel": true - } -} diff --git a/benchmarks/example_config.yml b/benchmarks/example_config.yml new file mode 100644 index 0000000000..a59827c4b7 --- /dev/null +++ b/benchmarks/example_config.yml @@ -0,0 +1,13 @@ +num_layers: 64 +num_kv_heads: 8 +head_size: 128 +dtype: bfloat16 +use_mla: false +tp_size: 1 +dp_size: 1 +tokens_per_block: 16 + +cpu_cache_gb: 8 +ssd_cache_gb: 16 +ssd_cache_dir: ./ssd_cache1/;./ssd_cache2/ +enable_gds: false diff --git a/benchmarks/utils.py b/benchmarks/utils.py index cdde8bddcf..1ebabc402e 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -1,14 +1,15 @@ import asyncio -import json import random import time from dataclasses import dataclass, field from typing import Optional, List, Tuple, Any +import yaml import torch +import numpy as np from tqdm import tqdm -from flexkv.common.config import ModelConfig, CacheConfig +from flexkv.common.config import * from flexkv.common.storage import KVCacheLayoutType @@ -17,9 +18,9 @@ class KVRequest: user_id: int turn_id: int request_type: str # "get" or "put" - token_ids: torch.Tensor - token_mask: torch.Tensor - slot_mapping: Optional[torch.Tensor] = None + token_ids: np.ndarray + token_mask: np.ndarray + slot_mapping: Optional[np.ndarray] = None request_id: int = field(init=False) _request_id_counter: int = field(init=False, default=0) @@ -28,6 +29,12 @@ def __post_init__(self): self.request_id = KVRequest._request_id_counter KVRequest._request_id_counter += 1 + if isinstance(self.token_ids, torch.Tensor): + self.token_ids = self.token_ids.numpy().astype(np.int64) + if isinstance(self.token_mask, torch.Tensor): + self.token_mask = self.token_mask.numpy().astype(np.int64) + if isinstance(self.slot_mapping, torch.Tensor): + self.slot_mapping = self.slot_mapping.numpy().astype(np.int64) def generate_random_multiturn(num_user_requests: int, num_turns: int, @@ -88,27 +95,32 @@ def generate_random_multiturn(num_user_requests: int, def load_config(config_path: str) -> Tuple[ModelConfig, CacheConfig]: with open(config_path) as f: - config = json.load(f) - if "ModelConfig" not in config: - print("ModelConfig not found in config, using default values") - config["ModelConfig"] = {} - if "CacheConfig" not in config: - print("CacheConfig not found in config, using default values") - config["CacheConfig"] = {} - if "dtype" in config["ModelConfig"]: - config["ModelConfig"]["dtype"] = eval(f"torch.{config['ModelConfig']['dtype']}") - if "gpu_kv_layout_type" in config["CacheConfig"]: - config["CacheConfig"]["gpu_kv_layout_type"] = \ - KVCacheLayoutType(config["CacheConfig"]["gpu_kv_layout_type"]) - if "cpu_kv_layout_type" in config["CacheConfig"]: - config["CacheConfig"]["cpu_kv_layout_type"] = \ - KVCacheLayoutType(config["CacheConfig"]["cpu_kv_layout_type"]) - if "ssd_kv_layout_type" in config["CacheConfig"]: - config["CacheConfig"]["ssd_kv_layout_type"] = \ - KVCacheLayoutType(config["CacheConfig"]["ssd_kv_layout_type"]) - if "remote_kv_layout_type" in config["CacheConfig"]: - config["CacheConfig"]["remote_kv_layout_type"] = \ - KVCacheLayoutType(config["CacheConfig"]["remote_kv_layout_type"]) - model_config = ModelConfig(**config["ModelConfig"]) - cache_config = CacheConfig(**config["CacheConfig"]) + config = yaml.load(f, Loader=yaml.SafeLoader) + print(config) + model_config = ModelConfig() + cache_config = CacheConfig() + user_config = UserConfig() + model_config.num_layers = config["num_layers"] + model_config.num_kv_heads = config["num_kv_heads"] + model_config.head_size = config["head_size"] + model_config.dtype = eval(f"torch.{config['dtype']}") + model_config.use_mla = config["use_mla"] + model_config.tp_size = config["tp_size"] + model_config.dp_size = config["dp_size"] + cache_config.tokens_per_block = config["tokens_per_block"] + + if "cpu_cache_gb" in config: + user_config.cpu_cache_gb = config["cpu_cache_gb"] + if "ssd_cache_gb" in config: + user_config.ssd_cache_gb = config["ssd_cache_gb"] + if "ssd_cache_dir" in config: + user_config.ssd_cache_dir = parse_path_list(config["ssd_cache_dir"]) + if "enable_gds" in config: + user_config.enable_gds = config["enable_gds"] + update_default_config_from_user_config(model_config, cache_config, user_config) return model_config, cache_config + +if __name__ == "__main__": + model_config, cache_config = load_config("./benchmarks/example_config.yml") + print(model_config) + print(cache_config) diff --git a/docs/dynamo_integration/README_en.md b/docs/dynamo_integration/README_en.md index f456004c3c..cb62e5cba5 100644 --- a/docs/dynamo_integration/README_en.md +++ b/docs/dynamo_integration/README_en.md @@ -32,7 +32,7 @@ cd FlexKV && ./build.sh ```bash # Navigate to vLLM directory -cd /opt/vllm +cd /opt/vllm # apply patch git apply /your/path/to/FlexKV/examples/vllm_adaption/vllm_0_10_1_1-flexkv-connector.patch ``` @@ -82,48 +82,29 @@ python -m dynamo.frontend --router-mode kv --http-port 8000 & # Define number of worker nodes NUM_WORKERS=4 -# When using multiple workers, ensure FlexKV ports are different to avoid hanging at flexkv init -# Adjust num_cpu_blocks and num_ssd_blocks values according to your server configuration -for i in $(seq 0 $((NUM_WORKERS-1))); do - cat < ./flexkv_config_${i}.json -{ - "enable_flexkv": true, - "server_recv_port": "ipc:///tmp/flexkv_${i}_test", - "cache_config": { - "enable_cpu": true, - "enable_ssd": false, - "enable_remote": false, - "enable_gds": false, - "enable_trace": false, - "ssd_cache_iouring_entries": 512, - "tokens_per_block": 64, - "num_cpu_blocks": 10240, - "num_ssd_blocks": 256000, - "ssd_cache_dir": "/data/flexkv_ssd/", - "evict_ratio": 0.05, - "index_accel": true - - }, - "num_log_interval_requests": 200 -} -EOF -done - +# Configure FlexKV using environment variables, disabling config file +unset FLEXKV_CONFIG_PATH +# Adjust CPU and SSD space sizes according to your server configuration +export FLEXKV_CPU_CACHE_GB=32 +export FLEXKV_SSD_CACHE_GB=128 +export FLEXKV_SSD_CACHE_DIR="/data/flexkv_ssd/" # Use a loop to start worker nodes for i in $(seq 0 $((NUM_WORKERS-1))); do # Calculate GPU device IDs GPU_START=$((i*2)) GPU_END=$((i*2+1)) - + if [ $i -lt $((NUM_WORKERS-1)) ]; then - FLEXKV_CONFIG_PATH="./flexkv_config_${i}.json" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2 --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 & + # When using multiple workers, ensure FlexKV ports are different to avoid hanging at flexkv init + # Set FlexKV port via the `FLEXKV_SERVER_RECV_PORT` environment variable + FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_${i}" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2 --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 & else - FLEXKV_CONFIG_PATH="./flexkv_config_${i}.json" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2 --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 + FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_${i}" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2 --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 fi done ``` -> Note: The `flexkv_config.json` configuration is provided as a simple example only. For full parameter options, please refer to [`docs/flexkv_config_reference/README_en.md`](../../docs/flexkv_config_reference/README_en.md) +> Note: You can configure using YAML or JSON files. The above configuration is provided as a simple example only. For full parameter options, please refer to [`docs/flexkv_config_reference/README_en.md`](../../docs/flexkv_config_reference/README_en.md) ### Verification @@ -152,4 +133,4 @@ genai-perf can send requests according to the timestamps in the trace file and c ```bash genai-perf profile --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-70B --endpoint-type chat --endpoint /v1/chat/completions --streaming --url http://localhost:8000 --input-file payload:mooncake_trace_1_6.jsonl --random-seed 100 -v -H 'Authorization: Bearer NOT USED' -H 'Accept: text/event-stream' -- --stability-percentage 99 -``` \ No newline at end of file +``` diff --git a/docs/dynamo_integration/README_zh.md b/docs/dynamo_integration/README_zh.md index 53a1b49f93..70349fa6c0 100644 --- a/docs/dynamo_integration/README_zh.md +++ b/docs/dynamo_integration/README_zh.md @@ -32,7 +32,7 @@ cd FlexKV && ./build.sh ```bash # 进入 vLLM 目录 -cd /opt/vllm +cd /opt/vllm # apply patch git apply /your/path/to/FlexKV/examples/vllm_adaption/vllm_0_10_1_1-flexkv-connector.patch ``` @@ -83,48 +83,29 @@ python -m dynamo.frontend --router-mode kv --http-port 8000 & # 定义工作节点数量 NUM_WORKERS=4 -# 多个worker时注意FlexKV的端口应不同,否则会卡在flexkv init这一步 -# 请根据服务器的配置,调整num_cpu_blocks和num_ssd_blocks的数值 -for i in $(seq 0 $((NUM_WORKERS-1))); do - cat < ./flexkv_config_${i}.json -{ - "enable_flexkv": true, - "server_recv_port": "ipc:///tmp/flexkv_${i}_test", - "cache_config": { - "enable_cpu": true, - "enable_ssd": false, - "enable_remote": false, - "enable_gds": false, - "enable_trace": false, - "ssd_cache_iouring_entries": 512, - "tokens_per_block": 64, - "num_cpu_blocks": 10240, - "num_ssd_blocks": 256000, - "ssd_cache_dir": "/data/flexkv_ssd/", - "evict_ratio": 0.05, - "index_accel": true - - }, - "num_log_interval_requests": 200 -} -EOF -done - +# 使用环境变量配置Flexkv,禁用配置文件 +unset FLEXKV_CONFIG_PATH +# 请根据服务器的配置,调整CPU和SSD的空间大小 +export FLEXKV_CPU_CACHE_GB=32 +export FLEXKV_SSD_CACHE_GB=128 +export FLEXKV_SSD_CACHE_DIR="/data/flexkv_ssd/" # 使用for循环启动工作节点 for i in $(seq 0 $((NUM_WORKERS-1))); do # 计算GPU设备ID GPU_START=$((i*2)) GPU_END=$((i*2+1)) - + if [ $i -lt $((NUM_WORKERS-1)) ]; then - FLEXKV_CONFIG_PATH="./flexkv_config_${i}.json" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2 --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 & + # 多个worker时注意Flexkv的端口应不同,否则会卡在flexkv init这一步 + # 通过环境变量 `FLEXKV_SERVER_RECV_PORT` 设置Flexkv的端口 + FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_${i}" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2 --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 & else - FLEXKV_CONFIG_PATH="./flexkv_config_${i}.json" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2 --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 + FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_${i}" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2 --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 fi done ``` -> 注:`flexkv_config.json`配置仅为简单示例,选项请参考[`docs/flexkv_config_reference/README_zh.md`](../../docs/flexkv_config_reference/README_zh.md) +> 注:可使用 YAML 或 JSON 文件配置,上述配置仅为简单示例,更多选项请参考[`docs/flexkv_config_reference/README_zh.md`](../../docs/flexkv_config_reference/README_zh.md) ### 验证 @@ -152,4 +133,4 @@ genai-perf可以根据trace文件里的时间戳来发送请求,统计LLM服 ```bash genai-perf profile --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-70B --endpoint-type chat --endpoint /v1/chat/completions --streaming --url http://localhost:8000 --input-file payload:mooncake_trace_1_6.jsonl --random-seed 100 -v -H 'Authorization: Bearer NOT USED' -H 'Accept: text/event-stream' -- --stability-percentage 99 -``` \ No newline at end of file +``` diff --git a/docs/flexkv_config_reference/README_en.md b/docs/flexkv_config_reference/README_en.md index c20116dead..a416656dfc 100644 --- a/docs/flexkv_config_reference/README_en.md +++ b/docs/flexkv_config_reference/README_en.md @@ -1,147 +1,160 @@ # FlexKV Configuration Guide -This guide explains how to configure and use the FlexKV online serving configuration file (`flexkv_config.json`), including the meaning of all parameters, recommended values, and typical usage scenarios. +This guide provides detailed instructions on how to configure and use FlexKV's online service configuration file (`flexkv_config.json`), covering the meaning of all parameters, recommended values, and typical usage scenarios. --- -## Recommended Configuration +## Basic Configuration Options -Below is a production-grade recommended configuration that balances performance and stability: +### 1. Configuration via Config File +If the `FLEXKV_CONFIG_PATH` environment variable is set, the configuration file specified by this variable will be used with priority. Both yml and json file formats are supported. + +Below is a recommended configuration example that enables both CPU and SSD cache layers: + +YML configuration: +```yml +cpu_cache_gb: 32 +ssd_cache_gb: 1024 +ssd_cache_dir: /data/flexkv_ssd/ +enable_gds: false +``` +Or using JSON configuration: ```json { - "enable_flexkv": true, - "server_recv_port": "ipc:///tmp/flexkv_test", - "cache_config": { - "enable_cpu": true, - "enable_ssd": true, - "enable_remote": false, - "enable_gds": false, - "enable_trace": false, - "ssd_cache_iouring_entries": 512, - "tokens_per_block": 64, - "num_cpu_blocks": 233000, - "num_ssd_blocks": 4096000, - "ssd_cache_dir": "/data/flexkv_ssd/", - "evict_ratio": 0.05, - "index_accel": true - }, - "num_log_interval_requests": 2000 + "cpu_cache_gb": 32, + "ssd_cache_gb": 1024, + "ssd_cache_dir": "/data/flexkv_ssd/", + "enable_gds": false } ``` -- `num_cpu_blocks` and `num_ssd_blocks` represent the total number of blocks in CPU memory and SSD respectively. These values must be configured according to your machine specs and model size. See [Cache Capacity Configuration](#cache-capacity-config) for calculation details. -- `ssd_cache_dir` specifies the directory where SSD-stored KV cache files are saved. +- `cpu_cache_gb`: CPU cache layer capacity in GB, must not exceed physical memory. +- `ssd_cache_gb`: SSD cache layer capacity in GB. Recommended to be greater than `cpu_cache_gb` and a multiple of `FLEXKV_MAX_FILE_SIZE_GB`. Set to 0 if only using CPU cache (SSD cache will not be enabled). +- `ssd_cache_dir`: Directory where SSD cache data is stored. If multiple SSDs are available, separate multiple mount paths with semicolons `;`. For example, `ssd_cache_dir: /data0/flexkv_ssd/;/data1/flexkv_ssd/` to improve bandwidth. +- `enable_gds`: Whether to enable GPU Direct Storage (GDS). If hardware and drivers support it, enabling this can improve SSD to GPU data throughput. Disabled by default. --- -## Configuration File Structure Overview +### 2. Configuration via Environment Variables -The FlexKV configuration file is a JSON file, primarily consisting of three parts: +If the `FLEXKV_CONFIG_PATH` environment variable is not set, configuration can be done through the following environment variables. -- `enable_flexkv`: Whether to enable FlexKV (must be set to `true` to take effect). -- `server_recv_port`: The IPC port on which the FlexKV service listens. -- `cache_config`: The core cache configuration object, containing all cache behavior parameters. -- `num_log_interval_requests`: Log statistics interval (outputs performance log every N requests). +> Note: If `FLEXKV_CONFIG_PATH` is set, the configuration file specified by `FLEXKV_CONFIG_PATH` will take priority, and the following environment variables will be ignored. ---- +| Environment Variable | Type | Default | Description | +|----------------------|------|---------|-------------| +| `FLEXKV_CPU_CACHE_GB` | int | 16 | CPU cache layer capacity in GB, must not exceed physical memory | +| `FLEXKV_SSD_CACHE_GB` | int | 0 | SSD cache layer capacity in GB. Recommended to be greater than `FLEXKV_CPU_CACHE_GB` and a multiple of `FLEXKV_MAX_FILE_SIZE_GB`. Set to 0 if only using CPU cache (SSD cache will not be enabled) | +| `FLEXKV_SSD_CACHE_DIR` | str | "./flexkv_ssd" | Directory where SSD cache data is stored. If multiple SSDs are available, separate multiple mount paths with semicolons `;`. For example, `"/data0/flexkv_ssd/;/data1/flexkv_ssd/"` to improve bandwidth | +| `FLEXKV_ENABLE_GDS` | bool | 0 | Whether to enable GPU Direct Storage (GDS). If hardware and drivers support it, enabling this can improve SSD to GPU data throughput. Disabled by default, set to 1 to enable | -## Complete `cache_config` Parameter Reference (from [`flexkv/common/config.py`](../../flexkv/common/config.py)) +--- -### Basic Configuration +## Advanced Configuration Options +Advanced configuration is mainly for users who need fine-tuned performance optimization or custom special requirements. It is recommended for users with some understanding of FlexKV. +All advanced configurations support configuration via environment variables or yml/json configuration files. In case of conflicts with multiple configuration levels, the final priority order is: **Configuration file > Environment variables > Built-in default parameters**. +If setting in a configuration file, remove the `FLEXKV_` prefix and convert everything to lowercase. For example, setting `server_client_mode: 1` in a yml file will override the value of the `FLEXKV_SERVER_CLIENT_MODE` environment variable. +Some configurations can only be set through environment variables. -| Parameter Name | Type | Default | Description | -|----------------|------|---------|-------------| -| `tokens_per_block` | int | 16 | Number of tokens per KV block. Must match the `block_size` used in the acceleration framework (e.g., vLLM). | -| `enable_cpu` | bool | true | Whether to enable CPU memory as a cache layer. Strongly recommended to enable. | -| `enable_ssd` | bool | false | Whether to enable SSD as a cache layer. Recommended if NVMe SSD is available. | -| `enable_remote` | bool | false | Whether to enable remote cache (e.g., scalable cloud storage). Requires remote cache engine and custom implementation. | -| `enable_gds` | bool | false | Whether to use GPU Direct Storage (GDS) to accelerate SSD I/O. Not currently supported. | -| `index_accel` | bool | false | Whether to enable C++ RadixTree. Recommended to enable. | +### Enable/Disable FLEXKV ---- +> Note: This configuration can only be set through environment variables -### KV Cache Layout Types (Generally No Need to Modify) +| Environment Variable | Type | Default | Description | +|---------------------|------|---------|-------------| +| `ENABLE_FLEXKV` | bool | 1 | 0-Disable FLEXKV, 1-Enable FLEXKV | -| Parameter Name | Type | Default | Description | -|----------------|------|---------|-------------| -| `gpu_kv_layout_type` | enum | LAYERWISE | Organization of KV cache on GPU (layer-wise or block-wise). Must match vLLM’s layout (currently `LAYERWISE`). | -| `cpu_kv_layout_type` | enum | BLOCKWISE | Organization on CPU. Recommended to use `BLOCKWISE`. Does not need to match vLLM. | -| `ssd_kv_layout_type` | enum | BLOCKWISE | Organization on SSD. Recommended to use `BLOCKWISE`. Does not need to match vLLM. | -| `remote_kv_layout_type` | enum | BLOCKWISE | Organization for remote cache. Must be defined according to remote backend’s layout. | -> Note: Do not modify layout types unless you have specific performance requirements. --- -### Cache Capacity Configuration +### Server Mode Configuration + +| Environment Variable | Type | Default | Description | +|---------------------|------|---------|-------------| +| `FLEXKV_SERVER_CLIENT_MODE` | bool | 0 | `server_client_mode`: Whether to force enable server-client mode | +| `FLEXKV_SERVER_RECV_PORT` | str | "ipc:///tmp/flexkv_server" | `server_recv_port`: Server receive port configuration | -| Parameter Name | Type | Default | Description | -|----------------|------|---------|-------------| -| `num_cpu_blocks` | int | 1000000 | Number of blocks allocated in CPU memory. Adjust based on available RAM. | -| `num_ssd_blocks` | int | 10000000 | Number of blocks allocated on SSD. | -| `num_remote_blocks` | int \| None | None | Number of blocks allocated in remote cache. | +--- -> Note: Block size in all cache levels (CPU/SSD/Remote) matches the GPU block size. Estimate cache capacities based on GPU KV cache memory usage and block count. +### KV Cache Layout Types -> Note: `block_size = num_layer * _kv_dim * tokens_per_block * num_head * head_size * dtype_size`. +| Environment Variable | Type | Default | Description | +|---------------------|------|---------|-------------| +| `FLEXKV_CPU_LAYOUT` | str | BLOCKFIRST | CPU storage layout, options: `LAYERFIRST` and `BLOCKFIRST`, recommended to use `BLOCKFIRST` | +| `FLEXKV_SSD_LAYOUT` | str | BLOCKFIRST | SSD storage layout, options: `LAYERFIRST` and `BLOCKFIRST`, recommended to use `BLOCKFIRST` | +| `FLEXKV_REMOTE_LAYOUT` | str | BLOCKFIRST | REMOTE storage layout, options: `LAYERFIRST` and `BLOCKFIRST`, recommended to use `BLOCKFIRST` | +| `FLEXKV_GDS_LAYOUT` | str | BLOCKFIRST | GDS storage layout, options: `LAYERFIRST` and `BLOCKFIRST`, recommended to use `BLOCKFIRST` | --- ### CPU-GPU Transfer Optimization -| Parameter Name | Type | Default | Description | -|----------------|------|---------|-------------| -| `use_ce_transfer_h2d` | bool | false | Whether to use CUDA Copy Engine for Host→Device transfers. Reduces SM usage but may slightly reduce bandwidth. Real-world difference is minimal. | -| `use_ce_transfer_d2h` | bool | false | Whether to use CUDA Copy Engine for Device→Host transfers. | -| `transfer_sms_h2d` | int | 8 | Number of SMs (Streaming Multiprocessors) allocated for H2D transfers. | -| `transfer_sms_d2h` | int | 8 | Number of SMs allocated for D2H transfers. | +| Environment Variable | Type | Default | Description | +|---------------------|------|---------|-------------| +| `FLEXKV_USE_CE_TRANSFER_H2D` | bool | 0 | Whether to use cudaMemcpyAsync for Host→Device transfers. Can avoid occupying SM, but transfer speed will be reduced | +| `FLEXKV_USE_CE_TRANSFER_D2H` | bool | 0 | Whether to use cudaMemcpyAsync for Device→Host transfers. Can avoid occupying SM, but transfer speed will be reduced | +| `FLEXKV_TRANSFER_SMS_H2D` | int | 8 | Number of streaming multiprocessors used for H2D transfer, only effective when `FLEXKV_USE_CE_TRANSFER_H2D` is 0 | +| `FLEXKV_TRANSFER_SMS_D2H` | int | 8 | Number of streaming multiprocessors used for D2H transfer, only effective when `FLEXKV_USE_CE_TRANSFER_D2H` is 0 | --- -### SSD Cache Configuration +### SSD I/O Optimization -| Parameter Name | Type | Default | Description | -|----------------|------|---------|-------------| -| `max_blocks_per_file` | int | 32000 | Maximum number of blocks per SSD file. `-1` means unlimited. | -| `ssd_cache_dir` | str \| List[str] | None | **Required.** Path to SSD cache directory, e.g., `"/data/flexkv_ssd/"`. | -| `ssd_cache_iouring_entries` | int | 0 | io_uring queue depth. Recommended: `512` for significantly improved concurrent I/O performance. | -| `ssd_cache_iouring_flags` | int | 0 | io_uring flags. Recommended: `1`.| +> Note: Setting `iouring_entries` to 0 disables iouring. Not recommended to set to 0. + +| Environment Variable | Type | Default | Description | +|---------------------|------|---------|-------------| +| `FLEXKV_MAX_FILE_SIZE_GB` | float | -1 | Maximum size of a single SSD file, -1 means unlimited | +| `FLEXKV_IORING_ENTRIES` | int | 512 | io_uring queue depth. Recommended to set to `512` to improve concurrent I/O performance | +| `FLEXKV_IORING_FLAGS` | int | 0 | io_uring flags, default is 0 | -> Note: To maximize bandwidth across multiple SSDs, bind each SSD to a separate directory and specify them as a list: -> `"ssd_cache_dir": ["/data0/flexkv_ssd/", "/data1/flexkv_ssd/"]`. -> KV blocks will be evenly distributed across all SSDs. -> Note: Setting `ssd_cache_iouring_entries` to `0` disables io_uring. Not recommended. --- -### Remote Cache Configuration (Skip if not enabled) +### Multi-Node TP + +> Note: These configurations can only be set through environment variables + +| Environment Variable | Type | Default | Description | +|---------------------|------|---------|-------------| +| `FLEXKV_MASTER_HOST` | str | "localhost" | Master node IP for multi-node TP | +| `FLEXKV_MASTER_PORTS` | str | "5556,5557,5558" | Master node ports for multi-node TP. Uses three ports, separated by commas | -| Parameter Name | Type | Default | Description | -|----------------|------|---------|-------------| -| `remote_cache_size_mode` | str | "file_size" | Allocate remote cache space by file size or block count. | -| `remote_file_size` | int \| None | None | Size (in bytes) of each remote file. | -| `remote_file_num` | int \| None | None | Number of remote files. | -| `remote_file_prefix` | str \| None | None | Prefix for remote file names. | -| `remote_cache_path` | str \| List[str] | None | Remote cache path (e.g., Redis URL, S3 path). | -| `remote_config_custom` | dict \| None | None | Custom remote cache configurations (e.g., timeout, authentication). | --- -### Tracing and Logging +### Logging Configuration + +> Note: These configurations can only be set through environment variables + +| Environment Variable | Type | Default | Description | +|---------------------|------|---------|-------------| +| `FLEXKV_LOGGING_PREFIX` | str | "FLEXKV" | Logging prefix | +| `FLEXKV_LOG_LEVEL` | str | "INFO" | Log output level, options: "DEBUG" "INFO" "WARNING" "ERROR" "CRITICAL" "OFF" | +| `FLEXKV_NUM_LOG_INTERVAL_REQUESTS` | int | 200 | Log output interval request count | + + + +--- + +### Tracing and Debugging + +| Environment Variable | Type | Default | Description | +|---------------------|------|---------|-------------| +| `FLEXKV_ENABLE_TRACE` | bool | 0 | Whether to enable performance tracing. Recommended to disable (`0`) in production to reduce overhead | +| `FLEXKV_TRACE_FILE_PATH` | str | "./flexkv_trace.log" | Trace log file path | +| `FLEXKV_TRACE_MAX_FILE_SIZE_MB` | int | 100 | Maximum size (MB) per trace log file | +| `FLEXKV_TRACE_MAX_FILES` | int | 5 | Maximum number of trace log files to retain | +| `FLEXKV_TRACE_FLUSH_INTERVAL_MS` | int | 1000 | Trace log flush interval (milliseconds) | -| Parameter Name | Type | Default | Description | -|----------------|------|---------|-------------| -| `enable_trace` | bool | true | Whether to enable performance tracing. Disable (`false`) in production to reduce overhead. | -| `trace_file_path` | str | "./flexkv_trace.log" | Path to trace log file. | -| `trace_max_file_size_mb` | int | 100 | Maximum size (MB) per trace log file. | -| `trace_max_files` | int | 5 | Maximum number of trace log files to retain. | -| `trace_flush_interval_ms` | int | 1000 | Trace log flush interval (milliseconds). | --- -### Cache Eviction Policy +### Control Plane Optimization -| Parameter Name | Type | Default | Description | -|----------------|------|---------|-------------| -| `evict_ratio` | float | 0.0 | Ratio of blocks to proactively evict from CPU/SSD per eviction cycle. `0.0` = evict only the minimal necessary blocks (more eviction cycles may impact performance). Recommended: `0.05` (evict 5% of least recently used blocks per cycle). | +| Environment Variable | Type | Default | Description | +|---------------------|------|---------|-------------| +| `FLEXKV_INDEX_ACCEL` | bool | 1 | 0-Enable Python version RadixTree implementation, 1-Enable C++ version RadixTree implementation | +| `FLEXKV_EVICT_RATIO` | float | 0.05 | CPU and SSD eviction ratio for proactive eviction per cycle (0.0 = only evict the minimal necessary blocks). Recommended to keep at `0.05`, i.e., evict 5% of least recently used blocks per cycle | diff --git a/docs/flexkv_config_reference/README_zh.md b/docs/flexkv_config_reference/README_zh.md index d1f7a3a279..34821c5dd4 100644 --- a/docs/flexkv_config_reference/README_zh.md +++ b/docs/flexkv_config_reference/README_zh.md @@ -4,142 +4,152 @@ --- -## 推荐配置方案 +## 基础配置选项 -以下是一个兼顾性能与稳定性的生产级推荐配置: +### 一、通过文件配置 +如果设置了环境变量 `FLEXKV_CONFIG_PATH`,将优先使用该变量指定的配置文件。支持yml和json两种文件类型。 + +以下是一个同时开启 CPU 和 SSD 缓存层的推荐配置示例: + +yml配置: +```yml +cpu_cache_gb: 32 +ssd_cache_gb: 1024 +ssd_cache_dir: /data/flexkv_ssd/ +enable_gds: false +``` +或使用json配置: ```json { - "enable_flexkv": true, - "server_recv_port": "ipc:///tmp/flexkv_test", - "cache_config": { - "enable_cpu": true, - "enable_ssd": true, - "enable_remote": false, - "enable_gds": false, - "enable_trace": false, - "ssd_cache_iouring_entries": 512, - "tokens_per_block": 64, - "num_cpu_blocks": 233000, - "num_ssd_blocks": 4096000, - "ssd_cache_dir": "/data/flexkv_ssd/", - "evict_ratio": 0.05, - "index_accel": true - }, - "num_log_interval_requests": 2000 + "cpu_cache_gb": 32, + "ssd_cache_gb": 1024, + "ssd_cache_dir": "/data/flexkv_ssd/", + "enable_gds": false } ``` -- 其中的`num_cpu_blocks`和`num_ssd_blocks`分别代表内存和SSD中block的总数量,需要根据实际机器配置和模型来配置,具体计算方式见下文[缓存容量配置](#cache-capacity-config) -- `ssd_cache_dir`为ssd中KVCache存放的文件目录 +- `cpu_cache_gb`:CPU 缓存层容量,单位为 GB,不能超过物理内存。 +- `ssd_cache_gb`:SSD 缓存层容量,单位为 GB。建议大于 `cpu_cache_gb`并为`FLEXKV_MAX_FILE_SIZE_GB`的整数倍,若仅用CPU缓存则设为 0(此时不启用 SSD 缓存)。 +- `ssd_cache_dir`:SSD 缓存数据的存放目录。若有多块 SSD,可通过分号 `;` 分隔多个挂载路径。例如 `ssd_cache_dir: /data0/flexkv_ssd/;/data1/flexkv_ssd/`,以提升带宽。 +- `enable_gds`:是否启用 GPU Direct Storage(GDS)。如硬件和驱动支持,开启后可提升 SSD 到 GPU 的数据吞吐能力。默认关闭。 --- -## 配置文件结构概览 +### 二、通过环境变量配置 -FlexKV 的配置文件是一个 JSON 文件,主要包含三个部分: +如果未设置 `FLEXKV_CONFIG_PATH`环境变量,则可通过以下环境变量进行配置。 -- `enable_flexkv`: 是否启用 FlexKV 功能(必须设为 `true` 才生效) -- `server_recv_port`: FlexKV 服务监听的 IPC 端口 -- `cache_config`: 核心缓存配置对象,包含所有缓存行为参数 -- `num_log_interval_requests`: 日志统计间隔(每处理 N 个请求输出一次性能日志) +> 注:如果设置了`FLEXKV_CONFIG_PATH`,将优先使用`FLEXKV_CONFIG_PATH`指定的配置文件,以下环境变量将被忽略。 + +| 环境变量 | 类型 | 默认值 | 说明 | +|----------------------|-------|-------------|----------------------------------------------------------------------------------------------------------------| +| `FLEXKV_CPU_CACHE_GB` | int | 16 | CPU 缓存层容量,单位为 GB,不能超过物理内存 +| `FLEXKV_SSD_CACHE_GB` | int | 0 | SSD 缓存层容量,单位为 GB。建议设置大于 `FLEXKV_CPU_CACHE_GB`并为`FLEXKV_MAX_FILE_SIZE_GB`的整数倍,若仅用CPU缓存则设为 0(此时不启用 SSD 缓存) | +| `FLEXKV_SSD_CACHE_DIR` | str | "./flexkv_ssd" | SSD 缓存数据的存放目录。若有多块 SSD,可通过分号 `;` 分隔多个挂载路径。例如 `"/data0/flexkv_ssd/;/data1/flexkv_ssd/"`,以提升带宽 | +| `FLEXKV_ENABLE_GDS` | bool | 0 | 是否启用 GPU Direct Storage(GDS)。如硬件和驱动支持,开启后可提升 SSD 到 GPU 的数据吞吐能力。默认关闭,开启请设为 1 | --- -## cache_config完整参数详解(来自 [`flexkv/common/config.py`](../../flexkv/common/config.py)) +## 高级配置选项 +高级配置主要针对需要精细化性能优化或自定义特殊需求的用户,建议对 FlexKV 具备一定理解的用户使用。 +所有高级配置均支持通过环境变量或 yml/json 配置文件进行设置,如有多级配置冲突,最终生效顺序为:**配置文件 > 环境变量 > 默认内置参数**。 +如果在配置文件中设置,请去除`FLEXKV_`前缀并全部转换为小写,例如在yml文件中设置`server_client_mode: 1`将会覆盖`FLEXKV_SERVER_CLIENT_MODE`环境变量的值。 +部分配置只能通过环境变量设置。 + +### 启用/禁用FLEXKV -### 基础配置 +> 注:该配置只能通过环境变量设置 -| 参数名 | 类型 | 默认值 | 说明 | +| 环境变量 | 类型 | 默认值 | 说明 | |--------|------|--------|------| -| `tokens_per_block` | int | 16 | 每个 KV Block 包含的 token 数量。需要与加速框架(如vLLM)中`block_size`保持一致 | -| `enable_cpu` | bool | true | 是否启用 CPU 内存作为缓存层。强烈建议开启。 | -| `enable_ssd` | bool | false | 是否启用 SSD 作为缓存层。如配备 NVMe SSD,建议开启。 | -| `enable_remote` | bool | false | 是否启用远程缓存(如可扩展云存储等)。需要配合远程缓存和自定义的远程缓存引擎使用 | -| `enable_gds` | bool | false | 是否使用 GPU Direct Storage(GDS)加速 SSD 读写。目前暂不支持。 | -| `index_accel` | bool | false | 是否启用C++ RadixTree。推荐开启。 | +| `ENABLE_FLEXKV` | bool | 1 | 0-禁用FLEXKV,1-启用FLEXKV | --- -### KV 缓存布局类型(一般无需修改) +### 服务器模式配置 -| 参数名 | 类型 | 默认值 | 说明 | +| 环境变量 | 类型 | 默认值 | 说明 | |--------|------|--------|------| -| `gpu_kv_layout_type` | enum | LAYERWISE | GPU 上 KV Cache 的组织方式(按层或按块)。目前vLLM在GPU组织方式为`LAYERWISE`,因此FlexKV的`gpu_kv_layout_type`须与vLLM保持一致 | -| `cpu_kv_layout_type` | enum | BLOCKWISE | CPU 上按块组织, 推荐使用`BLOCKWISE`,不需要与vLLM保持一致 | -| `ssd_kv_layout_type` | enum | BLOCKWISE | SSD 上按块组织, 推荐使用`BLOCKWISE`,不需要与vLLM保持一致 | -| `remote_kv_layout_type` | enum | BLOCKWISE | 远程缓存按块组织, 需要按照remote组织形式定义 | - -> 注:除非有特殊性能需求,否则不建议修改布局类型。 +| `FLEXKV_SERVER_CLIENT_MODE` | bool | 0 | `server_client_mode`: 是否强制启用服务器-客户端模式 | +| `FLEXKV_SERVER_RECV_PORT` | str | "ipc:///tmp/flexkv_server" | `server_recv_port`: 服务器接收端口配置 | --- -### 缓存容量配置 +### KV 缓存布局类型 -| 参数名 | 类型 | 默认值 | 说明 | +| 环境变量 | 类型 | 默认值 | 说明 | |--------|------|--------|------| -| `num_cpu_blocks` | int | 1000000 | CPU 缓存块数。根据内存大小调整。| -| `num_ssd_blocks` | int | 10000000 | SSD 缓存块数。| -| `num_remote_blocks` | int \| None | None | 远程缓存块数。| - -> 注:FlexKV里的各级缓存的block大小与GPU中的block大小保持一致,可以参考GPU的KVCache显存大小与block数量估算各级缓存中的block数量。 - -> 注:block_size = num_layer * _kv_dim * tokens_per_block * num_head * self.head_size * torch_dtype.size()。 +| `FLEXKV_CPU_LAYOUT` | str | BLOCKFIRST | CPU 存储布局,可选`LAYERFIRST`和`BLOCKFIRST`, 推荐使用`BLOCKFIRST` | +| `FLEXKV_SSD_LAYOUT` | str | BLOCKFIRST | SSD 存储布局,可选`LAYERFIRST`和`BLOCKFIRST`, 推荐使用`BLOCKFIRST` | +| `FLEXKV_REMOTE_LAYOUT` | str | BLOCKFIRST | REMOTE 存储布局,可选`LAYERFIRST`和`BLOCKFIRST`, 推荐使用`BLOCKFIRST` | +| `FLEXKV_GDS_LAYOUT` | str | BLOCKFIRST | GDS 存储布局,可选`LAYERFIRST`和`BLOCKFIRST`, 推荐使用`BLOCKFIRST` | --- ### CPU-GPU 传输优化 -| 参数名 | 类型 | 默认值 | 说明 | +| 环境变量 | 类型 | 默认值 | 说明 | |--------|------|--------|------| -| `use_ce_transfer_h2d` | bool | false | 是否使用 cuda copy engine 优化 Host→Device 传输,使用CE可以减少GPU SM在传输上的使用,但是传输速度会降低,实际测试差距不大 | -| `use_ce_transfer_d2h` | bool | false | 是否使用 cuda copy engine 优化 Device→Host 传输 | -| `transfer_sms_h2d` | int | 8 | H2D 传输使用的流处理器数量 | -| `transfer_sms_d2h` | int | 8 | D2H 传输使用的流处理器数量 | +| `FLEXKV_USE_CE_TRANSFER_H2D` | bool | 0 | 是否使用 cudaMemcpyAsync 实现 Host→Device 传输,可以避免占用 SM,但是传输速度会降低 | +| `FLEXKV_USE_CE_TRANSFER_D2H` | bool | 0 | 是否使用 cudaMemcpyAsync 实现 Device→Host 传输,可以避免占用 SM,但是传输速度会降低 | +| `FLEXKV_TRANSFER_SMS_H2D` | int | 8 | H2D 传输使用的流处理器数量,仅在`FLEXKV_USE_CE_TRANSFER_H2D`为0时生效 | +| `FLEXKV_TRANSFER_SMS_D2H` | int | 8 | D2H 传输使用的流处理器数量,仅在`FLEXKV_USE_CE_TRANSFER_D2H`为0时生效 | --- -### SSD 缓存配置 +### SSD I/O优化 + +> 注:`iouring_entries`设置为0即禁用iouring,不推荐设置为0。 -| 参数名 | 类型 | 默认值 | 说明 | +| 环境变量 | 类型 | 默认值 | 说明 | |--------|------|--------|------| -| `max_blocks_per_file` | int | 32000 | 单个 SSD 文件最多包含的 block 数。-1 表示无限制 | -| `ssd_cache_dir` | str \| List[str] | None | SSD 缓存目录路径,**必须设置**,如 `"/data/flexkv_ssd/"` | -| `ssd_cache_iouring_entries` | int | 0 | io_uring 队列深度,推荐设为 `512` 以提升并发 IO 性能,实测比不使用iouring提升较大,推荐使用512 | -| `ssd_cache_iouring_flags` | int | 1 | io_uring 标志位,推荐设置为 1。| +| `FLEXKV_MAX_FILE_SIZE_GB` | float | -1 | 单个 SSD 文件的最大大小,-1表示不限 | +| `FLEXKV_IORING_ENTRIES` | int | 512 | io_uring 队列深度,推荐设为 `512` 以提升并发 IO 性能 | +| `FLEXKV_IORING_FLAGS` | int | 0 | io_uring 标志位,默认为 0| -> 注:为了充分利用多块SSD的带宽上限,可以将多块SSD绑定至不同目录,并使用如 `"ssd cache dir": ["/data0/flexkv_ssd/", "/data1/flexkv_ssd/"]`方式初始化,SSD KVCache会均匀分布在所有SSD中,充分利用多个SSD带宽。 -> 注:`ssd_cache_iouring_entries`设置为0即不适用iouring,不推荐设置为0 --- -### 远程缓存配置(不启用时无需配置) +### 多节点TP -| 参数名 | 类型 | 默认值 | 说明 | +> 注:这些配置只能通过环境变量设置 + +| 环境变量 | 类型 | 默认值 | 说明 | +|--------|------|--------|------| +| `FLEXKV_MASTER_HOST` | str | "localhost" | 多节点TP的主节点IP | +| `FLEXKV_MASTER_PORTS` | str | "5556,5557,5558" | 多节点TP的主节点端口。使用三个端口,用逗号分隔 | + +--- + +### 日志配置 + +> 注:这些配置只能通过环境变量设置 + +| 环境变量 | 类型 | 默认值 | 说明 | |--------|------|--------|------| -| `remote_cache_size_mode` | str | "file_size" | 按文件大小或块数分配远程缓存空间 | -| `remote_file_size` | int \| None | None | 单个远程文件大小(字节) | -| `remote_file_num` | int \| None | None | 远程文件数量 | -| `remote_file_prefix` | str \| None | None | 远程文件名前缀 | -| `remote_cache_path` | str \| List[str] | None | 远程缓存路径(如 Redis URL、S3 路径等) | -| `remote_config_custom` | dict \| None | None | 自定义远程缓存配置(如超时、认证等) | +| `FLEXKV_LOGGING_PREFIX` | str | "FLEXKV" | 日志前缀 | +| `FLEXKV_LOG_LEVEL` | str | "INFO" | 日志输出等级,可选:"DEBUG" "INFO" "WARNING" "ERROR" "CRITICAL" "OFF" | +| `FLEXKV_NUM_LOG_INTERVAL_REQUESTS` | int | 200 | 日志输出间隔请求数 | --- -### 追踪与日志 +### 追踪和调试 -| 参数名 | 类型 | 默认值 | 说明 | +| 环境变量 | 类型 | 默认值 | 说明 | |--------|------|--------|------| -| `enable_trace` | bool | true | 是否启用性能追踪。生产环境建议关闭(`false`)以减少开销 | -| `trace_file_path` | str | "./flexkv_trace.log" | 追踪日志路径 | -| `trace_max_file_size_mb` | int | 100 | 单个追踪文件最大大小(MB) | -| `trace_max_files` | int | 5 | 最多保留的追踪文件数 | -| `trace_flush_interval_ms` | int | 1000 | 追踪日志刷新间隔(毫秒) | +| `FLEXKV_ENABLE_TRACE` | bool | 0 | 是否启用性能追踪。生产环境建议关闭(`0`)以减少开销 | +| `FLEXKV_TRACE_FILE_PATH` | str | "./flexkv_trace.log" | 追踪日志路径 | +| `FLEXKV_TRACE_MAX_FILE_SIZE_MB` | int | 100 | 单个追踪文件最大大小(MB) | +| `FLEXKV_TRACE_MAX_FILES` | int | 5 | 最多保留的追踪文件数 | +| `FLEXKV_TRACE_FLUSH_INTERVAL_MS` | int | 1000 | 追踪日志刷新间隔(毫秒) | + --- -### 缓存淘汰策略 +### 控制面优化 -| 参数名 | 类型 | 默认值 | 说明 | +| 环境变量 | 类型 | 默认值 | 说明 | |--------|------|--------|------| -| `evict_ratio` | float | 0.0 | cpu,ssd一次evict主动淘汰比例(0.0 = 只淘汰最小的必要的block数量,较多的淘汰次数会影响性能)。建议保持 `0.05`,即每一次淘汰5%的最久未使用的block | +| `FLEXKV_INDEX_ACCEL` | bool | 1 | 0-启用Python版本RadixTree实现,1-启用C++版本RadixTree实现 | +| `FLEXKV_EVICT_RATIO` | float | 0.05 | cpu,ssd一次evict主动淘汰比例(0.0 = 只淘汰最小的必要的block数)。建议保持 `0.05`,即每一次淘汰5%的最久未使用的block | diff --git a/docs/gds/README_en.md b/docs/gds/README_en.md index e6dfa9e732..5bc4155fe0 100644 --- a/docs/gds/README_en.md +++ b/docs/gds/README_en.md @@ -109,17 +109,13 @@ docker run -itd \ ### 2.2 Configure FlexKV to Use GDS `export FLEXKV_ENABLE_GDS=1` to compile -Configuration example after compilation `config.json`: +Configuration example after compilation `config.yaml`: -```json -{ - "cache_config": { - "enable_ssd": False, - "enable_gds": True, - "num_gds_blocks": 10000000, - "gds_cache_dir": ["./gdstest"] - }, -} +```yaml +cpu_cache_gb: 32 +ssd_cache_gb: 1024 +ssd_cache_dir: /data/flexkv_ssd/ +enable_gds: true ``` --- diff --git a/docs/gds/README_zh.md b/docs/gds/README_zh.md index 2bd7a45950..50e57aa9b3 100644 --- a/docs/gds/README_zh.md +++ b/docs/gds/README_zh.md @@ -109,17 +109,13 @@ docker run -itd \ `export FLEXKV_ENABLE_GDS=1` 进行编译 -编译后config例子 `config.json`: +编译后config例子 `config.yaml`: -```json -{ - "cache_config": { - "enable_ssd": False, - "enable_gds": True, - "num_gds_blocks": 10000000, - "gds_cache_dir": ["./gdstest"] - }, -} +```yaml +cpu_cache_gb: 32 +ssd_cache_gb: 1024 +ssd_cache_dir: /data/flexkv_ssd/ +enable_gds: true ``` --- diff --git a/docs/vllm_adapter/README_en.md b/docs/vllm_adapter/README_en.md index 972cade803..c9ca1b7697 100644 --- a/docs/vllm_adapter/README_en.md +++ b/docs/vllm_adapter/README_en.md @@ -15,9 +15,32 @@ This change involves significant API adjustments. Therefore, please note: ### Supported Versions - FlexKV >= `1.0.0` -- vLLM versions >= `0.8.5` can generally follow this version for adaptation +- vLLM versions >= `0.8.5` can generally follow the example code for adaptation -### Example +### Configuration + +#### Example 1: CPU Offloading Only +Use 32GB of CPU memory as secondary cache. +```bash +unset FLEXKV_CONFIG_PATH +export FLEXKV_CPU_CACHE_GB=32 +``` +#### Example 2: SSD Offloading +Use 32GB of CPU memory and 1TB of SSD storage as secondary and tertiary cache respectively. (Assume the machine has two SSDs mounted at /data0 and /data1 respectively.) +```bash +# generate config +cat < ./flexkv_config.yml +cpu_cache_gb: 32 +ssd_cache_gb: 1024 +ssd_cache_dir: /data0/flexkv_ssd/;/data1/flexkv_ssd/ +enable_gds: false +EOF +export FLEXKV_CONFIG_PATH="./flexkv_config.yml" +``` + +> Note: The `flexkv_config.yml` configuration is provided as a simple example only. For full parameter options, please refer to [`docs/flexkv_config_reference/README_en.md`](../../docs/flexkv_config_reference/README_en.md) + +### Running We provide an adaptation example based on **vLLM 0.10.1.1**: 1. apply patch @@ -34,19 +57,6 @@ python examples/offline_inference/prefix_caching_flexkv.py 3. online serving ```bash -# generate config -cat < ./flexkv_config.json -{ - "server_recv_port": "ipc:///tmp/flexkv_test", - "cache_config": { - "enable_cpu": true, - "num_cpu_blocks": 10240, - }, - "num_log_interval_requests": 200 -} -EOF -export FLEXKV_CONFIG_PATH="./flexkv_config.json" - VLLM_USE_V1=1 python -m vllm.entrypoints.cli.main serve Qwen3/Qwen3-32B \ --tensor-parallel-size 8 \ --trust-remote-code \ @@ -63,14 +73,30 @@ VLLM_USE_V1=1 python -m vllm.entrypoints.cli.main serve Qwen3/Qwen3-32B \ ``` -> Note: The `flexkv_config.json` configuration is provided as a simple example only. For full parameter options, please refer to [`docs/flexkv_config_reference/README_en.md`](../../docs/flexkv_config_reference/README_en.md) - ## Legacy Version (<= 0.1.0) – Not Recommended for Current Use ### Supported Versions - FlexKV <= `0.1.0` -### Example +### Configuration + +Legacy version configuration: +```bash +# generate config +cat < ./flexkv_config.json +{ + "server_recv_port": "ipc:///tmp/flexkv_test", + "cache_config": { + "enable_cpu": true, + "num_cpu_blocks": 10240 + }, + "num_log_interval_requests": 200 +} +EOF +export FLEXKV_CONFIG_PATH="./flexkv_config.json" +``` + +### Running Apply the patch `examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch` to vLLM 0.8.4, then start FlexKV, vLLM, and the benchmark script: ```bash diff --git a/docs/vllm_adapter/README_zh.md b/docs/vllm_adapter/README_zh.md index bb9b51c292..52e3199755 100644 --- a/docs/vllm_adapter/README_zh.md +++ b/docs/vllm_adapter/README_zh.md @@ -16,7 +16,30 @@ - FlexKV >= `1.0.0` - vLLM 原则上>= `0.8.5`版本均可参考示例代码进行修改 -### 示例 +### 配置 + +#### 示例一:仅启用CPU卸载 +使用32GB的CPU内存作为二级缓存。 +```bash +unset FLEXKV_CONFIG_PATH +export FLEXKV_CPU_CACHE_GB=32 +``` +#### 示例二:启用SSD卸载 +使用32GB的CPU内存和1T的SSD存储分别作为二级和三级缓存。(假设机器有两个SSD,并分别挂载在/data0和/data1两个路径上。) +```bash +# generate config +cat < ./flexkv_config.yml +cpu_cache_gb: 32 +ssd_cache_gb: 1024 +ssd_cache_dir: /data0/flexkv_ssd/;/data1/flexkv_ssd/ +enable_gds: false +EOF +export FLEXKV_CONFIG_PATH="./flexkv_config.yml" +``` + +> 注:`flexkv_config.yml`配置仅为简单示例,选项请参考[`docs/flexkv_config_reference/README_zh.md`](../../docs/flexkv_config_reference/README_zh.md) + +### 运行 我们提供了基于 **vLLM 0.10.1.1** 的适配示例: 1. apply patch @@ -33,19 +56,6 @@ python examples/offline_inference/prefix_caching_flexkv.py 3. online serving ```bash -# generate config -cat < ./flexkv_config.json -{ - "server_recv_port": "ipc:///tmp/flexkv_test", - "cache_config": { - "enable_cpu": true, - "num_cpu_blocks": 10240, - }, - "num_log_interval_requests": 200 -} -EOF -export FLEXKV_CONFIG_PATH="./flexkv_config.json" - VLLM_USE_V1=1 python -m vllm.entrypoints.cli.main serve Qwen3/Qwen3-32B \ --tensor-parallel-size 8 \ --trust-remote-code \ @@ -62,14 +72,30 @@ VLLM_USE_V1=1 python -m vllm.entrypoints.cli.main serve Qwen3/Qwen3-32B \ ``` -> 注:`flexkv_config.json`配置仅为简单示例,选项请参考[`docs/flexkv_config_reference/README_zh.md`](../../docs/flexkv_config_reference/README_zh.md) - ## Legacy版本(<= 0.1.0),目前的版本尽量不要使用 ### 适用版本 - FlexKV <= `0.1.0` -### 示例 +### 配置 + +旧版本配置方式如下 +```bash +# generate config +cat < ./flexkv_config.json +{ + "server_recv_port": "ipc:///tmp/flexkv_test", + "cache_config": { + "enable_cpu": true, + "num_cpu_blocks": 10240 + }, + "num_log_interval_requests": 200 +} +EOF +export FLEXKV_CONFIG_PATH="./flexkv_config.json" +``` + +### 运行 在 vLLM 0.8.4 版本中应用patch `examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch`,分别启动 FlexKV、vLLM 和测试脚本: ```bash diff --git a/examples/.gitkeep b/examples/.gitkeep deleted file mode 100755 index e69de29bb2..0000000000 diff --git a/examples/run_server.py b/examples/run_server.py deleted file mode 100644 index 4016f446c4..0000000000 --- a/examples/run_server.py +++ /dev/null @@ -1,97 +0,0 @@ -import argparse - -from transformers import AutoConfig, PretrainedConfig - -from flexkv.common.config import CacheConfig, ModelConfig -from flexkv.common.debug import flexkv_logger -from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType -from flexkv.server.server import KVServer - - - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - - # NAME - parser.add_argument("--enable-cpu", - action=argparse.BooleanOptionalAction, - default=True) - parser.add_argument("--enable-ssd", - action=argparse.BooleanOptionalAction, - default=False,) - parser.add_argument("--enable-remote", - action=argparse.BooleanOptionalAction, - default=False,) - parser.add_argument("--model-path", type=str, help="model path", default="") - parser.add_argument("--tp-size", type=int, default=1) - parser.add_argument("--dp-size", type=int, default=1) - parser.add_argument("--block-size", type=int, default=16) - parser.add_argument("--num-cpu-blocks", type=int, default=8192) - parser.add_argument("--num-ssd-blocks", type=int, default=8192) - parser.add_argument("--num-remote-blocks", type=int, default=8192) - parser.add_argument("--server-recv-port", type=str, default=None) - parser.add_argument("--remote-cache-size-mode", type=str, default="block_num") - parser.add_argument( - "--ssd-cache-dir", - type=str, - nargs='+', - default=[], - help="SSD cache file paths (multiple paths supported, separated by spaces)" - ) - parser.add_argument( - "--remote-cache-path", - type=str, - nargs='+', - default=[], - help="remote cache paths (multiple paths supported, separated by spaces)" - ) - - args = parser.parse_args() - return args - - -if __name__ == "__main__": - args = parse_args() - hf_config = AutoConfig.from_pretrained(args.model_path) - - num_layers=hf_config.num_hidden_layers - if hasattr(hf_config, 'num_key_value_heads'): - num_kv_heads=hf_config.num_key_value_heads - elif hasattr(hf_config, 'num_attention_heads'): - num_kv_heads=hf_config.num_attention_heads - else: - raise NotImplementedError - head_size=(hf_config.head_dim if hasattr(hf_config, 'head_dim') - else hf_config.hidden_size//hf_config.num_attention_heads) - use_mla=hf_config.architectures[0].startswith("Deepseek") - - # TODO: different model config may have different attribute name - model_config = ModelConfig( - num_layers=num_layers, - num_kv_heads=num_kv_heads, - head_size=head_size, - use_mla=use_mla, - tp_size=args.tp_size, - dp_size=args.dp_size, - dtype=hf_config.torch_dtype - ) - - cache_config = CacheConfig( - enable_cpu=args.enable_cpu, - enable_ssd=args.enable_ssd, - enable_remote=args.enable_remote, - enable_gds=False, - enable_trace=False, - ssd_cache_iouring_entries=512, - tokens_per_block=args.block_size, - num_cpu_blocks=args.num_cpu_blocks, - num_ssd_blocks=args.num_ssd_blocks, - num_remote_blocks=args.num_remote_blocks, - ssd_cache_dir=args.ssd_cache_dir, - remote_cache_size_mode=args.remote_cache_size_mode, - remote_cache_path=args.remote_cache_path, - ) - - kvserver = KVServer(model_config, cache_config, args.server_recv_port) - kvserver.run() diff --git a/examples/scheduler_server_example.py b/examples/scheduler_server_example.py deleted file mode 100644 index 9dfea43355..0000000000 --- a/examples/scheduler_server_example.py +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env python3 -""" -SchedulerServer Usage Example - -Demonstrates how to use the new SchedulerServer to replace the original KVServer + KVDPClient mode -""" - -import torch -import time -from multiprocessing import Process -from flexkv.common.config import ModelConfig, CacheConfig -from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType -from flexkv.server.scheduler_server import SchedulerServer - - -def run_tp_client_process(dp_client_id, tp_rank, device_id, server_recv_port, model_config, gpu_kv_layout): - """Run TP client process""" - from flexkv.server.client import KVTPClient - - print(f"Starting TP client: dp_client_id={dp_client_id}, tp_rank={tp_rank}, device_id={device_id}") - - try: - # Set CUDA device for this process - if torch.cuda.is_available(): - torch.cuda.set_device(device_id) - # Initialize CUDA context - torch.cuda.init() - # Clear cache - torch.cuda.empty_cache() - - tp_client = KVTPClient(server_recv_port, dp_client_id, device_id) - - # Create GPU blocks for this TP client - gpu_blocks = [] - for layer_id in range(model_config.num_layers): - kv_dim = 2 if not model_config.use_mla else 1 - kv_tensor = torch.zeros( - size=(kv_dim, gpu_kv_layout.num_block, gpu_kv_layout.tokens_per_block, - model_config.num_kv_heads // model_config.tp_size, - model_config.head_size), - dtype=model_config.dtype, - device=f"cuda:{device_id}" - ) - gpu_blocks.append(kv_tensor) - - print(f"TP client {tp_rank} registering to server...") - # Register to server - tp_client.register_to_server(gpu_blocks, gpu_kv_layout) - print(f"TP client {tp_rank} registered to server") - - # Keep TP client running - while True: - time.sleep(1) - except Exception as e: - print(f"TP client {tp_rank} error: {e}") - import traceback - traceback.print_exc() - raise - - -def main(): - # Configuration parameters - num_layers = 32 - num_kv_heads = 8 - head_size = 128 - num_cpu_blocks = 300 - num_gpu_blocks = 30 - tp_size = 1 - tokens_per_block = 4 - - # Create model and cache configuration - model_config = ModelConfig( - num_layers=num_layers, - num_kv_heads=num_kv_heads, - head_size=head_size, - use_mla=False, - tp_size=tp_size, - dtype=torch.float16 - ) - - cache_config = CacheConfig( - enable_cpu=True, - enable_ssd=False, - enable_remote=False, - enable_gds=False, - tokens_per_block=tokens_per_block, - num_cpu_blocks=num_cpu_blocks, - ) - - # Create GPU KV layout - gpu_kv_layout = KVCacheLayout( - type=KVCacheLayoutType.LAYERWISE, - num_layer=num_layers, - num_block=num_gpu_blocks, - tokens_per_block=tokens_per_block, - num_head=num_kv_heads // tp_size, - head_size=head_size, - is_mla=False - ) - - # Create SchedulerServer (integrates server and dpclient functionality) - scheduler_server = SchedulerServer( - model_config=model_config, - cache_config=cache_config, - server_recv_port="ipc:///tmp/scheduler_server_example" # TPClient connects to this port - ) - - # Start background server thread to handle TPClient registration - scheduler_server.start_server_thread() - - print("SchedulerServer started!") - print(f"TPClient can connect to: {scheduler_server.get_server_port()}") - print("Starting TP client processes...") - - # Start TP client processes - tp_client_processes = [] - for tp_rank in range(tp_size): - device_id = tp_rank # Use TP rank as device ID - # Check available GPUs - available_gpus = torch.cuda.device_count() - if device_id >= available_gpus: - device_id = device_id % available_gpus - print(f"Warning: Using GPU {device_id} for TP rank {tp_rank} (not enough GPUs)") - tp_client_process = Process( - target=run_tp_client_process, - args=(0, tp_rank, device_id, scheduler_server.get_server_port(), model_config, gpu_kv_layout), - daemon=True - ) - tp_client_process.start() - tp_client_processes.append(tp_client_process) - print(f"Started TP client process for rank {tp_rank} on device {device_id}") - - print("Waiting for all TP clients to register...") - - time.sleep(5) - - # Now we can directly use scheduler_server without network communication - # Example: Create some test data (following benchmark_kvmanager.py pattern) - batch_size = 4 - seq_len = 128 - print("\n=== Generating test data ===") - # Generate separate sequences for each request (correct approach) - batch_token_ids = [] - batch_slot_mappings = [] - batch_token_masks = [] - for i in range(batch_size): - # Each sequence is independent (seq_len,) shape - token_ids = torch.randint(0, 1000, (seq_len,)) - slot_mapping = torch.arange(i * seq_len, (i + 1) * seq_len) - token_mask = torch.ones(seq_len, dtype=torch.bool) - - batch_token_ids.append(token_ids) - batch_slot_mappings.append(slot_mapping) - batch_token_masks.append(token_mask) - - print(f"Generated {batch_size} sequences, each with {seq_len} tokens") - - print("\n=== Executing PUT Operations ===") - # PUT operations - each sequence processed separately - start_time = time.time() - put_task_ids = [] - for i in range(batch_size): - task_id = scheduler_server.put_async( - token_ids=batch_token_ids[i], - slot_mapping=batch_slot_mappings[i], - token_mask=batch_token_masks[i] - ) - if task_id: - put_task_ids.append(task_id) - print(f"PUT task {task_id} created for sequence {i}") - put_time = (time.time() - start_time) * 1000 - print(f"Created {len(put_task_ids)} PUT tasks, time: {put_time:.2f}ms") - time.sleep(2) - print("\n=== Executing GET Operations ===") - # GET operations - each sequence processed separately - start_time = time.time() - get_task_ids = [] - for i in range(batch_size): - task_id = scheduler_server.get_async( - token_ids=batch_token_ids[i], - slot_mapping=batch_slot_mappings[i], - token_mask=batch_token_masks[i] - ) - if task_id: - get_task_ids.append(task_id) - print(f"GET task {task_id} created for sequence {i}") - - get_time = (time.time() - start_time) * 1000 - print(f"Created {len(get_task_ids)} GET tasks, time: {get_time:.2f}ms") - - print("\n=== Waiting for All Tasks to Complete ===") - # Wait for all tasks to complete - can wait for multiple tasks at once - all_task_ids = put_task_ids + get_task_ids - if all_task_ids: - start_time = time.time() - masks = scheduler_server.wait(all_task_ids) - wait_time = (time.time() - start_time) * 1000 - print(f"All {len(all_task_ids)} tasks completed, time: {wait_time:.2f}ms") - # Analyze results - if masks: - total_tokens = 0 - for task_id, mask in masks.items(): - if mask is not None: - tokens = mask.sum().item() if hasattr(mask, 'sum') else len(mask) - total_tokens += tokens - print(f"Task {task_id}: {tokens} tokens processed") - print("\n=== Trying Non-blocking Wait ===") - # Create a few more tasks and try non-blocking wait - extra_task_ids = [] - for i in range(2): - task_id = scheduler_server.put_async( - token_ids=batch_token_ids[i][:5], # Use first 5 tokens - slot_mapping=batch_slot_mappings[i][:5], - token_mask=batch_token_masks[i][:5] - ) - if task_id: - extra_task_ids.append(task_id) - if extra_task_ids: - # Immediately try to wait (might not be completed yet) - masks = scheduler_server.try_wait(extra_task_ids) - if masks: - print(f"Tasks {extra_task_ids} completed immediately") - else: - print(f"Tasks {extra_task_ids} not ready yet, will wait...") - masks = scheduler_server.wait(extra_task_ids) - print(f"Tasks {extra_task_ids} completed after wait") - - print("\n✅ All operations completed successfully!") - - - # Clean up resources - print("\n=== Shutting down SchedulerServer ===") - scheduler_server.shutdown() - print("SchedulerServer has been shut down") - # Terminate TP client processes - print("Terminating TP client processes...") - for i, process in enumerate(tp_client_processes): - process.terminate() - process.join(timeout=2) - if process.is_alive(): - process.kill() - print(f"TP client process {i} terminated") - - -if __name__ == "__main__": - main() diff --git a/examples/vllm_adaption_legacy/flexkv_vllm_0_10_0.patch b/examples/vllm_adaption_legacy/flexkv_vllm_0_10_0.patch index f6349a0ac7..53e1b2e17b 100644 --- a/examples/vllm_adaption_legacy/flexkv_vllm_0_10_0.patch +++ b/examples/vllm_adaption_legacy/flexkv_vllm_0_10_0.patch @@ -7,11 +7,11 @@ index c7229dbb8..d2325fd3a 100644 from dataclasses import dataclass, field from typing import Optional, Union +import asyncio - + import aiohttp import huggingface_hub.constants @@ -23,10 +24,10 @@ AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) - + @dataclass class RequestFuncInput: - prompt: str @@ -27,7 +27,7 @@ index c7229dbb8..d2325fd3a 100644 @@ -555,6 +556,107 @@ async def async_request_openai_audio( pbar.update(1) return output - + +async def async_request_openai_chat_completions_multiturns( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, @@ -66,7 +66,7 @@ index c7229dbb8..d2325fd3a 100644 + for turn_id, prompt in enumerate(request_func_input.prompt): + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len[turn_id] -+ ++ + payload["messages"].append({"role": "user", "content": prompt}) + payload["max_tokens"] = request_func_input.output_len[turn_id] + @@ -121,15 +121,15 @@ index c7229dbb8..d2325fd3a 100644 + output.error = "".join(traceback.format_exception(*exc_info)) + break + payload["messages"].append({"role": "assistant", "content": generated_text}) -+ ++ + output_list.append(output) + if turn_id != len(request_func_input.prompt) - 1: + await asyncio.sleep(turn_interval_time) -+ ++ + if pbar: + pbar.update(1) + return output_list - + def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true": @@ -619,6 +721,7 @@ ASYNC_REQUEST_FUNCS = { @@ -138,7 +138,7 @@ index c7229dbb8..d2325fd3a 100644 "llama.cpp": async_request_openai_completions, + "openai-chat-multiturns": async_request_openai_chat_completions_multiturns, } - + OPENAI_COMPATIBLE_BACKENDS = [ diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 1ad6cef7a..9178528d0 100644 @@ -147,7 +147,7 @@ index 1ad6cef7a..9178528d0 100644 @@ -49,9 +49,9 @@ class SampleRequest: Represents a single inference request for benchmarking. """ - + - prompt: Union[str, Any] - prompt_len: int - expected_output_len: int @@ -156,17 +156,17 @@ index 1ad6cef7a..9178528d0 100644 + expected_output_len: Union[int, list[int]] multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None lora_request: Optional[LoRARequest] = None - + @@ -617,6 +617,108 @@ class SonnetDataset(BenchmarkDataset): ) return samples - -+ + ++ +# ----------------------------------------------------------------------------- +# ShareGPT Multiturn Dataset Implementation +# ----------------------------------------------------------------------------- -+ -+ ++ ++ +class ShareGPTMultiTurnsDataset(BenchmarkDataset): + def __init__(self, min_num_turns: int = 2, **kwargs) -> None: + super().__init__(**kwargs) @@ -191,7 +191,7 @@ index 1ad6cef7a..9178528d0 100644 + self.data = new_data + random.seed(self.random_seed) + random.shuffle(self.data) -+ ++ + def sample( + self, + tokenizer: PreTrainedTokenizerBase, @@ -205,7 +205,7 @@ index 1ad6cef7a..9178528d0 100644 + for entry in self.data: + if len(samples) >= num_requests: + break -+ ++ + prompt_list = [d["value"] for d in entry["conversations"][::2]] + completion_list = [d["value"] for d in entry["conversations"][1::2]] + # prompt, completion = ( @@ -215,8 +215,8 @@ index 1ad6cef7a..9178528d0 100644 + + lora_request, tokenizer = self.get_random_lora_request( + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) -+ -+ ++ ++ + prompt_ids_list = [] + completion_ids_list = [] + prompt_len_list = [] @@ -247,12 +247,12 @@ index 1ad6cef7a..9178528d0 100644 + new_output_len_list.append(new_output_len) + history_len += prompt_len + history_len += new_output_len -+ ++ + if turn_id <= 0: + continue -+ ++ + prompt_list = prompt_list[:turn_id+1] -+ ++ + samples.append( + SampleRequest( + prompt=prompt_list, @@ -262,8 +262,8 @@ index 1ad6cef7a..9178528d0 100644 + )) + self.maybe_oversample_requests(samples, num_requests) + return samples -+ - ++ + # ----------------------------------------------------------------------------- # BurstGPT Dataset Implementation diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py @@ -289,7 +289,7 @@ index c597fb106..74e157927 100644 latency_minus_ttft = outputs[i].latency - outputs[i].ttft @@ -278,6 +279,9 @@ async def benchmark( ) - + test_output = await request_func(request_func_input=test_input) + if backend == "openai-chat-multiturns": + print("test_output ", test_output) @@ -303,7 +303,7 @@ index c597fb106..74e157927 100644 outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) + if backend == "openai-chat-multiturns": + outputs = [o for sub_o in outputs for o in sub_o] - + if profile: print("Stopping profiler...") @@ -748,6 +754,15 @@ def main(args: argparse.Namespace): @@ -540,7 +540,7 @@ index 000000000..6ff17dfca + +def main(): + # Create an LLM without prefix caching as a baseline. -+ regular_llm = LLM(model="facebook/opt-125m", ++ regular_llm = LLM(model="facebook/opt-125m", + enable_prefix_caching=False, + gpu_memory_utilization=0.4) + @@ -662,11 +662,11 @@ index 000000000..478683fa9 + dtype=flexkv_config.dtype, + tp_size=flexkv_config.tp_size, + ) -+ ++ + logger.info(f"start init FlexKVDPClient to {self.server_recv_port}") + self.dp_client = KVDPClient(self.server_recv_port, self.model_config) + logger.info(f"finish init FlexKVDPClient") -+ ++ + def put_async( + self, + token_ids: torch.Tensor, @@ -675,7 +675,7 @@ index 000000000..478683fa9 + ) -> int: + " return task_id " + return self.dp_client.put_async(token_ids, slot_mapping, token_mask) -+ ++ + def get_async( + self, + token_ids: torch.Tensor, @@ -684,15 +684,15 @@ index 000000000..478683fa9 + ) -> int: + " return task_id " + return self.dp_client.get_async(token_ids, slot_mapping, token_mask) -+ ++ + def wait( -+ self, ++ self, + wait_task_ids: list[int], + ) -> dict[int, torch.Tensor]: + return self.dp_client.wait(wait_task_ids) -+ ++ + def try_wait( -+ self, ++ self, + wait_task_ids: list[int], + ) -> dict[int, Optional[torch.Tensor]]: + # print("--------------------------------") @@ -706,8 +706,8 @@ index 000000000..478683fa9 + import traceback + traceback.print_exc() + return {} -+ -+ ++ ++ +class FlexKVTPClient: + def __init__( + self, @@ -722,7 +722,7 @@ index 000000000..478683fa9 + self.tp_client = KVTPClient(flexkv_config.server_recv_port, dp_client_id, device_id, tp_rank) + logger.info(f"finish init FlexKVTPClient") + gpu_layout = KVCacheLayout( -+ type=KVCacheLayoutType.LAYERWISE, ++ type=KVCacheLayoutType.LAYERFIRST, + num_layer=flexkv_config.num_layers, + num_block=flexkv_config.num_blocks, + tokens_per_block=flexkv_config.block_size, @@ -763,17 +763,17 @@ index 000000000..f2724e712 + dtype: torch.dtype = None + use_mla: bool = False + tp_size: int = 1 -+ ++ + @classmethod + def from_env(cls) -> 'FlexKVConfig': + enable_flexkv = (os.getenv('ENABLE_FLEXKV', "false").lower() == "true") + server_recv_port = os.getenv('FLEXKV_SERVER_RECV_PORT', "") -+ ++ + return cls(enable_flexkv=enable_flexkv, + server_recv_port=server_recv_port) -+ ++ + def post_init( -+ self, ++ self, + kv_cache_config: KVCacheConfig, + tp_size: int + ): @@ -794,12 +794,12 @@ index 69aaf4390..fe426f420 100644 @@ -21,7 +21,7 @@ VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX - + -_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " +_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s.%(msecs)03d " "[%(filename)s:%(lineno)d] %(message)s") _DATE_FORMAT = "%m-%d %H:%M:%S" - + diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 5b0218640..aa590eb6f 100644 --- a/vllm/v1/core/kv_cache_utils.py @@ -812,12 +812,12 @@ index 5b0218640..aa590eb6f 100644 # A deque of (requests, queries, hits) for the most recent requests. - self.query_queue: deque[tuple[int, int, int]] = deque() + self.query_queue: deque[tuple[int, int, int, int]] = deque() - + def observe(self, stats: PrefixCacheStats): """Observe the prefix caching for a set of requests. @@ -108,14 +109,15 @@ class PrefixCachingMetrics: self.reset() - + # Update the metrics. - self.query_queue.append((stats.requests, stats.queries, stats.hits)) + self.query_queue.append((stats.requests, stats.queries, stats.hits, stats.flexkv_hits)) @@ -825,7 +825,7 @@ index 5b0218640..aa590eb6f 100644 self.aggregated_query_total += stats.queries self.aggregated_query_hit += stats.hits + self.aggregated_query_flexkv_hit += stats.flexkv_hits - + # Remove the oldest stats if the number of requests exceeds. if self.aggregated_requests > self.max_recent_requests: - old_requests, old_queries, old_hits = self.query_queue.popleft() @@ -839,28 +839,28 @@ index 5b0218640..aa590eb6f 100644 self.aggregated_query_hit = 0 + self.aggregated_query_flexkv_hit = 0 self.query_queue.clear() - + @property @@ -133,6 +136,13 @@ class PrefixCachingMetrics: if self.aggregated_query_total == 0: return 0.0 return self.aggregated_query_hit / self.aggregated_query_total -+ ++ + @property + def flexkv_hit_rate(self) -> float: + """Calculate the hit rate for the past N requests.""" + if self.aggregated_query_total == 0: + return 0.0 + return self.aggregated_query_flexkv_hit / self.aggregated_query_total - - + + @dataclass diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 446f98034..b465c4cf1 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -5,6 +5,7 @@ from __future__ import annotations - + import itertools import time +import torch @@ -874,13 +874,13 @@ index 446f98034..b465c4cf1 100644 +# flexkv +from vllm.utils import cdiv +from vllm.distributed.flexkv_extension.config import FlexKVConfig - + logger = init_logger(__name__) - + @@ -162,6 +166,23 @@ class Scheduler(SchedulerInterface): ) self.use_pp = self.parallel_config.pipeline_parallel_size > 1 - + + # flexkv + self.enable_flexkv = False + self.flexkv_client = None @@ -904,14 +904,14 @@ index 446f98034..b465c4cf1 100644 @@ -174,6 +195,13 @@ class Scheduler(SchedulerInterface): # chunked prefills, prefix caching, speculative decoding, # and the "jump decoding" optimization in the future. - + + # flexkv + if self.enable_flexkv: + # aviod busy loop + if self.get_num_unfinished_requests() == 0: + time.sleep(0.01) + self.check_offload_kv_tasks() -+ ++ scheduled_new_reqs: list[Request] = [] scheduled_resumed_reqs: list[Request] = [] scheduled_running_reqs: list[Request] = [] @@ -919,7 +919,7 @@ index 446f98034..b465c4cf1 100644 if new_blocks is None: # The request cannot be scheduled. break -+ ++ + if self.enable_flexkv and num_new_tokens > self.block_size and request.status == RequestStatus.WAITING: + # don't match the last block + num_new_blocks_to_get = cdiv(num_new_tokens, self.block_size)-1 @@ -940,7 +940,7 @@ index 446f98034..b465c4cf1 100644 + self.flexkv_timer[request.request_id] = {} + self.flexkv_timer[request.request_id]['get_async_start'] = t_async_get_start + self.flexkv_timer[request.request_id]['get_async_return'] = t_async_get_return - + # KVTransfer: the connector uses this info to determine # if a load is needed. Note that @@ -505,6 +554,31 @@ class Scheduler(SchedulerInterface): @@ -948,7 +948,7 @@ index 446f98034..b465c4cf1 100644 self.encoder_cache_manager.allocate(request, i) encoder_budget = new_encoder_budget + # batch wait -+ ++ + # batch wait + if self.enable_flexkv: + if len(self.load_kv_tasks) != 0: @@ -967,18 +967,18 @@ index 446f98034..b465c4cf1 100644 + f"[FlexKV] req: {request.request_id}, task: {task_id}, " + f"get {match_length} tokens cost {(t_async_get_end-t_get_async_start)*1000:.2f} ms, " + f"get_async() api cost {(t_get_async_return-t_get_async_start)*1000:.2f} ms") -+ ++ + token_budget += match_length + num_scheduled_tokens[request.request_id] -= match_length + request.num_computed_tokens += match_length + self.kv_cache_manager.prefix_cache_stats.flexkv_hits += (match_length//self.block_size) - + # Put back any skipped requests at the head of the waiting queue if skipped_waiting_requests: @@ -1016,11 +1090,49 @@ class Scheduler(SchedulerInterface): if self.finished_req_ids_dict is not None: self.finished_req_ids_dict[request.client_index].add(request_id) - + - if not delay_free_blocks: - self._free_blocks(request) + # flexkv: offload BEFORE freeing blocks to preserve req_to_blocks info @@ -990,9 +990,9 @@ index 446f98034..b465c4cf1 100644 + # else: + # self._free_block(request) + - + return kv_xfer_params - + + def _free_block(self, request: Request) -> None: + self.kv_cache_manager.free(request) + self.kv_cache_manager.free_block_hashes(request) @@ -1004,20 +1004,20 @@ index 446f98034..b465c4cf1 100644 + req_blocks = self.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks.get(request.request_id, []) + req_token_ids = torch.tensor(request.all_token_ids[:-1]) + req_block_ids = torch.tensor([block.block_id for block in req_blocks]) -+ ++ + # Debug information for empty req_blocks + # if len(req_blocks) == 0: + # print(f"WARNING: Empty req_blocks for request {request.request_id}") + # print(f" request.all_token_ids length: {len(request.all_token_ids)}") + # print(f" req_token_ids length: {len(req_token_ids)}") + # print(f" req_to_blocks keys: {list(self.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks.keys())}") -+ ++ + slot_mapping = req_block_ids.repeat_interleave(self.block_size)[:len(req_token_ids)] * self.block_size -+ ++ + # Additional debug info + # print(f"FlexKV _offload_kv: req_id={request.request_id}, " + # f"blocks={len(req_blocks)}, tokens={len(req_token_ids)}, slots={len(slot_mapping)}") -+ ++ + self.flexkv_timer[request.request_id] = {} + self.flexkv_timer[request.request_id]["put_async_start"] = time.monotonic() + task_id = self.flexkv_client.put_async(token_ids=req_token_ids, slot_mapping=slot_mapping) @@ -1032,7 +1032,7 @@ index 446f98034..b465c4cf1 100644 num_accepted_tokens=num_accepted_tokens) return spec_decoding_stats - -+ ++ + def check_offload_kv_tasks(self): + if len(self.offload_kv_tasks) == 0: + return @@ -1051,8 +1051,8 @@ index 446f98034..b465c4cf1 100644 + f"[FlexKV] req: {request.request_id}, task: {task_id}, " + f"put {sum(task_result).item()} tokens cost {(t_async_put_end-t_put_async_start)*1000:.2f} ms, " + f"put_async() api cost {(t_put_async_return-t_put_async_start)*1000:.2f} ms") -+ self._free_block(request) -+ ++ self._free_block(request) ++ def shutdown(self) -> None: if self.kv_event_publisher: self.kv_event_publisher.shutdown() @@ -1063,16 +1063,16 @@ index 7779b559c..2d17908ea 100644 @@ -46,6 +46,8 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm.v1.structured_output import StructuredOutputManager from vllm.version import __version__ as VLLM_VERSION - + +from vllm.distributed.flexkv_extension.config import FlexKVConfig + logger = init_logger(__name__) - + POLLING_TIMEOUT_S = 2.5 @@ -118,6 +120,8 @@ class EngineCore: log_stats=self.log_stats, ) - + + self.init_flexkv(vllm_config, kv_cache_config) + # Setup MM Input Mapper. @@ -1081,11 +1081,11 @@ index 7779b559c..2d17908ea 100644 @@ -194,6 +198,23 @@ class EngineCore: "warmup model) took %.2f seconds"), elapsed) return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config - + + + def init_flexkv( -+ self, -+ taco_llm_config: VllmConfig, ++ self, ++ taco_llm_config: VllmConfig, + kv_cache_config: KVCacheConfig + ): + self.scheduler: V1Scheduler @@ -1098,7 +1098,7 @@ index 7779b559c..2d17908ea 100644 + ) + dp_client_id = self.scheduler.init_flexkv(flexkv_config) + self.model_executor.init_flexkv(flexkv_config, dp_client_id) -+ ++ def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" if pooling_params := request.pooling_params: @@ -1113,16 +1113,16 @@ index 50b9634a4..3d7bdd4c8 100644 - +from vllm.distributed.flexkv_extension.config import FlexKVConfig FailureCallback = Callable[[], None] - - + + @@ -88,6 +88,10 @@ class Executor(ExecutorBase): args=(scheduler_output, )) return output[0] - + + def init_flexkv(self, flexkv_config: FlexKVConfig, dp_client_id: int): + self.collective_rpc("init_flexkv", + args=(flexkv_config, dp_client_id, )) -+ ++ @property def max_concurrent_batches(self) -> int: return 1 @@ -1147,7 +1147,7 @@ index 7f2556bab..e7fb79486 100644 + self.prefix_caching_metrics.flexkv_hit_rate * 100, ) self.spec_decoding_logging.log(log_fn=log_fn) - + diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 1eb10ccb6..1073aa571 100644 --- a/vllm/v1/metrics/stats.py @@ -1159,7 +1159,7 @@ index 1eb10ccb6..1073aa571 100644 - + # flexkv + flexkv_hits: int = 0 - + @dataclass class SchedulerStats: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py @@ -1190,20 +1190,20 @@ index 522946351..31a3bed13 100644 @@ -33,6 +33,10 @@ from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.worker_base import WorkerBase - + +# flexkv +from vllm.distributed.flexkv_extension.config import FlexKVConfig +from vllm.distributed.flexkv_extension.client import FlexKVTPClient + logger = init_logger(__name__) - + if TYPE_CHECKING: @@ -556,6 +560,23 @@ class Worker(WorkerBase): max_size=max_size, ) - + + def init_flexkv( -+ self, ++ self, + flexkv_config: FlexKVConfig, + dp_client_id: int, + ) -> None: diff --git a/examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch b/examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch index 5310d53c6a..6d9f0a9fc4 100644 --- a/examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch +++ b/examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch @@ -7,11 +7,11 @@ index 287d500a8..7e87f0446 100644 from dataclasses import dataclass, field from typing import Optional, Union +import asyncio - + import aiohttp import huggingface_hub.constants @@ -22,10 +23,10 @@ AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) - + @dataclass class RequestFuncInput: - prompt: str @@ -26,8 +26,8 @@ index 287d500a8..7e87f0446 100644 logprobs: Optional[int] = None @@ -436,6 +437,109 @@ async def async_request_openai_chat_completions( return output - - + + +async def async_request_openai_chat_completions_multiturns( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, @@ -66,7 +66,7 @@ index 287d500a8..7e87f0446 100644 + for turn_id, prompt in enumerate(request_func_input.prompt): + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len[turn_id] -+ ++ + payload["messages"].append({"role": "user", "content": prompt}) + payload["max_tokens"] = request_func_input.output_len[turn_id] + @@ -121,11 +121,11 @@ index 287d500a8..7e87f0446 100644 + output.error = "".join(traceback.format_exception(*exc_info)) + break + payload["messages"].append({"role": "assistant", "content": generated_text}) -+ ++ + output_list.append(output) + if turn_id != len(request_func_input.prompt) - 1: + await asyncio.sleep(turn_interval_time) -+ ++ + if pbar: + pbar.update(1) + return output_list @@ -140,7 +140,7 @@ index 287d500a8..7e87f0446 100644 "sglang": async_request_openai_completions, + "openai-chat-multiturns": async_request_openai_chat_completions_multiturns, } - + OPENAI_COMPATIBLE_BACKENDS = [ diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 63f174275..561a40421 100644 @@ -149,7 +149,7 @@ index 63f174275..561a40421 100644 @@ -50,9 +50,9 @@ class SampleRequest: Represents a single inference request for benchmarking. """ - + - prompt: Union[str, Any] - prompt_len: int - expected_output_len: int @@ -158,17 +158,17 @@ index 63f174275..561a40421 100644 + expected_output_len: Union[int, list[int]] multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None lora_request: Optional[LoRARequest] = None - + @@ -507,6 +507,108 @@ class SonnetDataset(BenchmarkDataset): )) return samples - -+ + ++ +# ----------------------------------------------------------------------------- +# ShareGPT Multiturn Dataset Implementation +# ----------------------------------------------------------------------------- -+ -+ ++ ++ +class ShareGPTMultiTurnsDataset(BenchmarkDataset): + def __init__(self, min_num_turns: int = 2, **kwargs) -> None: + super().__init__(**kwargs) @@ -193,7 +193,7 @@ index 63f174275..561a40421 100644 + self.data = new_data + random.seed(self.random_seed) + random.shuffle(self.data) -+ ++ + def sample( + self, + tokenizer: PreTrainedTokenizerBase, @@ -207,7 +207,7 @@ index 63f174275..561a40421 100644 + for entry in self.data: + if len(samples) >= num_requests: + break -+ ++ + prompt_list = [d["value"] for d in entry["conversations"][::2]] + completion_list = [d["value"] for d in entry["conversations"][1::2]] + # prompt, completion = ( @@ -217,8 +217,8 @@ index 63f174275..561a40421 100644 + + lora_request, tokenizer = self.get_random_lora_request( + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) -+ -+ ++ ++ + prompt_ids_list = [] + completion_ids_list = [] + prompt_len_list = [] @@ -249,12 +249,12 @@ index 63f174275..561a40421 100644 + new_output_len_list.append(new_output_len) + history_len += prompt_len + history_len += new_output_len -+ ++ + if turn_id <= 0: + continue -+ ++ + prompt_list = prompt_list[:turn_id+1] -+ ++ + samples.append( + SampleRequest( + prompt=prompt_list, @@ -264,8 +264,8 @@ index 63f174275..561a40421 100644 + )) + self.maybe_oversample_requests(samples, num_requests) + return samples -+ - ++ + # ----------------------------------------------------------------------------- # BurstGPT Dataset Implementation diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py @@ -279,7 +279,7 @@ index b5bd840d8..7e670eb64 100644 - VisionArenaDataset) + VisionArenaDataset, ShareGPTMultiTurnsDataset) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json - + MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -166,7 +166,7 @@ def calculate_metrics( tokenizer(outputs[i].generated_text, @@ -292,7 +292,7 @@ index b5bd840d8..7e670eb64 100644 latency_minus_ttft = outputs[i].latency - outputs[i].ttft @@ -293,6 +293,8 @@ async def benchmark( ) - + test_output = await request_func(request_func_input=test_input) + if backend == "openai-chat-multiturns": + test_output = test_output[-1] @@ -305,7 +305,7 @@ index b5bd840d8..7e670eb64 100644 outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) + if backend == "openai-chat-multiturns": + outputs = [o for sub_o in outputs for o in sub_o] - + if profile: print("Stopping profiler...") @@ -636,6 +640,15 @@ def main(args: argparse.Namespace): @@ -478,7 +478,7 @@ index 000000000..6ff17dfca + +def main(): + # Create an LLM without prefix caching as a baseline. -+ regular_llm = LLM(model="facebook/opt-125m", ++ regular_llm = LLM(model="facebook/opt-125m", + enable_prefix_caching=False, + gpu_memory_utilization=0.4) + @@ -600,11 +600,11 @@ index 000000000..11b7ff1c3 + dtype=flexkv_config.dtype, + tp_size=flexkv_config.tp_size, + ) -+ ++ + logger.info(f"start init FlexKVDPClient to {self.server_recv_port}") + self.dp_client = KVDPClient(self.server_recv_port, self.model_config) + logger.info(f"finish init FlexKVDPClient") -+ ++ + def put_async( + self, + token_ids: torch.Tensor, @@ -613,7 +613,7 @@ index 000000000..11b7ff1c3 + ) -> int: + " return task_id " + return self.dp_client.put_async(token_ids, slot_mapping, token_mask) -+ ++ + def get_async( + self, + token_ids: torch.Tensor, @@ -622,20 +622,20 @@ index 000000000..11b7ff1c3 + ) -> int: + " return task_id " + return self.dp_client.get_async(token_ids, slot_mapping, token_mask) -+ ++ + def wait( -+ self, ++ self, + wait_task_ids: list[int], + ) -> dict[int, torch.Tensor]: + return self.dp_client.wait(wait_task_ids) -+ ++ + def try_wait( -+ self, ++ self, + wait_task_ids: list[int], + ) -> dict[int, Optional[torch.Tensor]]: + return self.dp_client.try_wait(wait_task_ids) -+ -+ ++ ++ +class FlexKVTPClient: + def __init__( + self, @@ -650,7 +650,7 @@ index 000000000..11b7ff1c3 + self.tp_client = KVTPClient(flexkv_config.server_recv_port, dp_client_id, device_id, tp_rank) + logger.info(f"finish init FlexKVTPClient") + gpu_layout = KVCacheLayout( -+ type=KVCacheLayoutType.LAYERWISE, ++ type=KVCacheLayoutType.LAYERFIRST, + num_layer=flexkv_config.num_layers, + num_block=flexkv_config.num_blocks, + tokens_per_block=flexkv_config.block_size, @@ -691,17 +691,17 @@ index 000000000..f2724e712 + dtype: torch.dtype = None + use_mla: bool = False + tp_size: int = 1 -+ ++ + @classmethod + def from_env(cls) -> 'FlexKVConfig': + enable_flexkv = (os.getenv('ENABLE_FLEXKV', "false").lower() == "true") + server_recv_port = os.getenv('FLEXKV_SERVER_RECV_PORT', "") -+ ++ + return cls(enable_flexkv=enable_flexkv, + server_recv_port=server_recv_port) -+ ++ + def post_init( -+ self, ++ self, + kv_cache_config: KVCacheConfig, + tp_size: int + ): @@ -722,12 +722,12 @@ index 2b0b9da2d..7f377af6d 100644 @@ -19,7 +19,7 @@ VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX - + -_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " +_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s.%(msecs)03d " "[%(filename)s:%(lineno)d] %(message)s") _DATE_FORMAT = "%m-%d %H:%M:%S" - + diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index bd0e01d04..f4cadfba8 100644 --- a/vllm/v1/core/kv_cache_utils.py @@ -740,12 +740,12 @@ index bd0e01d04..f4cadfba8 100644 # A deque of (requests, queries, hits) for the most recent requests. - self.query_queue: deque[tuple[int, int, int]] = deque() + self.query_queue: deque[tuple[int, int, int, int]] = deque() - + def observe(self, stats: PrefixCacheStats): """Observe the prefix caching for a set of requests. @@ -81,23 +82,26 @@ class PrefixCachingMetrics: self.reset() - + # Update the metrics. - self.query_queue.append((stats.requests, stats.queries, stats.hits)) + self.query_queue.append((stats.requests, stats.queries, stats.hits, stats.flexkv_hits)) @@ -753,7 +753,7 @@ index bd0e01d04..f4cadfba8 100644 self.aggregated_query_total += stats.queries self.aggregated_query_hit += stats.hits + self.aggregated_query_flexkv_hit += stats.flexkv_hits - + # Remove the oldest stats if the number of requests exceeds. if self.aggregated_requests > self.interval: - old_requests, old_queries, old_hits = self.query_queue.popleft() @@ -762,7 +762,7 @@ index bd0e01d04..f4cadfba8 100644 self.aggregated_query_total -= old_queries self.aggregated_query_hit -= old_hits + self.aggregated_query_flexkv_hit -= old_flexkv_hits - + def reset(self): """Reset the metrics.""" self.aggregated_requests = 0 @@ -770,23 +770,23 @@ index bd0e01d04..f4cadfba8 100644 self.aggregated_query_hit = 0 + self.aggregated_query_flexkv_hit = 0 self.query_queue.clear() - + @property @@ -106,6 +110,15 @@ class PrefixCachingMetrics: if self.aggregated_query_total == 0: return 0.0 return self.aggregated_query_hit / self.aggregated_query_total -+ ++ + @property + def flexkv_hit_rate(self) -> float: + """Calculate the hit rate for the past N requests.""" + if self.aggregated_query_total == 0: + return 0.0 + return self.aggregated_query_flexkv_hit / self.aggregated_query_total -+ + - - ++ + + @dataclass diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index a81574875..e808e7537 100644 @@ -794,7 +794,7 @@ index a81574875..e808e7537 100644 +++ b/vllm/v1/core/sched/scheduler.py @@ -3,6 +3,7 @@ from __future__ import annotations - + import time +import torch from collections import deque @@ -803,19 +803,19 @@ index a81574875..e808e7537 100644 @@ -27,6 +28,10 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.structured_output import StructuredOutputManager - + +# flexkv +from vllm.utils import cdiv +from vllm.distributed.flexkv_extension.config import FlexKVConfig + logger = init_logger(__name__) - - + + @@ -118,6 +123,23 @@ class Scheduler(SchedulerInterface): if speculative_config and speculative_config.method == "eagle": self.num_lookahead_tokens = \ speculative_config.num_speculative_tokens -+ ++ + # flexkv + self.enable_flexkv = False + self.flexkv_client = None @@ -832,13 +832,13 @@ index a81574875..e808e7537 100644 + from vllm.distributed.flexkv_extension.client import FlexKVDPClient + self.flexkv_client = FlexKVDPClient(flexkv_config) + return self.flexkv_client.dp_client.dp_client_id - + def schedule(self) -> SchedulerOutput: # NOTE(woosuk) on the scheduling algorithm: @@ -131,6 +153,13 @@ class Scheduler(SchedulerInterface): # chunked prefills, prefix caching, speculative decoding, # and the "jump decoding" optimization in the future. - + + # flexkv + if self.enable_flexkv: + # aviod busy loop @@ -865,10 +865,10 @@ index a81574875..e808e7537 100644 @@ -335,6 +364,29 @@ class Scheduler(SchedulerInterface): # The request cannot be scheduled. break - + + # flexkv + if self.enable_flexkv and num_new_tokens > self.block_size and request.status == RequestStatus.WAITING: -+ ++ + # don't match the last block + num_new_blocks_to_get = cdiv(num_new_tokens, self.block_size)-1 + num_new_tokens_to_match = num_new_blocks_to_get*self.block_size @@ -895,7 +895,7 @@ index a81574875..e808e7537 100644 @@ -372,6 +424,29 @@ class Scheduler(SchedulerInterface): self.encoder_cache_manager.allocate(request, i) encoder_budget = new_encoder_budget - + + # batch wait + if self.enable_flexkv: + if len(self.load_kv_tasks) != 0: @@ -912,7 +912,7 @@ index a81574875..e808e7537 100644 + f"[FlexKV] req: {request.request_id}, task: {task_id}, " + f"get {match_length} tokens cost {(t_async_get_end-t_get_async_start)*1000:.2f} ms, " + f"get_async() api cost {(t_get_async_return-t_get_async_start)*1000:.2f} ms") -+ ++ + token_budget += match_length + num_scheduled_tokens[request.request_id] -= match_length + request.num_computed_tokens += match_length @@ -923,7 +923,7 @@ index a81574875..e808e7537 100644 if skipped_waiting_requests: self.waiting.extendleft(skipped_waiting_requests) @@ -730,18 +805,36 @@ class Scheduler(SchedulerInterface): - + def _free_request(self, request: Request) -> None: assert request.is_finished() - self.kv_cache_manager.free(request) @@ -932,7 +932,7 @@ index a81574875..e808e7537 100644 self._cached_reqs_data.pop(request.request_id, None) del self.requests[request.request_id] self.finished_req_ids.add(request.request_id) - + + if self.enable_flexkv: + self._offload_kv(request) + else: @@ -955,11 +955,11 @@ index a81574875..e808e7537 100644 + def get_num_unfinished_requests(self) -> int: return len(self.waiting) + len(self.running) - + def has_finished_requests(self) -> bool: - return len(self.finished_req_ids) > 0 + return len(self.finished_req_ids) > 0 or len(self.offload_kv_tasks) - + def get_num_unscheduled_requests(self) -> int: """Number of requests that are not being processed by the executor.""" @@ -777,3 +870,23 @@ class Scheduler(SchedulerInterface): @@ -984,8 +984,8 @@ index a81574875..e808e7537 100644 + f"[FlexKV] req: {request.request_id}, task: {task_id}, " + f"put {sum(task_result).item()} tokens cost {(t_async_put_end-t_put_async_start)*1000:.2f} ms, " + f"put_async() api cost {(t_put_async_return-t_put_async_start)*1000:.2f} ms") -+ self._free_block(request) -+ ++ self._free_block(request) ++ diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f642e5100..3ce609b50 100644 --- a/vllm/v1/engine/core.py @@ -993,16 +993,16 @@ index f642e5100..3ce609b50 100644 @@ -40,6 +40,8 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm.v1.structured_output import StructuredOutputManager from vllm.version import __version__ as VLLM_VERSION - + +from vllm.distributed.flexkv_extension.config import FlexKVConfig + logger = init_logger(__name__) - + POLLING_TIMEOUT_S = 2.5 @@ -105,6 +107,8 @@ class EngineCore: log_stats=self.log_stats, ) - + + self.init_flexkv(vllm_config, kv_cache_config) + # Setup MM Input Mapper. @@ -1011,10 +1011,10 @@ index f642e5100..3ce609b50 100644 @@ -164,6 +168,22 @@ class EngineCore: "warmup model) took %.2f seconds"), elapsed) return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config - + + def init_flexkv( -+ self, -+ taco_llm_config: VllmConfig, ++ self, ++ taco_llm_config: VllmConfig, + kv_cache_config: KVCacheConfig + ): + self.scheduler: V1Scheduler @@ -1030,7 +1030,7 @@ index f642e5100..3ce609b50 100644 + def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" - + diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index e3a4cd98c..dd009a3a4 100644 --- a/vllm/v1/executor/abstract.py @@ -1040,13 +1040,13 @@ index e3a4cd98c..dd009a3a4 100644 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput +from vllm.distributed.flexkv_extension.config import FlexKVConfig - - + + class Executor(ExecutorBase): @@ -78,6 +79,11 @@ class Executor(ExecutorBase): args=(scheduler_output, )) return output[0] - + + + def init_flexkv(self, flexkv_config: FlexKVConfig, dp_client_id: int): + self.collective_rpc("init_flexkv", @@ -1075,7 +1075,7 @@ index 3959be40b..69c5b59a1 100644 self.prefix_caching_metrics.hit_rate * 100, + self.prefix_caching_metrics.flexkv_hit_rate * 100, ) - + if scheduler_stats.spec_decoding_stats is not None: diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index fd9492648..8915c4f78 100644 @@ -1087,8 +1087,8 @@ index fd9492648..8915c4f78 100644 hits: int = 0 + # flexkv + flexkv_hits: int = 0 - - + + @dataclass diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 2972e0ffb..6bb8fa9ff 100644 @@ -1106,22 +1106,22 @@ index 2972e0ffb..6bb8fa9ff 100644 @@ -25,6 +25,10 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.worker_base import WorkerBase - + +# flexkv +from vllm.distributed.flexkv_extension.config import FlexKVConfig +from vllm.distributed.flexkv_extension.client import FlexKVTPClient + logger = init_logger(__name__) - + if TYPE_CHECKING: @@ -282,7 +286,23 @@ class Worker(WorkerBase): pattern=pattern, max_size=max_size, ) - -+ ++ + def init_flexkv( -+ self, ++ self, + flexkv_config: FlexKVConfig, + dp_client_id: int, + ) -> None: @@ -1136,6 +1136,6 @@ index 2972e0ffb..6bb8fa9ff 100644 + device_id=self.device.index, + gpu_blocks=self.model_runner.kv_caches, + kv_shape=kv_shape) - + def init_worker_distributed_environment( parallel_config: ParallelConfig, diff --git a/flexkv/cache/cache_engine.py b/flexkv/cache/cache_engine.py index 8340cc476f..3f7fb71217 100644 --- a/flexkv/cache/cache_engine.py +++ b/flexkv/cache/cache_engine.py @@ -29,7 +29,7 @@ from flexkv.cache.radixtree import RadixTreeIndex, RadixNode, MatchResult from flexkv.cache.transfer_pattern import add_virtal_op_for_mutiple_finished_ops from flexkv.common.block import SequenceMeta -from flexkv.common.config import CacheConfig, ModelConfig +from flexkv.common.config import CacheConfig, ModelConfig, GLOBAL_CONFIG_FROM_ENV from flexkv.common.exceptions import InvalidConfigError, NotEnoughSpaceError from flexkv.common.transfer import ( DeviceType, TransferOpGraph, TransferOp, TransferType @@ -53,7 +53,7 @@ def __init__(self, num_total_blocks: int, tokens_per_block: int, evict_ratio: float, - hit_reward_seconds: int): + hit_reward_seconds: int = 0): if not isinstance(device_type, DeviceType): raise InvalidConfigError(f"Unknown device type: {device_type}") if num_total_blocks <= 0: @@ -152,7 +152,7 @@ def __init__(self, num_total_blocks: int, tokens_per_block: int, evict_ratio: float, - hit_reward_seconds: int): + hit_reward_seconds: int = 0): if not isinstance(device_type, DeviceType): raise InvalidConfigError(f"Unknown device type: {device_type}") if num_total_blocks <= 0: @@ -238,61 +238,65 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig): self.remote_cache_engine = None self.gds_cache_engine = None + self.index_accel = GLOBAL_CONFIG_FROM_ENV.index_accel self.cache_engines = {} + self.evict_ratio = GLOBAL_CONFIG_FROM_ENV.evict_ratio + self.hit_reward_seconds = GLOBAL_CONFIG_FROM_ENV.hit_reward_seconds + if cache_config.enable_cpu: - if cache_config.index_accel: + if self.index_accel: self.cpu_cache_engine = CacheEngineAccel(DeviceType.CPU, cache_config.num_cpu_blocks, cache_config.tokens_per_block, - cache_config.evict_ratio, - cache_config.hit_reward_seconds) + self.evict_ratio, + self.hit_reward_seconds) else: self.cpu_cache_engine = CacheEngine(DeviceType.CPU, cache_config.num_cpu_blocks, cache_config.tokens_per_block, - cache_config.evict_ratio, - cache_config.hit_reward_seconds) + self.evict_ratio, + self.hit_reward_seconds) self.cache_engines[DeviceType.CPU] = self.cpu_cache_engine if cache_config.enable_ssd: - if cache_config.index_accel: + if self.index_accel: self.ssd_cache_engine = CacheEngineAccel(DeviceType.SSD, cache_config.num_ssd_blocks, cache_config.tokens_per_block, - cache_config.evict_ratio, - cache_config.hit_reward_seconds) + self.evict_ratio, + self.hit_reward_seconds) else: self.ssd_cache_engine = CacheEngine(DeviceType.SSD, cache_config.num_ssd_blocks, cache_config.tokens_per_block, - cache_config.evict_ratio, - cache_config.hit_reward_seconds) + self.evict_ratio, + self.hit_reward_seconds) self.cache_engines[DeviceType.SSD] = self.ssd_cache_engine if cache_config.enable_remote: - if cache_config.index_accel: + if self.index_accel: self.remote_cache_engine = CacheEngineAccel(DeviceType.REMOTE, cache_config.num_remote_blocks, cache_config.tokens_per_block, - cache_config.evict_ratio, - cache_config.hit_reward_seconds) + self.evict_ratio, + self.hit_reward_seconds) else: self.remote_cache_engine = CacheEngine(DeviceType.REMOTE, cache_config.num_remote_blocks, cache_config.tokens_per_block, - cache_config.evict_ratio, - cache_config.hit_reward_seconds) + self.evict_ratio, + self.hit_reward_seconds) self.cache_engines[DeviceType.REMOTE] = self.remote_cache_engine if cache_config.enable_gds: - if cache_config.index_accel: + if self.index_accel: self.gds_cache_engine = CacheEngineAccel(DeviceType.GDS, cache_config.num_gds_blocks, cache_config.tokens_per_block, - cache_config.evict_ratio) + self.evict_ratio) else: self.gds_cache_engine = CacheEngine(DeviceType.GDS, cache_config.num_gds_blocks, cache_config.tokens_per_block, - cache_config.evict_ratio) + self.evict_ratio) self.cache_engines[DeviceType.GDS] = self.gds_cache_engine self._empty_get_return: Callable[[int], Tuple[TransferOpGraph, List[int], Dict, Dict, Dict, int]] = \ @@ -587,7 +591,7 @@ def _get_impl_local(self, assert self.cache_config.enable_cpu assert self.cpu_cache_engine is not None - if self.cache_config.index_accel: + if self.index_accel: cpu_matched_result, ssd_matched_result = self.match_local_accel(sequence_meta) else: cpu_matched_result, ssd_matched_result = self.match_local(sequence_meta) @@ -637,7 +641,8 @@ def _get_impl_local(self, ) transfer_graph.add_transfer_op(op_gds_transfer) finished_ops_ids.append(op_gds_transfer.op_id) - op_node_to_ready[op_gds_transfer.op_id] = (DeviceType.GDS, ssd_node_to_unlock, ssd_node_to_unlock.size()) + op_node_to_ready[op_gds_transfer.op_id] = \ + (DeviceType.GDS, ssd_node_to_unlock, ssd_node_to_unlock.size()) else: fragment2_cpu_blocks = self.cpu_cache_engine.take( num_required_blocks=fragment2_num_blocks, @@ -681,7 +686,8 @@ def _get_impl_local(self, graph_id = transfer_graph.graph_id, transfer_type = TransferType.H2D, src_block_ids = fragment12_cpu_blocks if not self.cache_config.enable_gds else fragment1_cpu_blocks, - dst_block_ids = fragment12_gpu_blocks if not self.cache_config.enable_gds else fragment12_gpu_blocks[:fragment1_num_blocks], + dst_block_ids = fragment12_gpu_blocks if not self.cache_config.enable_gds \ + else fragment12_gpu_blocks[:fragment1_num_blocks], layer_id = 0, layer_granularity = layer_num ) @@ -803,7 +809,7 @@ def _put_impl_global(self, assert self.cpu_cache_engine is not None assert self.remote_cache_engine is not None - if self.cache_config.index_accel: + if self.index_accel: cpu_matched_result, ssd_matched_result, remote_matched_result = self.match_all_accel(sequence_meta) else: cpu_matched_result, ssd_matched_result, remote_matched_result = self.match_all(sequence_meta) @@ -957,7 +963,7 @@ def _put_impl_local(self, assert self.cpu_cache_engine is not None # assert self.ssd_cache_engine is not None - if self.cache_config.index_accel: + if self.index_accel: cpu_matched_result, ssd_matched_result = self.match_local_accel(sequence_meta) else: cpu_matched_result, ssd_matched_result = self.match_local(sequence_meta) @@ -981,7 +987,7 @@ def _put_impl_local(self, protected_node = cpu_matched_result.last_node, strict=False ) - + # Determine which disk cache to use (GDS or SSD) disk_cache_engine = None if self.cache_config.enable_gds: @@ -997,7 +1003,7 @@ def _put_impl_local(self, ) else: fragment2_ssd_blocks = np.array([], dtype=np.int64) - + if len(fragment12_cpu_blocks) < fragment12_num_blocks or \ len(fragment2_ssd_blocks) < fragment2_num_blocks: self.cpu_cache_engine.recycle(fragment12_cpu_blocks) @@ -1122,7 +1128,7 @@ def match_local_accel(self, sequence_meta: SequenceMeta) -> Tuple[MatchResultAcc ssd_matched_result = self.gds_cache_engine.match(sequence_meta) return cpu_matched_result, ssd_matched_result - + @nvtx.annotate("Match Prefix", color="yellow") def match_local(self, sequence_meta: SequenceMeta) -> Tuple[MatchResult, MatchResult]: cpu_matched_result = MatchResult() @@ -1135,7 +1141,7 @@ def match_local(self, sequence_meta: SequenceMeta) -> Tuple[MatchResult, MatchRe ssd_matched_result = self.gds_cache_engine.match(sequence_meta) return cpu_matched_result, ssd_matched_result - + @nvtx.annotate("Match All Prefix accel", color="yellow") def match_all_accel(self, sequence_meta: SequenceMeta) -> Tuple[MatchResultAccel, MatchResultAccel, MatchResultAccel]: diff --git a/flexkv/common/config.py b/flexkv/common/config.py index ad0114618c..c79c1e5773 100644 --- a/flexkv/common/config.py +++ b/flexkv/common/config.py @@ -1,20 +1,22 @@ from dataclasses import dataclass from enum import Enum from typing import Optional, List, Union, Dict, Any +from argparse import Namespace +import os +import copy import torch from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType - +from flexkv.common.debug import flexkv_logger @dataclass class ModelConfig: - num_layers: int - num_kv_heads: int - head_size: int + num_layers: int = 0 + num_kv_heads: int = 0 + head_size: int = 0 use_mla: bool = False dtype: torch.dtype = torch.bfloat16 - max_req_tokens = 163840 # parallel configs tp_size: int = 1 @@ -32,15 +34,6 @@ class CacheConfig: enable_ssd: bool = False enable_remote: bool = False enable_gds: bool = False - index_accel: bool = False - hit_reward_seconds: int = 0 - - # kv cache layout configs - gpu_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.LAYERWISE - cpu_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.BLOCKWISE - ssd_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.BLOCKWISE - remote_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.BLOCKWISE - gds_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.BLOCKWISE # mempool capacity configs num_cpu_blocks: int = 1000000 @@ -48,22 +41,13 @@ class CacheConfig: num_gds_blocks: int = 10000000 num_remote_blocks: Optional[int] = None - # CPU-GPU transfer configs - use_ce_transfer_h2d: bool = False - use_ce_transfer_d2h: bool = False - transfer_sms_h2d: int = 8 - transfer_sms_d2h: int = 8 - # ssd cache configs - max_blocks_per_file: int = 32000 # -1 means no limit ssd_cache_dir: Optional[Union[str, List[str]]] = None - ssd_cache_iouring_entries: int = 512 - ssd_cache_iouring_flags: int = 1 # gds cache configs gds_cache_dir: Optional[Union[str, List[str]]] = None - # remote cache configs + # remote cache configs for cfs remote_cache_size_mode: str = "file_size" # file_size or block_num remote_file_size: Optional[int] = None remote_file_num: Optional[int] = None @@ -71,23 +55,147 @@ class CacheConfig: remote_cache_path: Optional[Union[str, List[str]]] = None remote_config_custom: Optional[Dict[str, Any]] = None - # Trace configs - enable_trace: bool = True - trace_file_path: str = "./flexkv_trace.log" - trace_max_file_size_mb: int = 100 - trace_max_files: int = 5 - trace_flush_interval_ms: int = 1000 +GLOBAL_CONFIG_FROM_ENV: Namespace = Namespace( + server_client_mode=bool(int(os.getenv('FLEXKV_SERVER_CLIENT_MODE', 0))), + server_recv_port=os.getenv('FLEXKV_SERVER_RECV_PORT', 'ipc:///tmp/flexkv_server'), + + index_accel=bool(int(os.getenv('FLEXKV_INDEX_ACCEL', 1))), + cpu_layout_type=KVCacheLayoutType(os.getenv('FLEXKV_CPU_LAYOUT', 'BLOCKFIRST').upper()), + ssd_layout_type=KVCacheLayoutType(os.getenv('FLEXKV_SSD_LAYOUT', 'BLOCKFIRST').upper()), + remote_layout_type=KVCacheLayoutType(os.getenv('FLEXKV_REMOTE_LAYOUT', 'BLOCKFIRST').upper()), + gds_layout_type=KVCacheLayoutType(os.getenv('FLEXKV_GDS_LAYOUT', 'BLOCKFIRST').upper()), + + use_ce_transfer_h2d=bool(int(os.getenv('FLEXKV_USE_CE_TRANSFER_H2D', 0))), + use_ce_transfer_d2h=bool(int(os.getenv('FLEXKV_USE_CE_TRANSFER_D2H', 0))), + transfer_sms_h2d=int(os.getenv('FLEXKV_TRANSFER_SMS_H2D', 8)), + transfer_sms_d2h=int(os.getenv('FLEXKV_TRANSFER_SMS_D2H', 8)), + + iouring_entries=int(os.getenv('FLEXKV_IORING_ENTRIES', 512)), + iouring_flags=int(os.getenv('FLEXKV_IORING_FLAGS', 0)), - #evict ratio - evict_ratio: float = 0.0 + max_file_size_gb=float(os.getenv('FLEXKV_MAX_FILE_SIZE_GB', -1)), # -1 means no limit + + evict_ratio=float(os.getenv('FLEXKV_EVICT_RATIO', 0.05)), + hit_reward_seconds=int(os.getenv('FLEXKV_HIT_REWARD_SECONDS', 0)), + + enable_trace=bool(int(os.getenv('FLEXKV_ENABLE_TRACE', 0))), + trace_file_path=os.getenv('FLEXKV_TRACE_FILE_PATH', './flexkv_trace.log'), + trace_max_file_size_mb=int(os.getenv('FLEXKV_TRACE_MAX_FILE_SIZE_MB', 100)), + trace_max_files=int(os.getenv('FLEXKV_TRACE_MAX_FILES', 5)), + trace_flush_interval_ms=int(os.getenv('FLEXKV_TRACE_FLUSH_INTERVAL_MS', 1000)), +) + +@dataclass +class UserConfig: + cpu_cache_gb: int = 16 + ssd_cache_gb: int = 0 # 0 means disable ssd + ssd_cache_dir: Union[str, List[str]] = "./ssd_cache" + enable_gds: bool = False def __post_init__(self): - layout_fields = ['gpu_kv_layout_type', - 'cpu_kv_layout_type', - 'ssd_kv_layout_type', - 'remote_kv_layout_type', - 'gds_kv_layout_type'] - for field in layout_fields: - value = getattr(self, field) - if isinstance(value, str): - setattr(self, field, KVCacheLayoutType[value.upper()]) + if self.cpu_cache_gb <= 0: + raise ValueError(f"Invalid cpu_cache_gb: {self.cpu_cache_gb}") + if self.ssd_cache_gb < 0: + raise ValueError(f"Invalid ssd_cache_gb: {self.ssd_cache_gb}") + if self.ssd_cache_gb > 0 and self.ssd_cache_gb <= self.cpu_cache_gb: + raise ValueError(f"Invalid ssd_cache_gb: {self.ssd_cache_gb}, " + f"must be greater than cpu_cache_gb: {self.cpu_cache_gb}.") + +def parse_path_list(path_str: str) -> List[str]: + paths = [p.strip() for p in path_str.split(';') if p.strip()] + return paths + +def load_user_config_from_file(config_file: str) -> UserConfig: + import json + import yaml + from dataclasses import fields + + # read json config file or yaml config file + if config_file.endswith('.json'): + with open(config_file) as f: + config = json.load(f) + elif config_file.endswith('.yaml'): + with open(config_file) as f: + config = yaml.safe_load(f) + else: + raise ValueError(f"Unsupported config file extension: {config_file}") + + if 'ssd_cache_dir' in config: + config['ssd_cache_dir'] = parse_path_list(config['ssd_cache_dir']) + + defined_fields = {f.name for f in fields(UserConfig)} + known_config = {k: v for k, v in config.items() if k in defined_fields} + extra_config = {k: v for k, v in config.items() if k not in defined_fields} + + user_config = UserConfig(**known_config) + + for key, value in extra_config.items(): + setattr(user_config, f"override_{key}", value) + + return user_config + +def load_user_config_from_env() -> UserConfig: + return UserConfig( + cpu_cache_gb=int(os.getenv('FLEXKV_CPU_CACHE_GB', 16)), + ssd_cache_gb=int(os.getenv('FLEXKV_SSD_CACHE_GB', 0)), + ssd_cache_dir=parse_path_list(os.getenv('FLEXKV_SSD_CACHE_DIR', "./flexkv_ssd")), + enable_gds=bool(int(os.getenv('FLEXKV_ENABLE_GDS', 0))), + ) + +def convert_to_block_num(size_in_GB: float, block_size_in_bytes: int) -> int: + return int(size_in_GB * 1024 * 1024 * 1024 / block_size_in_bytes) + +def update_default_config_from_user_config(model_config: ModelConfig, + cache_config: CacheConfig, + user_config: UserConfig) -> None: + block_size_in_bytes = model_config.token_size_in_bytes * cache_config.tokens_per_block + + assert user_config.cpu_cache_gb > 0 + assert user_config.ssd_cache_gb >= 0 + + cache_config.num_cpu_blocks = convert_to_block_num(user_config.cpu_cache_gb, block_size_in_bytes) + cache_config.num_ssd_blocks = convert_to_block_num(user_config.ssd_cache_gb, block_size_in_bytes) + + cache_config.ssd_cache_dir = user_config.ssd_cache_dir + cache_config.enable_ssd = user_config.ssd_cache_gb > 0 + cache_config.enable_gds = user_config.enable_gds + + if cache_config.num_ssd_blocks % len(cache_config.ssd_cache_dir) != 0: + cache_config.num_ssd_blocks = \ + cache_config.num_ssd_blocks // len(cache_config.ssd_cache_dir) * len(cache_config.ssd_cache_dir) + flexkv_logger.warning(f"num_ssd_blocks is not a multiple of num_ssd_devices, " + f"adjust num_ssd_blocks to {cache_config.num_ssd_blocks}") + + global_config_attrs = set(vars(GLOBAL_CONFIG_FROM_ENV).keys()) + for attr_name in dir(user_config): + if attr_name.startswith('override_'): + global_attr_name = attr_name[9:] # len('override_') = 9 + if global_attr_name in global_config_attrs: + attr_value = getattr(user_config, attr_name) + original_value = getattr(GLOBAL_CONFIG_FROM_ENV, global_attr_name) + + original_type = type(original_value) + + try: + if original_type is bool: + if isinstance(attr_value, str): + attr_value = attr_value.lower() in ('true', '1', 'yes') + else: + attr_value = bool(int(attr_value)) + elif issubclass(original_type, Enum): # KVCacheLayoutType + if isinstance(attr_value, str): + attr_value = original_type(attr_value.upper()) + elif not isinstance(attr_value, original_type): + attr_value = original_type(attr_value) + else: + attr_value = original_type(attr_value) + except (ValueError, TypeError) as e: + raise ValueError(f"Cannot convert config value '{attr_value}' to type {original_type.__name__} " + f"for config '{global_attr_name}': {e}") from e + + setattr(GLOBAL_CONFIG_FROM_ENV, global_attr_name, attr_value) + flexkv_logger.info(f"Override environment variable: {'FLEXKV_' + global_attr_name.upper()} " + f"to {attr_value} from config file.") + else: + raise ValueError(f"Unknown config name: {global_attr_name} in config file, " + f"available config names: {global_config_attrs}") diff --git a/flexkv/common/storage.py b/flexkv/common/storage.py index 32587dbc76..53ab859c2d 100644 --- a/flexkv/common/storage.py +++ b/flexkv/common/storage.py @@ -14,11 +14,11 @@ class AccessHandleType(Enum): TENSOR_HANDLE = auto() # single tensor handle or tensor handle list GDS_MANAGER = auto() -# NOTE: currently, we assume that the layout type of GPU should always be layerwise -# and the layout type of CPU, SSD, remote should be the same, either laywise or blockwise +# NOTE: currently, we assume that the layout type of GPU should always be LAYERFIRST +# and the layout type of CPU, SSD, remote should be the same, either laywise or BLOCKFIRST class KVCacheLayoutType(Enum): - LAYERWISE = "LAYERWISE" - BLOCKWISE = "BLOCKWISE" + LAYERFIRST = "LAYERFIRST" + BLOCKFIRST = "BLOCKFIRST" @dataclass class KVCacheLayout: @@ -59,14 +59,14 @@ def __post_init__(self) -> None: def _compute_kv_shape(self) -> None: if self._kv_shape is None: - if self.type == KVCacheLayoutType.LAYERWISE: # for layerwise transfer + if self.type == KVCacheLayoutType.LAYERFIRST: # for Layerwise transfer self._kv_shape = torch.Size([self.num_layer, self._kv_dim, self.num_block, self.tokens_per_block, self.num_head, self.head_size]) - elif self.type == KVCacheLayoutType.BLOCKWISE: + elif self.type == KVCacheLayoutType.BLOCKFIRST: self._kv_shape = torch.Size([self.num_block, self.num_layer, self._kv_dim, @@ -126,25 +126,25 @@ def get_chunk_size(self) -> int: return self.tokens_per_block * self.num_head * self.head_size def get_layer_stride(self) -> int: - if self.type == KVCacheLayoutType.LAYERWISE: + if self.type == KVCacheLayoutType.LAYERFIRST: return self.kv_shape[1:].numel() - elif self.type == KVCacheLayoutType.BLOCKWISE: + elif self.type == KVCacheLayoutType.BLOCKFIRST: return self.kv_shape[2:].numel() else: raise ValueError(f"Invalid KVCacheLayoutType: {self.type}") def get_block_stride(self) -> int: - if self.type == KVCacheLayoutType.LAYERWISE: + if self.type == KVCacheLayoutType.LAYERFIRST: return self.kv_shape[3:].numel() - elif self.type == KVCacheLayoutType.BLOCKWISE: + elif self.type == KVCacheLayoutType.BLOCKFIRST: return self.kv_shape[1:].numel() else: raise ValueError(f"Invalid KVCacheLayoutType: {self.type}") def get_kv_stride(self) -> int: - if self.type == KVCacheLayoutType.LAYERWISE: + if self.type == KVCacheLayoutType.LAYERFIRST: return self.kv_shape[2:].numel() - elif self.type == KVCacheLayoutType.BLOCKWISE: + elif self.type == KVCacheLayoutType.BLOCKFIRST: return self.kv_shape[3:].numel() else: raise ValueError(f"Invalid KVCacheLayoutType: {self.type}") diff --git a/flexkv/common/tracer.py b/flexkv/common/tracer.py index 765fed0a9f..7efb107244 100644 --- a/flexkv/common/tracer.py +++ b/flexkv/common/tracer.py @@ -3,25 +3,25 @@ import threading import time from datetime import datetime -from typing import Any, Dict, Optional, List, Union +from typing import Any, Optional, List, Union import torch import numpy as np -from flexkv.common.config import CacheConfig +from flexkv.common.config import GLOBAL_CONFIG_FROM_ENV class FlexKVTracer: """FlexKV Tracer class for recording operations in JSON format""" - def __init__(self, cache_config: CacheConfig): - self.enabled = cache_config.enable_trace + def __init__(self): + self.enabled = GLOBAL_CONFIG_FROM_ENV.enable_trace if not self.enabled: return - print(f"FlexKVTracer enabled, trace_file_path: {cache_config.trace_file_path}") - self.trace_file_path = cache_config.trace_file_path - self.max_file_size_mb = cache_config.trace_max_file_size_mb - self.max_files = cache_config.trace_max_files - self.flush_interval_ms = cache_config.trace_flush_interval_ms + print(f"FlexKVTracer enabled, trace_file_path: {GLOBAL_CONFIG_FROM_ENV.trace_file_path}") + self.trace_file_path = GLOBAL_CONFIG_FROM_ENV.trace_file_path + self.max_file_size_mb = GLOBAL_CONFIG_FROM_ENV.trace_max_file_size_mb + self.max_files = GLOBAL_CONFIG_FROM_ENV.trace_max_files + self.flush_interval_ms = GLOBAL_CONFIG_FROM_ENV.trace_flush_interval_ms # Thread-safe file writing self._lock = threading.Lock() @@ -116,25 +116,40 @@ def trace_config(self, model_config, cache_config, gpu_layout=None): "enable_cpu": cache_config.enable_cpu, "enable_ssd": cache_config.enable_ssd, "enable_remote": cache_config.enable_remote, - "gpu_kv_layout_type": str(cache_config.gpu_kv_layout_type), - "cpu_kv_layout_type": str(cache_config.cpu_kv_layout_type), - "ssd_kv_layout_type": str(cache_config.ssd_kv_layout_type), - "remote_kv_layout_type": str(cache_config.remote_kv_layout_type), "enable_gds": cache_config.enable_gds, - "remote_cache_size_mode": cache_config.remote_cache_size_mode, "num_cpu_blocks": cache_config.num_cpu_blocks, "num_ssd_blocks": cache_config.num_ssd_blocks, + "num_gds_blocks": cache_config.num_gds_blocks, "num_remote_blocks": cache_config.num_remote_blocks, + "ssd_cache_dir": cache_config.ssd_cache_dir, + "gds_cache_dir": cache_config.gds_cache_dir, + "remote_cache_size_mode": cache_config.remote_cache_size_mode, "remote_file_size": cache_config.remote_file_size, "remote_file_num": cache_config.remote_file_num, "remote_file_prefix": cache_config.remote_file_prefix, - "ssd_cache_dir": cache_config.ssd_cache_dir, - "ssd_cache_iouring_entries": cache_config.ssd_cache_iouring_entries, - "ssd_cache_iouring_flags": cache_config.ssd_cache_iouring_flags, - "gds_cache_dir": cache_config.gds_cache_dir, "remote_cache_path": cache_config.remote_cache_path, "remote_config_custom": cache_config.remote_config_custom, - "evict_ratio": cache_config.evict_ratio, + } + + # Convert GLOBAL_CONFIG_FROM_ENV to dict + from flexkv.common.config import GLOBAL_CONFIG_FROM_ENV + global_config_dict = { + "server_client_mode": GLOBAL_CONFIG_FROM_ENV.server_client_mode, + "server_recv_port": GLOBAL_CONFIG_FROM_ENV.server_recv_port, + "index_accel": GLOBAL_CONFIG_FROM_ENV.index_accel, + "cpu_layout_type": str(GLOBAL_CONFIG_FROM_ENV.cpu_layout_type), + "ssd_layout_type": str(GLOBAL_CONFIG_FROM_ENV.ssd_layout_type), + "remote_layout_type": str(GLOBAL_CONFIG_FROM_ENV.remote_layout_type), + "gds_layout_type": str(GLOBAL_CONFIG_FROM_ENV.gds_layout_type), + "use_ce_transfer_h2d": GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_h2d, + "use_ce_transfer_d2h": GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_d2h, + "transfer_sms_h2d": GLOBAL_CONFIG_FROM_ENV.transfer_sms_h2d, + "transfer_sms_d2h": GLOBAL_CONFIG_FROM_ENV.transfer_sms_d2h, + "iouring_entries": GLOBAL_CONFIG_FROM_ENV.iouring_entries, + "iouring_flags": GLOBAL_CONFIG_FROM_ENV.iouring_flags, + "max_file_size_gb": GLOBAL_CONFIG_FROM_ENV.max_file_size_gb, + "evict_ratio": GLOBAL_CONFIG_FROM_ENV.evict_ratio, + # Note: trace-related configs are excluded as they should not affect replay } # Convert gpu_layout to dict if provided @@ -157,6 +172,7 @@ def trace_config(self, model_config, cache_config, gpu_layout=None): "data": { "model_config": model_config_dict, "cache_config": cache_config_dict, + "global_config": global_config_dict, "gpu_layout": gpu_layout_dict, } } @@ -174,9 +190,9 @@ def trace_config(self, model_config, cache_config, gpu_layout=None): def trace_request(self, request_type: str, request_id: int, - token_ids: torch.Tensor, - slot_mapping: torch.Tensor, - token_mask: Optional[torch.Tensor] = None, + token_ids: Union[torch.Tensor, np.ndarray], + slot_mapping: Union[torch.Tensor, np.ndarray], + token_mask: Optional[Union[torch.Tensor, np.ndarray]] = None, layer_granularity: int = -1, dp_id: int = 0, **kwargs): @@ -186,7 +202,7 @@ def trace_request(self, timestamp = datetime.now().isoformat() - # Convert tensors to lists for JSON serialization + # Convert tensors/arrays to lists for JSON serialization data = { "request_type": request_type, "request_id": request_id, @@ -221,6 +237,8 @@ def trace_request(self, def trace_wait_request(self, wait_type: str, task_ids: Union[int, List[int]], + timeout: Optional[float] = None, + completely: Optional[bool] = None, layer_group_id: Optional[int] = None): """Record a wait operation""" if not self.enabled: @@ -237,6 +255,8 @@ def trace_wait_request(self, data = { "wait_type": wait_type, "task_ids": task_ids_list, + "timeout": timeout, + "completely": completely, "layer_group_id": layer_group_id, } record = { @@ -256,6 +276,45 @@ def trace_wait_request(self, if (current_time - self._last_flush_time) * 1000 >= self.flush_interval_ms: self._flush_buffer() + def trace_launch_tasks(self, + task_ids: List[int], + slot_mappings: List[Union[torch.Tensor, np.ndarray]]): + """Record a launch_tasks operation""" + if not self.enabled: + return + + timestamp = datetime.now().isoformat() + + # Convert slot_mappings to lists + slot_mappings_list = [] + slot_mappings_shapes = [] + for slot_mapping in slot_mappings: + slot_mappings_list.append(self._convert_tensor_to_list(slot_mapping)) + slot_mappings_shapes.append(list(slot_mapping.shape)) + + data = { + "task_ids": task_ids, + "slot_mappings": slot_mappings_list, + "slot_mappings_shapes": slot_mappings_shapes, + } + + record = { + "timestamp": timestamp, + "event_type": "launch_tasks", + "component": "KVManager", + "data": data + } + + json_record = json.dumps(record, ensure_ascii=False, separators=(',', ':')) + + with self._lock: + self._buffer.append(json_record) + + # Check if we need to flush + current_time = time.time() + if (current_time - self._last_flush_time) * 1000 >= self.flush_interval_ms: + self._flush_buffer() + def flush(self): """Manually flush all buffered records""" if not self.enabled: diff --git a/flexkv/integration/config.py b/flexkv/integration/config.py index b437657c3e..12d4d8ef14 100644 --- a/flexkv/integration/config.py +++ b/flexkv/integration/config.py @@ -7,6 +7,7 @@ from dataclasses import dataclass, field from flexkv.common.debug import flexkv_logger +from flexkv.common.config import * if TYPE_CHECKING: from vllm.v1.kv_cache_interface import KVCacheConfig, FullAttentionSpec @@ -17,53 +18,53 @@ @dataclass class FlexKVConfig: + enable_flexkv: bool = True + #base config - server_recv_port: str - + server_recv_port: str = "" + # cache config - cache_config: dict = field(default_factory=dict) - + cache_config: CacheConfig = field(default_factory=CacheConfig) + # model config - block_size: int = None - num_layers: int = None - num_kv_heads: int = None - head_size: int = None - dtype: torch.dtype = None - use_mla: bool = False - tp_size: int = 1 - dp_size: int = 1 - # log config - num_log_interval_requests: int = 200 - + model_config: ModelConfig = field(default_factory=ModelConfig) + + # user config + user_config: UserConfig = field(default_factory=UserConfig) + + def __post_init__(self): + if self.server_recv_port == "": + self.server_recv_port = GLOBAL_CONFIG_FROM_ENV.server_recv_port + update_default_config_from_user_config(self.model_config, self.cache_config, self.user_config) + @classmethod def from_env(cls) -> 'FlexKVConfig': + enable_flexkv = bool(int(os.getenv('ENABLE_FLEXKV', 1))) config_file_path = os.getenv('FLEXKV_CONFIG_PATH', None) - logger.info(f"{config_file_path=}") if config_file_path is None: - return cls(enable_flexkv=False, - server_recv_port="") - - assert config_file_path.endswith(".json"), "flexkv config must be a json file." - - with open(config_file_path, 'r') as f: - config_dict: dict = json.load(f) - logger.info(f"FlexKV Config Dict: {config_dict}") - - return cls( - server_recv_port=config_dict.get("server_recv_port", f"ipc:///tmp/flexkv_test"), - cache_config=config_dict.get("cache_config", {}), - num_log_interval_requests=config_dict.get("num_log_interval_requests", 200), - ) - + logger.info("No flexkv config file provided, please set FLEXKV_CONFIG_PATH environment variable.") + logger.info("Loading flexkv config from environment variables.") + user_config = load_user_config_from_env() + return cls(enable_flexkv=enable_flexkv, + user_config=user_config) + else: + logger.info(f"Loading flexkv config from file: {config_file_path}") + user_config = load_user_config_from_file(config_file_path) + return cls(enable_flexkv=enable_flexkv, + user_config=user_config) + def post_init_from_vllm_config( - self, + self, vllm_config: "VllmConfig", ): - self.num_layers = vllm_config.model_config.get_num_layers(vllm_config.parallel_config) - self.block_size = vllm_config.cache_config.block_size - self.num_kv_heads = vllm_config.model_config.get_total_num_kv_heads() - self.head_size = vllm_config.model_config.get_head_size() - self.dtype = vllm_config.model_config.dtype - self.use_mla = vllm_config.model_config.is_deepseek_mla - self.tp_size = vllm_config.parallel_config.tensor_parallel_size - self.dp_size = vllm_config.parallel_config.data_parallel_size \ No newline at end of file + self.cache_config.tokens_per_block = vllm_config.cache_config.block_size + + self.model_config.num_layers = vllm_config.model_config.get_num_layers(vllm_config.parallel_config) + self.model_config.num_kv_heads = vllm_config.model_config.get_total_num_kv_heads() + self.model_config.head_size = vllm_config.model_config.get_head_size() + self.model_config.dtype = vllm_config.model_config.dtype + self.model_config.use_mla = vllm_config.model_config.is_deepseek_mla + self.model_config.tp_size = vllm_config.parallel_config.tensor_parallel_size + self.model_config.dp_size = vllm_config.parallel_config.data_parallel_size + + self.__post_init__() diff --git a/flexkv/integration/vllm/vllm_v1_adapter.py b/flexkv/integration/vllm/vllm_v1_adapter.py index 7bec7141fd..cdeec495d5 100644 --- a/flexkv/integration/vllm/vllm_v1_adapter.py +++ b/flexkv/integration/vllm/vllm_v1_adapter.py @@ -10,7 +10,6 @@ from flexkv.kvmanager import KVManager from flexkv.server.client import KVTPClient from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType -from flexkv.common.config import ModelConfig, CacheConfig from flexkv.common.request import KVResponseStatus from flexkv.common.debug import flexkv_logger from flexkv.integration.stats import FlexKVStats @@ -119,29 +118,15 @@ def __init__( dp_rank: int = 0, ): logger.info(f"Start init FlexKVSchedulerConnector with {flexkv_config}") - self.flexkv_config = flexkv_config self.server_recv_port = flexkv_config.server_recv_port - self.tp_size = flexkv_config.tp_size - self.dp_size = flexkv_config.dp_size - self.block_size = flexkv_config.block_size - self.model_config = ModelConfig( - num_layers=flexkv_config.num_layers, - num_kv_heads=flexkv_config.num_kv_heads, - head_size=flexkv_config.head_size, - use_mla=flexkv_config.use_mla, - dtype=flexkv_config.dtype, - tp_size=flexkv_config.tp_size, - dp_size=flexkv_config.dp_size, - ) - if "tokens_per_block" in flexkv_config.cache_config: - assert flexkv_config.cache_config.pop("tokens_per_block") == flexkv_config.block_size - self.cache_config = CacheConfig( - tokens_per_block=flexkv_config.block_size, - **flexkv_config.cache_config, - ) + self.tp_size = flexkv_config.model_config.tp_size + self.dp_size = flexkv_config.model_config.dp_size + self.block_size = flexkv_config.cache_config.tokens_per_block + self.model_config = flexkv_config.model_config + self.cache_config = flexkv_config.cache_config self.flexkv_manager = KVManager(model_config=self.model_config, cache_config=self.cache_config, - gpu_register_port=flexkv_config.server_recv_port, + server_recv_port=flexkv_config.server_recv_port, dp_client_id=dp_rank) self.flexkv_manager.start() # self.dp_client = KVDPClient(self.server_recv_port, self.model_config) @@ -155,7 +140,7 @@ def __init__( self.tasks_to_launch: dict[int, FlexKVTask] = {} self.tasks_to_cancel: dict[int, FlexKVTask] = {} - self.flexkv_stats = FlexKVStats(flexkv_config.num_log_interval_requests) + self.flexkv_stats = FlexKVStats(os.getenv('FLEXKV_NUM_LOG_INTERVAL_REQUESTS', 200)) while not self.is_ready(): logger.info("Waiting for flexkv init...") @@ -532,9 +517,10 @@ def __init__( flexkv_config: FlexKVConfig, dp_client_id: int, ): - current_device_id = torch.cuda.current_device() + dp_client_id * flexkv_config.tp_size + current_device_id = torch.cuda.current_device() + dp_client_id * flexkv_config.model_config.tp_size self.flexkv_config = flexkv_config - logger.info(f"Start init FlexKVWorkerConnector to {flexkv_config.server_recv_port}, dp_client_id: {dp_client_id}") + logger.info(f"Start init FlexKVWorkerConnector to {flexkv_config.server_recv_port}, \ + dp_client_id: {dp_client_id}") self.tp_client = KVTPClient(flexkv_config.server_recv_port, dp_client_id, current_device_id) logger.info("Finish init FlexKVWorkerConnector") @@ -542,7 +528,7 @@ def register_to_server(self, kv_caches: dict[str, torch.Tensor]): logger.info("Start register kv_caches") gpu_blocks = list(kv_caches.values()) num_layer = len(kv_caches) - if self.flexkv_config.use_mla: + if self.flexkv_config.model_config.use_mla: assert gpu_blocks[0].ndim == 3, ( f"expect kv cached tensor has 3 dim but get shape={gpu_blocks[0].shape}.") num_blocks = gpu_blocks[0].shape[0] @@ -557,13 +543,13 @@ def register_to_server(self, kv_caches: dict[str, torch.Tensor]): num_kv_heads = gpu_blocks[0].shape[3] head_size = gpu_blocks[0].shape[4] gpu_layout = KVCacheLayout( - type=KVCacheLayoutType.LAYERWISE, + type=KVCacheLayoutType.LAYERFIRST, num_layer=num_layer, num_block=num_blocks, tokens_per_block=block_size, num_head=num_kv_heads, head_size=head_size, - is_mla=self.flexkv_config.use_mla, + is_mla=self.flexkv_config.model_config.use_mla, ) self.tp_client.register_to_server(gpu_blocks, gpu_layout) logger.info("Finish register kv_caches") diff --git a/flexkv/kvmanager.py b/flexkv/kvmanager.py index cac6355da7..cbf795648b 100644 --- a/flexkv/kvmanager.py +++ b/flexkv/kvmanager.py @@ -22,7 +22,7 @@ from flexkv.server.client import KVDPClient from flexkv.server.server import KVServer, DPClient from flexkv.kvtask import KVTaskEngine, KVResponse -from flexkv.common.config import ModelConfig, CacheConfig +from flexkv.common.config import ModelConfig, CacheConfig, GLOBAL_CONFIG_FROM_ENV from flexkv.common.debug import flexkv_logger @@ -30,16 +30,21 @@ class KVManager: def __init__(self, model_config: ModelConfig, cache_config: CacheConfig, - gpu_register_port: Optional[str] = None, - server_recv_port: Optional[str] = None, - dp_client_id: int = 0): + dp_client_id: int = 0, + server_recv_port: str = ""): flexkv_logger.info(f"{model_config = }") flexkv_logger.info(f"{cache_config = }") + flexkv_logger.info(f"{GLOBAL_CONFIG_FROM_ENV = }") self.model_config = model_config self.cache_config = cache_config - self.gpu_register_port = gpu_register_port if gpu_register_port is not None else "ipc:///tmp/flexkv_test_gpu_register" - self.server_recv_port = server_recv_port if server_recv_port is not None else "ipc:///tmp/flexkv_test_server" - self.server_client_mode = model_config.dp_size > 1 + + if server_recv_port != "": + self.server_recv_port = server_recv_port + else: + self.server_recv_port = GLOBAL_CONFIG_FROM_ENV.server_recv_port + self.gpu_register_port = self.server_recv_port + "_gpu_register" + + self.server_client_mode = model_config.dp_size > 1 or GLOBAL_CONFIG_FROM_ENV.server_client_mode self.dp_client_id = dp_client_id flexkv_logger.info(f"server_client_mode: {self.server_client_mode}") if self.server_client_mode: @@ -50,17 +55,17 @@ def __init__(self, # Example: inherit_env = False # to not inherit parent env self.server_handle = KVServer.create_server(model_config=model_config, cache_config=cache_config, - gpu_register_port=gpu_register_port, + gpu_register_port=self.gpu_register_port, server_recv_port=self.server_recv_port, inherit_env=False) - + else: self.server_handle = None self.dp_client = KVDPClient(self.server_recv_port, self.model_config, dp_client_id) else: self.server_handle = None - self.kv_task_engine = KVTaskEngine(model_config, cache_config, gpu_register_port) - + self.kv_task_engine = KVTaskEngine(model_config, cache_config, self.gpu_register_port) + @property def dpclient_id(self) -> int: return self.dp_client_id diff --git a/flexkv/kvtask.py b/flexkv/kvtask.py index c62ea8ed33..735e452a47 100644 --- a/flexkv/kvtask.py +++ b/flexkv/kvtask.py @@ -375,7 +375,9 @@ def __init__(self, gpu_register_port: Optional[str] = None, ): super().__init__(model_config, cache_config, gpu_register_port) - self.tracer = FlexKVTracer(cache_config) + self.tracer = FlexKVTracer() + # trace config + self.tracer.trace_config(model_config, cache_config, gpu_layout=None) def get_async(self, token_ids: np.ndarray, @@ -391,6 +393,16 @@ def get_async(self, layer_granularity=layer_granularity, dp_id=dp_id, task_id=task_id) + # trace get request + self.tracer.trace_request( + request_type="GET", + request_id=task_id, + token_ids=token_ids, + slot_mapping=slot_mapping, + token_mask=token_mask, + layer_granularity=layer_granularity, + dp_id=dp_id + ) self._launch_task(task_id) return task_id, return_mask @@ -406,6 +418,16 @@ def put_async(self, token_mask=token_mask, dp_id=dp_id, task_id=task_id) + # trace put request + self.tracer.trace_request( + request_type="PUT", + request_id=task_id, + token_ids=token_ids, + slot_mapping=slot_mapping, + token_mask=token_mask, + layer_granularity=-1, # put has no layer_granularity parameter + dp_id=dp_id + ) self._launch_task(task_id) return task_id, return_mask @@ -464,6 +486,13 @@ def try_wait(self, task_ids: Union[int, List[int]]) -> Dict[int, KVResponse]: if isinstance(task_ids, int): task_ids = [task_ids] nvtx.mark(f"try_wait task_ids: {task_ids}") + # trace try_wait request + self.tracer.trace_wait_request( + wait_type="try_wait", + task_ids=task_ids, + timeout=None, # try_wait doesn't have explicit timeout + completely=False + ) return_responses = self._wait_impl(task_ids, completely=False, only_return_finished=True) @@ -476,6 +505,13 @@ def wait(self, if isinstance(task_ids, int): task_ids = [task_ids] nvtx.push_range(f"wait task_ids: {task_ids}", color=get_nvtx_default_color()) + # trace wait request + self.tracer.trace_wait_request( + wait_type="wait", + task_ids=task_ids, + timeout=timeout, + completely=completely + ) return_responses = self._wait_impl(task_ids, timeout, completely=completely) nvtx.pop_range() return return_responses @@ -489,13 +525,24 @@ def get_match(self, if token_mask is None: token_mask = np.ones_like(token_ids, dtype=bool) fake_slot_mapping = np.zeros_like(token_ids[token_mask]) - return self._get_match_impl(token_ids, - fake_slot_mapping, - is_fake_slot_mapping=True, - token_mask=token_mask, - layer_granularity=layer_granularity, - dp_id=dp_id, - task_id=task_id) + result_task_id, return_mask = self._get_match_impl(token_ids, + fake_slot_mapping, + is_fake_slot_mapping=True, + token_mask=token_mask, + layer_granularity=layer_granularity, + dp_id=dp_id, + task_id=task_id) + # trace get match request + self.tracer.trace_request( + request_type="GET_MATCH", + request_id=result_task_id, + token_ids=token_ids, + slot_mapping=fake_slot_mapping, + token_mask=token_mask, + layer_granularity=layer_granularity, + dp_id=dp_id + ) + return result_task_id, return_mask def _get_match_impl(self, token_ids: np.ndarray, @@ -529,12 +576,23 @@ def put_match(self, dp_id: int = 0, task_id: int = -1) -> Tuple[int, np.ndarray]: fake_slot_mapping = np.zeros_like(token_ids) - return self._put_match_impl(token_ids, - fake_slot_mapping, - is_fake_slot_mapping=True, - token_mask=token_mask, - dp_id=dp_id, - task_id=task_id) + result_task_id, return_mask = self._put_match_impl(token_ids, + fake_slot_mapping, + is_fake_slot_mapping=True, + token_mask=token_mask, + dp_id=dp_id, + task_id=task_id) + # trace put match request + self.tracer.trace_request( + request_type="PUT_MATCH", + request_id=result_task_id, + token_ids=token_ids, + slot_mapping=fake_slot_mapping, + token_mask=token_mask, + layer_granularity=-1, # put has no layer_granularity parameter + dp_id=dp_id + ) + return result_task_id, return_mask def _put_match_impl(self, token_ids: np.ndarray, @@ -562,6 +620,8 @@ def launch_tasks(self, task_ids: List[int], slot_mappings: List[np.ndarray]) -> None: assert isinstance(slot_mappings[0], np.ndarray) + # trace launch tasks + self.tracer.trace_launch_tasks(task_ids, slot_mappings) self.set_slot_mappings(task_ids, slot_mappings) for task_id in task_ids: self._launch_task(task_id) diff --git a/flexkv/server/server.py b/flexkv/server/server.py index 5e678b6a59..95b2abc101 100644 --- a/flexkv/server/server.py +++ b/flexkv/server/server.py @@ -174,14 +174,14 @@ def create_server(cls, # Set spawn method for CUDA compatibility with contextlib.suppress(RuntimeError): mp.set_start_method("spawn") - + # Prepare environment variables for child process if child_env is not None or not inherit_env: # Use subprocess for better environment control import subprocess import pickle import sys - + # Prepare environment if inherit_env: env = os.environ.copy() @@ -189,10 +189,10 @@ def create_server(cls, env.update(child_env) else: env = child_env or {} - + # Serialize arguments args_data = pickle.dumps((model_config, cache_config, gpu_register_port, server_recv_port)) - + # Start subprocess flexkv_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) server_script = textwrap.dedent(f''' @@ -209,7 +209,7 @@ def create_server(cls, process = subprocess.Popen([ sys.executable, '-c', server_script ], env=env) - + flexkv_logger.info(f"KVServer subprocess started, PID: {process.pid}") return KVServerHandle(process) else: @@ -389,7 +389,7 @@ def __del__(self) -> None: tokens_per_block = 4 gpu_kv_layout = KVCacheLayout( - type=KVCacheLayoutType.LAYERWISE, + type=KVCacheLayoutType.LAYERFIRST, num_layer=num_layers, num_block=num_gpu_blocks, tokens_per_block=tokens_per_block, diff --git a/flexkv/storage/allocator.py b/flexkv/storage/allocator.py index 73310f4cfa..479f8a4bcc 100644 --- a/flexkv/storage/allocator.py +++ b/flexkv/storage/allocator.py @@ -136,9 +136,9 @@ def allocate(cls, **kwargs: Any) -> StorageHandle: cache_dir = kwargs.get("cache_dir") file_prefix = kwargs.get("file_prefix", "flexkv_ssd_cache") - cfg_max_blocks_per_file = kwargs.get("max_blocks_per_file", -1) - if cfg_max_blocks_per_file == -1: - cfg_max_blocks_per_file = int(1e9) + cfg_max_file_size_gb = kwargs.get("max_file_size_gb", -1) + cfg_max_blocks_per_file = int(1e9) + if cache_dir is None: raise ValueError("cache_dir is required for SSD allocator") if isinstance(cache_dir, str): @@ -159,6 +159,10 @@ def allocate(cls, total_blocks_per_device = layout.num_block // num_ssd_devices block_size = layout.get_elements_per_block() * dtype.itemsize + if cfg_max_file_size_gb != -1: + cfg_max_blocks_per_file = int(cfg_max_file_size_gb * 1024 * 1024 * 1024 // block_size) + + fsys_max_blocks_per_file = cls.get_file_size_limit(cache_dir[0]) // block_size num_blocks_per_file = min(fsys_max_blocks_per_file, cfg_max_blocks_per_file) @@ -173,7 +177,10 @@ def allocate(cls, with open(file_path, "wb+", buffering=0) as file: cls._create_file(file, real_file_size) ssd_files[i].append(file_path) - + total_num_files = num_files_per_device * num_ssd_devices + real_total_size = total_num_files * real_file_size + flexkv_logger.info(f"SSD allocator create total {total_num_files} files in {cache_dir}, " + f"each file has {real_file_size/1024/1024/1024:.2f} GB, total size {real_total_size/1024/1024/1024:.2f} GB") return StorageHandle( handle_type=AccessHandleType.FILE, data=ssd_files, diff --git a/flexkv/storage/storage_engine.py b/flexkv/storage/storage_engine.py index 82d4073917..44f98e1356 100644 --- a/flexkv/storage/storage_engine.py +++ b/flexkv/storage/storage_engine.py @@ -4,7 +4,7 @@ import torch -from flexkv.common.config import ModelConfig, CacheConfig +from flexkv.common.config import ModelConfig, CacheConfig, GLOBAL_CONFIG_FROM_ENV from flexkv.common.memory_handle import TensorSharedHandle from flexkv.common.storage import StorageHandle, KVCacheLayout, KVCacheLayoutType from flexkv.common.transfer import DeviceType @@ -19,11 +19,9 @@ def __init__(self, self._storage_handles: Dict[Tuple[DeviceType, int], StorageHandle] = {} self._model_config = model_config self._cache_config = cache_config - if not self._cache_config.gpu_kv_layout_type == KVCacheLayoutType.LAYERWISE: - raise ValueError("Only layerwise layout is supported for GPU") if self._cache_config.enable_cpu: self._cpu_layout: Optional[KVCacheLayout] = KVCacheLayout( - type=self._cache_config.cpu_kv_layout_type, + type=GLOBAL_CONFIG_FROM_ENV.cpu_layout_type, num_layer=self._model_config.num_layers, num_block=self._cache_config.num_cpu_blocks, tokens_per_block=self._cache_config.tokens_per_block, @@ -37,10 +35,10 @@ def __init__(self, dtype=self._model_config.dtype, ) if self._cache_config.enable_ssd: - if not self._cache_config.ssd_kv_layout_type == self._cpu_layout.type: + if not GLOBAL_CONFIG_FROM_ENV.ssd_layout_type == self._cpu_layout.type: raise ValueError(f"SSD layout type must be the same as CPU layout type: {self._cpu_layout.type}") self._ssd_layout: Optional[KVCacheLayout] = KVCacheLayout( - type=self._cache_config.ssd_kv_layout_type, + type=GLOBAL_CONFIG_FROM_ENV.ssd_layout_type, num_layer=self._model_config.num_layers, num_block=self._cache_config.num_ssd_blocks, tokens_per_block=self._cache_config.tokens_per_block, @@ -53,13 +51,13 @@ def __init__(self, layout=self._ssd_layout, dtype=self._model_config.dtype, cache_dir=self._cache_config.ssd_cache_dir, - max_blocks_per_file=self._cache_config.max_blocks_per_file + max_file_size_gb=GLOBAL_CONFIG_FROM_ENV.max_file_size_gb ) if self._cache_config.enable_remote: - if not self._cache_config.remote_kv_layout_type == self._cpu_layout.type: + if not GLOBAL_CONFIG_FROM_ENV.remote_layout_type == self._cpu_layout.type: raise ValueError(f"Remote layout type must be the same as CPU layout type: {self._cpu_layout.type}") self._remote_layout: Optional[KVCacheLayout] = KVCacheLayout( - type=self._cache_config.remote_kv_layout_type, + type=GLOBAL_CONFIG_FROM_ENV.remote_layout_type, num_layer=self._model_config.num_layers, num_block=self._cache_config.num_remote_blocks, tokens_per_block=self._cache_config.tokens_per_block, @@ -76,11 +74,11 @@ def __init__(self, ) if self._cache_config.enable_gds: # GDS should follow similar constraints as CPU/SSD/Remote - if not self._cache_config.gds_kv_layout_type == self._cpu_layout.type: + if not GLOBAL_CONFIG_FROM_ENV.gds_layout_type == self._cpu_layout.type: raise ValueError(f"GDS layout type must be the same as CPU layout type: {self._cpu_layout.type}") - + self._gds_layout: Optional[KVCacheLayout] = KVCacheLayout( - type=self._cache_config.gds_kv_layout_type, + type=GLOBAL_CONFIG_FROM_ENV.gds_layout_type, num_layer=self._model_config.num_layers, num_block=self._cache_config.num_gds_blocks, tokens_per_block=self._cache_config.tokens_per_block, @@ -93,7 +91,7 @@ def __init__(self, layout=self._gds_layout, dtype=self._model_config.dtype, gds_cache_dir=self._cache_config.gds_cache_dir, - max_blocks_per_file=self._cache_config.max_blocks_per_file + max_file_size_gb=GLOBAL_CONFIG_FROM_ENV.max_file_size_gb ) def register_gpu_blocks(self, @@ -175,7 +173,7 @@ def allocate(self, ) elif device_type == DeviceType.SSD: cache_dir = kwargs.get('cache_dir') - max_blocks_per_file = kwargs.get('max_blocks_per_file', -1) + max_file_size_gb = kwargs.get('max_file_size_gb', -1) if raw_data is not None: assert isinstance(raw_data, str) or \ (isinstance(raw_data, list) and all(isinstance(x, str) for x in raw_data)), \ @@ -193,7 +191,7 @@ def allocate(self, dtype=dtype, cache_dir=cache_dir, file_prefix="flexkv_ssd_cache", - max_blocks_per_file=max_blocks_per_file + max_file_size_gb=max_file_size_gb ) elif device_type == DeviceType.REMOTE: file_path = kwargs.get('file_path') @@ -224,13 +222,13 @@ def allocate(self, ) elif device_type == DeviceType.GDS: gds_cache_dir = kwargs.get('gds_cache_dir') - max_blocks_per_file = kwargs.get('max_blocks_per_file', -1) - + max_file_size_gb = kwargs.get('max_file_size_gb', -1) + allocator = GDSAllocator( layout=layout, dtype=dtype, gds_cache_dir=gds_cache_dir, - max_blocks_per_file=max_blocks_per_file + max_file_size_gb=max_file_size_gb ) storage_handle = allocator.get_accessible_handle() else: diff --git a/flexkv/transfer/transfer_engine.py b/flexkv/transfer/transfer_engine.py index 08676183d9..1b7959fce9 100644 --- a/flexkv/transfer/transfer_engine.py +++ b/flexkv/transfer/transfer_engine.py @@ -38,7 +38,7 @@ GDSTransferWorker, tpGDSTransferWorker, ) -from flexkv.common.config import CacheConfig, ModelConfig +from flexkv.common.config import CacheConfig, ModelConfig, GLOBAL_CONFIG_FROM_ENV from flexkv.common.ring_buffer import SharedOpPool @@ -90,7 +90,7 @@ def __init__(self, self._remote_handle = remote_handle self._cache_config = cache_config - self.pin_buffer = SharedOpPool(2048, self.model_config.max_req_tokens // self.cache_config.tokens_per_block) + self.pin_buffer = SharedOpPool(2048, self.cache_config.num_cpu_blocks) self.op_id_to_nvtx_range: Dict[int, str] = {} @@ -118,10 +118,10 @@ def _init_workers(self) -> None: cpu_kv_layout=self._cpu_handle.kv_layout, dtype=self.gpu_handles[i].dtype, gpu_device_id=i, - use_ce_transfer_h2d=self.cache_config.use_ce_transfer_h2d, - use_ce_transfer_d2h=self.cache_config.use_ce_transfer_d2h, - transfer_sms_h2d=self.cache_config.transfer_sms_h2d, - transfer_sms_d2h=self.cache_config.transfer_sms_d2h, + use_ce_transfer_h2d=GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_h2d, + use_ce_transfer_d2h=GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_d2h, + transfer_sms_h2d=GLOBAL_CONFIG_FROM_ENV.transfer_sms_h2d, + transfer_sms_d2h=GLOBAL_CONFIG_FROM_ENV.transfer_sms_d2h, ) for i in range(self.dp_size) ] @@ -140,10 +140,10 @@ def _init_workers(self) -> None: dtype=self.gpu_handles[i].dtype, tp_group_size=self.tp_size, dp_group_id=i, - use_ce_transfer_h2d=self.cache_config.use_ce_transfer_h2d, - use_ce_transfer_d2h=self.cache_config.use_ce_transfer_d2h, - transfer_sms_h2d=self.cache_config.transfer_sms_h2d, - transfer_sms_d2h=self.cache_config.transfer_sms_d2h, + use_ce_transfer_h2d=GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_h2d, + use_ce_transfer_d2h=GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_d2h, + transfer_sms_h2d=GLOBAL_CONFIG_FROM_ENV.transfer_sms_h2d, + transfer_sms_d2h=GLOBAL_CONFIG_FROM_ENV.transfer_sms_d2h, ) for i in range(self.dp_size) ] diff --git a/flexkv/transfer/worker.py b/flexkv/transfer/worker.py index 254e0b4e28..f01d8631af 100644 --- a/flexkv/transfer/worker.py +++ b/flexkv/transfer/worker.py @@ -23,7 +23,7 @@ from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType from flexkv.common.transfer import TransferOp, TransferType, PartitionBlockType from flexkv.common.transfer import get_nvtx_range_color -from flexkv.common.config import CacheConfig +from flexkv.common.config import CacheConfig, GLOBAL_CONFIG_FROM_ENV try: from flexkv.c_ext import transfer_kv_blocks_remote @@ -596,8 +596,8 @@ def __init__(self, self.ssd_layer_stride_in_bytes = ssd_kv_layout_per_file.get_layer_stride() * self.dtype.itemsize try: - self.ioctx = c_ext.SSDIOCTX(ssd_files, len(ssd_files), cache_config.ssd_cache_iouring_entries, - cache_config.ssd_cache_iouring_flags) + self.ioctx = c_ext.SSDIOCTX(ssd_files, len(ssd_files), GLOBAL_CONFIG_FROM_ENV.iouring_entries, + GLOBAL_CONFIG_FROM_ENV.iouring_flags) except Exception as e: flexkv_logger.error(f"Error setting ssd ioctx: {e}\n") raise RuntimeError("SSD Worker init failed") from e @@ -1084,10 +1084,10 @@ def __init__( self.gds_kv_stride_in_bytes = gds_kv_layout.get_kv_stride() * self.dtype.itemsize self.gds_block_stride_in_bytes = gds_kv_layout.get_block_stride() * self.dtype.itemsize - if not gpu_kv_layout.type == KVCacheLayoutType.LAYERWISE: - raise ValueError("Only layerwise layout is supported for GPU") - if not gds_kv_layout.type == KVCacheLayoutType.LAYERWISE: - raise ValueError("Only layerwise layout is supported for GDS") + if not gpu_kv_layout.type == KVCacheLayoutType.LAYERFIRST: + raise ValueError("Only LAYERFIRST layout is supported for GPU") + if not gds_kv_layout.type == KVCacheLayoutType.LAYERFIRST: + raise ValueError("Only LAYERFIRST layout is supported for GDS") # Create TP GDS Transfer Thread Group self.tp_gds_transfer_thread_group = TPGDSTransferThreadGroup( diff --git a/pyproject.toml b/pyproject.toml index db54deba99..01021be59a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,8 @@ ignore = [ "SIM108", # Logging statement uses f-string "G004", + # Use X | None for type annotations. + "UP045", ] [tool.mypy] diff --git a/tests/replay_from_tracer.py b/tests/replay_from_tracer.py index c4c3440f43..75f8c6c8e7 100644 --- a/tests/replay_from_tracer.py +++ b/tests/replay_from_tracer.py @@ -20,11 +20,14 @@ import time from typing import Dict, List, Optional, Any, Tuple import torch +import zmq from flexkv.common.config import CacheConfig, ModelConfig from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType from flexkv.common.memory_handle import TensorSharedHandle from flexkv.kvtask import KVTaskEngine +from flexkv.server.request import RegisterTPClientRequest +from flexkv.server.utils import get_zmq_socket class FlexKVReplayEngine: @@ -79,8 +82,32 @@ def parse_config_event(self, event: Dict[str, Any]): data = event['data'] model_config_data = data['model_config'] cache_config_data = data['cache_config'] + global_config_data = data.get('global_config', {}) gpu_layout_data = data.get('gpu_layout') + # Restore GLOBAL_CONFIG_FROM_ENV from trace + if global_config_data: + self.log("Restoring global config from trace...") + from flexkv.common.config import GLOBAL_CONFIG_FROM_ENV + + # Restore layout types + if 'cpu_layout_type' in global_config_data: + GLOBAL_CONFIG_FROM_ENV.cpu_layout_type = self._parse_layout_type(global_config_data['cpu_layout_type']) + if 'ssd_layout_type' in global_config_data: + GLOBAL_CONFIG_FROM_ENV.ssd_layout_type = self._parse_layout_type(global_config_data['ssd_layout_type']) + if 'remote_layout_type' in global_config_data: + GLOBAL_CONFIG_FROM_ENV.remote_layout_type = self._parse_layout_type(global_config_data['remote_layout_type']) + if 'gds_layout_type' in global_config_data: + GLOBAL_CONFIG_FROM_ENV.gds_layout_type = self._parse_layout_type(global_config_data['gds_layout_type']) + + # Restore other configs + for key in ['server_client_mode', 'index_accel', 'use_ce_transfer_h2d', 'use_ce_transfer_d2h', + 'transfer_sms_h2d', 'transfer_sms_d2h', 'iouring_entries', 'iouring_flags', + 'max_file_size_gb', 'evict_ratio', 'server_recv_port']: + if key in global_config_data: + setattr(GLOBAL_CONFIG_FROM_ENV, key, global_config_data[key]) + self.log(f" Restored {key} = {global_config_data[key]}") + # Recreate model_config dtype_str = model_config_data['dtype'] if dtype_str == "torch.float16": @@ -108,24 +135,19 @@ def parse_config_event(self, event: Dict[str, Any]): enable_cpu=cache_config_data['enable_cpu'], enable_ssd=cache_config_data['enable_ssd'], enable_remote=cache_config_data['enable_remote'], - gpu_kv_layout_type=self._parse_layout_type(cache_config_data['gpu_kv_layout_type']), - cpu_kv_layout_type=self._parse_layout_type(cache_config_data['cpu_kv_layout_type']), - ssd_kv_layout_type=self._parse_layout_type(cache_config_data['ssd_kv_layout_type']), - remote_kv_layout_type=self._parse_layout_type(cache_config_data['remote_kv_layout_type']), enable_gds=cache_config_data['enable_gds'], - remote_cache_size_mode=cache_config_data['remote_cache_size_mode'], num_cpu_blocks=cache_config_data['num_cpu_blocks'], num_ssd_blocks=cache_config_data['num_ssd_blocks'], + num_gds_blocks=cache_config_data['num_gds_blocks'], num_remote_blocks=cache_config_data['num_remote_blocks'], + ssd_cache_dir=cache_config_data['ssd_cache_dir'], + gds_cache_dir=cache_config_data['gds_cache_dir'], + remote_cache_size_mode=cache_config_data['remote_cache_size_mode'], remote_file_size=cache_config_data['remote_file_size'], remote_file_num=cache_config_data['remote_file_num'], remote_file_prefix=cache_config_data['remote_file_prefix'], - ssd_cache_dir=cache_config_data['ssd_cache_dir'], - ssd_cache_iouring_entries=cache_config_data['ssd_cache_iouring_entries'], - ssd_cache_iouring_flags=cache_config_data['ssd_cache_iouring_flags'], remote_cache_path=cache_config_data['remote_cache_path'], remote_config_custom=cache_config_data['remote_config_custom'], - enable_trace=False, # Disable trace for replay ) # Recreate gpu_layout if available @@ -139,8 +161,7 @@ def parse_config_event(self, event: Dict[str, Any]): head_size=8,#gpu_layout_data['head_size'], #for local test is_mla=gpu_layout_data['is_mla'], ) - - self.gpu_blocks_num = self.gpu_layout.num_block + self.gpu_blocks_num = self.gpu_layout.num_block self.log(f"Model config: {self.model_config}") self.log(f"Cache config loaded {self.cache_config}") @@ -149,12 +170,12 @@ def parse_config_event(self, event: Dict[str, Any]): def _parse_layout_type(self, layout_type_str: str) -> KVCacheLayoutType: """Parse layout type string to enum""" - if "LAYERWISE" in layout_type_str: - return KVCacheLayoutType.LAYERWISE - elif "BLOCKWISE" in layout_type_str: - return KVCacheLayoutType.BLOCKWISE + if "LAYERFIRST" in layout_type_str: + return KVCacheLayoutType.LAYERFIRST + elif "BLOCKFIRST" in layout_type_str: + return KVCacheLayoutType.BLOCKFIRST else: - return KVCacheLayoutType.LAYERWISE # default + return KVCacheLayoutType.LAYERFIRST # default def create_gpu_blocks(self): """Create GPU blocks for testing (similar to test code)""" @@ -186,6 +207,46 @@ def create_gpu_blocks(self): self.log(f"Created GPU blocks for {total_gpus} GPUs with {self.gpu_blocks_num} blocks each") + def register_gpu_blocks_to_kvmanager(self, gpu_register_port: str): + """Register GPU blocks to KVManager via socket""" + self.log("Registering GPU blocks via socket...") + + total_gpus = self.model_config.tp_size * self.model_config.dp_size + + # Create zmq socket to send GPU blocks + context = zmq.Context(2) + send_socket = get_zmq_socket( + context, zmq.SocketType.PUSH, gpu_register_port, False + ) + + # Register each GPU's blocks + for gpu_id in range(total_gpus): + # Convert torch tensors to TensorSharedHandle + handles = [] + for layer_tensor in self.gpu_blocks[gpu_id]: + handle = TensorSharedHandle(layer_tensor, gpu_id) + handles.append(handle) + + # Create registration request + register_req = RegisterTPClientRequest( + dp_client_id=gpu_id // self.model_config.tp_size, # DP client ID + device_id=gpu_id, + handles=handles, + gpu_layout=self.gpu_layout + ) + + # Send registration request + send_socket.send_pyobj(register_req) + self.log(f"Registered GPU {gpu_id} blocks") + + # Wait a bit to ensure all registration requests are sent + time.sleep(0.1) + + # Close socket + send_socket.close() + context.term() + self.log("GPU blocks registration completed") + def create_kvmanager(self,): """Create and initialize KVManager""" self.log("Creating KVManager...") @@ -193,7 +254,7 @@ def create_kvmanager(self,): if not self.gpu_layout: # Create default GPU layout if not provided in trace self.gpu_layout = KVCacheLayout( - type=KVCacheLayoutType.LAYERWISE, + type=KVCacheLayoutType.LAYERFIRST, num_layer=self.model_config.num_layers, num_block=self.gpu_blocks_num, # default number of blocks tokens_per_block=self.cache_config.tokens_per_block, @@ -202,30 +263,42 @@ def create_kvmanager(self,): is_mla=self.model_config.use_mla ) - # Create KVManager + # Create KVTaskEngine with gpu_register_port + import tempfile + gpu_register_port = f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}" + self.kvmanager = KVTaskEngine( model_config=self.model_config, cache_config=self.cache_config, - gpu_layout=self.gpu_layout, - gpu_blocks=self.gpu_blocks + gpu_register_port=gpu_register_port ) - # Start KVManager - if self.kvmanager.is_ready(): - self.kvmanager.start() - self.log("KVManager started successfully") - else: - raise RuntimeError("KVManager is not ready") + # Start KVManager first so it can listen for registration requests + self.kvmanager.start() + + # Register GPU blocks via socket after KVManager is started + self.register_gpu_blocks_to_kvmanager(gpu_register_port) + + # Wait for KVManager to be ready + max_wait_time = 30 # seconds + start_time = time.time() + while not self.kvmanager.is_ready(): + if time.time() - start_time > max_wait_time: + raise RuntimeError("KVManager failed to become ready within timeout") + time.sleep(0.1) + + self.log("KVManager started successfully") def replay_request_event(self, event: Dict[str, Any]) -> int: - """Replay a request event (GET or PUT)""" + """Replay a request event (GET, PUT, GET_MATCH, PUT_MATCH)""" data = event['data'] request_type = data['request_type'] - # Convert lists back to tensors - token_ids = torch.tensor(data['token_ids'], dtype=torch.long) - slot_mapping = torch.tensor(data['slot_mapping'], dtype=torch.long) - token_mask = torch.tensor(data['token_mask'], dtype=torch.bool) if data['token_mask'] else None + # Convert lists back to numpy arrays (KVTaskEngine uses numpy, not torch) + import numpy as np + token_ids = np.array(data['token_ids'], dtype=np.int64) + slot_mapping = np.array(data['slot_mapping'], dtype=np.int64) + token_mask = np.array(data['token_mask'], dtype=bool) if data['token_mask'] else None layer_granularity = data.get('layer_granularity', -1) dp_id = data.get('dp_id', 0) @@ -234,8 +307,9 @@ def replay_request_event(self, event: Dict[str, Any]) -> int: if request_type == "GET": print(f"🔍🔍🔍GET token_ids: {token_ids[:128]}") print(f"request_id: {data['request_id']}, request_type: {request_type}, " - f"input length: {len(token_ids)}, true in mask: {token_mask.sum()}") - task_id = self.kvmanager.get_async( + f"input length: {len(token_ids)}, true in mask: {token_mask.sum() if token_mask is not None else 'N/A'}") + # get_async return (task_id, return_mask) + task_id, return_mask = self.kvmanager.get_async( token_ids=token_ids, slot_mapping=slot_mapping, token_mask=token_mask, @@ -245,45 +319,108 @@ def replay_request_event(self, event: Dict[str, Any]) -> int: elif request_type == "PUT": print(f"✅✅✅PUT token_ids: {token_ids[:128]}") print(f"request_id: {data['request_id']}, request_type: {request_type}, " - f"input length: {len(token_ids)}, true in mask: {token_mask.sum()}") - task_id = self.kvmanager.put_async( + f"input length: {len(token_ids)}, true in mask: {token_mask.sum() if token_mask is not None else 'N/A'}") + # put_async return (task_id, return_mask) + task_id, return_mask = self.kvmanager.put_async( token_ids=token_ids, slot_mapping=slot_mapping, token_mask=token_mask, dp_id=dp_id ) + elif request_type == "GET_MATCH": + print(f"🔍📝GET_MATCH token_ids: {token_ids[:128]}") + print(f"request_id: {data['request_id']}, request_type: {request_type}, " + f"input length: {len(token_ids)}, true in mask: {token_mask.sum() if token_mask is not None else 'N/A'}") + # get_match return (task_id, return_mask) + task_id, return_mask = self.kvmanager.get_match( + token_ids=token_ids, + token_mask=token_mask, + layer_granularity=layer_granularity, + dp_id=dp_id + ) + elif request_type == "PUT_MATCH": + print(f"✅📝PUT_MATCH token_ids: {token_ids[:128]}") + print(f"request_id: {data['request_id']}, request_type: {request_type}, " + f"input length: {len(token_ids)}, true in mask: {token_mask.sum() if token_mask is not None else 'N/A'}") + # put_match return (task_id, return_mask) + task_id, return_mask = self.kvmanager.put_match( + token_ids=token_ids, + token_mask=token_mask, + dp_id=dp_id + ) else: raise ValueError(f"Unknown request type: {request_type}") return task_id + def replay_launch_tasks_event(self, event: Dict[str, Any]): + """Replay a launch_tasks event""" + data = event['data'] + task_ids = data['task_ids'] + slot_mappings_list = data['slot_mappings'] + + self.log(f"🚀🚀🚀Replaying launch_tasks for task_ids: {task_ids}") + + try: + # Convert lists back to numpy arrays + import numpy as np + slot_mappings = [np.array(sm, dtype=np.int64) for sm in slot_mappings_list] + + print(f"Launching {len(task_ids)} tasks with slot_mappings") + + # Call launch_tasks + self.kvmanager.launch_tasks(task_ids, slot_mappings) + + self.log(f"launch_tasks completed successfully for {len(task_ids)} tasks") + + except Exception as e: + self.log(f"Warning: launch_tasks operation failed: {e}") + import traceback + traceback.print_exc() + def replay_wait_event(self, event: Dict[str, Any]): """Replay a wait event""" data = event['data'] wait_type = data['wait_type'] task_ids = data['task_ids'] + timeout = data.get('timeout', 20.0) # default timeout + completely = data.get('completely', False) # default completely layer_group_id = data.get('layer_group_id') - self.log(f"⏰⏰⏰Replaying {wait_type} for task_ids: {task_ids}") + self.log(f"⏰⏰⏰Replaying {wait_type} for task_ids: {task_ids}, timeout: {timeout}, completely: {completely}") try: + # wait and try_wait return Dict[int, KVResponse] if wait_type == "wait": - result = self.kvmanager.wait(task_ids) - elif wait_type == "wait_for_graph_finished": - result = self.kvmanager.wait_for_graph_finished(task_ids) + result = self.kvmanager.wait(task_ids, timeout=timeout, completely=completely) elif wait_type == "try_wait": result = self.kvmanager.try_wait(task_ids) else: raise ValueError(f"Unknown wait type: {wait_type}") + + # process result: result is Dict[int, KVResponse] successed_elements = [] + statuses = [] for task_id in task_ids: - successed_elements.append(result[task_id].sum().item()) - print(f"wait result: task ids: {task_ids}, successed elements num: {successed_elements}") + if task_id in result: + # return_mask in KVResponse may be None + if result[task_id].return_mask is not None: + successed_elements.append(result[task_id].return_mask.sum()) + else: + successed_elements.append(0) + statuses.append(result[task_id].status.name if hasattr(result[task_id], 'status') else "SUCCESS") + else: + successed_elements.append(0) + statuses.append("NOT_FOUND") + + print(f"✅ {wait_type} result: task_ids={task_ids}, successed_elements={successed_elements}, statuses={statuses}") self.log(f"Wait completed successfully for {wait_type}") return result except Exception as e: self.log(f"Warning: Wait operation failed: {e}") + import traceback + traceback.print_exc() return None def replay_all_events(self): @@ -293,8 +430,10 @@ def replay_all_events(self): config_events = [e for e in self.events if e['event_type'] == 'config'] request_events = [e for e in self.events if e['event_type'] == 'request'] wait_events = [e for e in self.events if e['event_type'] == 'wait'] + launch_tasks_events = [e for e in self.events if e['event_type'] == 'launch_tasks'] - self.log(f"Found {len(config_events)} config, {len(request_events)} request, {len(wait_events)} wait events") + self.log(f"Found {len(config_events)} config, {len(request_events)} request, " + f"{len(wait_events)} wait, {len(launch_tasks_events)} launch_tasks events") # Parse configuration first if config_events: @@ -307,7 +446,7 @@ def replay_all_events(self): self.create_gpu_blocks() self.create_kvmanager() # Replay all non-config events in timestamp order - other_events = request_events + wait_events + other_events = request_events + wait_events + launch_tasks_events other_events.sort(key=lambda e: e['timestamp']) request_id_mapping = {} # Map original request_id to replayed task_id @@ -320,6 +459,22 @@ def replay_all_events(self): request_id_mapping[original_request_id] = replayed_task_id self.log(f"Mapped original request_id {original_request_id} to task_id {replayed_task_id}") + elif event_type == 'launch_tasks': + # Map original task_ids to replayed task_ids + original_task_ids = event['data']['task_ids'] + mapped_task_ids = [] + for orig_id in original_task_ids: + if orig_id in request_id_mapping: + mapped_task_ids.append(request_id_mapping[orig_id]) + else: + self.log(f"Warning: Cannot find mapping for task_id {orig_id}") + mapped_task_ids.append(orig_id) # Use original if not found + + # Update event data with mapped task_ids + event['data']['task_ids'] = mapped_task_ids + self.replay_launch_tasks_event(event) + print("launch_tasks done") + elif event_type == 'wait': # Map original task_ids to replayed task_ids original_task_ids = event['data']['task_ids'] diff --git a/tests/test_kvmanager.py b/tests/test_kvmanager.py index 473cf0741c..b1364c8932 100644 --- a/tests/test_kvmanager.py +++ b/tests/test_kvmanager.py @@ -7,7 +7,7 @@ import multiprocessing as mp from multiprocessing import Process, Pipe -from flexkv.common.config import ModelConfig, CacheConfig +from flexkv.common.config import ModelConfig, CacheConfig, GLOBAL_CONFIG_FROM_ENV from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType from flexkv.common.request import KVResponseStatus from flexkv.kvtask import KVTaskEngine @@ -24,12 +24,12 @@ create_gpu_kv_layout, GPUKVCacheVerifier ) -def run_tp_client(dp_client_id, - tp_rank, - server_recv_port, - model_config, - cache_config, - num_gpu_blocks, +def run_tp_client(dp_client_id, + tp_rank, + server_recv_port, + model_config, + cache_config, + num_gpu_blocks, child_conn, gpu_layout_type): """Run tp_client process""" @@ -96,28 +96,21 @@ def shutdown_tp_client(tp_client_processes): {'tp_size': 4, 'dp_size': 1, 'use_mla': True}, ], indirect=True) @pytest.mark.parametrize("cache_config", [ - {'enable_cpu': True, 'enable_ssd': False, 'enable_remote': False, 'num_cpu_blocks': 1024}, - {'enable_cpu': True, 'enable_ssd': True, 'enable_remote': False,}, - {'enable_cpu': True, 'enable_ssd': True, 'enable_remote': False, 'ssd_cache_iouring_entries': 512}, - {'enable_cpu': True, 'enable_ssd': True, 'enable_remote': True, 'num_ssd_blocks': 256, 'num_remote_blocks': 512}, - {'enable_cpu': True, 'enable_ssd': True, 'enable_remote': True, - 'num_ssd_blocks': 256, 'num_remote_blocks': 512, 'ssd_cache_iouring_entries': 512}, + {'enable_cpu': True, 'enable_ssd': False, 'num_cpu_blocks': 1024}, + {'enable_cpu': True, 'enable_ssd': True, 'num_cpu_blocks': 1024, 'num_ssd_blocks': 2048}, # GDS test configs - {'enable_cpu': True, 'enable_gds': True, 'enable_ssd': False, 'enable_remote': False, 'num_gds_blocks': 512, 'gds_cache_dir': ["./gdstest"]}, + {'enable_cpu': True, 'enable_gds': True, 'enable_ssd': False, \ + 'num_gds_blocks': 512, 'gds_cache_dir': ["./gdstest"]}, ], indirect=True) @pytest.mark.parametrize("test_config", [ {'num_gpu_blocks': 512, 'requests_per_block': 16, 'initial_write_ratio': 0.4}, ], indirect=True) -@pytest.mark.parametrize("flex_kv_layout_type", [ - KVCacheLayoutType.LAYERWISE, - KVCacheLayoutType.BLOCKWISE, -]) @pytest.mark.parametrize("gpu_layout_type", [ 0, 1, 2, ]) -def test_kvmanager(model_config, cache_config, test_config, flex_kv_layout_type, gpu_layout_type): +def test_kvmanager(model_config, cache_config, test_config, gpu_layout_type): tp_size = model_config.tp_size dp_size = model_config.dp_size @@ -131,11 +124,6 @@ def test_kvmanager(model_config, cache_config, test_config, flex_kv_layout_type, enable_remote = cache_config.enable_remote enable_gds = cache_config.enable_gds - cache_config.cpu_kv_layout_type = flex_kv_layout_type - cache_config.ssd_kv_layout_type = flex_kv_layout_type - cache_config.remote_kv_layout_type = flex_kv_layout_type - cache_config.gds_kv_layout_type = flex_kv_layout_type - num_gpu_blocks = test_config["num_gpu_blocks"] block_per_request = test_config['requests_per_block'] initial_write_ratio = test_config['initial_write_ratio'] @@ -155,10 +143,7 @@ def test_kvmanager(model_config, cache_config, test_config, flex_kv_layout_type, #note that for now only dp_size=1 is supported pytest.skip("skip because server-client mode is not ready for dp_size > 1") - import uuid - gpu_register_port = f"ipc:///tmp/flexkv_gpu_{uuid.uuid4().hex[:8]}" - server_recv_port = f"ipc:///tmp/flexkv_srv_{uuid.uuid4().hex[:8]}" - kvmanager = KVManager(model_config, cache_config, gpu_register_port, server_recv_port) + kvmanager = KVManager(model_config, cache_config) kvmanager.start() # Create pipes for each tp_client to send GPU blocks back @@ -172,7 +157,8 @@ def test_kvmanager(model_config, cache_config, test_config, flex_kv_layout_type, tp_client_process = mp_ctx.Process( target=run_tp_client, - args=(0, tp_rank, gpu_register_port, model_config, cache_config, num_gpu_blocks + tp_rank, child_conn, gpu_layout_type), + args=(0, tp_rank, kvmanager.gpu_register_port, model_config, cache_config, \ + num_gpu_blocks + tp_rank, child_conn, gpu_layout_type), daemon=True ) tp_client_processes.append(tp_client_process) diff --git a/tests/test_utils.py b/tests/test_utils.py index 0297c0e5cb..b682d46ff1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -31,27 +31,8 @@ 'enable_remote': False, 'num_cpu_blocks': 128, 'num_ssd_blocks': 512, - 'num_remote_blocks': 512, # Aligned with ssd_blocks - 'remote_cache_size_mode': "block_num", - 'remote_file_size': (1024*1024*1024), - 'remote_file_num': 16, - 'remote_file_prefix': "remote_cache", 'enable_gds': False, - 'enable_trace': False, 'ssd_cache_dir': ["./ssd_cache", "./ssd_cache2/"], - 'ssd_cache_iouring_entries': 512, - 'ssd_cache_iouring_flags': 1, - 'remote_cache_path': ["remote_cache1", "remote_cache2"], - 'remote_config_custom': { - "pcfs_fsid": "f_l91fz6", - "pcfs_port": 31, - "pcfs_ip": "172.21.16.177", - "pcfs_parent_nodeid": 144115188075855883 # Using transfer engine value for consistency - }, - 'use_ce_transfer_h2d': False, - 'use_ce_transfer_d2h': False, - 'transfer_sms_h2d': 8, - 'transfer_sms_d2h': 8, } DEFAULT_TEST_CONFIG = { @@ -121,9 +102,9 @@ def create_gpu_kv_layout(model_config, cache_config, num_gpu_blocks, gpu_layout_ tokens_per_block = cache_config.tokens_per_block if gpu_layout_type == 0 or gpu_layout_type == 2: - layout_type = KVCacheLayoutType.LAYERWISE + layout_type = KVCacheLayoutType.LAYERFIRST elif gpu_layout_type == 1: - layout_type = KVCacheLayoutType.BLOCKWISE + layout_type = KVCacheLayoutType.BLOCKFIRST else: raise ValueError(f"Invalid GPU layout type: {gpu_layout_type}") tpgroup_gpu_kv_layout = KVCacheLayout( @@ -151,7 +132,7 @@ def generate_gpu_blocks_with_ground_truth(model_config, cache_config, test_confi num_gpu_blocks = test_config["num_gpu_blocks"] tpgroup_gpu_kv_layout = KVCacheLayout( - type=KVCacheLayoutType.LAYERWISE, + type=KVCacheLayoutType.LAYERFIRST, num_layer=num_layers, num_block=num_gpu_blocks, tokens_per_block=tokens_per_block, @@ -242,220 +223,6 @@ def skip_if_no_cuda(): if torch.cuda.device_count() == 0: pytest.skip("No CUDA devices available") -# Server-Client mode support functions -class KVManagerServerClient: - """Server-Client wrapper for KVManager that manages server, tp_client, and dp_client processes""" - - def __init__(self, model_config, cache_config, gpu_kv_layout, gpu_blocks): - import tempfile - from flexkv.server.client import KVDPClient, KVTPClient - from flexkv.server.server import KVServer - - self.model_config = model_config - self.cache_config = cache_config - self.gpu_kv_layout = gpu_kv_layout - self.gpu_blocks = gpu_blocks - - # Create temporary IPC port for communication - self.server_recv_port = f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}" - - # Extract basic config parameters for server process - server_config = { - 'num_layers': model_config.num_layers, - 'num_kv_heads': model_config.num_kv_heads, - 'head_size': model_config.head_size, - 'use_mla': model_config.use_mla, - 'tp_size': model_config.tp_size, - 'dp_size': model_config.dp_size, - 'dtype': str(model_config.dtype), - 'tokens_per_block': cache_config.tokens_per_block, - 'enable_cpu': cache_config.enable_cpu, - 'enable_ssd': cache_config.enable_ssd, - 'enable_remote': cache_config.enable_remote, - 'num_cpu_blocks': cache_config.num_cpu_blocks, - 'num_ssd_blocks': cache_config.num_ssd_blocks, - 'ssd_cache_dir': cache_config.ssd_cache_dir if hasattr(cache_config, 'ssd_cache_dir') else ["./ssd_cache"], - } - - # Start server process - self.server_process = Process( - target=self._run_server, - args=(self.server_recv_port, server_config), - daemon=False - ) - self.server_process.start() - - # Wait for server to start - time.sleep(5) - - # Initialize dp_client - self.dp_client = KVDPClient(self.server_recv_port, model_config) - print("dp_client started") - - # Start tp_client processes - self.tp_client_processes = [] - for tp_rank in range(model_config.tp_size): - device_id = tp_rank + self.dp_client.dp_client_id * model_config.tp_size - # Extract only the necessary basic types for tp_client - tp_client_process = Process( - target=KVManagerServerClient._run_tp_client, - args=(self.dp_client.dp_client_id, tp_rank, device_id, self.server_recv_port, - model_config.num_layers, str(model_config.dtype), - list(gpu_kv_layout.kv_shape[1:]), model_config.use_mla), - daemon=True - ) - tp_client_process.start() - self.tp_client_processes.append(tp_client_process) - - # Wait for tp clients to register - time.sleep(5) - - self._server_client_mode = True - - def _run_server(self, server_recv_port, server_config): - """Run server process""" - from flexkv.server.server import KVServer - from flexkv.common.config import ModelConfig, CacheConfig - - # Recreate config objects from basic parameters - model_config = ModelConfig( - num_layers=server_config['num_layers'], - num_kv_heads=server_config['num_kv_heads'], - head_size=server_config['head_size'], - use_mla=server_config['use_mla'], - tp_size=server_config['tp_size'], - dp_size=server_config['dp_size'], - dtype=torch.float16 if server_config['dtype'] == 'torch.float16' else torch.float32 - ) - - cache_config = CacheConfig( - tokens_per_block=server_config['tokens_per_block'], - enable_cpu=server_config['enable_cpu'], - enable_ssd=server_config['enable_ssd'], - enable_remote=server_config['enable_remote'], - num_cpu_blocks=server_config['num_cpu_blocks'], - num_ssd_blocks=server_config['num_ssd_blocks'], - ssd_cache_dir=server_config['ssd_cache_dir'] - ) - print("starting server ... ...") - kvserver = KVServer(model_config, cache_config, server_recv_port) - kvserver.run() - print("server started") - - @staticmethod - def _run_tp_client(dp_client_id, tp_rank, device_id, server_recv_port, num_layers, dtype_str, kv_shape, is_mla): - """Run tp_client process""" - from flexkv.server.client import KVTPClient - from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType - - tp_client = KVTPClient(server_recv_port, dp_client_id, device_id) - # Convert dtype string back to torch dtype - if dtype_str == "torch.float16": - dtype = torch.float16 - elif dtype_str == "torch.float32": - dtype = torch.float32 - else: - dtype = torch.float16 # default - - # Create GPU blocks for this tp_rank in the tp_client process - gpu_blocks_for_tp = [] - for layer_id in range(num_layers): - gpu_blocks_for_tp.append( - torch.rand(size=tuple(kv_shape), dtype=dtype).cuda(device_id) - ) - - # Create a simple layout for registration - gpu_kv_layout = KVCacheLayout( - type=KVCacheLayoutType.LAYERWISE, - num_layer=num_layers, - num_block=kv_shape[1], # Assuming this is the block dimension - tokens_per_block=kv_shape[2], # Assuming this is the tokens_per_block dimension - num_head=kv_shape[3], # Assuming this is the num_head dimension - head_size=kv_shape[4], # Assuming this is the head_size dimension - is_mla=is_mla - ) - print("registering to server ... ...") - tp_client.register_to_server(gpu_blocks_for_tp, gpu_kv_layout) - print("registered to server") - # Keep the process running - while True: - time.sleep(1) - - def is_ready(self): - """Check if the server-client system is ready""" - return self.server_process.is_alive() and all(p.is_alive() for p in self.tp_client_processes) - - def start(self): - """Start the server-client system (already started in __init__)""" - return True - - def put_async(self, token_ids, slot_mapping, dp_id): - """Put data to the server-client system""" - return self.dp_client.put_async(token_ids, slot_mapping, token_mask=None) - - def get_async(self, token_ids, slot_mapping, layer_granularity, dp_id): - """Get data from the server-client system""" - return self.dp_client.get_async(token_ids, slot_mapping, token_mask=None) - - def wait_for_graph_finished(self, request): - """Wait for graph to finish""" - masks = self.dp_client.wait(request) - time.sleep(0.2) - return masks - - def wait(self, request_ids): - """Wait for requests to complete""" - masks = self.dp_client.wait(request_ids) - return masks - - def shutdown(self): - """Shutdown all processes""" - print("Shutting down KVManagerServerClient...") - - # First, try to gracefully shutdown the server by sending a shutdown signal - try: - # Send a shutdown request to the server - self.dp_client.shutdown() - print("Sent shutdown request to server") - - # Wait a bit for graceful shutdown - time.sleep(3) - except Exception as e: - print(f"Error sending shutdown request: {e}") - - # Terminate tp_client processes - print("Terminating tp_client processes...") - for tp_process in self.tp_client_processes: - if tp_process.is_alive(): - tp_process.terminate() - tp_process.join(timeout=5) - if tp_process.is_alive(): - print(f"Force killing tp_client process {tp_process.pid}") - tp_process.kill() - tp_process.join(timeout=2) - - # Terminate server process - print("Terminating server process...") - if self.server_process.is_alive(): - self.server_process.terminate() - self.server_process.join(timeout=10) - if self.server_process.is_alive(): - print(f"Force killing server process {self.server_process.pid}") - self.server_process.kill() - self.server_process.join(timeout=5) - - # Clean up temporary file - import os - if hasattr(self, 'server_recv_port') and self.server_recv_port.startswith('ipc://'): - temp_file = self.server_recv_port[6:] # Remove 'ipc://' prefix - try: - if os.path.exists(temp_file): - os.unlink(temp_file) - print(f"Cleaned up temporary file: {temp_file}") - except Exception as e: - print(f"Error cleaning up temporary file: {e}") - - print("KVManagerServerClient shutdown complete") class GPUKVCacheVerifier: def __init__(self, @@ -537,10 +304,12 @@ def fill_gpu_blocks(self, token_ids, block_ids): actual_head_id) # GPU tensor dim:[kv_dim, num_block, tokens_per_block, num_head, head_size] if self.gpu_layout_type == 0: - # gpu_layout_type 0: [num_layer][kv_dim, num_block, tokens_per_block, num_head, head_size] + # gpu_layout_type 0: + # [num_layer][kv_dim, num_block, tokens_per_block, num_head, head_size] gpu_tensor[kv_id, block_id, :, head_id, :] = hash_value elif self.gpu_layout_type == 1: - # gpu_layout_type 1: [tp_id][0][num_block, num_layer, kv_dim, tokens_per_block, num_head, head_size] + # gpu_layout_type 1: + # [tp_id][0][num_block, num_layer, kv_dim, tokens_per_block, num_head, head_size] # Need to get the first (and only) tensor from the list gpu_tensor[block_id, layer_id, kv_id, :, head_id, :] = hash_value elif self.gpu_layout_type == 2: @@ -580,10 +349,12 @@ def verify_kv_blocks(self, token_ids, block_ids)->bool: token_ids[start_token_idx:end_token_idx], actual_head_id) if self.gpu_layout_type == 0: - # gpu_layout_type 0: [num_layer][kv_dim, num_block, tokens_per_block, num_head, head_size] + # gpu_layout_type 0: + # [num_layer][kv_dim, num_block, tokens_per_block, num_head, head_size] actual_values = gpu_tensor[kv_id, block_id, :, head_id, :] elif self.gpu_layout_type == 1: - # gpu_layout_type 1: [tp_id][0][num_block, num_layer, kv_dim, tokens_per_block, num_head, head_size] + # gpu_layout_type 1: + # [tp_id][0][num_block, num_layer, kv_dim, tokens_per_block, num_head, head_size] # Need to get the first (and only) tensor from the list actual_values = gpu_tensor[block_id, layer_id, kv_id, :, head_id, :] elif self.gpu_layout_type == 2: @@ -620,7 +391,7 @@ def gpu_blocks_worker_process(conn, model_config, cache_config, gpu_kv_layout): # Create GPU blocks in subprocess gpu_blocks = [] for layer_id in range(model_config.num_layers): - # LAYERWISE format: [kv_dim, num_block, tokens_per_block, num_head, head_size] + # LAYERFIRST format: [kv_dim, num_block, tokens_per_block, num_head, head_size] kv_dim = 2 if not model_config.use_mla else 1 gpu_tensor = torch.zeros( kv_dim, @@ -678,7 +449,7 @@ def example_usage_gpu_kv_cache_verifier(): # Create GPU KV layout gpu_kv_layout = KVCacheLayout( - type=KVCacheLayoutType.LAYERWISE, + type=KVCacheLayoutType.LAYERFIRST, num_layer=model_config.num_layers, num_block=64, # Assume 64 blocks tokens_per_block=cache_config.tokens_per_block, @@ -690,7 +461,7 @@ def example_usage_gpu_kv_cache_verifier(): # Create mock GPU blocks gpu_blocks = [] for layer_id in range(model_config.num_layers): - # LAYERWISE format: [kv_dim, num_block, tokens_per_block, num_head, head_size] + # LAYERFIRST format: [kv_dim, num_block, tokens_per_block, num_head, head_size] kv_dim = 2 if not model_config.use_mla else 1 gpu_tensor = torch.zeros( kv_dim,