diff --git a/benchmarks/benchmark_cache_engine.py b/benchmarks/benchmark_cache_engine.py
index a75a2008b7..d0072c402f 100644
--- a/benchmarks/benchmark_cache_engine.py
+++ b/benchmarks/benchmark_cache_engine.py
@@ -31,7 +31,7 @@ def main(args):
     num_put_requests = 0
     request_id = 0
     for req in reqs:
-        fake_slot_mapping = torch.arange(req.token_mask[req.token_mask].sum(), dtype=torch.int64)
+        fake_slot_mapping = torch.arange(req.token_mask[req.token_mask].sum(), dtype=torch.int64).numpy()
         local_vars = {
             'cache_engine': cache_engine,
             'req': req,
@@ -41,23 +41,25 @@ def main(args):
         if req.request_type == "get":
             num_get_requests += 1
             if not args.only_put:
-                profiler.runctx('graph, return_mask, transfer_call_back, finished_ops_ids = '
+                profiler.runctx('graph, return_mask, transfer_call_back, op_callback_dict, finished_ops_ids = '
                                 'cache_engine.get(request_id, req.token_ids, req.token_mask, '
                                 'fake_slot_mapping, -1, -1)',
                                 globals(), local_vars)
             else:
-                graph, return_mask, transfer_call_back, finished_ops_ids = \
+                graph, return_mask, transfer_call_back, op_callback_dict, finished_ops_ids = \
                     cache_engine.get(request_id, req.token_ids, req.token_mask,
                                    fake_slot_mapping, -1, -1)
                 local_vars.update({
                     'graph': graph,
                     'return_mask': return_mask,
                     'transfer_call_back': transfer_call_back,
+                    'op_callback_dict': op_callback_dict,
                     'finished_ops_ids': finished_ops_ids
                 })
             profiler.runctx('transfer_call_back()', globals(), local_vars)
 
             return_mask = local_vars['return_mask']
+            op_callback_dict = local_vars['op_callback_dict']
             cache_hit_ratio = return_mask.sum() / req.token_mask.sum()
             cache_hit_ratio_list.append(cache_hit_ratio)
             flexkv_logger.info(f"need get {req.token_mask.sum()} tokens, "
@@ -66,16 +68,17 @@ def main(args):
         elif req.request_type == "put":
             num_put_requests += 1
             if not args.only_get:
-                profiler.runctx('graph, return_mask, transfer_call_back, finished_ops_ids = '
+                profiler.runctx('graph, return_mask, transfer_call_back, op_callback_dict, finished_ops_ids = '
                                 'cache_engine.put(request_id, req.token_ids, req.token_mask, fake_slot_mapping)',
                                 globals(), local_vars)
             else:
-                graph, return_mask, transfer_call_back, finished_ops_ids = \
+                graph, return_mask, transfer_call_back, op_callback_dict, finished_ops_ids = \
                     cache_engine.put(request_id, req.token_ids, req.token_mask, fake_slot_mapping)
                 local_vars.update({
                     'graph': graph,
                     'return_mask': return_mask,
                     'transfer_call_back': transfer_call_back,
+                    'op_callback_dict': op_callback_dict,
                     'finished_ops_ids': finished_ops_ids
                 })
 
@@ -105,7 +108,7 @@ def parse_args():
     parser = ArgumentParser()
     parser.add_argument("--config",
                         type=str,
-                        default="./benchmarks/example_config.json")
+                        default="./benchmarks/example_config.yml")
     parser.add_argument("--only-get", action="store_true")
     parser.add_argument("--only-put", action="store_true")
     parser.add_argument("--num-users", type=int, default=20)
diff --git a/benchmarks/benchmark_single_batch.py b/benchmarks/benchmark_single_batch.py
index 397030becd..91b7c3b948 100644
--- a/benchmarks/benchmark_single_batch.py
+++ b/benchmarks/benchmark_single_batch.py
@@ -7,7 +7,7 @@
 import torch
 
 from flexkv.server.client import KVTPClient
-from flexkv.common.storage import KVCacheLayout
+from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType
 from flexkv.common.debug import flexkv_logger
 from flexkv.common.config import ModelConfig, CacheConfig
 from utils import load_config
@@ -33,7 +33,7 @@ def run_tp_client(dp_client_id, tp_rank, server_recv_port, model_config, cache_c
     num_gpu_blocks = cache_config.num_gpu_blocks
 
     gpu_kv_layout = KVCacheLayout(
-        type=cache_config.gpu_kv_layout_type,
+        type=KVCacheLayoutType.LAYERFIRST,
         num_layer=model_config.num_layers,
         num_block=num_gpu_blocks,
         tokens_per_block=cache_config.tokens_per_block,
@@ -66,13 +66,12 @@ def shutdown_tp_client(tp_client_processes):
 def benchmark_flexkv(model_config: ModelConfig,
                      cache_config: CacheConfig,
                      benchmark_config: BenchmarkConfig,
-                     gpu_register_port: str,
-                     server_recv_port: str):
+                     ):
     if model_config.tp_size * model_config.dp_size > torch.cuda.device_count():
         raise ValueError(f"tp_size {model_config.tp_size} * dp_size {model_config.dp_size} is greater than "
                          f"the number of available GPUs {torch.cuda.device_count()}")
     print(f"{benchmark_config = }")
-    kvmanager = KVManager(model_config, cache_config, gpu_register_port, server_recv_port)
+    kvmanager = KVManager(model_config, cache_config)
     kvmanager.start()
 
     tp_client_processes = []
@@ -85,7 +84,7 @@ def benchmark_flexkv(model_config: ModelConfig,
     for tp_rank in range(model_config.tp_size):
         tp_client_process = Process(
             target=run_tp_client,
-            args=(0, tp_rank, gpu_register_port,
+            args=(0, tp_rank, kvmanager.gpu_register_port,
                     model_config, cache_config),
             daemon=True
         )
@@ -161,7 +160,7 @@ def benchmark_flexkv(model_config: ModelConfig,
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--config", type=str, default="benchmarks/example_config.json")
+    parser.add_argument("--config", type=str, default="benchmarks/example_config.yml")
     # benchmark config
     parser.add_argument("--num-layers", type=int, default=-1)
     parser.add_argument("--batch-size", type=int, default=1)
@@ -184,8 +183,5 @@ def parse_args():
     # pad sequence length to divisible by tokens_per_block
     benchmark_config.sequence_length = \
         ((benchmark_config.sequence_length - 1) // cache_config.tokens_per_block + 1) * cache_config.tokens_per_block
-    import uuid
-    gpu_register_port = f"ipc:///tmp/flexkv_gpu_{uuid.uuid4().hex[:8]}"
-    server_recv_port = f"ipc:///tmp/flexkv_srv_{uuid.uuid4().hex[:8]}"
 
-    benchmark_flexkv(model_config, cache_config, benchmark_config, gpu_register_port, server_recv_port)
+    benchmark_flexkv(model_config, cache_config, benchmark_config)
diff --git a/benchmarks/benchmark_workers.py b/benchmarks/benchmark_workers.py
index f1694ccd52..009d6cec69 100644
--- a/benchmarks/benchmark_workers.py
+++ b/benchmarks/benchmark_workers.py
@@ -13,9 +13,9 @@
 from flexkv.transfer.worker import GPUCPUTransferWorker, CPUSSDDiskTransferWorker, WorkerHandle, tpGPUCPUTransferWorker
 from flexkv.storage.allocator import CPUAllocator, GPUAllocator, SSDAllocator
 from flexkv.common.storage import KVCacheLayoutType, KVCacheLayout
-from flexkv.common.config import ModelConfig, CacheConfig
+from flexkv.common.config import ModelConfig, CacheConfig, GLOBAL_CONFIG_FROM_ENV
 from flexkv.common.debug import flexkv_logger
-
+from utils import load_config
 
 # flexkv_logger.set_level("OFF")
 
@@ -32,30 +32,32 @@ class BenchmarkConfig:
 def make_configs(args: dict) -> Tuple[ModelConfig, CacheConfig, BenchmarkConfig]:
     config_file = args.config
     try:
-        with open(config_file) as f:
-            config = json.load(f)
-            model_config = ModelConfig(**config["ModelConfig"])
-            model_config.dtype = eval(f"torch.{model_config.dtype}")
-            cache_config = CacheConfig(**config["CacheConfig"])
-            cache_config.num_gpu_blocks = args.num_blocks
-            bench_config = BenchmarkConfig()
-            bench_config.transfer_type = TransferType(args.transfer_type)
-            bench_config.num_layers_to_transfer = args.num_layers
-            bench_config.num_blocks_to_transfer = args.num_blocks
-            bench_config.shuffle_ids = args.shuffle_ids
-            bench_config.warmup_round = args.warmup_round
-            bench_config.benchmark_round = args.benchmark_round
-            bench_config.bidirectional = args.bi
-            return model_config, cache_config, bench_config
+        model_config, cache_config = load_config(config_file)
+        if args.transfer_type == "H2D" or args.transfer_type == "D2H":
+            cache_config.enable_ssd = False
+        elif args.transfer_type == "H2DISK" or args.transfer_type == "DISK2H":
+            assert cache_config.enable_ssd, "SSD cache must be enabled for DISK2H or H2DISK benchmark"
+        bench_config = BenchmarkConfig(
+            transfer_type=TransferType(args.transfer_type),
+            num_layers_to_transfer=args.num_layers,
+            num_blocks_to_transfer=args.num_blocks,
+            shuffle_ids=args.shuffle_ids,
+            warmup_round=args.warmup_round,
+            benchmark_round=args.benchmark_round,
+            bidirectional=args.bi
+        )
+        cache_config.num_ssd_blocks = max(cache_config.num_ssd_blocks, bench_config.num_blocks_to_transfer)
+        return model_config, cache_config, bench_config
     except Exception as e:
         raise ValueError(f"Failed to load config file {config_file}: {e}") from None
 
 def create_cpu_gpu_worker(
                   model_config: ModelConfig,
-                  cache_config: CacheConfig) -> Tuple[WorkerHandle, mp.Queue]:
+                  cache_config: CacheConfig,
+                  num_gpu_blocks: int) -> Tuple[WorkerHandle, mp.Queue]:
     mp.set_start_method('spawn', force=True)
     cpu_layout = KVCacheLayout(
-        type=KVCacheLayoutType(cache_config.cpu_kv_layout_type),
+        type=GLOBAL_CONFIG_FROM_ENV.cpu_layout_type,
         num_layer=model_config.num_layers,
         num_block=cache_config.num_cpu_blocks,
         tokens_per_block=cache_config.tokens_per_block,
@@ -63,9 +65,9 @@ def create_cpu_gpu_worker(
         head_size=model_config.head_size,
     )
     gpu_layout = KVCacheLayout(
-        type=KVCacheLayoutType.LAYERWISE,
+        type=KVCacheLayoutType.LAYERFIRST,
         num_layer=model_config.num_layers,
-        num_block=cache_config.num_gpu_blocks,
+        num_block=num_gpu_blocks,
         tokens_per_block=cache_config.tokens_per_block,
         num_head=model_config.num_kv_heads,
         head_size=model_config.head_size,
@@ -132,7 +134,7 @@ def create_cpu_ssd_worker(
                   cache_config: CacheConfig) -> Tuple[WorkerHandle, mp.Queue]:
     mp.set_start_method('spawn', force=True)
     cpu_layout = KVCacheLayout(
-        type=KVCacheLayoutType(cache_config.cpu_kv_layout_type),
+        type=GLOBAL_CONFIG_FROM_ENV.cpu_layout_type,
         num_layer=model_config.num_layers,
         num_block=cache_config.num_cpu_blocks,
         tokens_per_block=cache_config.tokens_per_block,
@@ -140,7 +142,7 @@ def create_cpu_ssd_worker(
         head_size=model_config.head_size,
     )
     ssd_layout = KVCacheLayout(
-        type=KVCacheLayoutType(cache_config.ssd_kv_layout_type),
+        type=GLOBAL_CONFIG_FROM_ENV.ssd_layout_type,
         num_layer=model_config.num_layers,
         num_block=cache_config.num_ssd_blocks,
         tokens_per_block=cache_config.tokens_per_block,
@@ -157,6 +159,7 @@ def create_cpu_ssd_worker(
         dtype=model_config.dtype,
         num_chunks=model_config.num_layers,
         cache_dir=cache_config.ssd_cache_dir,
+        max_file_size_gb=GLOBAL_CONFIG_FROM_ENV.max_file_size_gb,
     )
     finished_ops_queue = mp.Queue()
     # Create a shared memory buffer for transfer operations
@@ -216,7 +219,7 @@ def bench_worker(args):
     bidirectional = bench_config.bidirectional
 
     if transfer_type == TransferType.H2D or transfer_type == TransferType.D2H:
-        worker_handle, finished_ops_queue = create_cpu_gpu_worker(model_config, cache_config)
+        worker_handle, finished_ops_queue = create_cpu_gpu_worker(model_config, cache_config, num_blocks_to_transfer)
     elif transfer_type == TransferType.H2DISK or transfer_type == TransferType.DISK2H:
         worker_handle, finished_ops_queue = create_cpu_ssd_worker(model_config, cache_config)
     else:
@@ -325,7 +328,7 @@ def parse_args():
                         default=16)
     parser.add_argument("--config",
                         type=str,
-                        default="./benchmarks/example_config.json")
+                        default="./benchmarks/example_config.yml")
     parser.add_argument("--shuffle-ids",
                         action="store_true")
     parser.add_argument("--warmup-round",
diff --git a/benchmarks/example_config.json b/benchmarks/example_config.json
deleted file mode 100644
index 0aea8e5e3f..0000000000
--- a/benchmarks/example_config.json
+++ /dev/null
@@ -1,46 +0,0 @@
-{
-    "ModelConfig": {
-        "num_layers": 64,
-        "num_kv_heads": 8,
-        "head_size": 128,
-        "dtype": "bfloat16",
-        "use_mla": false,
-        "tp_size": 1,
-        "dp_size": 1
-    },
-    "CacheConfig": {
-        "enable_cpu": true,
-        "enable_ssd": true,
-        "enable_remote": false,
-        "tokens_per_block": 16,
-        "enable_gds": false,
-        "gpu_kv_layout_type": "LAYERWISE",
-        "cpu_kv_layout_type": "BLOCKWISE",
-        "ssd_kv_layout_type": "BLOCKWISE",
-        "remote_kv_layout_type": "BLOCKWISE",
-        "num_cpu_blocks": 2048,
-        "num_ssd_blocks": 4096,
-        "num_remote_blocks": null,
-        "use_ce_transfer_h2d": false,
-        "use_ce_transfer_d2h": false,
-        "transfer_sms_h2d": 8,
-        "transfer_sms_d2h": 8,
-        "max_blocks_per_file": 32000,
-        "ssd_cache_dir": "./ssd_cache1/",
-        "ssd_cache_iouring_entries": 512,
-        "ssd_cache_iouring_flags": 1,
-        "remote_cache_size_mode": "file_size",
-        "remote_file_size": null,
-        "remote_file_num": null,
-        "remote_file_prefix": null,
-        "remote_cache_path": null,
-        "remote_config_custom": null,
-        "enable_trace": false,
-        "trace_file_path": "./flexkv_trace.log",
-        "trace_max_file_size_mb": 100,
-        "trace_max_files": 5,
-        "trace_flush_interval_ms": 1000,
-        "evict_ratio": 0.05,
-        "index_accel": true
-    }
-}
diff --git a/benchmarks/example_config.yml b/benchmarks/example_config.yml
new file mode 100644
index 0000000000..a59827c4b7
--- /dev/null
+++ b/benchmarks/example_config.yml
@@ -0,0 +1,13 @@
+num_layers: 64
+num_kv_heads: 8
+head_size: 128
+dtype: bfloat16
+use_mla: false
+tp_size: 1
+dp_size: 1
+tokens_per_block: 16
+
+cpu_cache_gb: 8
+ssd_cache_gb: 16
+ssd_cache_dir: ./ssd_cache1/;./ssd_cache2/
+enable_gds: false
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index cdde8bddcf..1ebabc402e 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -1,14 +1,15 @@
 import asyncio
-import json
 import random
 import time
 from dataclasses import dataclass, field
 from typing import Optional, List, Tuple, Any
+import yaml
 
 import torch
+import numpy as np
 from tqdm import tqdm
 
-from flexkv.common.config import ModelConfig, CacheConfig
+from flexkv.common.config import *
 from flexkv.common.storage import KVCacheLayoutType
 
 
@@ -17,9 +18,9 @@ class KVRequest:
     user_id: int
     turn_id: int
     request_type: str  # "get" or "put"
-    token_ids: torch.Tensor
-    token_mask: torch.Tensor
-    slot_mapping: Optional[torch.Tensor] = None
+    token_ids: np.ndarray
+    token_mask: np.ndarray
+    slot_mapping: Optional[np.ndarray] = None
 
     request_id: int = field(init=False)
     _request_id_counter: int = field(init=False, default=0)
@@ -28,6 +29,12 @@ def __post_init__(self):
         self.request_id = KVRequest._request_id_counter
         KVRequest._request_id_counter += 1
 
+        if isinstance(self.token_ids, torch.Tensor):
+            self.token_ids = self.token_ids.numpy().astype(np.int64)
+        if isinstance(self.token_mask, torch.Tensor):
+            self.token_mask = self.token_mask.numpy().astype(np.int64)
+        if isinstance(self.slot_mapping, torch.Tensor):
+            self.slot_mapping = self.slot_mapping.numpy().astype(np.int64)
 
 def generate_random_multiturn(num_user_requests: int,
                               num_turns: int,
@@ -88,27 +95,32 @@ def generate_random_multiturn(num_user_requests: int,
 
 def load_config(config_path: str) -> Tuple[ModelConfig, CacheConfig]:
     with open(config_path) as f:
-        config = json.load(f)
-        if "ModelConfig" not in config:
-            print("ModelConfig not found in config, using default values")
-            config["ModelConfig"] = {}
-        if "CacheConfig" not in config:
-            print("CacheConfig not found in config, using default values")
-            config["CacheConfig"] = {}
-        if "dtype" in config["ModelConfig"]:
-            config["ModelConfig"]["dtype"] = eval(f"torch.{config['ModelConfig']['dtype']}")
-        if "gpu_kv_layout_type" in config["CacheConfig"]:
-            config["CacheConfig"]["gpu_kv_layout_type"] = \
-                KVCacheLayoutType(config["CacheConfig"]["gpu_kv_layout_type"])
-        if "cpu_kv_layout_type" in config["CacheConfig"]:
-            config["CacheConfig"]["cpu_kv_layout_type"] = \
-                KVCacheLayoutType(config["CacheConfig"]["cpu_kv_layout_type"])
-        if "ssd_kv_layout_type" in config["CacheConfig"]:
-            config["CacheConfig"]["ssd_kv_layout_type"] = \
-                KVCacheLayoutType(config["CacheConfig"]["ssd_kv_layout_type"])
-        if "remote_kv_layout_type" in config["CacheConfig"]:
-            config["CacheConfig"]["remote_kv_layout_type"] = \
-                KVCacheLayoutType(config["CacheConfig"]["remote_kv_layout_type"])
-        model_config = ModelConfig(**config["ModelConfig"])
-        cache_config = CacheConfig(**config["CacheConfig"])
+        config = yaml.load(f, Loader=yaml.SafeLoader)
+        print(config)
+        model_config = ModelConfig()
+        cache_config = CacheConfig()
+        user_config = UserConfig()
+        model_config.num_layers = config["num_layers"]
+        model_config.num_kv_heads = config["num_kv_heads"]
+        model_config.head_size = config["head_size"]
+        model_config.dtype = eval(f"torch.{config['dtype']}")
+        model_config.use_mla = config["use_mla"]
+        model_config.tp_size = config["tp_size"]
+        model_config.dp_size = config["dp_size"]
+        cache_config.tokens_per_block = config["tokens_per_block"]
+
+        if "cpu_cache_gb" in config:
+            user_config.cpu_cache_gb = config["cpu_cache_gb"]
+        if "ssd_cache_gb" in config:
+            user_config.ssd_cache_gb = config["ssd_cache_gb"]
+        if "ssd_cache_dir" in config:
+            user_config.ssd_cache_dir = parse_path_list(config["ssd_cache_dir"])
+        if "enable_gds" in config:
+            user_config.enable_gds = config["enable_gds"]
+        update_default_config_from_user_config(model_config, cache_config, user_config)
         return model_config, cache_config
+
+if __name__ == "__main__":
+    model_config, cache_config = load_config("./benchmarks/example_config.yml")
+    print(model_config)
+    print(cache_config)
diff --git a/docs/dynamo_integration/README_en.md b/docs/dynamo_integration/README_en.md
index f456004c3c..cb62e5cba5 100644
--- a/docs/dynamo_integration/README_en.md
+++ b/docs/dynamo_integration/README_en.md
@@ -32,7 +32,7 @@ cd FlexKV && ./build.sh
 
 ```bash
 # Navigate to vLLM directory
-cd /opt/vllm 
+cd /opt/vllm
 # apply patch
 git apply /your/path/to/FlexKV/examples/vllm_adaption/vllm_0_10_1_1-flexkv-connector.patch
 ```
@@ -82,48 +82,29 @@ python -m dynamo.frontend --router-mode kv --http-port 8000 &
 # Define number of worker nodes
 NUM_WORKERS=4
 
-# When using multiple workers, ensure FlexKV ports are different to avoid hanging at flexkv init
-# Adjust num_cpu_blocks and num_ssd_blocks values according to your server configuration
-for i in $(seq 0 $((NUM_WORKERS-1))); do
-    cat <<EOF > ./flexkv_config_${i}.json
-{
-        "enable_flexkv": true,
-        "server_recv_port": "ipc:///tmp/flexkv_${i}_test",
-        "cache_config": {
-                        "enable_cpu": true,
-                        "enable_ssd": false,
-                        "enable_remote": false,
-                        "enable_gds": false,
-                        "enable_trace": false,
-                        "ssd_cache_iouring_entries": 512,
-                        "tokens_per_block": 64,
-                        "num_cpu_blocks": 10240,
-                        "num_ssd_blocks": 256000,
-                        "ssd_cache_dir": "/data/flexkv_ssd/",
-                        "evict_ratio": 0.05,
-                        "index_accel": true
-
-        },
-        "num_log_interval_requests": 200
-}
-EOF
-done
-
+# Configure FlexKV using environment variables, disabling config file
+unset FLEXKV_CONFIG_PATH
+# Adjust CPU and SSD space sizes according to your server configuration
+export FLEXKV_CPU_CACHE_GB=32
+export FLEXKV_SSD_CACHE_GB=128
+export FLEXKV_SSD_CACHE_DIR="/data/flexkv_ssd/"
 # Use a loop to start worker nodes
 for i in $(seq 0 $((NUM_WORKERS-1))); do
     # Calculate GPU device IDs
     GPU_START=$((i*2))
     GPU_END=$((i*2+1))
-    
+
     if [ $i -lt $((NUM_WORKERS-1)) ]; then
-        FLEXKV_CONFIG_PATH="./flexkv_config_${i}.json" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2  --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 &
+        # When using multiple workers, ensure FlexKV ports are different to avoid hanging at flexkv init
+        # Set FlexKV port via the `FLEXKV_SERVER_RECV_PORT` environment variable
+        FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_${i}" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2  --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 &
     else
-        FLEXKV_CONFIG_PATH="./flexkv_config_${i}.json" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2  --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310
+        FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_${i}" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2  --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310
     fi
 done
 ```
 
-> Note: The `flexkv_config.json` configuration is provided as a simple example only. For full parameter options, please refer to [`docs/flexkv_config_reference/README_en.md`](../../docs/flexkv_config_reference/README_en.md)
+> Note: You can configure using YAML or JSON files. The above configuration is provided as a simple example only. For full parameter options, please refer to [`docs/flexkv_config_reference/README_en.md`](../../docs/flexkv_config_reference/README_en.md)
 
 ### Verification
 
@@ -152,4 +133,4 @@ genai-perf can send requests according to the timestamps in the trace file and c
 
 ```bash
 genai-perf profile   --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B  --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-70B  --endpoint-type chat   --endpoint /v1/chat/completions --streaming  --url http://localhost:8000  --input-file payload:mooncake_trace_1_6.jsonl --random-seed 100  -v  -H 'Authorization: Bearer NOT USED'  -H 'Accept: text/event-stream'   -- --stability-percentage 99
-```
\ No newline at end of file
+```
diff --git a/docs/dynamo_integration/README_zh.md b/docs/dynamo_integration/README_zh.md
index 53a1b49f93..70349fa6c0 100644
--- a/docs/dynamo_integration/README_zh.md
+++ b/docs/dynamo_integration/README_zh.md
@@ -32,7 +32,7 @@ cd FlexKV && ./build.sh
 
 ```bash
 # 进入 vLLM 目录
-cd /opt/vllm 
+cd /opt/vllm
 # apply patch
 git apply /your/path/to/FlexKV/examples/vllm_adaption/vllm_0_10_1_1-flexkv-connector.patch
 ```
@@ -83,48 +83,29 @@ python -m dynamo.frontend --router-mode kv --http-port 8000 &
 # 定义工作节点数量
 NUM_WORKERS=4
 
-# 多个worker时注意FlexKV的端口应不同，否则会卡在flexkv init这一步
-# 请根据服务器的配置，调整num_cpu_blocks和num_ssd_blocks的数值
-for i in $(seq 0 $((NUM_WORKERS-1))); do
-    cat <<EOF > ./flexkv_config_${i}.json
-{
-        "enable_flexkv": true,
-        "server_recv_port": "ipc:///tmp/flexkv_${i}_test",
-        "cache_config": {
-                        "enable_cpu": true,
-                        "enable_ssd": false,
-                        "enable_remote": false,
-                        "enable_gds": false,
-                        "enable_trace": false,
-                        "ssd_cache_iouring_entries": 512,
-                        "tokens_per_block": 64,
-                        "num_cpu_blocks": 10240,
-                        "num_ssd_blocks": 256000,
-                        "ssd_cache_dir": "/data/flexkv_ssd/",
-                        "evict_ratio": 0.05,
-                        "index_accel": true
-
-        },
-        "num_log_interval_requests": 200
-}
-EOF
-done
-
+# 使用环境变量配置Flexkv，禁用配置文件
+unset FLEXKV_CONFIG_PATH
+# 请根据服务器的配置，调整CPU和SSD的空间大小
+export FLEXKV_CPU_CACHE_GB=32
+export FLEXKV_SSD_CACHE_GB=128
+export FLEXKV_SSD_CACHE_DIR="/data/flexkv_ssd/"
 # 使用for循环启动工作节点
 for i in $(seq 0 $((NUM_WORKERS-1))); do
     # 计算GPU设备ID
     GPU_START=$((i*2))
     GPU_END=$((i*2+1))
-    
+
     if [ $i -lt $((NUM_WORKERS-1)) ]; then
-        FLEXKV_CONFIG_PATH="./flexkv_config_${i}.json" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2  --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 &
+        # 多个worker时注意Flexkv的端口应不同，否则会卡在flexkv init这一步
+        # 通过环境变量 `FLEXKV_SERVER_RECV_PORT` 设置Flexkv的端口
+        FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_${i}" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2  --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310 &
     else
-        FLEXKV_CONFIG_PATH="./flexkv_config_${i}.json" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2  --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310
+        FLEXKV_SERVER_RECV_PORT="ipc:///tmp/flexkv_server_${i}" CUDA_VISIBLE_DEVICES=${GPU_START},${GPU_END} python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor_parallel_size 2  --block-size 64 --gpu-memory-utilization 0.9 --max-model-len 100310
     fi
 done
 ```
 
-> 注：`flexkv_config.json`配置仅为简单示例，选项请参考[`docs/flexkv_config_reference/README_zh.md`](../../docs/flexkv_config_reference/README_zh.md)
+> 注：可使用 YAML 或 JSON 文件配置，上述配置仅为简单示例，更多选项请参考[`docs/flexkv_config_reference/README_zh.md`](../../docs/flexkv_config_reference/README_zh.md)
 
 ### 验证
 
@@ -152,4 +133,4 @@ genai-perf可以根据trace文件里的时间戳来发送请求，统计LLM服
 
 ```bash
  genai-perf profile   --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B  --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-70B  --endpoint-type chat   --endpoint /v1/chat/completions --streaming  --url http://localhost:8000  --input-file payload:mooncake_trace_1_6.jsonl --random-seed 100  -v  -H 'Authorization: Bearer NOT USED'  -H 'Accept: text/event-stream'   -- --stability-percentage 99
-```
\ No newline at end of file
+```
diff --git a/docs/flexkv_config_reference/README_en.md b/docs/flexkv_config_reference/README_en.md
index c20116dead..a416656dfc 100644
--- a/docs/flexkv_config_reference/README_en.md
+++ b/docs/flexkv_config_reference/README_en.md
@@ -1,147 +1,160 @@
 # FlexKV Configuration Guide
 
-This guide explains how to configure and use the FlexKV online serving configuration file (`flexkv_config.json`), including the meaning of all parameters, recommended values, and typical usage scenarios.
+This guide provides detailed instructions on how to configure and use FlexKV's online service configuration file (`flexkv_config.json`), covering the meaning of all parameters, recommended values, and typical usage scenarios.
 
 ---
 
-## Recommended Configuration
+## Basic Configuration Options
 
-Below is a production-grade recommended configuration that balances performance and stability:
+### 1. Configuration via Config File
 
+If the `FLEXKV_CONFIG_PATH` environment variable is set, the configuration file specified by this variable will be used with priority. Both yml and json file formats are supported.
+
+Below is a recommended configuration example that enables both CPU and SSD cache layers:
+
+YML configuration:
+```yml
+cpu_cache_gb: 32
+ssd_cache_gb: 1024
+ssd_cache_dir: /data/flexkv_ssd/
+enable_gds: false
+```
+Or using JSON configuration:
 ```json
 {
-    "enable_flexkv": true,
-    "server_recv_port": "ipc:///tmp/flexkv_test",
-    "cache_config": {
-        "enable_cpu": true,
-        "enable_ssd": true,
-        "enable_remote": false,
-        "enable_gds": false,
-        "enable_trace": false,
-        "ssd_cache_iouring_entries": 512,
-        "tokens_per_block": 64,
-        "num_cpu_blocks": 233000,
-        "num_ssd_blocks": 4096000,
-        "ssd_cache_dir": "/data/flexkv_ssd/",
-        "evict_ratio": 0.05,
-        "index_accel": true
-    },
-    "num_log_interval_requests": 2000
+  "cpu_cache_gb": 32,
+  "ssd_cache_gb": 1024,
+  "ssd_cache_dir": "/data/flexkv_ssd/",
+  "enable_gds": false
 }
 ```
-- `num_cpu_blocks` and `num_ssd_blocks` represent the total number of blocks in CPU memory and SSD respectively. These values must be configured according to your machine specs and model size. See [Cache Capacity Configuration](#cache-capacity-config) for calculation details.
-- `ssd_cache_dir` specifies the directory where SSD-stored KV cache files are saved.
+- `cpu_cache_gb`: CPU cache layer capacity in GB, must not exceed physical memory.
+- `ssd_cache_gb`: SSD cache layer capacity in GB. Recommended to be greater than `cpu_cache_gb` and a multiple of `FLEXKV_MAX_FILE_SIZE_GB`. Set to 0 if only using CPU cache (SSD cache will not be enabled).
+- `ssd_cache_dir`: Directory where SSD cache data is stored. If multiple SSDs are available, separate multiple mount paths with semicolons `;`. For example, `ssd_cache_dir: /data0/flexkv_ssd/;/data1/flexkv_ssd/` to improve bandwidth.
+- `enable_gds`: Whether to enable GPU Direct Storage (GDS). If hardware and drivers support it, enabling this can improve SSD to GPU data throughput. Disabled by default.
 
 ---
 
-## Configuration File Structure Overview
+### 2. Configuration via Environment Variables
 
-The FlexKV configuration file is a JSON file, primarily consisting of three parts:
+If the `FLEXKV_CONFIG_PATH` environment variable is not set, configuration can be done through the following environment variables.
 
-- `enable_flexkv`: Whether to enable FlexKV (must be set to `true` to take effect).
-- `server_recv_port`: The IPC port on which the FlexKV service listens.
-- `cache_config`: The core cache configuration object, containing all cache behavior parameters.
-- `num_log_interval_requests`: Log statistics interval (outputs performance log every N requests).
+> Note: If `FLEXKV_CONFIG_PATH` is set, the configuration file specified by `FLEXKV_CONFIG_PATH` will take priority, and the following environment variables will be ignored.
 
----
+| Environment Variable | Type | Default | Description |
+|----------------------|------|---------|-------------|
+| `FLEXKV_CPU_CACHE_GB` | int | 16 | CPU cache layer capacity in GB, must not exceed physical memory |
+| `FLEXKV_SSD_CACHE_GB` | int | 0 | SSD cache layer capacity in GB. Recommended to be greater than `FLEXKV_CPU_CACHE_GB` and a multiple of `FLEXKV_MAX_FILE_SIZE_GB`. Set to 0 if only using CPU cache (SSD cache will not be enabled) |
+| `FLEXKV_SSD_CACHE_DIR` | str | "./flexkv_ssd" | Directory where SSD cache data is stored. If multiple SSDs are available, separate multiple mount paths with semicolons `;`. For example, `"/data0/flexkv_ssd/;/data1/flexkv_ssd/"` to improve bandwidth |
+| `FLEXKV_ENABLE_GDS` | bool | 0 | Whether to enable GPU Direct Storage (GDS). If hardware and drivers support it, enabling this can improve SSD to GPU data throughput. Disabled by default, set to 1 to enable |
 
-## Complete `cache_config` Parameter Reference (from [`flexkv/common/config.py`](../../flexkv/common/config.py))
+---
 
-### Basic Configuration
+## Advanced Configuration Options
+Advanced configuration is mainly for users who need fine-tuned performance optimization or custom special requirements. It is recommended for users with some understanding of FlexKV.
+All advanced configurations support configuration via environment variables or yml/json configuration files. In case of conflicts with multiple configuration levels, the final priority order is: **Configuration file > Environment variables > Built-in default parameters**.
+If setting in a configuration file, remove the `FLEXKV_` prefix and convert everything to lowercase. For example, setting `server_client_mode: 1` in a yml file will override the value of the `FLEXKV_SERVER_CLIENT_MODE` environment variable.
+Some configurations can only be set through environment variables.
 
-| Parameter Name | Type | Default | Description |
-|----------------|------|---------|-------------|
-| `tokens_per_block` | int | 16 | Number of tokens per KV block. Must match the `block_size` used in the acceleration framework (e.g., vLLM). |
-| `enable_cpu` | bool | true | Whether to enable CPU memory as a cache layer. Strongly recommended to enable. |
-| `enable_ssd` | bool | false | Whether to enable SSD as a cache layer. Recommended if NVMe SSD is available. |
-| `enable_remote` | bool | false | Whether to enable remote cache (e.g., scalable cloud storage). Requires remote cache engine and custom implementation. |
-| `enable_gds` | bool | false | Whether to use GPU Direct Storage (GDS) to accelerate SSD I/O. Not currently supported. |
-| `index_accel` | bool | false | Whether to enable C++ RadixTree. Recommended to enable. |
+### Enable/Disable FLEXKV
 
----
+> Note: This configuration can only be set through environment variables
 
-### KV Cache Layout Types (Generally No Need to Modify)
+| Environment Variable | Type | Default | Description |
+|---------------------|------|---------|-------------|
+| `ENABLE_FLEXKV` | bool | 1 | 0-Disable FLEXKV, 1-Enable FLEXKV |
 
-| Parameter Name | Type | Default | Description |
-|----------------|------|---------|-------------|
-| `gpu_kv_layout_type` | enum | LAYERWISE | Organization of KV cache on GPU (layer-wise or block-wise). Must match vLLM’s layout (currently `LAYERWISE`). |
-| `cpu_kv_layout_type` | enum | BLOCKWISE | Organization on CPU. Recommended to use `BLOCKWISE`. Does not need to match vLLM. |
-| `ssd_kv_layout_type` | enum | BLOCKWISE | Organization on SSD. Recommended to use `BLOCKWISE`. Does not need to match vLLM. |
-| `remote_kv_layout_type` | enum | BLOCKWISE | Organization for remote cache. Must be defined according to remote backend’s layout. |
 
-> Note: Do not modify layout types unless you have specific performance requirements.
 
 ---
 
-### Cache Capacity Configuration <a id="cache-capacity-config"></a>
+### Server Mode Configuration
+
+| Environment Variable | Type | Default | Description |
+|---------------------|------|---------|-------------|
+| `FLEXKV_SERVER_CLIENT_MODE` | bool | 0 | `server_client_mode`: Whether to force enable server-client mode |
+| `FLEXKV_SERVER_RECV_PORT` | str | "ipc:///tmp/flexkv_server" | `server_recv_port`: Server receive port configuration |
 
-| Parameter Name | Type | Default | Description |
-|----------------|------|---------|-------------|
-| `num_cpu_blocks` | int | 1000000 | Number of blocks allocated in CPU memory. Adjust based on available RAM. |
-| `num_ssd_blocks` | int | 10000000 | Number of blocks allocated on SSD. |
-| `num_remote_blocks` | int \| None | None | Number of blocks allocated in remote cache. |
+---
 
-> Note: Block size in all cache levels (CPU/SSD/Remote) matches the GPU block size. Estimate cache capacities based on GPU KV cache memory usage and block count.
+### KV Cache Layout Types
 
-> Note: `block_size = num_layer * _kv_dim * tokens_per_block * num_head * head_size * dtype_size`.
+| Environment Variable | Type | Default | Description |
+|---------------------|------|---------|-------------|
+| `FLEXKV_CPU_LAYOUT` | str | BLOCKFIRST | CPU storage layout, options: `LAYERFIRST` and `BLOCKFIRST`, recommended to use `BLOCKFIRST` |
+| `FLEXKV_SSD_LAYOUT` | str | BLOCKFIRST | SSD storage layout, options: `LAYERFIRST` and `BLOCKFIRST`, recommended to use `BLOCKFIRST` |
+| `FLEXKV_REMOTE_LAYOUT` | str | BLOCKFIRST | REMOTE storage layout, options: `LAYERFIRST` and `BLOCKFIRST`, recommended to use `BLOCKFIRST` |
+| `FLEXKV_GDS_LAYOUT` | str | BLOCKFIRST | GDS storage layout, options: `LAYERFIRST` and `BLOCKFIRST`, recommended to use `BLOCKFIRST` |
 
 ---
 
 ### CPU-GPU Transfer Optimization
 
-| Parameter Name | Type | Default | Description |
-|----------------|------|---------|-------------|
-| `use_ce_transfer_h2d` | bool | false | Whether to use CUDA Copy Engine for Host→Device transfers. Reduces SM usage but may slightly reduce bandwidth. Real-world difference is minimal. |
-| `use_ce_transfer_d2h` | bool | false | Whether to use CUDA Copy Engine for Device→Host transfers. |
-| `transfer_sms_h2d` | int | 8 | Number of SMs (Streaming Multiprocessors) allocated for H2D transfers. |
-| `transfer_sms_d2h` | int | 8 | Number of SMs allocated for D2H transfers. |
+| Environment Variable | Type | Default | Description |
+|---------------------|------|---------|-------------|
+| `FLEXKV_USE_CE_TRANSFER_H2D` | bool | 0 | Whether to use cudaMemcpyAsync for Host→Device transfers. Can avoid occupying SM, but transfer speed will be reduced |
+| `FLEXKV_USE_CE_TRANSFER_D2H` | bool | 0 | Whether to use cudaMemcpyAsync for Device→Host transfers. Can avoid occupying SM, but transfer speed will be reduced |
+| `FLEXKV_TRANSFER_SMS_H2D` | int | 8 | Number of streaming multiprocessors used for H2D transfer, only effective when `FLEXKV_USE_CE_TRANSFER_H2D` is 0 |
+| `FLEXKV_TRANSFER_SMS_D2H` | int | 8 | Number of streaming multiprocessors used for D2H transfer, only effective when `FLEXKV_USE_CE_TRANSFER_D2H` is 0 |
 
 ---
 
-### SSD Cache Configuration
+### SSD I/O Optimization
 
-| Parameter Name | Type | Default | Description |
-|----------------|------|---------|-------------|
-| `max_blocks_per_file` | int | 32000 | Maximum number of blocks per SSD file. `-1` means unlimited. |
-| `ssd_cache_dir` | str \| List[str] | None | **Required.** Path to SSD cache directory, e.g., `"/data/flexkv_ssd/"`. |
-| `ssd_cache_iouring_entries` | int | 0 | io_uring queue depth. Recommended: `512` for significantly improved concurrent I/O performance. |
-| `ssd_cache_iouring_flags` | int | 0 | io_uring flags. Recommended: `1`.|
+> Note: Setting `iouring_entries` to 0 disables iouring. Not recommended to set to 0.
+
+| Environment Variable | Type | Default | Description |
+|---------------------|------|---------|-------------|
+| `FLEXKV_MAX_FILE_SIZE_GB` | float | -1 | Maximum size of a single SSD file, -1 means unlimited |
+| `FLEXKV_IORING_ENTRIES` | int | 512 | io_uring queue depth. Recommended to set to `512` to improve concurrent I/O performance |
+| `FLEXKV_IORING_FLAGS` | int | 0 | io_uring flags, default is 0 |
 
-> Note: To maximize bandwidth across multiple SSDs, bind each SSD to a separate directory and specify them as a list:  
-> `"ssd_cache_dir": ["/data0/flexkv_ssd/", "/data1/flexkv_ssd/"]`.  
-> KV blocks will be evenly distributed across all SSDs.
 
-> Note: Setting `ssd_cache_iouring_entries` to `0` disables io_uring. Not recommended.
 
 ---
 
-### Remote Cache Configuration (Skip if not enabled)
+### Multi-Node TP
+
+> Note: These configurations can only be set through environment variables
+
+| Environment Variable | Type | Default | Description |
+|---------------------|------|---------|-------------|
+| `FLEXKV_MASTER_HOST` | str | "localhost" | Master node IP for multi-node TP |
+| `FLEXKV_MASTER_PORTS` | str | "5556,5557,5558" | Master node ports for multi-node TP. Uses three ports, separated by commas |
 
-| Parameter Name | Type | Default | Description |
-|----------------|------|---------|-------------|
-| `remote_cache_size_mode` | str | "file_size" | Allocate remote cache space by file size or block count. |
-| `remote_file_size` | int \| None | None | Size (in bytes) of each remote file. |
-| `remote_file_num` | int \| None | None | Number of remote files. |
-| `remote_file_prefix` | str \| None | None | Prefix for remote file names. |
-| `remote_cache_path` | str \| List[str] | None | Remote cache path (e.g., Redis URL, S3 path). |
-| `remote_config_custom` | dict \| None | None | Custom remote cache configurations (e.g., timeout, authentication). |
 
 ---
 
-### Tracing and Logging
+### Logging Configuration
+
+> Note: These configurations can only be set through environment variables
+
+| Environment Variable | Type | Default | Description |
+|---------------------|------|---------|-------------|
+| `FLEXKV_LOGGING_PREFIX` | str | "FLEXKV" | Logging prefix |
+| `FLEXKV_LOG_LEVEL` | str | "INFO" | Log output level, options: "DEBUG" "INFO" "WARNING" "ERROR" "CRITICAL" "OFF" |
+| `FLEXKV_NUM_LOG_INTERVAL_REQUESTS` | int | 200 | Log output interval request count |
+
+
+
+---
+
+### Tracing and Debugging
+
+| Environment Variable | Type | Default | Description |
+|---------------------|------|---------|-------------|
+| `FLEXKV_ENABLE_TRACE` | bool | 0 | Whether to enable performance tracing. Recommended to disable (`0`) in production to reduce overhead |
+| `FLEXKV_TRACE_FILE_PATH` | str | "./flexkv_trace.log" | Trace log file path |
+| `FLEXKV_TRACE_MAX_FILE_SIZE_MB` | int | 100 | Maximum size (MB) per trace log file |
+| `FLEXKV_TRACE_MAX_FILES` | int | 5 | Maximum number of trace log files to retain |
+| `FLEXKV_TRACE_FLUSH_INTERVAL_MS` | int | 1000 | Trace log flush interval (milliseconds) |
 
-| Parameter Name | Type | Default | Description |
-|----------------|------|---------|-------------|
-| `enable_trace` | bool | true | Whether to enable performance tracing. Disable (`false`) in production to reduce overhead. |
-| `trace_file_path` | str | "./flexkv_trace.log" | Path to trace log file. |
-| `trace_max_file_size_mb` | int | 100 | Maximum size (MB) per trace log file. |
-| `trace_max_files` | int | 5 | Maximum number of trace log files to retain. |
-| `trace_flush_interval_ms` | int | 1000 | Trace log flush interval (milliseconds). |
 
 ---
 
-### Cache Eviction Policy
+### Control Plane Optimization
 
-| Parameter Name | Type | Default | Description |
-|----------------|------|---------|-------------|
-| `evict_ratio` | float | 0.0 | Ratio of blocks to proactively evict from CPU/SSD per eviction cycle. `0.0` = evict only the minimal necessary blocks (more eviction cycles may impact performance). Recommended: `0.05` (evict 5% of least recently used blocks per cycle). |
+| Environment Variable | Type | Default | Description |
+|---------------------|------|---------|-------------|
+| `FLEXKV_INDEX_ACCEL` | bool | 1 | 0-Enable Python version RadixTree implementation, 1-Enable C++ version RadixTree implementation |
+| `FLEXKV_EVICT_RATIO` | float | 0.05 | CPU and SSD eviction ratio for proactive eviction per cycle (0.0 = only evict the minimal necessary blocks). Recommended to keep at `0.05`, i.e., evict 5% of least recently used blocks per cycle |
diff --git a/docs/flexkv_config_reference/README_zh.md b/docs/flexkv_config_reference/README_zh.md
index d1f7a3a279..34821c5dd4 100644
--- a/docs/flexkv_config_reference/README_zh.md
+++ b/docs/flexkv_config_reference/README_zh.md
@@ -4,142 +4,152 @@
 
 ---
 
-## 推荐配置方案
+## 基础配置选项
 
-以下是一个兼顾性能与稳定性的生产级推荐配置：
+### 一、通过文件配置
 
+如果设置了环境变量 `FLEXKV_CONFIG_PATH`，将优先使用该变量指定的配置文件。支持yml和json两种文件类型。
+
+以下是一个同时开启 CPU 和 SSD 缓存层的推荐配置示例：
+
+yml配置：
+```yml
+cpu_cache_gb: 32
+ssd_cache_gb: 1024
+ssd_cache_dir: /data/flexkv_ssd/
+enable_gds: false
+```
+或使用json配置：
 ```json
 {
-    "enable_flexkv": true,
-    "server_recv_port": "ipc:///tmp/flexkv_test",
-    "cache_config": {
-        "enable_cpu": true,
-        "enable_ssd": true,
-        "enable_remote": false,
-        "enable_gds": false,
-        "enable_trace": false,
-        "ssd_cache_iouring_entries": 512,
-        "tokens_per_block": 64,
-        "num_cpu_blocks": 233000,
-        "num_ssd_blocks": 4096000,
-        "ssd_cache_dir": "/data/flexkv_ssd/",
-        "evict_ratio": 0.05,
-        "index_accel": true
-    },
-    "num_log_interval_requests": 2000
+  "cpu_cache_gb": 32,
+  "ssd_cache_gb": 1024,
+  "ssd_cache_dir": "/data/flexkv_ssd/",
+  "enable_gds": false
 }
 ```
-- 其中的`num_cpu_blocks`和`num_ssd_blocks`分别代表内存和SSD中block的总数量，需要根据实际机器配置和模型来配置，具体计算方式见下文[缓存容量配置](#cache-capacity-config)
-- `ssd_cache_dir`为ssd中KVCache存放的文件目录
+- `cpu_cache_gb`：CPU 缓存层容量，单位为 GB，不能超过物理内存。
+- `ssd_cache_gb`：SSD 缓存层容量，单位为 GB。建议大于 `cpu_cache_gb`并为`FLEXKV_MAX_FILE_SIZE_GB`的整数倍，若仅用CPU缓存则设为 0（此时不启用 SSD 缓存）。
+- `ssd_cache_dir`：SSD 缓存数据的存放目录。若有多块 SSD，可通过分号 `;` 分隔多个挂载路径。例如 `ssd_cache_dir: /data0/flexkv_ssd/;/data1/flexkv_ssd/`，以提升带宽。
+- `enable_gds`：是否启用 GPU Direct Storage（GDS）。如硬件和驱动支持，开启后可提升 SSD 到 GPU 的数据吞吐能力。默认关闭。
 
 ---
 
-## 配置文件结构概览
+### 二、通过环境变量配置
 
-FlexKV 的配置文件是一个 JSON 文件，主要包含三个部分：
+如果未设置 `FLEXKV_CONFIG_PATH`环境变量，则可通过以下环境变量进行配置。
 
-- `enable_flexkv`: 是否启用 FlexKV 功能（必须设为 `true` 才生效）
-- `server_recv_port`: FlexKV 服务监听的 IPC 端口
-- `cache_config`: 核心缓存配置对象，包含所有缓存行为参数
-- `num_log_interval_requests`: 日志统计间隔（每处理 N 个请求输出一次性能日志）
+> 注：如果设置了`FLEXKV_CONFIG_PATH`，将优先使用`FLEXKV_CONFIG_PATH`指定的配置文件，以下环境变量将被忽略。
+
+| 环境变量             | 类型  | 默认值        | 说明                                                                                                            |
+|----------------------|-------|-------------|----------------------------------------------------------------------------------------------------------------|
+| `FLEXKV_CPU_CACHE_GB`    | int   | 16          | CPU 缓存层容量，单位为 GB，不能超过物理内存
+| `FLEXKV_SSD_CACHE_GB`    | int   | 0           | SSD 缓存层容量，单位为 GB。建议设置大于 `FLEXKV_CPU_CACHE_GB`并为`FLEXKV_MAX_FILE_SIZE_GB`的整数倍，若仅用CPU缓存则设为 0（此时不启用 SSD 缓存）               |
+| `FLEXKV_SSD_CACHE_DIR`   | str   | "./flexkv_ssd" | SSD 缓存数据的存放目录。若有多块 SSD，可通过分号 `;` 分隔多个挂载路径。例如 `"/data0/flexkv_ssd/;/data1/flexkv_ssd/"`，以提升带宽                  |
+| `FLEXKV_ENABLE_GDS`      | bool  | 0           | 是否启用 GPU Direct Storage（GDS）。如硬件和驱动支持，开启后可提升 SSD 到 GPU 的数据吞吐能力。默认关闭，开启请设为 1                    |
 
 ---
 
-## cache_config完整参数详解（来自 [`flexkv/common/config.py`](../../flexkv/common/config.py)）
+## 高级配置选项
+高级配置主要针对需要精细化性能优化或自定义特殊需求的用户，建议对 FlexKV 具备一定理解的用户使用。
+所有高级配置均支持通过环境变量或 yml/json 配置文件进行设置，如有多级配置冲突，最终生效顺序为：**配置文件 > 环境变量 > 默认内置参数**。
+如果在配置文件中设置，请去除`FLEXKV_`前缀并全部转换为小写，例如在yml文件中设置`server_client_mode: 1`将会覆盖`FLEXKV_SERVER_CLIENT_MODE`环境变量的值。
+部分配置只能通过环境变量设置。
+
+### 启用/禁用FLEXKV
 
-### 基础配置
+> 注：该配置只能通过环境变量设置
 
-| 参数名 | 类型 | 默认值 | 说明 |
+| 环境变量 | 类型 | 默认值 | 说明 |
 |--------|------|--------|------|
-| `tokens_per_block` | int | 16 | 每个 KV Block 包含的 token 数量。需要与加速框架（如vLLM）中`block_size`保持一致 |
-| `enable_cpu` | bool | true | 是否启用 CPU 内存作为缓存层。强烈建议开启。 |
-| `enable_ssd` | bool | false | 是否启用 SSD 作为缓存层。如配备 NVMe SSD，建议开启。 |
-| `enable_remote` | bool | false | 是否启用远程缓存（如可扩展云存储等）。需要配合远程缓存和自定义的远程缓存引擎使用 |
-| `enable_gds` | bool | false | 是否使用 GPU Direct Storage（GDS）加速 SSD 读写。目前暂不支持。 |
-| `index_accel` | bool | false | 是否启用C++ RadixTree。推荐开启。 |
+| `ENABLE_FLEXKV` | bool | 1 | 0-禁用FLEXKV，1-启用FLEXKV |
 
 ---
 
-### KV 缓存布局类型（一般无需修改）
+### 服务器模式配置
 
-| 参数名 | 类型 | 默认值 | 说明 |
+| 环境变量 | 类型 | 默认值 | 说明 |
 |--------|------|--------|------|
-| `gpu_kv_layout_type` | enum | LAYERWISE | GPU 上 KV Cache 的组织方式（按层或按块）。目前vLLM在GPU组织方式为`LAYERWISE`，因此FlexKV的`gpu_kv_layout_type`须与vLLM保持一致 |
-| `cpu_kv_layout_type` | enum | BLOCKWISE | CPU 上按块组织, 推荐使用`BLOCKWISE`，不需要与vLLM保持一致 |
-| `ssd_kv_layout_type` | enum | BLOCKWISE | SSD 上按块组织, 推荐使用`BLOCKWISE`，不需要与vLLM保持一致 |
-| `remote_kv_layout_type` | enum | BLOCKWISE | 远程缓存按块组织, 需要按照remote组织形式定义 |
-
-> 注：除非有特殊性能需求，否则不建议修改布局类型。
+| `FLEXKV_SERVER_CLIENT_MODE` | bool | 0 | `server_client_mode`: 是否强制启用服务器-客户端模式 |
+| `FLEXKV_SERVER_RECV_PORT` | str | "ipc:///tmp/flexkv_server" | `server_recv_port`: 服务器接收端口配置 |
 
 ---
 
-### 缓存容量配置 <a id="cache-capacity-config"></a>
+### KV 缓存布局类型
 
-| 参数名 | 类型 | 默认值 | 说明 |
+| 环境变量 | 类型 | 默认值 | 说明 |
 |--------|------|--------|------|
-| `num_cpu_blocks` | int | 1000000 | CPU 缓存块数。根据内存大小调整。|
-| `num_ssd_blocks` | int | 10000000 | SSD 缓存块数。|
-| `num_remote_blocks` | int \| None | None | 远程缓存块数。|
-
-> 注：FlexKV里的各级缓存的block大小与GPU中的block大小保持一致，可以参考GPU的KVCache显存大小与block数量估算各级缓存中的block数量。
-
-> 注：block_size = num_layer * _kv_dim * tokens_per_block * num_head * self.head_size * torch_dtype.size()。
+| `FLEXKV_CPU_LAYOUT` | str | BLOCKFIRST | CPU 存储布局，可选`LAYERFIRST`和`BLOCKFIRST`, 推荐使用`BLOCKFIRST` |
+| `FLEXKV_SSD_LAYOUT` | str | BLOCKFIRST | SSD 存储布局，可选`LAYERFIRST`和`BLOCKFIRST`, 推荐使用`BLOCKFIRST` |
+| `FLEXKV_REMOTE_LAYOUT` | str | BLOCKFIRST | REMOTE 存储布局，可选`LAYERFIRST`和`BLOCKFIRST`, 推荐使用`BLOCKFIRST` |
+| `FLEXKV_GDS_LAYOUT` | str | BLOCKFIRST | GDS 存储布局，可选`LAYERFIRST`和`BLOCKFIRST`, 推荐使用`BLOCKFIRST` |
 
 ---
 
 ### CPU-GPU 传输优化
 
-| 参数名 | 类型 | 默认值 | 说明 |
+| 环境变量 | 类型 | 默认值 | 说明 |
 |--------|------|--------|------|
-| `use_ce_transfer_h2d` | bool | false | 是否使用 cuda copy engine 优化 Host→Device 传输，使用CE可以减少GPU SM在传输上的使用，但是传输速度会降低，实际测试差距不大 |
-| `use_ce_transfer_d2h` | bool | false | 是否使用 cuda copy engine 优化 Device→Host 传输 |
-| `transfer_sms_h2d` | int | 8 | H2D 传输使用的流处理器数量 |
-| `transfer_sms_d2h` | int | 8 | D2H 传输使用的流处理器数量 |
+| `FLEXKV_USE_CE_TRANSFER_H2D` | bool | 0 | 是否使用 cudaMemcpyAsync 实现 Host→Device 传输，可以避免占用 SM，但是传输速度会降低 |
+| `FLEXKV_USE_CE_TRANSFER_D2H` | bool | 0 |  是否使用 cudaMemcpyAsync 实现 Device→Host 传输，可以避免占用 SM，但是传输速度会降低 |
+| `FLEXKV_TRANSFER_SMS_H2D` | int | 8 | H2D 传输使用的流处理器数量，仅在`FLEXKV_USE_CE_TRANSFER_H2D`为0时生效 |
+| `FLEXKV_TRANSFER_SMS_D2H` | int | 8 | D2H 传输使用的流处理器数量，仅在`FLEXKV_USE_CE_TRANSFER_D2H`为0时生效 |
 
 ---
 
-### SSD 缓存配置
+### SSD I/O优化
+
+> 注：`iouring_entries`设置为0即禁用iouring，不推荐设置为0。
 
-| 参数名 | 类型 | 默认值 | 说明 |
+| 环境变量 | 类型 | 默认值 | 说明 |
 |--------|------|--------|------|
-| `max_blocks_per_file` | int | 32000 | 单个 SSD 文件最多包含的 block 数。-1 表示无限制 |
-| `ssd_cache_dir` | str \| List[str] | None | SSD 缓存目录路径，**必须设置**，如 `"/data/flexkv_ssd/"` |
-| `ssd_cache_iouring_entries` | int | 0 | io_uring 队列深度，推荐设为 `512` 以提升并发 IO 性能，实测比不使用iouring提升较大，推荐使用512 |
-| `ssd_cache_iouring_flags` | int | 1 | io_uring 标志位，推荐设置为 1。|
+| `FLEXKV_MAX_FILE_SIZE_GB` | float | -1 | 单个 SSD 文件的最大大小，-1表示不限 |
+| `FLEXKV_IORING_ENTRIES` | int | 512 | io_uring 队列深度，推荐设为 `512` 以提升并发 IO 性能 |
+| `FLEXKV_IORING_FLAGS` | int | 0 | io_uring 标志位，默认为 0|
 
-> 注：为了充分利用多块SSD的带宽上限，可以将多块SSD绑定至不同目录，并使用如 `"ssd cache dir": ["/data0/flexkv_ssd/", "/data1/flexkv_ssd/"]`方式初始化，SSD KVCache会均匀分布在所有SSD中，充分利用多个SSD带宽。
 
-> 注：`ssd_cache_iouring_entries`设置为0即不适用iouring，不推荐设置为0
 
 ---
 
-### 远程缓存配置（不启用时无需配置）
+### 多节点TP
 
-| 参数名 | 类型 | 默认值 | 说明 |
+> 注：这些配置只能通过环境变量设置
+
+| 环境变量 | 类型 | 默认值 | 说明 |
+|--------|------|--------|------|
+| `FLEXKV_MASTER_HOST` | str | "localhost" | 多节点TP的主节点IP |
+| `FLEXKV_MASTER_PORTS` | str | "5556,5557,5558" | 多节点TP的主节点端口。使用三个端口，用逗号分隔 |
+
+---
+
+### 日志配置
+
+> 注：这些配置只能通过环境变量设置
+
+| 环境变量 | 类型 | 默认值 | 说明 |
 |--------|------|--------|------|
-| `remote_cache_size_mode` | str | "file_size" | 按文件大小或块数分配远程缓存空间 |
-| `remote_file_size` | int \| None | None | 单个远程文件大小（字节） |
-| `remote_file_num` | int \| None | None | 远程文件数量 |
-| `remote_file_prefix` | str \| None | None | 远程文件名前缀 |
-| `remote_cache_path` | str \| List[str] | None | 远程缓存路径（如 Redis URL、S3 路径等） |
-| `remote_config_custom` | dict \| None | None | 自定义远程缓存配置（如超时、认证等） |
+| `FLEXKV_LOGGING_PREFIX` | str | "FLEXKV" | 日志前缀 |
+| `FLEXKV_LOG_LEVEL` | str | "INFO" | 日志输出等级，可选："DEBUG"  "INFO" "WARNING"  "ERROR"  "CRITICAL" "OFF" |
+| `FLEXKV_NUM_LOG_INTERVAL_REQUESTS` | int | 200 | 日志输出间隔请求数 |
 
 ---
 
-### 追踪与日志
+### 追踪和调试
 
-| 参数名 | 类型 | 默认值 | 说明 |
+| 环境变量 | 类型 | 默认值 | 说明 |
 |--------|------|--------|------|
-| `enable_trace` | bool | true | 是否启用性能追踪。生产环境建议关闭（`false`）以减少开销 |
-| `trace_file_path` | str | "./flexkv_trace.log" | 追踪日志路径 |
-| `trace_max_file_size_mb` | int | 100 | 单个追踪文件最大大小（MB） |
-| `trace_max_files` | int | 5 | 最多保留的追踪文件数 |
-| `trace_flush_interval_ms` | int | 1000 | 追踪日志刷新间隔（毫秒） |
+| `FLEXKV_ENABLE_TRACE` | bool | 0 | 是否启用性能追踪。生产环境建议关闭（`0`）以减少开销 |
+| `FLEXKV_TRACE_FILE_PATH` | str | "./flexkv_trace.log" | 追踪日志路径 |
+| `FLEXKV_TRACE_MAX_FILE_SIZE_MB` | int | 100 | 单个追踪文件最大大小（MB） |
+| `FLEXKV_TRACE_MAX_FILES` | int | 5 | 最多保留的追踪文件数 |
+| `FLEXKV_TRACE_FLUSH_INTERVAL_MS` | int | 1000 | 追踪日志刷新间隔（毫秒） |
+
 
 ---
 
-### 缓存淘汰策略
+### 控制面优化
 
-| 参数名 | 类型 | 默认值 | 说明 |
+| 环境变量 | 类型 | 默认值 | 说明 |
 |--------|------|--------|------|
-| `evict_ratio` | float | 0.0 | cpu，ssd一次evict主动淘汰比例（0.0 = 只淘汰最小的必要的block数量，较多的淘汰次数会影响性能）。建议保持 `0.05`，即每一次淘汰5%的最久未使用的block |
+| `FLEXKV_INDEX_ACCEL` | bool | 1 | 0-启用Python版本RadixTree实现，1-启用C++版本RadixTree实现 |
+| `FLEXKV_EVICT_RATIO` | float | 0.05 | cpu，ssd一次evict主动淘汰比例（0.0 = 只淘汰最小的必要的block数）。建议保持 `0.05`，即每一次淘汰5%的最久未使用的block |
diff --git a/docs/gds/README_en.md b/docs/gds/README_en.md
index e6dfa9e732..5bc4155fe0 100644
--- a/docs/gds/README_en.md
+++ b/docs/gds/README_en.md
@@ -109,17 +109,13 @@ docker run -itd \
 ### 2.2 Configure FlexKV to Use GDS
 
 `export FLEXKV_ENABLE_GDS=1` to compile
-Configuration example after compilation `config.json`:
+Configuration example after compilation `config.yaml`:
 
-```json
-{
-    "cache_config": {
-          "enable_ssd": False,
-          "enable_gds": True,
-          "num_gds_blocks": 10000000,
-          "gds_cache_dir": ["./gdstest"]
-    },
-}
+```yaml
+cpu_cache_gb: 32
+ssd_cache_gb: 1024
+ssd_cache_dir: /data/flexkv_ssd/
+enable_gds: true
 ```
 
 ---
diff --git a/docs/gds/README_zh.md b/docs/gds/README_zh.md
index 2bd7a45950..50e57aa9b3 100644
--- a/docs/gds/README_zh.md
+++ b/docs/gds/README_zh.md
@@ -109,17 +109,13 @@ docker run -itd \
 
 `export FLEXKV_ENABLE_GDS=1` 进行编译
 
-编译后config例子 `config.json`：
+编译后config例子 `config.yaml`：
 
-```json
-{
-    "cache_config": {
-          "enable_ssd": False,
-          "enable_gds": True,
-          "num_gds_blocks": 10000000,
-          "gds_cache_dir": ["./gdstest"]
-    },
-}
+```yaml
+cpu_cache_gb: 32
+ssd_cache_gb: 1024
+ssd_cache_dir: /data/flexkv_ssd/
+enable_gds: true
 ```
 
 ---
diff --git a/docs/vllm_adapter/README_en.md b/docs/vllm_adapter/README_en.md
index 972cade803..c9ca1b7697 100644
--- a/docs/vllm_adapter/README_en.md
+++ b/docs/vllm_adapter/README_en.md
@@ -15,9 +15,32 @@ This change involves significant API adjustments. Therefore, please note:
 
 ### Supported Versions
 - FlexKV >= `1.0.0`
-- vLLM versions >= `0.8.5` can generally follow this version for adaptation
+- vLLM versions >= `0.8.5` can generally follow the example code for adaptation
 
-### Example
+### Configuration
+
+#### Example 1: CPU Offloading Only
+Use 32GB of CPU memory as secondary cache.
+```bash
+unset FLEXKV_CONFIG_PATH
+export FLEXKV_CPU_CACHE_GB=32
+```
+#### Example 2: SSD Offloading
+Use 32GB of CPU memory and 1TB of SSD storage as secondary and tertiary cache respectively. (Assume the machine has two SSDs mounted at /data0 and /data1 respectively.)
+```bash
+# generate config
+cat <<EOF > ./flexkv_config.yml
+cpu_cache_gb: 32
+ssd_cache_gb: 1024
+ssd_cache_dir: /data0/flexkv_ssd/;/data1/flexkv_ssd/
+enable_gds: false
+EOF
+export FLEXKV_CONFIG_PATH="./flexkv_config.yml"
+```
+
+> Note: The `flexkv_config.yml` configuration is provided as a simple example only. For full parameter options, please refer to [`docs/flexkv_config_reference/README_en.md`](../../docs/flexkv_config_reference/README_en.md)
+
+### Running
 We provide an adaptation example based on **vLLM 0.10.1.1**:
 
 1. apply patch
@@ -34,19 +57,6 @@ python examples/offline_inference/prefix_caching_flexkv.py
 
 3. online serving
 ```bash
-# generate config
-cat <<EOF > ./flexkv_config.json
-{
-    "server_recv_port": "ipc:///tmp/flexkv_test",
-    "cache_config": {
-          "enable_cpu": true,
-          "num_cpu_blocks": 10240,
-    },
-    "num_log_interval_requests": 200
-}
-EOF
-export FLEXKV_CONFIG_PATH="./flexkv_config.json"
-
 VLLM_USE_V1=1 python -m vllm.entrypoints.cli.main serve Qwen3/Qwen3-32B \
      --tensor-parallel-size 8 \
      --trust-remote-code \
@@ -63,14 +73,30 @@ VLLM_USE_V1=1 python -m vllm.entrypoints.cli.main serve Qwen3/Qwen3-32B \
 
 ```
 
-> Note: The `flexkv_config.json` configuration is provided as a simple example only. For full parameter options, please refer to [`docs/flexkv_config_reference/README_en.md`](../../docs/flexkv_config_reference/README_en.md)
-
 ## Legacy Version (<= 0.1.0) – Not Recommended for Current Use
 
 ### Supported Versions
 - FlexKV <= `0.1.0`
 
-### Example
+### Configuration
+
+Legacy version configuration:
+```bash
+# generate config
+cat <<EOF > ./flexkv_config.json
+{
+    "server_recv_port": "ipc:///tmp/flexkv_test",
+    "cache_config": {
+          "enable_cpu": true,
+          "num_cpu_blocks": 10240
+    },
+    "num_log_interval_requests": 200
+}
+EOF
+export FLEXKV_CONFIG_PATH="./flexkv_config.json"
+```
+
+### Running
 Apply the patch `examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch` to vLLM 0.8.4, then start FlexKV, vLLM, and the benchmark script:
 
 ```bash
diff --git a/docs/vllm_adapter/README_zh.md b/docs/vllm_adapter/README_zh.md
index bb9b51c292..52e3199755 100644
--- a/docs/vllm_adapter/README_zh.md
+++ b/docs/vllm_adapter/README_zh.md
@@ -16,7 +16,30 @@
 - FlexKV >= `1.0.0`
 - vLLM 原则上>= `0.8.5`版本均可参考示例代码进行修改
 
-### 示例
+### 配置
+
+#### 示例一：仅启用CPU卸载
+使用32GB的CPU内存作为二级缓存。
+```bash
+unset FLEXKV_CONFIG_PATH
+export FLEXKV_CPU_CACHE_GB=32
+```
+#### 示例二：启用SSD卸载
+使用32GB的CPU内存和1T的SSD存储分别作为二级和三级缓存。（假设机器有两个SSD，并分别挂载在/data0和/data1两个路径上。）
+```bash
+# generate config
+cat <<EOF > ./flexkv_config.yml
+cpu_cache_gb: 32
+ssd_cache_gb: 1024
+ssd_cache_dir: /data0/flexkv_ssd/;/data1/flexkv_ssd/
+enable_gds: false
+EOF
+export FLEXKV_CONFIG_PATH="./flexkv_config.yml"
+```
+
+> 注：`flexkv_config.yml`配置仅为简单示例，选项请参考[`docs/flexkv_config_reference/README_zh.md`](../../docs/flexkv_config_reference/README_zh.md)
+
+### 运行
 我们提供了基于 **vLLM 0.10.1.1** 的适配示例：
 
 1. apply patch
@@ -33,19 +56,6 @@ python examples/offline_inference/prefix_caching_flexkv.py
 
 3. online serving
 ```bash
-# generate config
-cat <<EOF > ./flexkv_config.json
-{
-    "server_recv_port": "ipc:///tmp/flexkv_test",
-    "cache_config": {
-          "enable_cpu": true,
-          "num_cpu_blocks": 10240,
-    },
-    "num_log_interval_requests": 200
-}
-EOF
-export FLEXKV_CONFIG_PATH="./flexkv_config.json"
-
 VLLM_USE_V1=1 python -m vllm.entrypoints.cli.main serve Qwen3/Qwen3-32B \
      --tensor-parallel-size 8 \
      --trust-remote-code \
@@ -62,14 +72,30 @@ VLLM_USE_V1=1 python -m vllm.entrypoints.cli.main serve Qwen3/Qwen3-32B \
 
 ```
 
-> 注：`flexkv_config.json`配置仅为简单示例，选项请参考[`docs/flexkv_config_reference/README_zh.md`](../../docs/flexkv_config_reference/README_zh.md)
-
 ## Legacy版本（<= 0.1.0）,目前的版本尽量不要使用
 
 ### 适用版本
 - FlexKV <= `0.1.0`
 
-### 示例
+### 配置
+
+旧版本配置方式如下
+```bash
+# generate config
+cat <<EOF > ./flexkv_config.json
+{
+    "server_recv_port": "ipc:///tmp/flexkv_test",
+    "cache_config": {
+          "enable_cpu": true,
+          "num_cpu_blocks": 10240
+    },
+    "num_log_interval_requests": 200
+}
+EOF
+export FLEXKV_CONFIG_PATH="./flexkv_config.json"
+```
+
+### 运行
 在 vLLM 0.8.4 版本中应用patch `examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch`，分别启动 FlexKV、vLLM 和测试脚本：
 
 ```bash
diff --git a/examples/.gitkeep b/examples/.gitkeep
deleted file mode 100755
index e69de29bb2..0000000000
diff --git a/examples/run_server.py b/examples/run_server.py
deleted file mode 100644
index 4016f446c4..0000000000
--- a/examples/run_server.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import argparse
-
-from transformers import AutoConfig, PretrainedConfig
-
-from flexkv.common.config import CacheConfig, ModelConfig
-from flexkv.common.debug import flexkv_logger
-from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType
-from flexkv.server.server import KVServer
-
-
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-
-    # NAME
-    parser.add_argument("--enable-cpu",
-                        action=argparse.BooleanOptionalAction,
-                        default=True)
-    parser.add_argument("--enable-ssd",
-                        action=argparse.BooleanOptionalAction,
-                        default=False,)
-    parser.add_argument("--enable-remote",
-                        action=argparse.BooleanOptionalAction,
-                        default=False,)
-    parser.add_argument("--model-path", type=str, help="model path", default="")
-    parser.add_argument("--tp-size", type=int, default=1)
-    parser.add_argument("--dp-size", type=int, default=1)
-    parser.add_argument("--block-size", type=int, default=16)
-    parser.add_argument("--num-cpu-blocks", type=int, default=8192)
-    parser.add_argument("--num-ssd-blocks", type=int, default=8192)
-    parser.add_argument("--num-remote-blocks", type=int, default=8192)
-    parser.add_argument("--server-recv-port", type=str, default=None)
-    parser.add_argument("--remote-cache-size-mode", type=str, default="block_num")
-    parser.add_argument(
-        "--ssd-cache-dir",
-        type=str,
-        nargs='+',
-        default=[],
-        help="SSD cache file paths (multiple paths supported, separated by spaces)"
-    )
-    parser.add_argument(
-        "--remote-cache-path",
-        type=str,
-        nargs='+',
-        default=[],
-        help="remote cache paths (multiple paths supported, separated by spaces)"
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    hf_config = AutoConfig.from_pretrained(args.model_path)
-
-    num_layers=hf_config.num_hidden_layers
-    if hasattr(hf_config, 'num_key_value_heads'):
-        num_kv_heads=hf_config.num_key_value_heads
-    elif hasattr(hf_config, 'num_attention_heads'):
-        num_kv_heads=hf_config.num_attention_heads
-    else:
-        raise NotImplementedError
-    head_size=(hf_config.head_dim if hasattr(hf_config, 'head_dim')
-                else hf_config.hidden_size//hf_config.num_attention_heads)
-    use_mla=hf_config.architectures[0].startswith("Deepseek")
-
-    # TODO: different model config may have different attribute name
-    model_config = ModelConfig(
-        num_layers=num_layers,
-        num_kv_heads=num_kv_heads,
-        head_size=head_size,
-        use_mla=use_mla,
-        tp_size=args.tp_size,
-        dp_size=args.dp_size,
-        dtype=hf_config.torch_dtype
-    )
-
-    cache_config = CacheConfig(
-        enable_cpu=args.enable_cpu,
-        enable_ssd=args.enable_ssd,
-        enable_remote=args.enable_remote,
-        enable_gds=False,
-        enable_trace=False,
-        ssd_cache_iouring_entries=512,
-        tokens_per_block=args.block_size,
-        num_cpu_blocks=args.num_cpu_blocks,
-        num_ssd_blocks=args.num_ssd_blocks,
-        num_remote_blocks=args.num_remote_blocks,
-        ssd_cache_dir=args.ssd_cache_dir,
-        remote_cache_size_mode=args.remote_cache_size_mode,
-        remote_cache_path=args.remote_cache_path,
-    )
-
-    kvserver = KVServer(model_config, cache_config, args.server_recv_port)
-    kvserver.run()
diff --git a/examples/scheduler_server_example.py b/examples/scheduler_server_example.py
deleted file mode 100644
index 9dfea43355..0000000000
--- a/examples/scheduler_server_example.py
+++ /dev/null
@@ -1,246 +0,0 @@
-#!/usr/bin/env python3
-"""
-SchedulerServer Usage Example
-
-Demonstrates how to use the new SchedulerServer to replace the original KVServer + KVDPClient mode
-"""
-
-import torch
-import time
-from multiprocessing import Process
-from flexkv.common.config import ModelConfig, CacheConfig
-from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType
-from flexkv.server.scheduler_server import SchedulerServer
-
-
-def run_tp_client_process(dp_client_id, tp_rank, device_id, server_recv_port, model_config, gpu_kv_layout):
-    """Run TP client process"""
-    from flexkv.server.client import KVTPClient
-
-    print(f"Starting TP client: dp_client_id={dp_client_id}, tp_rank={tp_rank}, device_id={device_id}")
-
-    try:
-        # Set CUDA device for this process
-        if torch.cuda.is_available():
-            torch.cuda.set_device(device_id)
-            # Initialize CUDA context
-            torch.cuda.init()
-            # Clear cache
-            torch.cuda.empty_cache()
-
-        tp_client = KVTPClient(server_recv_port, dp_client_id, device_id)
-
-        # Create GPU blocks for this TP client
-        gpu_blocks = []
-        for layer_id in range(model_config.num_layers):
-            kv_dim = 2 if not model_config.use_mla else 1
-            kv_tensor = torch.zeros(
-                size=(kv_dim, gpu_kv_layout.num_block, gpu_kv_layout.tokens_per_block,
-                      model_config.num_kv_heads // model_config.tp_size,
-                      model_config.head_size),
-                dtype=model_config.dtype,
-                device=f"cuda:{device_id}"
-            )
-            gpu_blocks.append(kv_tensor)
-
-        print(f"TP client {tp_rank} registering to server...")
-        # Register to server
-        tp_client.register_to_server(gpu_blocks, gpu_kv_layout)
-        print(f"TP client {tp_rank} registered to server")
-
-        # Keep TP client running
-        while True:
-            time.sleep(1)
-    except Exception as e:
-        print(f"TP client {tp_rank} error: {e}")
-        import traceback
-        traceback.print_exc()
-        raise
-
-
-def main():
-    # Configuration parameters
-    num_layers = 32
-    num_kv_heads = 8
-    head_size = 128
-    num_cpu_blocks = 300
-    num_gpu_blocks = 30
-    tp_size = 1
-    tokens_per_block = 4
-
-    # Create model and cache configuration
-    model_config = ModelConfig(
-        num_layers=num_layers,
-        num_kv_heads=num_kv_heads,
-        head_size=head_size,
-        use_mla=False,
-        tp_size=tp_size,
-        dtype=torch.float16
-    )
-
-    cache_config = CacheConfig(
-        enable_cpu=True,
-        enable_ssd=False,
-        enable_remote=False,
-        enable_gds=False,
-        tokens_per_block=tokens_per_block,
-        num_cpu_blocks=num_cpu_blocks,
-    )
-
-    # Create GPU KV layout
-    gpu_kv_layout = KVCacheLayout(
-        type=KVCacheLayoutType.LAYERWISE,
-        num_layer=num_layers,
-        num_block=num_gpu_blocks,
-        tokens_per_block=tokens_per_block,
-        num_head=num_kv_heads // tp_size,
-        head_size=head_size,
-        is_mla=False
-    )
-
-    # Create SchedulerServer (integrates server and dpclient functionality)
-    scheduler_server = SchedulerServer(
-        model_config=model_config,
-        cache_config=cache_config,
-        server_recv_port="ipc:///tmp/scheduler_server_example"  # TPClient connects to this port
-    )
-
-    # Start background server thread to handle TPClient registration
-    scheduler_server.start_server_thread()
-
-    print("SchedulerServer started!")
-    print(f"TPClient can connect to: {scheduler_server.get_server_port()}")
-    print("Starting TP client processes...")
-
-    # Start TP client processes
-    tp_client_processes = []
-    for tp_rank in range(tp_size):
-        device_id = tp_rank  # Use TP rank as device ID
-        # Check available GPUs
-        available_gpus = torch.cuda.device_count()
-        if device_id >= available_gpus:
-            device_id = device_id % available_gpus
-            print(f"Warning: Using GPU {device_id} for TP rank {tp_rank} (not enough GPUs)")
-        tp_client_process = Process(
-            target=run_tp_client_process,
-            args=(0, tp_rank, device_id, scheduler_server.get_server_port(), model_config, gpu_kv_layout),
-            daemon=True
-        )
-        tp_client_process.start()
-        tp_client_processes.append(tp_client_process)
-        print(f"Started TP client process for rank {tp_rank} on device {device_id}")
-
-    print("Waiting for all TP clients to register...")
-
-    time.sleep(5)
-
-    # Now we can directly use scheduler_server without network communication
-    # Example: Create some test data (following benchmark_kvmanager.py pattern)
-    batch_size = 4
-    seq_len = 128
-    print("\n=== Generating test data ===")
-    # Generate separate sequences for each request (correct approach)
-    batch_token_ids = []
-    batch_slot_mappings = []
-    batch_token_masks = []
-    for i in range(batch_size):
-        # Each sequence is independent (seq_len,) shape
-        token_ids = torch.randint(0, 1000, (seq_len,))
-        slot_mapping = torch.arange(i * seq_len, (i + 1) * seq_len)
-        token_mask = torch.ones(seq_len, dtype=torch.bool)
-
-        batch_token_ids.append(token_ids)
-        batch_slot_mappings.append(slot_mapping)
-        batch_token_masks.append(token_mask)
-
-    print(f"Generated {batch_size} sequences, each with {seq_len} tokens")
-
-    print("\n=== Executing PUT Operations ===")
-    # PUT operations - each sequence processed separately
-    start_time = time.time()
-    put_task_ids = []
-    for i in range(batch_size):
-        task_id = scheduler_server.put_async(
-            token_ids=batch_token_ids[i],
-            slot_mapping=batch_slot_mappings[i],
-            token_mask=batch_token_masks[i]
-        )
-        if task_id:
-            put_task_ids.append(task_id)
-            print(f"PUT task {task_id} created for sequence {i}")
-    put_time = (time.time() - start_time) * 1000
-    print(f"Created {len(put_task_ids)} PUT tasks, time: {put_time:.2f}ms")
-    time.sleep(2)
-    print("\n=== Executing GET Operations ===")
-    # GET operations - each sequence processed separately
-    start_time = time.time()
-    get_task_ids = []
-    for i in range(batch_size):
-        task_id = scheduler_server.get_async(
-            token_ids=batch_token_ids[i],
-            slot_mapping=batch_slot_mappings[i],
-            token_mask=batch_token_masks[i]
-        )
-        if task_id:
-            get_task_ids.append(task_id)
-            print(f"GET task {task_id} created for sequence {i}")
-
-    get_time = (time.time() - start_time) * 1000
-    print(f"Created {len(get_task_ids)} GET tasks, time: {get_time:.2f}ms")
-
-    print("\n=== Waiting for All Tasks to Complete ===")
-    # Wait for all tasks to complete - can wait for multiple tasks at once
-    all_task_ids = put_task_ids + get_task_ids
-    if all_task_ids:
-        start_time = time.time()
-        masks = scheduler_server.wait(all_task_ids)
-        wait_time = (time.time() - start_time) * 1000
-        print(f"All {len(all_task_ids)} tasks completed, time: {wait_time:.2f}ms")
-        # Analyze results
-        if masks:
-            total_tokens = 0
-            for task_id, mask in masks.items():
-                if mask is not None:
-                    tokens = mask.sum().item() if hasattr(mask, 'sum') else len(mask)
-                    total_tokens += tokens
-                    print(f"Task {task_id}: {tokens} tokens processed")
-    print("\n=== Trying Non-blocking Wait ===")
-    # Create a few more tasks and try non-blocking wait
-    extra_task_ids = []
-    for i in range(2):
-        task_id = scheduler_server.put_async(
-            token_ids=batch_token_ids[i][:5],  # Use first 5 tokens
-            slot_mapping=batch_slot_mappings[i][:5],
-            token_mask=batch_token_masks[i][:5]
-        )
-        if task_id:
-            extra_task_ids.append(task_id)
-    if extra_task_ids:
-        # Immediately try to wait (might not be completed yet)
-        masks = scheduler_server.try_wait(extra_task_ids)
-        if masks:
-            print(f"Tasks {extra_task_ids} completed immediately")
-        else:
-            print(f"Tasks {extra_task_ids} not ready yet, will wait...")
-            masks = scheduler_server.wait(extra_task_ids)
-            print(f"Tasks {extra_task_ids} completed after wait")
-
-    print("\n✅ All operations completed successfully!")
-
-
-    # Clean up resources
-    print("\n=== Shutting down SchedulerServer ===")
-    scheduler_server.shutdown()
-    print("SchedulerServer has been shut down")
-    # Terminate TP client processes
-    print("Terminating TP client processes...")
-    for i, process in enumerate(tp_client_processes):
-        process.terminate()
-        process.join(timeout=2)
-        if process.is_alive():
-            process.kill()
-        print(f"TP client process {i} terminated")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/vllm_adaption_legacy/flexkv_vllm_0_10_0.patch b/examples/vllm_adaption_legacy/flexkv_vllm_0_10_0.patch
index f6349a0ac7..53e1b2e17b 100644
--- a/examples/vllm_adaption_legacy/flexkv_vllm_0_10_0.patch
+++ b/examples/vllm_adaption_legacy/flexkv_vllm_0_10_0.patch
@@ -7,11 +7,11 @@ index c7229dbb8..d2325fd3a 100644
  from dataclasses import dataclass, field
  from typing import Optional, Union
 +import asyncio
- 
+
  import aiohttp
  import huggingface_hub.constants
 @@ -23,10 +24,10 @@ AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
- 
+
  @dataclass
  class RequestFuncInput:
 -    prompt: str
@@ -27,7 +27,7 @@ index c7229dbb8..d2325fd3a 100644
 @@ -555,6 +556,107 @@ async def async_request_openai_audio(
              pbar.update(1)
          return output
- 
+
 +async def async_request_openai_chat_completions_multiturns(
 +    request_func_input: RequestFuncInput,
 +    pbar: Optional[tqdm] = None,
@@ -66,7 +66,7 @@ index c7229dbb8..d2325fd3a 100644
 +        for turn_id, prompt in enumerate(request_func_input.prompt):
 +            output = RequestFuncOutput()
 +            output.prompt_len = request_func_input.prompt_len[turn_id]
-+            
++
 +            payload["messages"].append({"role": "user", "content": prompt})
 +            payload["max_tokens"] = request_func_input.output_len[turn_id]
 +
@@ -121,15 +121,15 @@ index c7229dbb8..d2325fd3a 100644
 +                output.error = "".join(traceback.format_exception(*exc_info))
 +                break
 +            payload["messages"].append({"role": "assistant", "content": generated_text})
-+            
++
 +            output_list.append(output)
 +            if turn_id != len(request_func_input.prompt) - 1:
 +                await asyncio.sleep(turn_interval_time)
-+            
++
 +    if pbar:
 +        pbar.update(1)
 +    return output_list
- 
+
  def get_model(pretrained_model_name_or_path: str) -> str:
      if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
 @@ -619,6 +721,7 @@ ASYNC_REQUEST_FUNCS = {
@@ -138,7 +138,7 @@ index c7229dbb8..d2325fd3a 100644
      "llama.cpp": async_request_openai_completions,
 +    "openai-chat-multiturns": async_request_openai_chat_completions_multiturns,
  }
- 
+
  OPENAI_COMPATIBLE_BACKENDS = [
 diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
 index 1ad6cef7a..9178528d0 100644
@@ -147,7 +147,7 @@ index 1ad6cef7a..9178528d0 100644
 @@ -49,9 +49,9 @@ class SampleRequest:
      Represents a single inference request for benchmarking.
      """
- 
+
 -    prompt: Union[str, Any]
 -    prompt_len: int
 -    expected_output_len: int
@@ -156,17 +156,17 @@ index 1ad6cef7a..9178528d0 100644
 +    expected_output_len: Union[int, list[int]]
      multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
      lora_request: Optional[LoRARequest] = None
- 
+
 @@ -617,6 +617,108 @@ class SonnetDataset(BenchmarkDataset):
                  )
          return samples
- 
-+        
+
++
 +# -----------------------------------------------------------------------------
 +# ShareGPT Multiturn Dataset Implementation
 +# -----------------------------------------------------------------------------
-+    
-+    
++
++
 +class ShareGPTMultiTurnsDataset(BenchmarkDataset):
 +    def __init__(self, min_num_turns: int = 2, **kwargs) -> None:
 +        super().__init__(**kwargs)
@@ -191,7 +191,7 @@ index 1ad6cef7a..9178528d0 100644
 +        self.data = new_data
 +        random.seed(self.random_seed)
 +        random.shuffle(self.data)
-+    
++
 +    def sample(
 +        self,
 +        tokenizer: PreTrainedTokenizerBase,
@@ -205,7 +205,7 @@ index 1ad6cef7a..9178528d0 100644
 +        for entry in self.data:
 +            if len(samples) >= num_requests:
 +                break
-+            
++
 +            prompt_list = [d["value"] for d in entry["conversations"][::2]]
 +            completion_list = [d["value"] for d in entry["conversations"][1::2]]
 +            # prompt, completion = (
@@ -215,8 +215,8 @@ index 1ad6cef7a..9178528d0 100644
 +
 +            lora_request, tokenizer = self.get_random_lora_request(
 +                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
-+            
-+            
++
++
 +            prompt_ids_list = []
 +            completion_ids_list = []
 +            prompt_len_list = []
@@ -247,12 +247,12 @@ index 1ad6cef7a..9178528d0 100644
 +                new_output_len_list.append(new_output_len)
 +                history_len += prompt_len
 +                history_len += new_output_len
-+                
++
 +            if turn_id <= 0:
 +                continue
-+            
++
 +            prompt_list = prompt_list[:turn_id+1]
-+            
++
 +            samples.append(
 +                SampleRequest(
 +                    prompt=prompt_list,
@@ -262,8 +262,8 @@ index 1ad6cef7a..9178528d0 100644
 +                ))
 +        self.maybe_oversample_requests(samples, num_requests)
 +        return samples
-+    
- 
++
+
  # -----------------------------------------------------------------------------
  # BurstGPT Dataset Implementation
 diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -289,7 +289,7 @@ index c597fb106..74e157927 100644
                  latency_minus_ttft = outputs[i].latency - outputs[i].ttft
 @@ -278,6 +279,9 @@ async def benchmark(
      )
- 
+
      test_output = await request_func(request_func_input=test_input)
 +    if backend == "openai-chat-multiturns":
 +        print("test_output ", test_output)
@@ -303,7 +303,7 @@ index c597fb106..74e157927 100644
      outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 +    if backend == "openai-chat-multiturns":
 +        outputs = [o for sub_o in outputs for o in sub_o]
- 
+
      if profile:
          print("Stopping profiler...")
 @@ -748,6 +754,15 @@ def main(args: argparse.Namespace):
@@ -540,7 +540,7 @@ index 000000000..6ff17dfca
 +
 +def main():
 +    # Create an LLM without prefix caching as a baseline.
-+    regular_llm = LLM(model="facebook/opt-125m", 
++    regular_llm = LLM(model="facebook/opt-125m",
 +                      enable_prefix_caching=False,
 +                      gpu_memory_utilization=0.4)
 +
@@ -662,11 +662,11 @@ index 000000000..478683fa9
 +            dtype=flexkv_config.dtype,
 +            tp_size=flexkv_config.tp_size,
 +        )
-+        
++
 +        logger.info(f"start init FlexKVDPClient to {self.server_recv_port}")
 +        self.dp_client = KVDPClient(self.server_recv_port, self.model_config)
 +        logger.info(f"finish init FlexKVDPClient")
-+        
++
 +    def put_async(
 +        self,
 +        token_ids: torch.Tensor,
@@ -675,7 +675,7 @@ index 000000000..478683fa9
 +    ) -> int:
 +        " return task_id "
 +        return self.dp_client.put_async(token_ids, slot_mapping, token_mask)
-+    
++
 +    def get_async(
 +        self,
 +        token_ids: torch.Tensor,
@@ -684,15 +684,15 @@ index 000000000..478683fa9
 +    ) -> int:
 +        " return task_id "
 +        return self.dp_client.get_async(token_ids, slot_mapping, token_mask)
-+        
++
 +    def wait(
-+        self, 
++        self,
 +        wait_task_ids: list[int],
 +    ) -> dict[int, torch.Tensor]:
 +        return self.dp_client.wait(wait_task_ids)
-+    
++
 +    def try_wait(
-+        self, 
++        self,
 +        wait_task_ids: list[int],
 +    ) -> dict[int, Optional[torch.Tensor]]:
 +        # print("--------------------------------")
@@ -706,8 +706,8 @@ index 000000000..478683fa9
 +            import traceback
 +            traceback.print_exc()
 +            return {}
-+        
-+        
++
++
 +class FlexKVTPClient:
 +    def __init__(
 +        self,
@@ -722,7 +722,7 @@ index 000000000..478683fa9
 +        self.tp_client = KVTPClient(flexkv_config.server_recv_port, dp_client_id, device_id, tp_rank)
 +        logger.info(f"finish init FlexKVTPClient")
 +        gpu_layout = KVCacheLayout(
-+            type=KVCacheLayoutType.LAYERWISE,
++            type=KVCacheLayoutType.LAYERFIRST,
 +            num_layer=flexkv_config.num_layers,
 +            num_block=flexkv_config.num_blocks,
 +            tokens_per_block=flexkv_config.block_size,
@@ -763,17 +763,17 @@ index 000000000..f2724e712
 +    dtype: torch.dtype = None
 +    use_mla: bool = False
 +    tp_size: int = 1
-+    
++
 +    @classmethod
 +    def from_env(cls) -> 'FlexKVConfig':
 +        enable_flexkv = (os.getenv('ENABLE_FLEXKV', "false").lower() == "true")
 +        server_recv_port = os.getenv('FLEXKV_SERVER_RECV_PORT', "")
-+        
++
 +        return cls(enable_flexkv=enable_flexkv,
 +                   server_recv_port=server_recv_port)
-+    
++
 +    def post_init(
-+        self, 
++        self,
 +        kv_cache_config: KVCacheConfig,
 +        tp_size: int
 +        ):
@@ -794,12 +794,12 @@ index 69aaf4390..fe426f420 100644
 @@ -21,7 +21,7 @@ VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
  VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
  VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
- 
+
 -_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
 +_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s.%(msecs)03d "
             "[%(filename)s:%(lineno)d] %(message)s")
  _DATE_FORMAT = "%m-%d %H:%M:%S"
- 
+
 diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
 index 5b0218640..aa590eb6f 100644
 --- a/vllm/v1/core/kv_cache_utils.py
@@ -812,12 +812,12 @@ index 5b0218640..aa590eb6f 100644
          # A deque of (requests, queries, hits) for the most recent requests.
 -        self.query_queue: deque[tuple[int, int, int]] = deque()
 +        self.query_queue: deque[tuple[int, int, int, int]] = deque()
- 
+
      def observe(self, stats: PrefixCacheStats):
          """Observe the prefix caching for a set of requests.
 @@ -108,14 +109,15 @@ class PrefixCachingMetrics:
              self.reset()
- 
+
          # Update the metrics.
 -        self.query_queue.append((stats.requests, stats.queries, stats.hits))
 +        self.query_queue.append((stats.requests, stats.queries, stats.hits, stats.flexkv_hits))
@@ -825,7 +825,7 @@ index 5b0218640..aa590eb6f 100644
          self.aggregated_query_total += stats.queries
          self.aggregated_query_hit += stats.hits
 +        self.aggregated_query_flexkv_hit += stats.flexkv_hits
- 
+
          # Remove the oldest stats if the number of requests exceeds.
          if self.aggregated_requests > self.max_recent_requests:
 -            old_requests, old_queries, old_hits = self.query_queue.popleft()
@@ -839,28 +839,28 @@ index 5b0218640..aa590eb6f 100644
          self.aggregated_query_hit = 0
 +        self.aggregated_query_flexkv_hit = 0
          self.query_queue.clear()
- 
+
      @property
 @@ -133,6 +136,13 @@ class PrefixCachingMetrics:
          if self.aggregated_query_total == 0:
              return 0.0
          return self.aggregated_query_hit / self.aggregated_query_total
-+        
++
 +    @property
 +    def flexkv_hit_rate(self) -> float:
 +        """Calculate the hit rate for the past N requests."""
 +        if self.aggregated_query_total == 0:
 +            return 0.0
 +        return self.aggregated_query_flexkv_hit / self.aggregated_query_total
- 
- 
+
+
  @dataclass
 diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
 index 446f98034..b465c4cf1 100644
 --- a/vllm/v1/core/sched/scheduler.py
 +++ b/vllm/v1/core/sched/scheduler.py
 @@ -5,6 +5,7 @@ from __future__ import annotations
- 
+
  import itertools
  import time
 +import torch
@@ -874,13 +874,13 @@ index 446f98034..b465c4cf1 100644
 +# flexkv
 +from vllm.utils import cdiv
 +from vllm.distributed.flexkv_extension.config import FlexKVConfig
- 
+
  logger = init_logger(__name__)
- 
+
 @@ -162,6 +166,23 @@ class Scheduler(SchedulerInterface):
          )
          self.use_pp = self.parallel_config.pipeline_parallel_size > 1
- 
+
 +        # flexkv
 +        self.enable_flexkv = False
 +        self.flexkv_client = None
@@ -904,14 +904,14 @@ index 446f98034..b465c4cf1 100644
 @@ -174,6 +195,13 @@ class Scheduler(SchedulerInterface):
          # chunked prefills, prefix caching, speculative decoding,
          # and the "jump decoding" optimization in the future.
- 
+
 +        # flexkv
 +        if self.enable_flexkv:
 +            # aviod busy loop
 +            if self.get_num_unfinished_requests() == 0:
 +                time.sleep(0.01)
 +            self.check_offload_kv_tasks()
-+            
++
          scheduled_new_reqs: list[Request] = []
          scheduled_resumed_reqs: list[Request] = []
          scheduled_running_reqs: list[Request] = []
@@ -919,7 +919,7 @@ index 446f98034..b465c4cf1 100644
                  if new_blocks is None:
                      # The request cannot be scheduled.
                      break
-+                
++
 +                if self.enable_flexkv and num_new_tokens > self.block_size and request.status == RequestStatus.WAITING:
 +                    # don't match the last block
 +                    num_new_blocks_to_get = cdiv(num_new_tokens, self.block_size)-1
@@ -940,7 +940,7 @@ index 446f98034..b465c4cf1 100644
 +                    self.flexkv_timer[request.request_id] = {}
 +                    self.flexkv_timer[request.request_id]['get_async_start'] = t_async_get_start
 +                    self.flexkv_timer[request.request_id]['get_async_return'] = t_async_get_return
- 
+
                  # KVTransfer: the connector uses this info to determine
                  # if a load is needed. Note that
 @@ -505,6 +554,31 @@ class Scheduler(SchedulerInterface):
@@ -948,7 +948,7 @@ index 446f98034..b465c4cf1 100644
                          self.encoder_cache_manager.allocate(request, i)
                      encoder_budget = new_encoder_budget
 +                        # batch wait
-+            
++
 +            # batch wait
 +            if self.enable_flexkv:
 +                if len(self.load_kv_tasks) != 0:
@@ -967,18 +967,18 @@ index 446f98034..b465c4cf1 100644
 +                            f"[FlexKV] req: {request.request_id}, task: {task_id}, "
 +                            f"get {match_length} tokens cost {(t_async_get_end-t_get_async_start)*1000:.2f} ms, "
 +                            f"get_async() api cost {(t_get_async_return-t_get_async_start)*1000:.2f} ms")
-+                        
++
 +                        token_budget += match_length
 +                        num_scheduled_tokens[request.request_id] -= match_length
 +                        request.num_computed_tokens += match_length
 +                        self.kv_cache_manager.prefix_cache_stats.flexkv_hits += (match_length//self.block_size)
- 
+
          # Put back any skipped requests at the head of the waiting queue
          if skipped_waiting_requests:
 @@ -1016,11 +1090,49 @@ class Scheduler(SchedulerInterface):
          if self.finished_req_ids_dict is not None:
              self.finished_req_ids_dict[request.client_index].add(request_id)
- 
+
 -        if not delay_free_blocks:
 -            self._free_blocks(request)
 +        # flexkv: offload BEFORE freeing blocks to preserve req_to_blocks info
@@ -990,9 +990,9 @@ index 446f98034..b465c4cf1 100644
 +            # else:
 +            #     self._free_block(request)
 +
- 
+
          return kv_xfer_params
- 
+
 +    def _free_block(self, request: Request) -> None:
 +        self.kv_cache_manager.free(request)
 +        self.kv_cache_manager.free_block_hashes(request)
@@ -1004,20 +1004,20 @@ index 446f98034..b465c4cf1 100644
 +        req_blocks = self.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks.get(request.request_id, [])
 +        req_token_ids = torch.tensor(request.all_token_ids[:-1])
 +        req_block_ids = torch.tensor([block.block_id for block in req_blocks])
-+        
++
 +        # Debug information for empty req_blocks
 +        # if len(req_blocks) == 0:
 +            # print(f"WARNING: Empty req_blocks for request {request.request_id}")
 +            # print(f"  request.all_token_ids length: {len(request.all_token_ids)}")
 +            # print(f"  req_token_ids length: {len(req_token_ids)}")
 +            # print(f"  req_to_blocks keys: {list(self.kv_cache_manager.coordinator.single_type_managers[0].req_to_blocks.keys())}")
-+            
++
 +        slot_mapping = req_block_ids.repeat_interleave(self.block_size)[:len(req_token_ids)] * self.block_size
-+        
++
 +        # Additional debug info
 +        # print(f"FlexKV _offload_kv: req_id={request.request_id}, "
 +            #   f"blocks={len(req_blocks)}, tokens={len(req_token_ids)}, slots={len(slot_mapping)}")
-+              
++
 +        self.flexkv_timer[request.request_id] = {}
 +        self.flexkv_timer[request.request_id]["put_async_start"] = time.monotonic()
 +        task_id = self.flexkv_client.put_async(token_ids=req_token_ids, slot_mapping=slot_mapping)
@@ -1032,7 +1032,7 @@ index 446f98034..b465c4cf1 100644
              num_accepted_tokens=num_accepted_tokens)
          return spec_decoding_stats
 -
-+        
++
 +    def check_offload_kv_tasks(self):
 +        if len(self.offload_kv_tasks) == 0:
 +            return
@@ -1051,8 +1051,8 @@ index 446f98034..b465c4cf1 100644
 +                    f"[FlexKV] req: {request.request_id}, task: {task_id}, "
 +                    f"put {sum(task_result).item()} tokens cost {(t_async_put_end-t_put_async_start)*1000:.2f} ms, "
 +                    f"put_async() api cost {(t_put_async_return-t_put_async_start)*1000:.2f} ms")
-+                self._free_block(request) 
-+            
++                self._free_block(request)
++
      def shutdown(self) -> None:
          if self.kv_event_publisher:
              self.kv_event_publisher.shutdown()
@@ -1063,16 +1063,16 @@ index 7779b559c..2d17908ea 100644
 @@ -46,6 +46,8 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
  from vllm.v1.structured_output import StructuredOutputManager
  from vllm.version import __version__ as VLLM_VERSION
- 
+
 +from vllm.distributed.flexkv_extension.config import FlexKVConfig
 +
  logger = init_logger(__name__)
- 
+
  POLLING_TIMEOUT_S = 2.5
 @@ -118,6 +120,8 @@ class EngineCore:
              log_stats=self.log_stats,
          )
- 
+
 +        self.init_flexkv(vllm_config, kv_cache_config)
 +
          # Setup MM Input Mapper.
@@ -1081,11 +1081,11 @@ index 7779b559c..2d17908ea 100644
 @@ -194,6 +198,23 @@ class EngineCore:
                       "warmup model) took %.2f seconds"), elapsed)
          return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
- 
+
 +
 +    def init_flexkv(
-+        self, 
-+        taco_llm_config: VllmConfig, 
++        self,
++        taco_llm_config: VllmConfig,
 +        kv_cache_config: KVCacheConfig
 +    ):
 +        self.scheduler: V1Scheduler
@@ -1098,7 +1098,7 @@ index 7779b559c..2d17908ea 100644
 +                )
 +                dp_client_id = self.scheduler.init_flexkv(flexkv_config)
 +                self.model_executor.init_flexkv(flexkv_config, dp_client_id)
-+                
++
      def add_request(self, request: EngineCoreRequest):
          """Add request to the scheduler."""
          if pooling_params := request.pooling_params:
@@ -1113,16 +1113,16 @@ index 50b9634a4..3d7bdd4c8 100644
 -
 +from vllm.distributed.flexkv_extension.config import FlexKVConfig
  FailureCallback = Callable[[], None]
- 
- 
+
+
 @@ -88,6 +88,10 @@ class Executor(ExecutorBase):
                                       args=(scheduler_output, ))
          return output[0]
- 
+
 +    def init_flexkv(self, flexkv_config: FlexKVConfig, dp_client_id: int):
 +        self.collective_rpc("init_flexkv",
 +                            args=(flexkv_config, dp_client_id, ))
-+                            
++
      @property
      def max_concurrent_batches(self) -> int:
          return 1
@@ -1147,7 +1147,7 @@ index 7f2556bab..e7fb79486 100644
 +            self.prefix_caching_metrics.flexkv_hit_rate * 100,
          )
          self.spec_decoding_logging.log(log_fn=log_fn)
- 
+
 diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
 index 1eb10ccb6..1073aa571 100644
 --- a/vllm/v1/metrics/stats.py
@@ -1159,7 +1159,7 @@ index 1eb10ccb6..1073aa571 100644
 -
 +    # flexkv
 +    flexkv_hits: int = 0
- 
+
  @dataclass
  class SchedulerStats:
 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1190,20 +1190,20 @@ index 522946351..31a3bed13 100644
 @@ -33,6 +33,10 @@ from vllm.v1.utils import report_usage_stats
  from vllm.v1.worker.gpu_model_runner import GPUModelRunner
  from vllm.v1.worker.worker_base import WorkerBase
- 
+
 +# flexkv
 +from vllm.distributed.flexkv_extension.config import FlexKVConfig
 +from vllm.distributed.flexkv_extension.client import FlexKVTPClient
 +
  logger = init_logger(__name__)
- 
+
  if TYPE_CHECKING:
 @@ -556,6 +560,23 @@ class Worker(WorkerBase):
              max_size=max_size,
          )
- 
+
 +    def init_flexkv(
-+        self, 
++        self,
 +        flexkv_config: FlexKVConfig,
 +        dp_client_id: int,
 +    ) -> None:
diff --git a/examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch b/examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch
index 5310d53c6a..6d9f0a9fc4 100644
--- a/examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch
+++ b/examples/vllm_adaption_legacy/flexkv_vllm_0_8_4.patch
@@ -7,11 +7,11 @@ index 287d500a8..7e87f0446 100644
  from dataclasses import dataclass, field
  from typing import Optional, Union
 +import asyncio
- 
+
  import aiohttp
  import huggingface_hub.constants
 @@ -22,10 +23,10 @@ AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
- 
+
  @dataclass
  class RequestFuncInput:
 -    prompt: str
@@ -26,8 +26,8 @@ index 287d500a8..7e87f0446 100644
      logprobs: Optional[int] = None
 @@ -436,6 +437,109 @@ async def async_request_openai_chat_completions(
      return output
- 
- 
+
+
 +async def async_request_openai_chat_completions_multiturns(
 +    request_func_input: RequestFuncInput,
 +    pbar: Optional[tqdm] = None,
@@ -66,7 +66,7 @@ index 287d500a8..7e87f0446 100644
 +        for turn_id, prompt in enumerate(request_func_input.prompt):
 +            output = RequestFuncOutput()
 +            output.prompt_len = request_func_input.prompt_len[turn_id]
-+            
++
 +            payload["messages"].append({"role": "user", "content": prompt})
 +            payload["max_tokens"] = request_func_input.output_len[turn_id]
 +
@@ -121,11 +121,11 @@ index 287d500a8..7e87f0446 100644
 +                output.error = "".join(traceback.format_exception(*exc_info))
 +                break
 +            payload["messages"].append({"role": "assistant", "content": generated_text})
-+            
++
 +            output_list.append(output)
 +            if turn_id != len(request_func_input.prompt) - 1:
 +                await asyncio.sleep(turn_interval_time)
-+            
++
 +    if pbar:
 +        pbar.update(1)
 +    return output_list
@@ -140,7 +140,7 @@ index 287d500a8..7e87f0446 100644
      "sglang": async_request_openai_completions,
 +    "openai-chat-multiturns": async_request_openai_chat_completions_multiturns,
  }
- 
+
  OPENAI_COMPATIBLE_BACKENDS = [
 diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
 index 63f174275..561a40421 100644
@@ -149,7 +149,7 @@ index 63f174275..561a40421 100644
 @@ -50,9 +50,9 @@ class SampleRequest:
      Represents a single inference request for benchmarking.
      """
- 
+
 -    prompt: Union[str, Any]
 -    prompt_len: int
 -    expected_output_len: int
@@ -158,17 +158,17 @@ index 63f174275..561a40421 100644
 +    expected_output_len: Union[int, list[int]]
      multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
      lora_request: Optional[LoRARequest] = None
- 
+
 @@ -507,6 +507,108 @@ class SonnetDataset(BenchmarkDataset):
                      ))
          return samples
- 
-+    
+
++
 +# -----------------------------------------------------------------------------
 +# ShareGPT Multiturn Dataset Implementation
 +# -----------------------------------------------------------------------------
-+    
-+    
++
++
 +class ShareGPTMultiTurnsDataset(BenchmarkDataset):
 +    def __init__(self, min_num_turns: int = 2, **kwargs) -> None:
 +        super().__init__(**kwargs)
@@ -193,7 +193,7 @@ index 63f174275..561a40421 100644
 +        self.data = new_data
 +        random.seed(self.random_seed)
 +        random.shuffle(self.data)
-+    
++
 +    def sample(
 +        self,
 +        tokenizer: PreTrainedTokenizerBase,
@@ -207,7 +207,7 @@ index 63f174275..561a40421 100644
 +        for entry in self.data:
 +            if len(samples) >= num_requests:
 +                break
-+            
++
 +            prompt_list = [d["value"] for d in entry["conversations"][::2]]
 +            completion_list = [d["value"] for d in entry["conversations"][1::2]]
 +            # prompt, completion = (
@@ -217,8 +217,8 @@ index 63f174275..561a40421 100644
 +
 +            lora_request, tokenizer = self.get_random_lora_request(
 +                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
-+            
-+            
++
++
 +            prompt_ids_list = []
 +            completion_ids_list = []
 +            prompt_len_list = []
@@ -249,12 +249,12 @@ index 63f174275..561a40421 100644
 +                new_output_len_list.append(new_output_len)
 +                history_len += prompt_len
 +                history_len += new_output_len
-+                
++
 +            if turn_id <= 0:
 +                continue
-+            
++
 +            prompt_list = prompt_list[:turn_id+1]
-+            
++
 +            samples.append(
 +                SampleRequest(
 +                    prompt=prompt_list,
@@ -264,8 +264,8 @@ index 63f174275..561a40421 100644
 +                ))
 +        self.maybe_oversample_requests(samples, num_requests)
 +        return samples
-+    
- 
++
+
  # -----------------------------------------------------------------------------
  # BurstGPT Dataset Implementation
 diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -279,7 +279,7 @@ index b5bd840d8..7e670eb64 100644
 -                               VisionArenaDataset)
 +                               VisionArenaDataset, ShareGPTMultiTurnsDataset)
  from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
- 
+
  MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 @@ -166,7 +166,7 @@ def calculate_metrics(
                      tokenizer(outputs[i].generated_text,
@@ -292,7 +292,7 @@ index b5bd840d8..7e670eb64 100644
                  latency_minus_ttft = outputs[i].latency - outputs[i].ttft
 @@ -293,6 +293,8 @@ async def benchmark(
      )
- 
+
      test_output = await request_func(request_func_input=test_input)
 +    if backend == "openai-chat-multiturns":
 +        test_output = test_output[-1]
@@ -305,7 +305,7 @@ index b5bd840d8..7e670eb64 100644
      outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 +    if backend == "openai-chat-multiturns":
 +        outputs = [o for sub_o in outputs for o in sub_o]
- 
+
      if profile:
          print("Stopping profiler...")
 @@ -636,6 +640,15 @@ def main(args: argparse.Namespace):
@@ -478,7 +478,7 @@ index 000000000..6ff17dfca
 +
 +def main():
 +    # Create an LLM without prefix caching as a baseline.
-+    regular_llm = LLM(model="facebook/opt-125m", 
++    regular_llm = LLM(model="facebook/opt-125m",
 +                      enable_prefix_caching=False,
 +                      gpu_memory_utilization=0.4)
 +
@@ -600,11 +600,11 @@ index 000000000..11b7ff1c3
 +            dtype=flexkv_config.dtype,
 +            tp_size=flexkv_config.tp_size,
 +        )
-+        
++
 +        logger.info(f"start init FlexKVDPClient to {self.server_recv_port}")
 +        self.dp_client = KVDPClient(self.server_recv_port, self.model_config)
 +        logger.info(f"finish init FlexKVDPClient")
-+        
++
 +    def put_async(
 +        self,
 +        token_ids: torch.Tensor,
@@ -613,7 +613,7 @@ index 000000000..11b7ff1c3
 +    ) -> int:
 +        " return task_id "
 +        return self.dp_client.put_async(token_ids, slot_mapping, token_mask)
-+    
++
 +    def get_async(
 +        self,
 +        token_ids: torch.Tensor,
@@ -622,20 +622,20 @@ index 000000000..11b7ff1c3
 +    ) -> int:
 +        " return task_id "
 +        return self.dp_client.get_async(token_ids, slot_mapping, token_mask)
-+        
++
 +    def wait(
-+        self, 
++        self,
 +        wait_task_ids: list[int],
 +    ) -> dict[int, torch.Tensor]:
 +        return self.dp_client.wait(wait_task_ids)
-+    
++
 +    def try_wait(
-+        self, 
++        self,
 +        wait_task_ids: list[int],
 +    ) -> dict[int, Optional[torch.Tensor]]:
 +        return self.dp_client.try_wait(wait_task_ids)
-+        
-+        
++
++
 +class FlexKVTPClient:
 +    def __init__(
 +        self,
@@ -650,7 +650,7 @@ index 000000000..11b7ff1c3
 +        self.tp_client = KVTPClient(flexkv_config.server_recv_port, dp_client_id, device_id, tp_rank)
 +        logger.info(f"finish init FlexKVTPClient")
 +        gpu_layout = KVCacheLayout(
-+            type=KVCacheLayoutType.LAYERWISE,
++            type=KVCacheLayoutType.LAYERFIRST,
 +            num_layer=flexkv_config.num_layers,
 +            num_block=flexkv_config.num_blocks,
 +            tokens_per_block=flexkv_config.block_size,
@@ -691,17 +691,17 @@ index 000000000..f2724e712
 +    dtype: torch.dtype = None
 +    use_mla: bool = False
 +    tp_size: int = 1
-+    
++
 +    @classmethod
 +    def from_env(cls) -> 'FlexKVConfig':
 +        enable_flexkv = (os.getenv('ENABLE_FLEXKV', "false").lower() == "true")
 +        server_recv_port = os.getenv('FLEXKV_SERVER_RECV_PORT', "")
-+        
++
 +        return cls(enable_flexkv=enable_flexkv,
 +                   server_recv_port=server_recv_port)
-+    
++
 +    def post_init(
-+        self, 
++        self,
 +        kv_cache_config: KVCacheConfig,
 +        tp_size: int
 +        ):
@@ -722,12 +722,12 @@ index 2b0b9da2d..7f377af6d 100644
 @@ -19,7 +19,7 @@ VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
  VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
  VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
- 
+
 -_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
 +_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s.%(msecs)03d "
             "[%(filename)s:%(lineno)d] %(message)s")
  _DATE_FORMAT = "%m-%d %H:%M:%S"
- 
+
 diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
 index bd0e01d04..f4cadfba8 100644
 --- a/vllm/v1/core/kv_cache_utils.py
@@ -740,12 +740,12 @@ index bd0e01d04..f4cadfba8 100644
          # A deque of (requests, queries, hits) for the most recent requests.
 -        self.query_queue: deque[tuple[int, int, int]] = deque()
 +        self.query_queue: deque[tuple[int, int, int, int]] = deque()
- 
+
      def observe(self, stats: PrefixCacheStats):
          """Observe the prefix caching for a set of requests.
 @@ -81,23 +82,26 @@ class PrefixCachingMetrics:
              self.reset()
- 
+
          # Update the metrics.
 -        self.query_queue.append((stats.requests, stats.queries, stats.hits))
 +        self.query_queue.append((stats.requests, stats.queries, stats.hits, stats.flexkv_hits))
@@ -753,7 +753,7 @@ index bd0e01d04..f4cadfba8 100644
          self.aggregated_query_total += stats.queries
          self.aggregated_query_hit += stats.hits
 +        self.aggregated_query_flexkv_hit += stats.flexkv_hits
- 
+
          # Remove the oldest stats if the number of requests exceeds.
          if self.aggregated_requests > self.interval:
 -            old_requests, old_queries, old_hits = self.query_queue.popleft()
@@ -762,7 +762,7 @@ index bd0e01d04..f4cadfba8 100644
              self.aggregated_query_total -= old_queries
              self.aggregated_query_hit -= old_hits
 +            self.aggregated_query_flexkv_hit -= old_flexkv_hits
- 
+
      def reset(self):
          """Reset the metrics."""
          self.aggregated_requests = 0
@@ -770,23 +770,23 @@ index bd0e01d04..f4cadfba8 100644
          self.aggregated_query_hit = 0
 +        self.aggregated_query_flexkv_hit = 0
          self.query_queue.clear()
- 
+
      @property
 @@ -106,6 +110,15 @@ class PrefixCachingMetrics:
          if self.aggregated_query_total == 0:
              return 0.0
          return self.aggregated_query_hit / self.aggregated_query_total
-+    
++
 +    @property
 +    def flexkv_hit_rate(self) -> float:
 +        """Calculate the hit rate for the past N requests."""
 +        if self.aggregated_query_total == 0:
 +            return 0.0
 +        return self.aggregated_query_flexkv_hit / self.aggregated_query_total
-+    
 +
- 
- 
++
+
+
  @dataclass
 diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
 index a81574875..e808e7537 100644
@@ -794,7 +794,7 @@ index a81574875..e808e7537 100644
 +++ b/vllm/v1/core/sched/scheduler.py
 @@ -3,6 +3,7 @@
  from __future__ import annotations
- 
+
  import time
 +import torch
  from collections import deque
@@ -803,19 +803,19 @@ index a81574875..e808e7537 100644
 @@ -27,6 +28,10 @@ from vllm.v1.request import Request, RequestStatus
  from vllm.v1.spec_decode.metrics import SpecDecodingStats
  from vllm.v1.structured_output import StructuredOutputManager
- 
+
 +# flexkv
 +from vllm.utils import cdiv
 +from vllm.distributed.flexkv_extension.config import FlexKVConfig
 +
  logger = init_logger(__name__)
- 
- 
+
+
 @@ -118,6 +123,23 @@ class Scheduler(SchedulerInterface):
          if speculative_config and speculative_config.method == "eagle":
              self.num_lookahead_tokens = \
                  speculative_config.num_speculative_tokens
-+            
++
 +        # flexkv
 +        self.enable_flexkv = False
 +        self.flexkv_client = None
@@ -832,13 +832,13 @@ index a81574875..e808e7537 100644
 +        from vllm.distributed.flexkv_extension.client import FlexKVDPClient
 +        self.flexkv_client = FlexKVDPClient(flexkv_config)
 +        return self.flexkv_client.dp_client.dp_client_id
- 
+
      def schedule(self) -> SchedulerOutput:
          # NOTE(woosuk) on the scheduling algorithm:
 @@ -131,6 +153,13 @@ class Scheduler(SchedulerInterface):
          # chunked prefills, prefix caching, speculative decoding,
          # and the "jump decoding" optimization in the future.
- 
+
 +        # flexkv
 +        if self.enable_flexkv:
 +            # aviod busy loop
@@ -865,10 +865,10 @@ index a81574875..e808e7537 100644
 @@ -335,6 +364,29 @@ class Scheduler(SchedulerInterface):
                      # The request cannot be scheduled.
                      break
- 
+
 +                # flexkv
 +                if self.enable_flexkv and num_new_tokens > self.block_size and request.status == RequestStatus.WAITING:
-+                                        
++
 +                    # don't match the last block
 +                    num_new_blocks_to_get = cdiv(num_new_tokens, self.block_size)-1
 +                    num_new_tokens_to_match = num_new_blocks_to_get*self.block_size
@@ -895,7 +895,7 @@ index a81574875..e808e7537 100644
 @@ -372,6 +424,29 @@ class Scheduler(SchedulerInterface):
                          self.encoder_cache_manager.allocate(request, i)
                      encoder_budget = new_encoder_budget
- 
+
 +            # batch wait
 +            if self.enable_flexkv:
 +                if len(self.load_kv_tasks) != 0:
@@ -912,7 +912,7 @@ index a81574875..e808e7537 100644
 +                            f"[FlexKV] req: {request.request_id}, task: {task_id}, "
 +                            f"get {match_length} tokens cost {(t_async_get_end-t_get_async_start)*1000:.2f} ms, "
 +                            f"get_async() api cost {(t_get_async_return-t_get_async_start)*1000:.2f} ms")
-+                        
++
 +                        token_budget += match_length
 +                        num_scheduled_tokens[request.request_id] -= match_length
 +                        request.num_computed_tokens += match_length
@@ -923,7 +923,7 @@ index a81574875..e808e7537 100644
          if skipped_waiting_requests:
              self.waiting.extendleft(skipped_waiting_requests)
 @@ -730,18 +805,36 @@ class Scheduler(SchedulerInterface):
- 
+
      def _free_request(self, request: Request) -> None:
          assert request.is_finished()
 -        self.kv_cache_manager.free(request)
@@ -932,7 +932,7 @@ index a81574875..e808e7537 100644
          self._cached_reqs_data.pop(request.request_id, None)
          del self.requests[request.request_id]
          self.finished_req_ids.add(request.request_id)
- 
+
 +        if self.enable_flexkv:
 +            self._offload_kv(request)
 +        else:
@@ -955,11 +955,11 @@ index a81574875..e808e7537 100644
 +
      def get_num_unfinished_requests(self) -> int:
          return len(self.waiting) + len(self.running)
- 
+
      def has_finished_requests(self) -> bool:
 -        return len(self.finished_req_ids) > 0
 +        return len(self.finished_req_ids) > 0 or len(self.offload_kv_tasks)
- 
+
      def get_num_unscheduled_requests(self) -> int:
          """Number of requests that are not being processed by the executor."""
 @@ -777,3 +870,23 @@ class Scheduler(SchedulerInterface):
@@ -984,8 +984,8 @@ index a81574875..e808e7537 100644
 +                    f"[FlexKV] req: {request.request_id}, task: {task_id}, "
 +                    f"put {sum(task_result).item()} tokens cost {(t_async_put_end-t_put_async_start)*1000:.2f} ms, "
 +                    f"put_async() api cost {(t_put_async_return-t_put_async_start)*1000:.2f} ms")
-+                self._free_block(request) 
-+            
++                self._free_block(request)
++
 diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
 index f642e5100..3ce609b50 100644
 --- a/vllm/v1/engine/core.py
@@ -993,16 +993,16 @@ index f642e5100..3ce609b50 100644
 @@ -40,6 +40,8 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
  from vllm.v1.structured_output import StructuredOutputManager
  from vllm.version import __version__ as VLLM_VERSION
- 
+
 +from vllm.distributed.flexkv_extension.config import FlexKVConfig
 +
  logger = init_logger(__name__)
- 
+
  POLLING_TIMEOUT_S = 2.5
 @@ -105,6 +107,8 @@ class EngineCore:
              log_stats=self.log_stats,
          )
- 
+
 +        self.init_flexkv(vllm_config, kv_cache_config)
 +
          # Setup MM Input Mapper.
@@ -1011,10 +1011,10 @@ index f642e5100..3ce609b50 100644
 @@ -164,6 +168,22 @@ class EngineCore:
                       "warmup model) took %.2f seconds"), elapsed)
          return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
- 
+
 +    def init_flexkv(
-+        self, 
-+        taco_llm_config: VllmConfig, 
++        self,
++        taco_llm_config: VllmConfig,
 +        kv_cache_config: KVCacheConfig
 +    ):
 +        self.scheduler: V1Scheduler
@@ -1030,7 +1030,7 @@ index f642e5100..3ce609b50 100644
 +
      def add_request(self, request: EngineCoreRequest):
          """Add request to the scheduler."""
- 
+
 diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
 index e3a4cd98c..dd009a3a4 100644
 --- a/vllm/v1/executor/abstract.py
@@ -1040,13 +1040,13 @@ index e3a4cd98c..dd009a3a4 100644
  from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
  from vllm.v1.outputs import ModelRunnerOutput
 +from vllm.distributed.flexkv_extension.config import FlexKVConfig
- 
- 
+
+
  class Executor(ExecutorBase):
 @@ -78,6 +79,11 @@ class Executor(ExecutorBase):
                                       args=(scheduler_output, ))
          return output[0]
- 
+
 +
 +    def init_flexkv(self, flexkv_config: FlexKVConfig, dp_client_id: int):
 +        self.collective_rpc("init_flexkv",
@@ -1075,7 +1075,7 @@ index 3959be40b..69c5b59a1 100644
              self.prefix_caching_metrics.hit_rate * 100,
 +            self.prefix_caching_metrics.flexkv_hit_rate * 100,
          )
- 
+
          if scheduler_stats.spec_decoding_stats is not None:
 diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
 index fd9492648..8915c4f78 100644
@@ -1087,8 +1087,8 @@ index fd9492648..8915c4f78 100644
      hits: int = 0
 +    # flexkv
 +    flexkv_hits: int = 0
- 
- 
+
+
  @dataclass
 diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
 index 2972e0ffb..6bb8fa9ff 100644
@@ -1106,22 +1106,22 @@ index 2972e0ffb..6bb8fa9ff 100644
 @@ -25,6 +25,10 @@ from vllm.v1.outputs import ModelRunnerOutput
  from vllm.v1.worker.gpu_model_runner import GPUModelRunner
  from vllm.v1.worker.worker_base import WorkerBase
- 
+
 +# flexkv
 +from vllm.distributed.flexkv_extension.config import FlexKVConfig
 +from vllm.distributed.flexkv_extension.client import FlexKVTPClient
 +
  logger = init_logger(__name__)
- 
+
  if TYPE_CHECKING:
 @@ -282,7 +286,23 @@ class Worker(WorkerBase):
              pattern=pattern,
              max_size=max_size,
          )
 -
-+    
++
 +    def init_flexkv(
-+        self, 
++        self,
 +        flexkv_config: FlexKVConfig,
 +        dp_client_id: int,
 +    ) -> None:
@@ -1136,6 +1136,6 @@ index 2972e0ffb..6bb8fa9ff 100644
 +                                            device_id=self.device.index,
 +                                            gpu_blocks=self.model_runner.kv_caches,
 +                                            kv_shape=kv_shape)
- 
+
  def init_worker_distributed_environment(
      parallel_config: ParallelConfig,
diff --git a/flexkv/cache/cache_engine.py b/flexkv/cache/cache_engine.py
index 8340cc476f..3f7fb71217 100644
--- a/flexkv/cache/cache_engine.py
+++ b/flexkv/cache/cache_engine.py
@@ -29,7 +29,7 @@
 from flexkv.cache.radixtree import RadixTreeIndex, RadixNode, MatchResult
 from flexkv.cache.transfer_pattern import add_virtal_op_for_mutiple_finished_ops
 from flexkv.common.block import SequenceMeta
-from flexkv.common.config import CacheConfig, ModelConfig
+from flexkv.common.config import CacheConfig, ModelConfig, GLOBAL_CONFIG_FROM_ENV
 from flexkv.common.exceptions import InvalidConfigError, NotEnoughSpaceError
 from flexkv.common.transfer import (
     DeviceType, TransferOpGraph, TransferOp, TransferType
@@ -53,7 +53,7 @@ def __init__(self,
                  num_total_blocks: int,
                  tokens_per_block: int,
                  evict_ratio: float,
-                 hit_reward_seconds: int):
+                 hit_reward_seconds: int = 0):
         if not isinstance(device_type, DeviceType):
             raise InvalidConfigError(f"Unknown device type: {device_type}")
         if num_total_blocks <= 0:
@@ -152,7 +152,7 @@ def __init__(self,
                  num_total_blocks: int,
                  tokens_per_block: int,
                  evict_ratio: float,
-                 hit_reward_seconds: int):
+                 hit_reward_seconds: int = 0):
         if not isinstance(device_type, DeviceType):
             raise InvalidConfigError(f"Unknown device type: {device_type}")
         if num_total_blocks <= 0:
@@ -238,61 +238,65 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig):
         self.remote_cache_engine = None
         self.gds_cache_engine = None
 
+        self.index_accel = GLOBAL_CONFIG_FROM_ENV.index_accel
         self.cache_engines = {}
 
+        self.evict_ratio = GLOBAL_CONFIG_FROM_ENV.evict_ratio
+        self.hit_reward_seconds = GLOBAL_CONFIG_FROM_ENV.hit_reward_seconds
+
         if cache_config.enable_cpu:
-            if cache_config.index_accel:
+            if self.index_accel:
                 self.cpu_cache_engine = CacheEngineAccel(DeviceType.CPU,
                                                 cache_config.num_cpu_blocks,
                                                 cache_config.tokens_per_block,
-                                                cache_config.evict_ratio,
-                                                cache_config.hit_reward_seconds)
+                                                self.evict_ratio,
+                                                self.hit_reward_seconds)
             else:
                 self.cpu_cache_engine = CacheEngine(DeviceType.CPU,
                                                 cache_config.num_cpu_blocks,
                                                 cache_config.tokens_per_block,
-                                                cache_config.evict_ratio,
-                                                cache_config.hit_reward_seconds)
+                                                self.evict_ratio,
+                                                self.hit_reward_seconds)
             self.cache_engines[DeviceType.CPU] = self.cpu_cache_engine
         if cache_config.enable_ssd:
-            if cache_config.index_accel:
+            if self.index_accel:
                 self.ssd_cache_engine = CacheEngineAccel(DeviceType.SSD,
                                                 cache_config.num_ssd_blocks,
                                                 cache_config.tokens_per_block,
-                                                cache_config.evict_ratio,
-                                                cache_config.hit_reward_seconds)
+                                                self.evict_ratio,
+                                                self.hit_reward_seconds)
             else:
                 self.ssd_cache_engine = CacheEngine(DeviceType.SSD,
                                                 cache_config.num_ssd_blocks,
                                                 cache_config.tokens_per_block,
-                                                cache_config.evict_ratio,
-                                                cache_config.hit_reward_seconds)
+                                                self.evict_ratio,
+                                                self.hit_reward_seconds)
             self.cache_engines[DeviceType.SSD] = self.ssd_cache_engine
         if cache_config.enable_remote:
-            if cache_config.index_accel:
+            if self.index_accel:
                 self.remote_cache_engine = CacheEngineAccel(DeviceType.REMOTE,
                                                    cache_config.num_remote_blocks,
                                                    cache_config.tokens_per_block,
-                                                   cache_config.evict_ratio,
-                                                   cache_config.hit_reward_seconds)
+                                                   self.evict_ratio,
+                                                   self.hit_reward_seconds)
             else:
                 self.remote_cache_engine = CacheEngine(DeviceType.REMOTE,
                                                    cache_config.num_remote_blocks,
                                                    cache_config.tokens_per_block,
-                                                   cache_config.evict_ratio,
-                                                   cache_config.hit_reward_seconds)
+                                                   self.evict_ratio,
+                                                   self.hit_reward_seconds)
             self.cache_engines[DeviceType.REMOTE] = self.remote_cache_engine
         if cache_config.enable_gds:
-            if cache_config.index_accel:
+            if self.index_accel:
                 self.gds_cache_engine = CacheEngineAccel(DeviceType.GDS,
                                                 cache_config.num_gds_blocks,
                                                 cache_config.tokens_per_block,
-                                                cache_config.evict_ratio)
+                                                self.evict_ratio)
             else:
                 self.gds_cache_engine = CacheEngine(DeviceType.GDS,
                                                 cache_config.num_gds_blocks,
                                                 cache_config.tokens_per_block,
-                                                cache_config.evict_ratio)
+                                                self.evict_ratio)
             self.cache_engines[DeviceType.GDS] = self.gds_cache_engine
 
         self._empty_get_return: Callable[[int], Tuple[TransferOpGraph, List[int], Dict, Dict, Dict, int]] = \
@@ -587,7 +591,7 @@ def _get_impl_local(self,
         assert self.cache_config.enable_cpu
         assert self.cpu_cache_engine is not None
 
-        if self.cache_config.index_accel:
+        if self.index_accel:
             cpu_matched_result, ssd_matched_result = self.match_local_accel(sequence_meta)
         else:
             cpu_matched_result, ssd_matched_result = self.match_local(sequence_meta)
@@ -637,7 +641,8 @@ def _get_impl_local(self,
                 )
                 transfer_graph.add_transfer_op(op_gds_transfer)
                 finished_ops_ids.append(op_gds_transfer.op_id)
-                op_node_to_ready[op_gds_transfer.op_id] = (DeviceType.GDS, ssd_node_to_unlock, ssd_node_to_unlock.size())
+                op_node_to_ready[op_gds_transfer.op_id] = \
+                    (DeviceType.GDS, ssd_node_to_unlock, ssd_node_to_unlock.size())
             else:
                 fragment2_cpu_blocks = self.cpu_cache_engine.take(
                     num_required_blocks=fragment2_num_blocks,
@@ -681,7 +686,8 @@ def _get_impl_local(self,
             graph_id = transfer_graph.graph_id,
             transfer_type = TransferType.H2D,
             src_block_ids = fragment12_cpu_blocks if not self.cache_config.enable_gds else fragment1_cpu_blocks,
-            dst_block_ids = fragment12_gpu_blocks if not self.cache_config.enable_gds else fragment12_gpu_blocks[:fragment1_num_blocks],
+            dst_block_ids = fragment12_gpu_blocks if not self.cache_config.enable_gds \
+                else fragment12_gpu_blocks[:fragment1_num_blocks],
             layer_id = 0,
             layer_granularity = layer_num
         )
@@ -803,7 +809,7 @@ def _put_impl_global(self,
         assert self.cpu_cache_engine is not None
         assert self.remote_cache_engine is not None
 
-        if self.cache_config.index_accel:
+        if self.index_accel:
             cpu_matched_result, ssd_matched_result, remote_matched_result = self.match_all_accel(sequence_meta)
         else:
             cpu_matched_result, ssd_matched_result, remote_matched_result = self.match_all(sequence_meta)
@@ -957,7 +963,7 @@ def _put_impl_local(self,
         assert self.cpu_cache_engine is not None
         # assert self.ssd_cache_engine is not None
 
-        if  self.cache_config.index_accel:
+        if  self.index_accel:
             cpu_matched_result, ssd_matched_result = self.match_local_accel(sequence_meta)
         else:
             cpu_matched_result, ssd_matched_result = self.match_local(sequence_meta)
@@ -981,7 +987,7 @@ def _put_impl_local(self,
             protected_node = cpu_matched_result.last_node,
             strict=False
         )
-        
+
         # Determine which disk cache to use (GDS or SSD)
         disk_cache_engine = None
         if self.cache_config.enable_gds:
@@ -997,7 +1003,7 @@ def _put_impl_local(self,
             )
         else:
             fragment2_ssd_blocks = np.array([], dtype=np.int64)
-            
+
         if len(fragment12_cpu_blocks) < fragment12_num_blocks or \
             len(fragment2_ssd_blocks) < fragment2_num_blocks:
             self.cpu_cache_engine.recycle(fragment12_cpu_blocks)
@@ -1122,7 +1128,7 @@ def match_local_accel(self, sequence_meta: SequenceMeta) -> Tuple[MatchResultAcc
             ssd_matched_result = self.gds_cache_engine.match(sequence_meta)
 
         return cpu_matched_result, ssd_matched_result
-    
+
     @nvtx.annotate("Match Prefix", color="yellow")
     def match_local(self, sequence_meta: SequenceMeta) -> Tuple[MatchResult, MatchResult]:
         cpu_matched_result = MatchResult()
@@ -1135,7 +1141,7 @@ def match_local(self, sequence_meta: SequenceMeta) -> Tuple[MatchResult, MatchRe
             ssd_matched_result = self.gds_cache_engine.match(sequence_meta)
 
         return cpu_matched_result, ssd_matched_result
-    
+
     @nvtx.annotate("Match All Prefix accel", color="yellow")
     def match_all_accel(self,
                         sequence_meta: SequenceMeta) -> Tuple[MatchResultAccel, MatchResultAccel, MatchResultAccel]:
diff --git a/flexkv/common/config.py b/flexkv/common/config.py
index ad0114618c..c79c1e5773 100644
--- a/flexkv/common/config.py
+++ b/flexkv/common/config.py
@@ -1,20 +1,22 @@
 from dataclasses import dataclass
 from enum import Enum
 from typing import Optional, List, Union, Dict, Any
+from argparse import Namespace
+import os
+import copy
 
 import torch
 
 from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType
-
+from flexkv.common.debug import flexkv_logger
 
 @dataclass
 class ModelConfig:
-    num_layers: int
-    num_kv_heads: int
-    head_size: int
+    num_layers: int = 0
+    num_kv_heads: int = 0
+    head_size: int = 0
     use_mla: bool = False
     dtype: torch.dtype = torch.bfloat16
-    max_req_tokens = 163840
 
     # parallel configs
     tp_size: int = 1
@@ -32,15 +34,6 @@ class CacheConfig:
     enable_ssd: bool = False
     enable_remote: bool = False
     enable_gds: bool = False
-    index_accel: bool = False
-    hit_reward_seconds: int = 0
-
-    # kv cache layout configs
-    gpu_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.LAYERWISE
-    cpu_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.BLOCKWISE
-    ssd_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.BLOCKWISE
-    remote_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.BLOCKWISE
-    gds_kv_layout_type: KVCacheLayoutType = KVCacheLayoutType.BLOCKWISE
 
     # mempool capacity configs
     num_cpu_blocks: int = 1000000
@@ -48,22 +41,13 @@ class CacheConfig:
     num_gds_blocks: int = 10000000
     num_remote_blocks: Optional[int] = None
 
-    # CPU-GPU transfer configs
-    use_ce_transfer_h2d: bool = False
-    use_ce_transfer_d2h: bool = False
-    transfer_sms_h2d: int = 8
-    transfer_sms_d2h: int = 8
-
     # ssd cache configs
-    max_blocks_per_file: int = 32000  # -1 means no limit
     ssd_cache_dir: Optional[Union[str, List[str]]] = None
-    ssd_cache_iouring_entries: int = 512
-    ssd_cache_iouring_flags: int = 1
 
     # gds cache configs
     gds_cache_dir: Optional[Union[str, List[str]]] = None
 
-    # remote cache configs
+    # remote cache configs for cfs
     remote_cache_size_mode: str = "file_size"  # file_size or block_num
     remote_file_size: Optional[int] = None
     remote_file_num: Optional[int] = None
@@ -71,23 +55,147 @@ class CacheConfig:
     remote_cache_path: Optional[Union[str, List[str]]] = None
     remote_config_custom: Optional[Dict[str, Any]] = None
 
-    # Trace configs
-    enable_trace: bool = True
-    trace_file_path: str = "./flexkv_trace.log"
-    trace_max_file_size_mb: int = 100
-    trace_max_files: int = 5
-    trace_flush_interval_ms: int = 1000
+GLOBAL_CONFIG_FROM_ENV: Namespace = Namespace(
+    server_client_mode=bool(int(os.getenv('FLEXKV_SERVER_CLIENT_MODE', 0))),
+    server_recv_port=os.getenv('FLEXKV_SERVER_RECV_PORT', 'ipc:///tmp/flexkv_server'),
+
+    index_accel=bool(int(os.getenv('FLEXKV_INDEX_ACCEL', 1))),
+    cpu_layout_type=KVCacheLayoutType(os.getenv('FLEXKV_CPU_LAYOUT', 'BLOCKFIRST').upper()),
+    ssd_layout_type=KVCacheLayoutType(os.getenv('FLEXKV_SSD_LAYOUT', 'BLOCKFIRST').upper()),
+    remote_layout_type=KVCacheLayoutType(os.getenv('FLEXKV_REMOTE_LAYOUT', 'BLOCKFIRST').upper()),
+    gds_layout_type=KVCacheLayoutType(os.getenv('FLEXKV_GDS_LAYOUT', 'BLOCKFIRST').upper()),
+
+    use_ce_transfer_h2d=bool(int(os.getenv('FLEXKV_USE_CE_TRANSFER_H2D', 0))),
+    use_ce_transfer_d2h=bool(int(os.getenv('FLEXKV_USE_CE_TRANSFER_D2H', 0))),
+    transfer_sms_h2d=int(os.getenv('FLEXKV_TRANSFER_SMS_H2D', 8)),
+    transfer_sms_d2h=int(os.getenv('FLEXKV_TRANSFER_SMS_D2H', 8)),
+
+    iouring_entries=int(os.getenv('FLEXKV_IORING_ENTRIES', 512)),
+    iouring_flags=int(os.getenv('FLEXKV_IORING_FLAGS', 0)),
 
-    #evict ratio
-    evict_ratio: float = 0.0
+    max_file_size_gb=float(os.getenv('FLEXKV_MAX_FILE_SIZE_GB', -1)),  # -1 means no limit
+
+    evict_ratio=float(os.getenv('FLEXKV_EVICT_RATIO', 0.05)),
+    hit_reward_seconds=int(os.getenv('FLEXKV_HIT_REWARD_SECONDS', 0)),
+
+    enable_trace=bool(int(os.getenv('FLEXKV_ENABLE_TRACE', 0))),
+    trace_file_path=os.getenv('FLEXKV_TRACE_FILE_PATH', './flexkv_trace.log'),
+    trace_max_file_size_mb=int(os.getenv('FLEXKV_TRACE_MAX_FILE_SIZE_MB', 100)),
+    trace_max_files=int(os.getenv('FLEXKV_TRACE_MAX_FILES', 5)),
+    trace_flush_interval_ms=int(os.getenv('FLEXKV_TRACE_FLUSH_INTERVAL_MS', 1000)),
+)
+
+@dataclass
+class UserConfig:
+    cpu_cache_gb: int = 16
+    ssd_cache_gb: int = 0  # 0 means disable ssd
+    ssd_cache_dir: Union[str, List[str]] = "./ssd_cache"
+    enable_gds: bool = False
 
     def __post_init__(self):
-        layout_fields = ['gpu_kv_layout_type',
-                         'cpu_kv_layout_type',
-                         'ssd_kv_layout_type',
-                         'remote_kv_layout_type',
-                         'gds_kv_layout_type']
-        for field in layout_fields:
-            value = getattr(self, field)
-            if isinstance(value, str):
-                setattr(self, field, KVCacheLayoutType[value.upper()])
+        if self.cpu_cache_gb <= 0:
+            raise ValueError(f"Invalid cpu_cache_gb: {self.cpu_cache_gb}")
+        if self.ssd_cache_gb < 0:
+            raise ValueError(f"Invalid ssd_cache_gb: {self.ssd_cache_gb}")
+        if self.ssd_cache_gb > 0 and self.ssd_cache_gb <= self.cpu_cache_gb:
+            raise ValueError(f"Invalid ssd_cache_gb: {self.ssd_cache_gb}, "
+                             f"must be greater than cpu_cache_gb: {self.cpu_cache_gb}.")
+
+def parse_path_list(path_str: str) -> List[str]:
+    paths = [p.strip() for p in path_str.split(';') if p.strip()]
+    return paths
+
+def load_user_config_from_file(config_file: str) -> UserConfig:
+    import json
+    import yaml
+    from dataclasses import fields
+
+    # read json config file or yaml config file
+    if config_file.endswith('.json'):
+        with open(config_file) as f:
+            config = json.load(f)
+    elif config_file.endswith('.yaml'):
+        with open(config_file) as f:
+            config = yaml.safe_load(f)
+    else:
+        raise ValueError(f"Unsupported config file extension: {config_file}")
+
+    if 'ssd_cache_dir' in config:
+        config['ssd_cache_dir'] = parse_path_list(config['ssd_cache_dir'])
+
+    defined_fields = {f.name for f in fields(UserConfig)}
+    known_config = {k: v for k, v in config.items() if k in defined_fields}
+    extra_config = {k: v for k, v in config.items() if k not in defined_fields}
+
+    user_config = UserConfig(**known_config)
+
+    for key, value in extra_config.items():
+        setattr(user_config, f"override_{key}", value)
+
+    return user_config
+
+def load_user_config_from_env() -> UserConfig:
+    return UserConfig(
+        cpu_cache_gb=int(os.getenv('FLEXKV_CPU_CACHE_GB', 16)),
+        ssd_cache_gb=int(os.getenv('FLEXKV_SSD_CACHE_GB', 0)),
+        ssd_cache_dir=parse_path_list(os.getenv('FLEXKV_SSD_CACHE_DIR', "./flexkv_ssd")),
+        enable_gds=bool(int(os.getenv('FLEXKV_ENABLE_GDS', 0))),
+    )
+
+def convert_to_block_num(size_in_GB: float, block_size_in_bytes: int) -> int:
+    return int(size_in_GB * 1024 * 1024 * 1024 / block_size_in_bytes)
+
+def update_default_config_from_user_config(model_config: ModelConfig,
+                                           cache_config: CacheConfig,
+                                           user_config: UserConfig) -> None:
+    block_size_in_bytes = model_config.token_size_in_bytes * cache_config.tokens_per_block
+
+    assert user_config.cpu_cache_gb > 0
+    assert user_config.ssd_cache_gb >= 0
+
+    cache_config.num_cpu_blocks = convert_to_block_num(user_config.cpu_cache_gb, block_size_in_bytes)
+    cache_config.num_ssd_blocks = convert_to_block_num(user_config.ssd_cache_gb, block_size_in_bytes)
+
+    cache_config.ssd_cache_dir = user_config.ssd_cache_dir
+    cache_config.enable_ssd = user_config.ssd_cache_gb > 0
+    cache_config.enable_gds = user_config.enable_gds
+
+    if cache_config.num_ssd_blocks % len(cache_config.ssd_cache_dir) != 0:
+        cache_config.num_ssd_blocks = \
+            cache_config.num_ssd_blocks // len(cache_config.ssd_cache_dir) * len(cache_config.ssd_cache_dir)
+        flexkv_logger.warning(f"num_ssd_blocks is not a multiple of num_ssd_devices, "
+                              f"adjust num_ssd_blocks to {cache_config.num_ssd_blocks}")
+
+    global_config_attrs = set(vars(GLOBAL_CONFIG_FROM_ENV).keys())
+    for attr_name in dir(user_config):
+        if attr_name.startswith('override_'):
+            global_attr_name = attr_name[9:]  # len('override_') = 9
+            if global_attr_name in global_config_attrs:
+                attr_value = getattr(user_config, attr_name)
+                original_value = getattr(GLOBAL_CONFIG_FROM_ENV, global_attr_name)
+
+                original_type = type(original_value)
+
+                try:
+                    if original_type is bool:
+                        if isinstance(attr_value, str):
+                            attr_value = attr_value.lower() in ('true', '1', 'yes')
+                        else:
+                            attr_value = bool(int(attr_value))
+                    elif issubclass(original_type, Enum):  # KVCacheLayoutType
+                        if isinstance(attr_value, str):
+                            attr_value = original_type(attr_value.upper())
+                        elif not isinstance(attr_value, original_type):
+                            attr_value = original_type(attr_value)
+                    else:
+                        attr_value = original_type(attr_value)
+                except (ValueError, TypeError) as e:
+                    raise ValueError(f"Cannot convert config value '{attr_value}' to type {original_type.__name__} "
+                                    f"for config '{global_attr_name}': {e}") from e
+
+                setattr(GLOBAL_CONFIG_FROM_ENV, global_attr_name, attr_value)
+                flexkv_logger.info(f"Override environment variable: {'FLEXKV_' + global_attr_name.upper()} "
+                                   f"to {attr_value} from config file.")
+            else:
+                raise ValueError(f"Unknown config name: {global_attr_name} in config file, "
+                                 f"available config names: {global_config_attrs}")
diff --git a/flexkv/common/storage.py b/flexkv/common/storage.py
index 32587dbc76..53ab859c2d 100644
--- a/flexkv/common/storage.py
+++ b/flexkv/common/storage.py
@@ -14,11 +14,11 @@ class AccessHandleType(Enum):
     TENSOR_HANDLE = auto()  # single tensor handle or tensor handle list
     GDS_MANAGER = auto()
 
-# NOTE: currently, we assume that the layout type of GPU should always be layerwise
-# and the layout type of CPU, SSD, remote should be the same, either laywise or blockwise
+# NOTE: currently, we assume that the layout type of GPU should always be LAYERFIRST
+# and the layout type of CPU, SSD, remote should be the same, either laywise or BLOCKFIRST
 class KVCacheLayoutType(Enum):
-    LAYERWISE = "LAYERWISE"
-    BLOCKWISE = "BLOCKWISE"
+    LAYERFIRST = "LAYERFIRST"
+    BLOCKFIRST = "BLOCKFIRST"
 
 @dataclass
 class KVCacheLayout:
@@ -59,14 +59,14 @@ def __post_init__(self) -> None:
 
     def _compute_kv_shape(self) -> None:
         if self._kv_shape is None:
-            if self.type == KVCacheLayoutType.LAYERWISE:  # for layerwise transfer
+            if self.type == KVCacheLayoutType.LAYERFIRST:  # for Layerwise transfer
                 self._kv_shape = torch.Size([self.num_layer,
                                              self._kv_dim,
                                              self.num_block,
                                              self.tokens_per_block,
                                              self.num_head,
                                              self.head_size])
-            elif self.type == KVCacheLayoutType.BLOCKWISE:
+            elif self.type == KVCacheLayoutType.BLOCKFIRST:
                 self._kv_shape = torch.Size([self.num_block,
                                              self.num_layer,
                                              self._kv_dim,
@@ -126,25 +126,25 @@ def get_chunk_size(self) -> int:
         return self.tokens_per_block * self.num_head * self.head_size
 
     def get_layer_stride(self) -> int:
-        if self.type == KVCacheLayoutType.LAYERWISE:
+        if self.type == KVCacheLayoutType.LAYERFIRST:
             return self.kv_shape[1:].numel()
-        elif self.type == KVCacheLayoutType.BLOCKWISE:
+        elif self.type == KVCacheLayoutType.BLOCKFIRST:
             return self.kv_shape[2:].numel()
         else:
             raise ValueError(f"Invalid KVCacheLayoutType: {self.type}")
 
     def get_block_stride(self) -> int:
-        if self.type == KVCacheLayoutType.LAYERWISE:
+        if self.type == KVCacheLayoutType.LAYERFIRST:
             return self.kv_shape[3:].numel()
-        elif self.type == KVCacheLayoutType.BLOCKWISE:
+        elif self.type == KVCacheLayoutType.BLOCKFIRST:
             return self.kv_shape[1:].numel()
         else:
             raise ValueError(f"Invalid KVCacheLayoutType: {self.type}")
 
     def get_kv_stride(self) -> int:
-        if self.type == KVCacheLayoutType.LAYERWISE:
+        if self.type == KVCacheLayoutType.LAYERFIRST:
             return self.kv_shape[2:].numel()
-        elif self.type == KVCacheLayoutType.BLOCKWISE:
+        elif self.type == KVCacheLayoutType.BLOCKFIRST:
             return self.kv_shape[3:].numel()
         else:
             raise ValueError(f"Invalid KVCacheLayoutType: {self.type}")
diff --git a/flexkv/common/tracer.py b/flexkv/common/tracer.py
index 765fed0a9f..7efb107244 100644
--- a/flexkv/common/tracer.py
+++ b/flexkv/common/tracer.py
@@ -3,25 +3,25 @@
 import threading
 import time
 from datetime import datetime
-from typing import Any, Dict, Optional, List, Union
+from typing import Any, Optional, List, Union
 import torch
 import numpy as np
 
-from flexkv.common.config import CacheConfig
+from flexkv.common.config import GLOBAL_CONFIG_FROM_ENV
 
 
 class FlexKVTracer:
     """FlexKV Tracer class for recording operations in JSON format"""
 
-    def __init__(self, cache_config: CacheConfig):
-        self.enabled = cache_config.enable_trace
+    def __init__(self):
+        self.enabled = GLOBAL_CONFIG_FROM_ENV.enable_trace
         if not self.enabled:
             return
-        print(f"FlexKVTracer enabled, trace_file_path: {cache_config.trace_file_path}")
-        self.trace_file_path = cache_config.trace_file_path
-        self.max_file_size_mb = cache_config.trace_max_file_size_mb
-        self.max_files = cache_config.trace_max_files
-        self.flush_interval_ms = cache_config.trace_flush_interval_ms
+        print(f"FlexKVTracer enabled, trace_file_path: {GLOBAL_CONFIG_FROM_ENV.trace_file_path}")
+        self.trace_file_path = GLOBAL_CONFIG_FROM_ENV.trace_file_path
+        self.max_file_size_mb = GLOBAL_CONFIG_FROM_ENV.trace_max_file_size_mb
+        self.max_files = GLOBAL_CONFIG_FROM_ENV.trace_max_files
+        self.flush_interval_ms = GLOBAL_CONFIG_FROM_ENV.trace_flush_interval_ms
 
         # Thread-safe file writing
         self._lock = threading.Lock()
@@ -116,25 +116,40 @@ def trace_config(self, model_config, cache_config, gpu_layout=None):
             "enable_cpu": cache_config.enable_cpu,
             "enable_ssd": cache_config.enable_ssd,
             "enable_remote": cache_config.enable_remote,
-            "gpu_kv_layout_type": str(cache_config.gpu_kv_layout_type),
-            "cpu_kv_layout_type": str(cache_config.cpu_kv_layout_type),
-            "ssd_kv_layout_type": str(cache_config.ssd_kv_layout_type),
-            "remote_kv_layout_type": str(cache_config.remote_kv_layout_type),
             "enable_gds": cache_config.enable_gds,
-            "remote_cache_size_mode": cache_config.remote_cache_size_mode,
             "num_cpu_blocks": cache_config.num_cpu_blocks,
             "num_ssd_blocks": cache_config.num_ssd_blocks,
+            "num_gds_blocks": cache_config.num_gds_blocks,
             "num_remote_blocks": cache_config.num_remote_blocks,
+            "ssd_cache_dir": cache_config.ssd_cache_dir,
+            "gds_cache_dir": cache_config.gds_cache_dir,
+            "remote_cache_size_mode": cache_config.remote_cache_size_mode,
             "remote_file_size": cache_config.remote_file_size,
             "remote_file_num": cache_config.remote_file_num,
             "remote_file_prefix": cache_config.remote_file_prefix,
-            "ssd_cache_dir": cache_config.ssd_cache_dir,
-            "ssd_cache_iouring_entries": cache_config.ssd_cache_iouring_entries,
-            "ssd_cache_iouring_flags": cache_config.ssd_cache_iouring_flags,
-            "gds_cache_dir": cache_config.gds_cache_dir,
             "remote_cache_path": cache_config.remote_cache_path,
             "remote_config_custom": cache_config.remote_config_custom,
-            "evict_ratio": cache_config.evict_ratio,
+        }
+
+        # Convert GLOBAL_CONFIG_FROM_ENV to dict
+        from flexkv.common.config import GLOBAL_CONFIG_FROM_ENV
+        global_config_dict = {
+            "server_client_mode": GLOBAL_CONFIG_FROM_ENV.server_client_mode,
+            "server_recv_port": GLOBAL_CONFIG_FROM_ENV.server_recv_port,
+            "index_accel": GLOBAL_CONFIG_FROM_ENV.index_accel,
+            "cpu_layout_type": str(GLOBAL_CONFIG_FROM_ENV.cpu_layout_type),
+            "ssd_layout_type": str(GLOBAL_CONFIG_FROM_ENV.ssd_layout_type),
+            "remote_layout_type": str(GLOBAL_CONFIG_FROM_ENV.remote_layout_type),
+            "gds_layout_type": str(GLOBAL_CONFIG_FROM_ENV.gds_layout_type),
+            "use_ce_transfer_h2d": GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_h2d,
+            "use_ce_transfer_d2h": GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_d2h,
+            "transfer_sms_h2d": GLOBAL_CONFIG_FROM_ENV.transfer_sms_h2d,
+            "transfer_sms_d2h": GLOBAL_CONFIG_FROM_ENV.transfer_sms_d2h,
+            "iouring_entries": GLOBAL_CONFIG_FROM_ENV.iouring_entries,
+            "iouring_flags": GLOBAL_CONFIG_FROM_ENV.iouring_flags,
+            "max_file_size_gb": GLOBAL_CONFIG_FROM_ENV.max_file_size_gb,
+            "evict_ratio": GLOBAL_CONFIG_FROM_ENV.evict_ratio,
+            # Note: trace-related configs are excluded as they should not affect replay
         }
 
         # Convert gpu_layout to dict if provided
@@ -157,6 +172,7 @@ def trace_config(self, model_config, cache_config, gpu_layout=None):
             "data": {
                 "model_config": model_config_dict,
                 "cache_config": cache_config_dict,
+                "global_config": global_config_dict,
                 "gpu_layout": gpu_layout_dict,
             }
         }
@@ -174,9 +190,9 @@ def trace_config(self, model_config, cache_config, gpu_layout=None):
     def trace_request(self,
                      request_type: str,
                      request_id: int,
-                     token_ids: torch.Tensor,
-                     slot_mapping: torch.Tensor,
-                     token_mask: Optional[torch.Tensor] = None,
+                     token_ids: Union[torch.Tensor, np.ndarray],
+                     slot_mapping: Union[torch.Tensor, np.ndarray],
+                     token_mask: Optional[Union[torch.Tensor, np.ndarray]] = None,
                      layer_granularity: int = -1,
                      dp_id: int = 0,
                      **kwargs):
@@ -186,7 +202,7 @@ def trace_request(self,
 
         timestamp = datetime.now().isoformat()
 
-        # Convert tensors to lists for JSON serialization
+        # Convert tensors/arrays to lists for JSON serialization
         data = {
             "request_type": request_type,
             "request_id": request_id,
@@ -221,6 +237,8 @@ def trace_request(self,
     def trace_wait_request(self,
                           wait_type: str,
                           task_ids: Union[int, List[int]],
+                          timeout: Optional[float] = None,
+                          completely: Optional[bool] = None,
                           layer_group_id: Optional[int] = None):
         """Record a wait operation"""
         if not self.enabled:
@@ -237,6 +255,8 @@ def trace_wait_request(self,
         data = {
             "wait_type": wait_type,
             "task_ids": task_ids_list,
+            "timeout": timeout,
+            "completely": completely,
             "layer_group_id": layer_group_id,
         }
         record = {
@@ -256,6 +276,45 @@ def trace_wait_request(self,
             if (current_time - self._last_flush_time) * 1000 >= self.flush_interval_ms:
                 self._flush_buffer()
 
+    def trace_launch_tasks(self,
+                          task_ids: List[int],
+                          slot_mappings: List[Union[torch.Tensor, np.ndarray]]):
+        """Record a launch_tasks operation"""
+        if not self.enabled:
+            return
+
+        timestamp = datetime.now().isoformat()
+
+        # Convert slot_mappings to lists
+        slot_mappings_list = []
+        slot_mappings_shapes = []
+        for slot_mapping in slot_mappings:
+            slot_mappings_list.append(self._convert_tensor_to_list(slot_mapping))
+            slot_mappings_shapes.append(list(slot_mapping.shape))
+
+        data = {
+            "task_ids": task_ids,
+            "slot_mappings": slot_mappings_list,
+            "slot_mappings_shapes": slot_mappings_shapes,
+        }
+        
+        record = {
+            "timestamp": timestamp,
+            "event_type": "launch_tasks",
+            "component": "KVManager",
+            "data": data
+        }
+
+        json_record = json.dumps(record, ensure_ascii=False, separators=(',', ':'))
+
+        with self._lock:
+            self._buffer.append(json_record)
+
+            # Check if we need to flush
+            current_time = time.time()
+            if (current_time - self._last_flush_time) * 1000 >= self.flush_interval_ms:
+                self._flush_buffer()
+
     def flush(self):
         """Manually flush all buffered records"""
         if not self.enabled:
diff --git a/flexkv/integration/config.py b/flexkv/integration/config.py
index b437657c3e..12d4d8ef14 100644
--- a/flexkv/integration/config.py
+++ b/flexkv/integration/config.py
@@ -7,6 +7,7 @@
 from dataclasses import dataclass, field
 
 from flexkv.common.debug import flexkv_logger
+from flexkv.common.config import *
 
 if TYPE_CHECKING:
     from vllm.v1.kv_cache_interface import KVCacheConfig, FullAttentionSpec
@@ -17,53 +18,53 @@
 
 @dataclass
 class FlexKVConfig:
+    enable_flexkv: bool = True
+
     #base config
-    server_recv_port: str
-    
+    server_recv_port: str = ""
+
     # cache config
-    cache_config: dict = field(default_factory=dict)
-    
+    cache_config: CacheConfig = field(default_factory=CacheConfig)
+
     # model config
-    block_size: int = None
-    num_layers: int = None
-    num_kv_heads: int = None
-    head_size: int = None
-    dtype: torch.dtype = None
-    use_mla: bool = False
-    tp_size: int = 1
-    dp_size: int = 1
-    # log config
-    num_log_interval_requests: int = 200
-    
+    model_config: ModelConfig = field(default_factory=ModelConfig)
+
+    # user config
+    user_config: UserConfig = field(default_factory=UserConfig)
+
+    def __post_init__(self):
+        if self.server_recv_port == "":
+            self.server_recv_port = GLOBAL_CONFIG_FROM_ENV.server_recv_port
+        update_default_config_from_user_config(self.model_config, self.cache_config, self.user_config)
+
     @classmethod
     def from_env(cls) -> 'FlexKVConfig':
+        enable_flexkv = bool(int(os.getenv('ENABLE_FLEXKV', 1)))
         config_file_path = os.getenv('FLEXKV_CONFIG_PATH', None)
-        logger.info(f"{config_file_path=}")
         if config_file_path is None:
-            return cls(enable_flexkv=False,
-                       server_recv_port="")
-        
-        assert config_file_path.endswith(".json"), "flexkv config must be a json file."
-        
-        with open(config_file_path, 'r') as f:
-            config_dict: dict = json.load(f)
-        logger.info(f"FlexKV Config Dict: {config_dict}")
-        
-        return cls(
-            server_recv_port=config_dict.get("server_recv_port", f"ipc:///tmp/flexkv_test"),
-            cache_config=config_dict.get("cache_config", {}),
-            num_log_interval_requests=config_dict.get("num_log_interval_requests", 200),
-        )
-        
+            logger.info("No flexkv config file provided, please set FLEXKV_CONFIG_PATH environment variable.")
+            logger.info("Loading flexkv config from environment variables.")
+            user_config = load_user_config_from_env()
+            return cls(enable_flexkv=enable_flexkv,
+                       user_config=user_config)
+        else:
+            logger.info(f"Loading flexkv config from file: {config_file_path}")
+            user_config = load_user_config_from_file(config_file_path)
+            return cls(enable_flexkv=enable_flexkv,
+                       user_config=user_config)
+
     def post_init_from_vllm_config(
-        self, 
+        self,
         vllm_config: "VllmConfig",
         ):
-        self.num_layers = vllm_config.model_config.get_num_layers(vllm_config.parallel_config)
-        self.block_size = vllm_config.cache_config.block_size
-        self.num_kv_heads = vllm_config.model_config.get_total_num_kv_heads()
-        self.head_size = vllm_config.model_config.get_head_size()
-        self.dtype = vllm_config.model_config.dtype
-        self.use_mla = vllm_config.model_config.is_deepseek_mla
-        self.tp_size = vllm_config.parallel_config.tensor_parallel_size
-        self.dp_size = vllm_config.parallel_config.data_parallel_size
\ No newline at end of file
+        self.cache_config.tokens_per_block = vllm_config.cache_config.block_size
+
+        self.model_config.num_layers = vllm_config.model_config.get_num_layers(vllm_config.parallel_config)
+        self.model_config.num_kv_heads = vllm_config.model_config.get_total_num_kv_heads()
+        self.model_config.head_size = vllm_config.model_config.get_head_size()
+        self.model_config.dtype = vllm_config.model_config.dtype
+        self.model_config.use_mla = vllm_config.model_config.is_deepseek_mla
+        self.model_config.tp_size = vllm_config.parallel_config.tensor_parallel_size
+        self.model_config.dp_size = vllm_config.parallel_config.data_parallel_size
+
+        self.__post_init__()
diff --git a/flexkv/integration/vllm/vllm_v1_adapter.py b/flexkv/integration/vllm/vllm_v1_adapter.py
index 7bec7141fd..cdeec495d5 100644
--- a/flexkv/integration/vllm/vllm_v1_adapter.py
+++ b/flexkv/integration/vllm/vllm_v1_adapter.py
@@ -10,7 +10,6 @@
 from flexkv.kvmanager import KVManager
 from flexkv.server.client import KVTPClient
 from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType
-from flexkv.common.config import ModelConfig, CacheConfig
 from flexkv.common.request import KVResponseStatus
 from flexkv.common.debug import flexkv_logger
 from flexkv.integration.stats import FlexKVStats
@@ -119,29 +118,15 @@ def __init__(
         dp_rank: int = 0,
     ):
         logger.info(f"Start init FlexKVSchedulerConnector with {flexkv_config}")
-        self.flexkv_config = flexkv_config
         self.server_recv_port = flexkv_config.server_recv_port
-        self.tp_size = flexkv_config.tp_size
-        self.dp_size = flexkv_config.dp_size
-        self.block_size = flexkv_config.block_size
-        self.model_config = ModelConfig(
-            num_layers=flexkv_config.num_layers,
-            num_kv_heads=flexkv_config.num_kv_heads,
-            head_size=flexkv_config.head_size,
-            use_mla=flexkv_config.use_mla,
-            dtype=flexkv_config.dtype,
-            tp_size=flexkv_config.tp_size,
-            dp_size=flexkv_config.dp_size,
-        )
-        if "tokens_per_block" in flexkv_config.cache_config:
-            assert flexkv_config.cache_config.pop("tokens_per_block") == flexkv_config.block_size
-        self.cache_config = CacheConfig(
-            tokens_per_block=flexkv_config.block_size,
-            **flexkv_config.cache_config,
-        )
+        self.tp_size = flexkv_config.model_config.tp_size
+        self.dp_size = flexkv_config.model_config.dp_size
+        self.block_size = flexkv_config.cache_config.tokens_per_block
+        self.model_config = flexkv_config.model_config
+        self.cache_config = flexkv_config.cache_config
         self.flexkv_manager = KVManager(model_config=self.model_config,
                                         cache_config=self.cache_config,
-                                        gpu_register_port=flexkv_config.server_recv_port,
+                                        server_recv_port=flexkv_config.server_recv_port,
                                         dp_client_id=dp_rank)
         self.flexkv_manager.start()
         # self.dp_client = KVDPClient(self.server_recv_port, self.model_config)
@@ -155,7 +140,7 @@ def __init__(
         self.tasks_to_launch: dict[int, FlexKVTask] = {}
         self.tasks_to_cancel: dict[int, FlexKVTask] = {}
 
-        self.flexkv_stats = FlexKVStats(flexkv_config.num_log_interval_requests)
+        self.flexkv_stats = FlexKVStats(os.getenv('FLEXKV_NUM_LOG_INTERVAL_REQUESTS', 200))
 
         while not self.is_ready():
             logger.info("Waiting for flexkv init...")
@@ -532,9 +517,10 @@ def __init__(
         flexkv_config: FlexKVConfig,
         dp_client_id: int,
     ):
-        current_device_id = torch.cuda.current_device() + dp_client_id * flexkv_config.tp_size
+        current_device_id = torch.cuda.current_device() + dp_client_id * flexkv_config.model_config.tp_size
         self.flexkv_config = flexkv_config
-        logger.info(f"Start init FlexKVWorkerConnector to {flexkv_config.server_recv_port}, dp_client_id: {dp_client_id}")
+        logger.info(f"Start init FlexKVWorkerConnector to {flexkv_config.server_recv_port}, \
+            dp_client_id: {dp_client_id}")
         self.tp_client = KVTPClient(flexkv_config.server_recv_port, dp_client_id, current_device_id)
         logger.info("Finish init FlexKVWorkerConnector")
 
@@ -542,7 +528,7 @@ def register_to_server(self, kv_caches: dict[str, torch.Tensor]):
         logger.info("Start register kv_caches")
         gpu_blocks = list(kv_caches.values())
         num_layer = len(kv_caches)
-        if self.flexkv_config.use_mla:
+        if self.flexkv_config.model_config.use_mla:
             assert gpu_blocks[0].ndim == 3, (
                 f"expect kv cached tensor has 3 dim but get shape={gpu_blocks[0].shape}.")
             num_blocks = gpu_blocks[0].shape[0]
@@ -557,13 +543,13 @@ def register_to_server(self, kv_caches: dict[str, torch.Tensor]):
             num_kv_heads = gpu_blocks[0].shape[3]
             head_size = gpu_blocks[0].shape[4]
         gpu_layout = KVCacheLayout(
-            type=KVCacheLayoutType.LAYERWISE,
+            type=KVCacheLayoutType.LAYERFIRST,
             num_layer=num_layer,
             num_block=num_blocks,
             tokens_per_block=block_size,
             num_head=num_kv_heads,
             head_size=head_size,
-            is_mla=self.flexkv_config.use_mla,
+            is_mla=self.flexkv_config.model_config.use_mla,
         )
         self.tp_client.register_to_server(gpu_blocks, gpu_layout)
         logger.info("Finish register kv_caches")
diff --git a/flexkv/kvmanager.py b/flexkv/kvmanager.py
index cac6355da7..cbf795648b 100644
--- a/flexkv/kvmanager.py
+++ b/flexkv/kvmanager.py
@@ -22,7 +22,7 @@
 from flexkv.server.client import KVDPClient
 from flexkv.server.server import KVServer, DPClient
 from flexkv.kvtask import KVTaskEngine, KVResponse
-from flexkv.common.config import ModelConfig, CacheConfig
+from flexkv.common.config import ModelConfig, CacheConfig, GLOBAL_CONFIG_FROM_ENV
 from flexkv.common.debug import flexkv_logger
 
 
@@ -30,16 +30,21 @@ class KVManager:
     def __init__(self,
                  model_config: ModelConfig,
                  cache_config: CacheConfig,
-                 gpu_register_port: Optional[str] = None,
-                 server_recv_port: Optional[str] = None,
-                 dp_client_id: int = 0):
+                 dp_client_id: int = 0,
+                 server_recv_port: str = ""):
         flexkv_logger.info(f"{model_config = }")
         flexkv_logger.info(f"{cache_config = }")
+        flexkv_logger.info(f"{GLOBAL_CONFIG_FROM_ENV = }")
         self.model_config = model_config
         self.cache_config = cache_config
-        self.gpu_register_port = gpu_register_port if gpu_register_port is not None else "ipc:///tmp/flexkv_test_gpu_register"
-        self.server_recv_port = server_recv_port if server_recv_port is not None else "ipc:///tmp/flexkv_test_server"
-        self.server_client_mode = model_config.dp_size > 1
+
+        if server_recv_port != "":
+            self.server_recv_port = server_recv_port
+        else:
+            self.server_recv_port = GLOBAL_CONFIG_FROM_ENV.server_recv_port
+        self.gpu_register_port = self.server_recv_port + "_gpu_register"
+
+        self.server_client_mode = model_config.dp_size > 1 or GLOBAL_CONFIG_FROM_ENV.server_client_mode
         self.dp_client_id = dp_client_id
         flexkv_logger.info(f"server_client_mode: {self.server_client_mode}")
         if self.server_client_mode:
@@ -50,17 +55,17 @@ def __init__(self,
                 # Example: inherit_env = False  # to not inherit parent env
                 self.server_handle = KVServer.create_server(model_config=model_config,
                                                             cache_config=cache_config,
-                                                            gpu_register_port=gpu_register_port,
+                                                            gpu_register_port=self.gpu_register_port,
                                                             server_recv_port=self.server_recv_port,
                                                             inherit_env=False)
-                                                            
+
             else:
                 self.server_handle = None
             self.dp_client = KVDPClient(self.server_recv_port, self.model_config, dp_client_id)
         else:
             self.server_handle = None
-            self.kv_task_engine = KVTaskEngine(model_config, cache_config, gpu_register_port)
-    
+            self.kv_task_engine = KVTaskEngine(model_config, cache_config, self.gpu_register_port)
+
     @property
     def dpclient_id(self) -> int:
         return self.dp_client_id
diff --git a/flexkv/kvtask.py b/flexkv/kvtask.py
index c62ea8ed33..735e452a47 100644
--- a/flexkv/kvtask.py
+++ b/flexkv/kvtask.py
@@ -375,7 +375,9 @@ def __init__(self,
                  gpu_register_port: Optional[str] = None,
                  ):
         super().__init__(model_config, cache_config, gpu_register_port)
-        self.tracer = FlexKVTracer(cache_config)
+        self.tracer = FlexKVTracer()
+        # trace config
+        self.tracer.trace_config(model_config, cache_config, gpu_layout=None)
 
     def get_async(self,
                   token_ids: np.ndarray,
@@ -391,6 +393,16 @@ def get_async(self,
                                                     layer_granularity=layer_granularity,
                                                     dp_id=dp_id,
                                                     task_id=task_id)
+        # trace get request
+        self.tracer.trace_request(
+            request_type="GET",
+            request_id=task_id,
+            token_ids=token_ids,
+            slot_mapping=slot_mapping,
+            token_mask=token_mask,
+            layer_granularity=layer_granularity,
+            dp_id=dp_id
+        )
         self._launch_task(task_id)
         return task_id, return_mask
 
@@ -406,6 +418,16 @@ def put_async(self,
                                                     token_mask=token_mask,
                                                     dp_id=dp_id,
                                                     task_id=task_id)
+        # trace put request
+        self.tracer.trace_request(
+            request_type="PUT",
+            request_id=task_id,
+            token_ids=token_ids,
+            slot_mapping=slot_mapping,
+            token_mask=token_mask,
+            layer_granularity=-1,  # put has no layer_granularity parameter
+            dp_id=dp_id
+        )
         self._launch_task(task_id)
         return task_id, return_mask
 
@@ -464,6 +486,13 @@ def try_wait(self, task_ids: Union[int, List[int]]) -> Dict[int, KVResponse]:
         if isinstance(task_ids, int):
             task_ids = [task_ids]
         nvtx.mark(f"try_wait task_ids: {task_ids}")
+        # trace try_wait request
+        self.tracer.trace_wait_request(
+            wait_type="try_wait",
+            task_ids=task_ids,
+            timeout=None,  # try_wait doesn't have explicit timeout
+            completely=False
+        )
         return_responses = self._wait_impl(task_ids,
                                            completely=False,
                                            only_return_finished=True)
@@ -476,6 +505,13 @@ def wait(self,
         if isinstance(task_ids, int):
             task_ids = [task_ids]
         nvtx.push_range(f"wait task_ids: {task_ids}", color=get_nvtx_default_color())
+        # trace wait request
+        self.tracer.trace_wait_request(
+            wait_type="wait",
+            task_ids=task_ids,
+            timeout=timeout,
+            completely=completely
+        )
         return_responses = self._wait_impl(task_ids, timeout, completely=completely)
         nvtx.pop_range()
         return return_responses
@@ -489,13 +525,24 @@ def get_match(self,
         if token_mask is None:
             token_mask = np.ones_like(token_ids, dtype=bool)
         fake_slot_mapping = np.zeros_like(token_ids[token_mask])
-        return self._get_match_impl(token_ids,
-                                    fake_slot_mapping,
-                                    is_fake_slot_mapping=True,
-                                    token_mask=token_mask,
-                                    layer_granularity=layer_granularity,
-                                    dp_id=dp_id,
-                                    task_id=task_id)
+        result_task_id, return_mask = self._get_match_impl(token_ids,
+                                                           fake_slot_mapping,
+                                                           is_fake_slot_mapping=True,
+                                                           token_mask=token_mask,
+                                                           layer_granularity=layer_granularity,
+                                                           dp_id=dp_id,
+                                                           task_id=task_id)
+        # trace get match request
+        self.tracer.trace_request(
+            request_type="GET_MATCH",
+            request_id=result_task_id,
+            token_ids=token_ids,
+            slot_mapping=fake_slot_mapping,
+            token_mask=token_mask,
+            layer_granularity=layer_granularity,
+            dp_id=dp_id
+        )
+        return result_task_id, return_mask
 
     def _get_match_impl(self,
                   token_ids: np.ndarray,
@@ -529,12 +576,23 @@ def put_match(self,
                   dp_id: int = 0,
                   task_id: int = -1) -> Tuple[int, np.ndarray]:
         fake_slot_mapping = np.zeros_like(token_ids)
-        return self._put_match_impl(token_ids,
-                                    fake_slot_mapping,
-                                    is_fake_slot_mapping=True,
-                                    token_mask=token_mask,
-                                    dp_id=dp_id,
-                                    task_id=task_id)
+        result_task_id, return_mask = self._put_match_impl(token_ids,
+                                                           fake_slot_mapping,
+                                                           is_fake_slot_mapping=True,
+                                                           token_mask=token_mask,
+                                                           dp_id=dp_id,
+                                                           task_id=task_id)
+        # trace put match request
+        self.tracer.trace_request(
+            request_type="PUT_MATCH",
+            request_id=result_task_id,
+            token_ids=token_ids,
+            slot_mapping=fake_slot_mapping,
+            token_mask=token_mask,
+            layer_granularity=-1,  # put has no layer_granularity parameter
+            dp_id=dp_id
+        )
+        return result_task_id, return_mask
 
     def _put_match_impl(self,
                         token_ids: np.ndarray,
@@ -562,6 +620,8 @@ def launch_tasks(self,
                         task_ids: List[int],
                         slot_mappings: List[np.ndarray]) -> None:
         assert isinstance(slot_mappings[0], np.ndarray)
+        # trace launch tasks
+        self.tracer.trace_launch_tasks(task_ids, slot_mappings)
         self.set_slot_mappings(task_ids, slot_mappings)
         for task_id in task_ids:
             self._launch_task(task_id)
diff --git a/flexkv/server/server.py b/flexkv/server/server.py
index 5e678b6a59..95b2abc101 100644
--- a/flexkv/server/server.py
+++ b/flexkv/server/server.py
@@ -174,14 +174,14 @@ def create_server(cls,
         # Set spawn method for CUDA compatibility
         with contextlib.suppress(RuntimeError):
             mp.set_start_method("spawn")
-        
+
         # Prepare environment variables for child process
         if child_env is not None or not inherit_env:
             # Use subprocess for better environment control
             import subprocess
             import pickle
             import sys
-            
+
             # Prepare environment
             if inherit_env:
                 env = os.environ.copy()
@@ -189,10 +189,10 @@ def create_server(cls,
                     env.update(child_env)
             else:
                 env = child_env or {}
-            
+
             # Serialize arguments
             args_data = pickle.dumps((model_config, cache_config, gpu_register_port, server_recv_port))
-            
+
             # Start subprocess
             flexkv_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
             server_script = textwrap.dedent(f'''
@@ -209,7 +209,7 @@ def create_server(cls,
             process = subprocess.Popen([
                 sys.executable, '-c', server_script
             ], env=env)
-            
+
             flexkv_logger.info(f"KVServer subprocess started, PID: {process.pid}")
             return KVServerHandle(process)
         else:
@@ -389,7 +389,7 @@ def __del__(self) -> None:
     tokens_per_block = 4
 
     gpu_kv_layout = KVCacheLayout(
-        type=KVCacheLayoutType.LAYERWISE,
+        type=KVCacheLayoutType.LAYERFIRST,
         num_layer=num_layers,
         num_block=num_gpu_blocks,
         tokens_per_block=tokens_per_block,
diff --git a/flexkv/storage/allocator.py b/flexkv/storage/allocator.py
index 73310f4cfa..479f8a4bcc 100644
--- a/flexkv/storage/allocator.py
+++ b/flexkv/storage/allocator.py
@@ -136,9 +136,9 @@ def allocate(cls,
                  **kwargs: Any) -> StorageHandle:
         cache_dir = kwargs.get("cache_dir")
         file_prefix = kwargs.get("file_prefix", "flexkv_ssd_cache")
-        cfg_max_blocks_per_file = kwargs.get("max_blocks_per_file", -1)
-        if cfg_max_blocks_per_file == -1:
-            cfg_max_blocks_per_file = int(1e9)
+        cfg_max_file_size_gb = kwargs.get("max_file_size_gb", -1)
+        cfg_max_blocks_per_file = int(1e9)
+        
         if cache_dir is None:
             raise ValueError("cache_dir is required for SSD allocator")
         if isinstance(cache_dir, str):
@@ -159,6 +159,10 @@ def allocate(cls,
         total_blocks_per_device = layout.num_block // num_ssd_devices
         block_size = layout.get_elements_per_block() * dtype.itemsize
 
+        if cfg_max_file_size_gb != -1:
+            cfg_max_blocks_per_file = int(cfg_max_file_size_gb * 1024 * 1024 * 1024 // block_size)
+        
+
         fsys_max_blocks_per_file = cls.get_file_size_limit(cache_dir[0]) // block_size
         num_blocks_per_file = min(fsys_max_blocks_per_file, cfg_max_blocks_per_file)
 
@@ -173,7 +177,10 @@ def allocate(cls,
                 with open(file_path, "wb+", buffering=0) as file:
                     cls._create_file(file, real_file_size)
                 ssd_files[i].append(file_path)
-
+        total_num_files = num_files_per_device * num_ssd_devices
+        real_total_size = total_num_files * real_file_size
+        flexkv_logger.info(f"SSD allocator create total {total_num_files} files in {cache_dir}, "
+                           f"each file has {real_file_size/1024/1024/1024:.2f} GB, total size {real_total_size/1024/1024/1024:.2f} GB")
         return StorageHandle(
             handle_type=AccessHandleType.FILE,
             data=ssd_files,
diff --git a/flexkv/storage/storage_engine.py b/flexkv/storage/storage_engine.py
index 82d4073917..44f98e1356 100644
--- a/flexkv/storage/storage_engine.py
+++ b/flexkv/storage/storage_engine.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from flexkv.common.config import ModelConfig, CacheConfig
+from flexkv.common.config import ModelConfig, CacheConfig, GLOBAL_CONFIG_FROM_ENV
 from flexkv.common.memory_handle import TensorSharedHandle
 from flexkv.common.storage import StorageHandle, KVCacheLayout, KVCacheLayoutType
 from flexkv.common.transfer import DeviceType
@@ -19,11 +19,9 @@ def __init__(self,
         self._storage_handles: Dict[Tuple[DeviceType, int], StorageHandle] = {}
         self._model_config = model_config
         self._cache_config = cache_config
-        if not self._cache_config.gpu_kv_layout_type == KVCacheLayoutType.LAYERWISE:
-            raise ValueError("Only layerwise layout is supported for GPU")
         if self._cache_config.enable_cpu:
             self._cpu_layout: Optional[KVCacheLayout] = KVCacheLayout(
-                type=self._cache_config.cpu_kv_layout_type,
+                type=GLOBAL_CONFIG_FROM_ENV.cpu_layout_type,
                 num_layer=self._model_config.num_layers,
                 num_block=self._cache_config.num_cpu_blocks,
                 tokens_per_block=self._cache_config.tokens_per_block,
@@ -37,10 +35,10 @@ def __init__(self,
                 dtype=self._model_config.dtype,
             )
         if self._cache_config.enable_ssd:
-            if not self._cache_config.ssd_kv_layout_type == self._cpu_layout.type:
+            if not GLOBAL_CONFIG_FROM_ENV.ssd_layout_type == self._cpu_layout.type:
                 raise ValueError(f"SSD layout type must be the same as CPU layout type: {self._cpu_layout.type}")
             self._ssd_layout: Optional[KVCacheLayout] = KVCacheLayout(
-                type=self._cache_config.ssd_kv_layout_type,
+                type=GLOBAL_CONFIG_FROM_ENV.ssd_layout_type,
                 num_layer=self._model_config.num_layers,
                 num_block=self._cache_config.num_ssd_blocks,
                 tokens_per_block=self._cache_config.tokens_per_block,
@@ -53,13 +51,13 @@ def __init__(self,
                 layout=self._ssd_layout,
                 dtype=self._model_config.dtype,
                 cache_dir=self._cache_config.ssd_cache_dir,
-                max_blocks_per_file=self._cache_config.max_blocks_per_file
+                max_file_size_gb=GLOBAL_CONFIG_FROM_ENV.max_file_size_gb
             )
         if self._cache_config.enable_remote:
-            if not self._cache_config.remote_kv_layout_type == self._cpu_layout.type:
+            if not GLOBAL_CONFIG_FROM_ENV.remote_layout_type == self._cpu_layout.type:
                 raise ValueError(f"Remote layout type must be the same as CPU layout type: {self._cpu_layout.type}")
             self._remote_layout: Optional[KVCacheLayout] = KVCacheLayout(
-                type=self._cache_config.remote_kv_layout_type,
+                type=GLOBAL_CONFIG_FROM_ENV.remote_layout_type,
                 num_layer=self._model_config.num_layers,
                 num_block=self._cache_config.num_remote_blocks,
                 tokens_per_block=self._cache_config.tokens_per_block,
@@ -76,11 +74,11 @@ def __init__(self,
             )
         if self._cache_config.enable_gds:
             # GDS should follow similar constraints as CPU/SSD/Remote
-            if not self._cache_config.gds_kv_layout_type == self._cpu_layout.type:
+            if not GLOBAL_CONFIG_FROM_ENV.gds_layout_type == self._cpu_layout.type:
                 raise ValueError(f"GDS layout type must be the same as CPU layout type: {self._cpu_layout.type}")
-            
+
             self._gds_layout: Optional[KVCacheLayout] = KVCacheLayout(
-                type=self._cache_config.gds_kv_layout_type,
+                type=GLOBAL_CONFIG_FROM_ENV.gds_layout_type,
                 num_layer=self._model_config.num_layers,
                 num_block=self._cache_config.num_gds_blocks,
                 tokens_per_block=self._cache_config.tokens_per_block,
@@ -93,7 +91,7 @@ def __init__(self,
                 layout=self._gds_layout,
                 dtype=self._model_config.dtype,
                 gds_cache_dir=self._cache_config.gds_cache_dir,
-                max_blocks_per_file=self._cache_config.max_blocks_per_file
+                max_file_size_gb=GLOBAL_CONFIG_FROM_ENV.max_file_size_gb
             )
 
     def register_gpu_blocks(self,
@@ -175,7 +173,7 @@ def allocate(self,
                 )
         elif device_type == DeviceType.SSD:
             cache_dir = kwargs.get('cache_dir')
-            max_blocks_per_file = kwargs.get('max_blocks_per_file', -1)
+            max_file_size_gb = kwargs.get('max_file_size_gb', -1)
             if raw_data is not None:
                 assert isinstance(raw_data, str) or \
                     (isinstance(raw_data, list) and all(isinstance(x, str) for x in raw_data)), \
@@ -193,7 +191,7 @@ def allocate(self,
                     dtype=dtype,
                     cache_dir=cache_dir,
                     file_prefix="flexkv_ssd_cache",
-                    max_blocks_per_file=max_blocks_per_file
+                    max_file_size_gb=max_file_size_gb
                 )
         elif device_type == DeviceType.REMOTE:
             file_path = kwargs.get('file_path')
@@ -224,13 +222,13 @@ def allocate(self,
                 )
         elif device_type == DeviceType.GDS:
             gds_cache_dir = kwargs.get('gds_cache_dir')
-            max_blocks_per_file = kwargs.get('max_blocks_per_file', -1)
-            
+            max_file_size_gb = kwargs.get('max_file_size_gb', -1)
+
             allocator = GDSAllocator(
                 layout=layout,
                 dtype=dtype,
                 gds_cache_dir=gds_cache_dir,
-                max_blocks_per_file=max_blocks_per_file
+                max_file_size_gb=max_file_size_gb
             )
             storage_handle = allocator.get_accessible_handle()
         else:
diff --git a/flexkv/transfer/transfer_engine.py b/flexkv/transfer/transfer_engine.py
index 08676183d9..1b7959fce9 100644
--- a/flexkv/transfer/transfer_engine.py
+++ b/flexkv/transfer/transfer_engine.py
@@ -38,7 +38,7 @@
     GDSTransferWorker,
     tpGDSTransferWorker,
 )
-from flexkv.common.config import CacheConfig, ModelConfig
+from flexkv.common.config import CacheConfig, ModelConfig, GLOBAL_CONFIG_FROM_ENV
 from flexkv.common.ring_buffer import SharedOpPool
 
 
@@ -90,7 +90,7 @@ def __init__(self,
         self._remote_handle = remote_handle
         self._cache_config = cache_config
 
-        self.pin_buffer = SharedOpPool(2048, self.model_config.max_req_tokens // self.cache_config.tokens_per_block)
+        self.pin_buffer = SharedOpPool(2048, self.cache_config.num_cpu_blocks)
 
         self.op_id_to_nvtx_range: Dict[int, str] = {}
 
@@ -118,10 +118,10 @@ def _init_workers(self) -> None:
                     cpu_kv_layout=self._cpu_handle.kv_layout,
                     dtype=self.gpu_handles[i].dtype,
                     gpu_device_id=i,
-                    use_ce_transfer_h2d=self.cache_config.use_ce_transfer_h2d,
-                    use_ce_transfer_d2h=self.cache_config.use_ce_transfer_d2h,
-                    transfer_sms_h2d=self.cache_config.transfer_sms_h2d,
-                    transfer_sms_d2h=self.cache_config.transfer_sms_d2h,
+                    use_ce_transfer_h2d=GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_h2d,
+                    use_ce_transfer_d2h=GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_d2h,
+                    transfer_sms_h2d=GLOBAL_CONFIG_FROM_ENV.transfer_sms_h2d,
+                    transfer_sms_d2h=GLOBAL_CONFIG_FROM_ENV.transfer_sms_d2h,
                 )
                 for i in range(self.dp_size)
             ]
@@ -140,10 +140,10 @@ def _init_workers(self) -> None:
                     dtype=self.gpu_handles[i].dtype,
                     tp_group_size=self.tp_size,
                     dp_group_id=i,
-                    use_ce_transfer_h2d=self.cache_config.use_ce_transfer_h2d,
-                    use_ce_transfer_d2h=self.cache_config.use_ce_transfer_d2h,
-                    transfer_sms_h2d=self.cache_config.transfer_sms_h2d,
-                    transfer_sms_d2h=self.cache_config.transfer_sms_d2h,
+                    use_ce_transfer_h2d=GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_h2d,
+                    use_ce_transfer_d2h=GLOBAL_CONFIG_FROM_ENV.use_ce_transfer_d2h,
+                    transfer_sms_h2d=GLOBAL_CONFIG_FROM_ENV.transfer_sms_h2d,
+                    transfer_sms_d2h=GLOBAL_CONFIG_FROM_ENV.transfer_sms_d2h,
                 )
                 for i in range(self.dp_size)
             ]
diff --git a/flexkv/transfer/worker.py b/flexkv/transfer/worker.py
index 254e0b4e28..f01d8631af 100644
--- a/flexkv/transfer/worker.py
+++ b/flexkv/transfer/worker.py
@@ -23,7 +23,7 @@
 from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType
 from flexkv.common.transfer import TransferOp, TransferType, PartitionBlockType
 from flexkv.common.transfer import get_nvtx_range_color
-from flexkv.common.config import CacheConfig
+from flexkv.common.config import CacheConfig, GLOBAL_CONFIG_FROM_ENV
 
 try:
     from flexkv.c_ext import transfer_kv_blocks_remote
@@ -596,8 +596,8 @@ def __init__(self,
         self.ssd_layer_stride_in_bytes = ssd_kv_layout_per_file.get_layer_stride() * self.dtype.itemsize
 
         try:
-            self.ioctx = c_ext.SSDIOCTX(ssd_files, len(ssd_files), cache_config.ssd_cache_iouring_entries,
-                cache_config.ssd_cache_iouring_flags)
+            self.ioctx = c_ext.SSDIOCTX(ssd_files, len(ssd_files), GLOBAL_CONFIG_FROM_ENV.iouring_entries,
+                GLOBAL_CONFIG_FROM_ENV.iouring_flags)
         except Exception as e:
             flexkv_logger.error(f"Error setting ssd ioctx: {e}\n")
             raise RuntimeError("SSD Worker init failed") from e
@@ -1084,10 +1084,10 @@ def __init__(
         self.gds_kv_stride_in_bytes = gds_kv_layout.get_kv_stride() * self.dtype.itemsize
         self.gds_block_stride_in_bytes = gds_kv_layout.get_block_stride() * self.dtype.itemsize
 
-        if not gpu_kv_layout.type == KVCacheLayoutType.LAYERWISE:
-            raise ValueError("Only layerwise layout is supported for GPU")
-        if not gds_kv_layout.type == KVCacheLayoutType.LAYERWISE:
-            raise ValueError("Only layerwise layout is supported for GDS")
+        if not gpu_kv_layout.type == KVCacheLayoutType.LAYERFIRST:
+            raise ValueError("Only LAYERFIRST layout is supported for GPU")
+        if not gds_kv_layout.type == KVCacheLayoutType.LAYERFIRST:
+            raise ValueError("Only LAYERFIRST layout is supported for GDS")
 
         # Create TP GDS Transfer Thread Group
         self.tp_gds_transfer_thread_group = TPGDSTransferThreadGroup(
diff --git a/pyproject.toml b/pyproject.toml
index db54deba99..01021be59a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,8 @@ ignore = [
     "SIM108",
     # Logging statement uses f-string
     "G004",
+    # Use X | None for type annotations.
+    "UP045",
 ]
 
 [tool.mypy]
diff --git a/tests/replay_from_tracer.py b/tests/replay_from_tracer.py
index c4c3440f43..75f8c6c8e7 100644
--- a/tests/replay_from_tracer.py
+++ b/tests/replay_from_tracer.py
@@ -20,11 +20,14 @@
 import time
 from typing import Dict, List, Optional, Any, Tuple
 import torch
+import zmq
 
 from flexkv.common.config import CacheConfig, ModelConfig
 from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType
 from flexkv.common.memory_handle import TensorSharedHandle
 from flexkv.kvtask import KVTaskEngine
+from flexkv.server.request import RegisterTPClientRequest
+from flexkv.server.utils import get_zmq_socket
 
 
 class FlexKVReplayEngine:
@@ -79,8 +82,32 @@ def parse_config_event(self, event: Dict[str, Any]):
         data = event['data']
         model_config_data = data['model_config']
         cache_config_data = data['cache_config']
+        global_config_data = data.get('global_config', {})
         gpu_layout_data = data.get('gpu_layout')
 
+        # Restore GLOBAL_CONFIG_FROM_ENV from trace
+        if global_config_data:
+            self.log("Restoring global config from trace...")
+            from flexkv.common.config import GLOBAL_CONFIG_FROM_ENV
+            
+            # Restore layout types
+            if 'cpu_layout_type' in global_config_data:
+                GLOBAL_CONFIG_FROM_ENV.cpu_layout_type = self._parse_layout_type(global_config_data['cpu_layout_type'])
+            if 'ssd_layout_type' in global_config_data:
+                GLOBAL_CONFIG_FROM_ENV.ssd_layout_type = self._parse_layout_type(global_config_data['ssd_layout_type'])
+            if 'remote_layout_type' in global_config_data:
+                GLOBAL_CONFIG_FROM_ENV.remote_layout_type = self._parse_layout_type(global_config_data['remote_layout_type'])
+            if 'gds_layout_type' in global_config_data:
+                GLOBAL_CONFIG_FROM_ENV.gds_layout_type = self._parse_layout_type(global_config_data['gds_layout_type'])
+            
+            # Restore other configs
+            for key in ['server_client_mode', 'index_accel', 'use_ce_transfer_h2d', 'use_ce_transfer_d2h',
+                       'transfer_sms_h2d', 'transfer_sms_d2h', 'iouring_entries', 'iouring_flags',
+                       'max_file_size_gb', 'evict_ratio', 'server_recv_port']:
+                if key in global_config_data:
+                    setattr(GLOBAL_CONFIG_FROM_ENV, key, global_config_data[key])
+                    self.log(f"  Restored {key} = {global_config_data[key]}")
+
         # Recreate model_config
         dtype_str = model_config_data['dtype']
         if dtype_str == "torch.float16":
@@ -108,24 +135,19 @@ def parse_config_event(self, event: Dict[str, Any]):
             enable_cpu=cache_config_data['enable_cpu'],
             enable_ssd=cache_config_data['enable_ssd'],
             enable_remote=cache_config_data['enable_remote'],
-            gpu_kv_layout_type=self._parse_layout_type(cache_config_data['gpu_kv_layout_type']),
-            cpu_kv_layout_type=self._parse_layout_type(cache_config_data['cpu_kv_layout_type']),
-            ssd_kv_layout_type=self._parse_layout_type(cache_config_data['ssd_kv_layout_type']),
-            remote_kv_layout_type=self._parse_layout_type(cache_config_data['remote_kv_layout_type']),
             enable_gds=cache_config_data['enable_gds'],
-            remote_cache_size_mode=cache_config_data['remote_cache_size_mode'],
             num_cpu_blocks=cache_config_data['num_cpu_blocks'],
             num_ssd_blocks=cache_config_data['num_ssd_blocks'],
+            num_gds_blocks=cache_config_data['num_gds_blocks'],
             num_remote_blocks=cache_config_data['num_remote_blocks'],
+            ssd_cache_dir=cache_config_data['ssd_cache_dir'],
+            gds_cache_dir=cache_config_data['gds_cache_dir'],
+            remote_cache_size_mode=cache_config_data['remote_cache_size_mode'],
             remote_file_size=cache_config_data['remote_file_size'],
             remote_file_num=cache_config_data['remote_file_num'],
             remote_file_prefix=cache_config_data['remote_file_prefix'],
-            ssd_cache_dir=cache_config_data['ssd_cache_dir'],
-            ssd_cache_iouring_entries=cache_config_data['ssd_cache_iouring_entries'],
-            ssd_cache_iouring_flags=cache_config_data['ssd_cache_iouring_flags'],
             remote_cache_path=cache_config_data['remote_cache_path'],
             remote_config_custom=cache_config_data['remote_config_custom'],
-            enable_trace=False,  # Disable trace for replay
         )
 
         # Recreate gpu_layout if available
@@ -139,8 +161,7 @@ def parse_config_event(self, event: Dict[str, Any]):
                 head_size=8,#gpu_layout_data['head_size'], #for local test
                 is_mla=gpu_layout_data['is_mla'],
             )
-
-        self.gpu_blocks_num = self.gpu_layout.num_block
+            self.gpu_blocks_num = self.gpu_layout.num_block
 
         self.log(f"Model config: {self.model_config}")
         self.log(f"Cache config loaded {self.cache_config}")
@@ -149,12 +170,12 @@ def parse_config_event(self, event: Dict[str, Any]):
 
     def _parse_layout_type(self, layout_type_str: str) -> KVCacheLayoutType:
         """Parse layout type string to enum"""
-        if "LAYERWISE" in layout_type_str:
-            return KVCacheLayoutType.LAYERWISE
-        elif "BLOCKWISE" in layout_type_str:
-            return KVCacheLayoutType.BLOCKWISE
+        if "LAYERFIRST" in layout_type_str:
+            return KVCacheLayoutType.LAYERFIRST
+        elif "BLOCKFIRST" in layout_type_str:
+            return KVCacheLayoutType.BLOCKFIRST
         else:
-            return KVCacheLayoutType.LAYERWISE  # default
+            return KVCacheLayoutType.LAYERFIRST  # default
 
     def create_gpu_blocks(self):
         """Create GPU blocks for testing (similar to test code)"""
@@ -186,6 +207,46 @@ def create_gpu_blocks(self):
 
         self.log(f"Created GPU blocks for {total_gpus} GPUs with {self.gpu_blocks_num} blocks each")
 
+    def register_gpu_blocks_to_kvmanager(self, gpu_register_port: str):
+        """Register GPU blocks to KVManager via socket"""
+        self.log("Registering GPU blocks via socket...")
+        
+        total_gpus = self.model_config.tp_size * self.model_config.dp_size
+        
+        # Create zmq socket to send GPU blocks
+        context = zmq.Context(2)
+        send_socket = get_zmq_socket(
+            context, zmq.SocketType.PUSH, gpu_register_port, False
+        )
+        
+        # Register each GPU's blocks
+        for gpu_id in range(total_gpus):
+            # Convert torch tensors to TensorSharedHandle
+            handles = []
+            for layer_tensor in self.gpu_blocks[gpu_id]:
+                handle = TensorSharedHandle(layer_tensor, gpu_id)
+                handles.append(handle)
+            
+            # Create registration request
+            register_req = RegisterTPClientRequest(
+                dp_client_id=gpu_id // self.model_config.tp_size,  # DP client ID
+                device_id=gpu_id,
+                handles=handles,
+                gpu_layout=self.gpu_layout
+            )
+            
+            # Send registration request
+            send_socket.send_pyobj(register_req)
+            self.log(f"Registered GPU {gpu_id} blocks")
+        
+        # Wait a bit to ensure all registration requests are sent
+        time.sleep(0.1)
+        
+        # Close socket
+        send_socket.close()
+        context.term()
+        self.log("GPU blocks registration completed")
+
     def create_kvmanager(self,):
         """Create and initialize KVManager"""
         self.log("Creating KVManager...")
@@ -193,7 +254,7 @@ def create_kvmanager(self,):
         if not self.gpu_layout:
             # Create default GPU layout if not provided in trace
             self.gpu_layout = KVCacheLayout(
-                type=KVCacheLayoutType.LAYERWISE,
+                type=KVCacheLayoutType.LAYERFIRST,
                 num_layer=self.model_config.num_layers,
                 num_block=self.gpu_blocks_num,  # default number of blocks
                 tokens_per_block=self.cache_config.tokens_per_block,
@@ -202,30 +263,42 @@ def create_kvmanager(self,):
                 is_mla=self.model_config.use_mla
             )
 
-        # Create KVManager
+        # Create KVTaskEngine with gpu_register_port
+        import tempfile
+        gpu_register_port = f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
+        
         self.kvmanager = KVTaskEngine(
             model_config=self.model_config,
             cache_config=self.cache_config,
-            gpu_layout=self.gpu_layout,
-            gpu_blocks=self.gpu_blocks
+            gpu_register_port=gpu_register_port
         )
 
-        # Start KVManager
-        if self.kvmanager.is_ready():
-            self.kvmanager.start()
-            self.log("KVManager started successfully")
-        else:
-            raise RuntimeError("KVManager is not ready")
+        # Start KVManager first so it can listen for registration requests
+        self.kvmanager.start()
+
+        # Register GPU blocks via socket after KVManager is started
+        self.register_gpu_blocks_to_kvmanager(gpu_register_port)
+        
+        # Wait for KVManager to be ready
+        max_wait_time = 30  # seconds
+        start_time = time.time()
+        while not self.kvmanager.is_ready():
+            if time.time() - start_time > max_wait_time:
+                raise RuntimeError("KVManager failed to become ready within timeout")
+            time.sleep(0.1)
+        
+        self.log("KVManager started successfully")
 
     def replay_request_event(self, event: Dict[str, Any]) -> int:
-        """Replay a request event (GET or PUT)"""
+        """Replay a request event (GET, PUT, GET_MATCH, PUT_MATCH)"""
         data = event['data']
         request_type = data['request_type']
 
-        # Convert lists back to tensors
-        token_ids = torch.tensor(data['token_ids'], dtype=torch.long)
-        slot_mapping = torch.tensor(data['slot_mapping'], dtype=torch.long)
-        token_mask = torch.tensor(data['token_mask'], dtype=torch.bool) if data['token_mask'] else None
+        # Convert lists back to numpy arrays (KVTaskEngine uses numpy, not torch)
+        import numpy as np
+        token_ids = np.array(data['token_ids'], dtype=np.int64)
+        slot_mapping = np.array(data['slot_mapping'], dtype=np.int64)
+        token_mask = np.array(data['token_mask'], dtype=bool) if data['token_mask'] else None
         layer_granularity = data.get('layer_granularity', -1)
         dp_id = data.get('dp_id', 0)
 
@@ -234,8 +307,9 @@ def replay_request_event(self, event: Dict[str, Any]) -> int:
         if request_type == "GET":
             print(f"🔍🔍🔍GET token_ids: {token_ids[:128]}")
             print(f"request_id: {data['request_id']}, request_type: {request_type}, "
-                  f"input length: {len(token_ids)}, true in mask: {token_mask.sum()}")
-            task_id = self.kvmanager.get_async(
+                  f"input length: {len(token_ids)}, true in mask: {token_mask.sum() if token_mask is not None else 'N/A'}")
+            # get_async return (task_id, return_mask)
+            task_id, return_mask = self.kvmanager.get_async(
                 token_ids=token_ids,
                 slot_mapping=slot_mapping,
                 token_mask=token_mask,
@@ -245,45 +319,108 @@ def replay_request_event(self, event: Dict[str, Any]) -> int:
         elif request_type == "PUT":
             print(f"✅✅✅PUT token_ids: {token_ids[:128]}")
             print(f"request_id: {data['request_id']}, request_type: {request_type}, "
-                  f"input length: {len(token_ids)}, true in mask: {token_mask.sum()}")
-            task_id = self.kvmanager.put_async(
+                  f"input length: {len(token_ids)}, true in mask: {token_mask.sum() if token_mask is not None else 'N/A'}")
+            # put_async return (task_id, return_mask)
+            task_id, return_mask = self.kvmanager.put_async(
                 token_ids=token_ids,
                 slot_mapping=slot_mapping,
                 token_mask=token_mask,
                 dp_id=dp_id
             )
+        elif request_type == "GET_MATCH":
+            print(f"🔍📝GET_MATCH token_ids: {token_ids[:128]}")
+            print(f"request_id: {data['request_id']}, request_type: {request_type}, "
+                  f"input length: {len(token_ids)}, true in mask: {token_mask.sum() if token_mask is not None else 'N/A'}")
+            # get_match return (task_id, return_mask)
+            task_id, return_mask = self.kvmanager.get_match(
+                token_ids=token_ids,
+                token_mask=token_mask,
+                layer_granularity=layer_granularity,
+                dp_id=dp_id
+            )
+        elif request_type == "PUT_MATCH":
+            print(f"✅📝PUT_MATCH token_ids: {token_ids[:128]}")
+            print(f"request_id: {data['request_id']}, request_type: {request_type}, "
+                  f"input length: {len(token_ids)}, true in mask: {token_mask.sum() if token_mask is not None else 'N/A'}")
+            # put_match return (task_id, return_mask)
+            task_id, return_mask = self.kvmanager.put_match(
+                token_ids=token_ids,
+                token_mask=token_mask,
+                dp_id=dp_id
+            )
         else:
             raise ValueError(f"Unknown request type: {request_type}")
 
         return task_id
 
+    def replay_launch_tasks_event(self, event: Dict[str, Any]):
+        """Replay a launch_tasks event"""
+        data = event['data']
+        task_ids = data['task_ids']
+        slot_mappings_list = data['slot_mappings']
+        
+        self.log(f"🚀🚀🚀Replaying launch_tasks for task_ids: {task_ids}")
+        
+        try:
+            # Convert lists back to numpy arrays
+            import numpy as np
+            slot_mappings = [np.array(sm, dtype=np.int64) for sm in slot_mappings_list]
+            
+            print(f"Launching {len(task_ids)} tasks with slot_mappings")
+            
+            # Call launch_tasks
+            self.kvmanager.launch_tasks(task_ids, slot_mappings)
+            
+            self.log(f"launch_tasks completed successfully for {len(task_ids)} tasks")
+            
+        except Exception as e:
+            self.log(f"Warning: launch_tasks operation failed: {e}")
+            import traceback
+            traceback.print_exc()
+
     def replay_wait_event(self, event: Dict[str, Any]):
         """Replay a wait event"""
         data = event['data']
         wait_type = data['wait_type']
         task_ids = data['task_ids']
+        timeout = data.get('timeout', 20.0)  # default timeout
+        completely = data.get('completely', False)  # default completely
         layer_group_id = data.get('layer_group_id')
 
-        self.log(f"⏰⏰⏰Replaying {wait_type} for task_ids: {task_ids}")
+        self.log(f"⏰⏰⏰Replaying {wait_type} for task_ids: {task_ids}, timeout: {timeout}, completely: {completely}")
 
         try:
+            # wait and try_wait return Dict[int, KVResponse]
             if wait_type == "wait":
-                result = self.kvmanager.wait(task_ids)
-            elif wait_type == "wait_for_graph_finished":
-                result = self.kvmanager.wait_for_graph_finished(task_ids)
+                result = self.kvmanager.wait(task_ids, timeout=timeout, completely=completely)
             elif wait_type == "try_wait":
                 result = self.kvmanager.try_wait(task_ids)
             else:
                 raise ValueError(f"Unknown wait type: {wait_type}")
+            
+            # process result: result is Dict[int, KVResponse]
             successed_elements = []
+            statuses = []
             for task_id in task_ids:
-                successed_elements.append(result[task_id].sum().item())
-            print(f"wait result: task ids: {task_ids}, successed elements num: {successed_elements}")
+                if task_id in result:
+                    # return_mask in KVResponse may be None
+                    if result[task_id].return_mask is not None:
+                        successed_elements.append(result[task_id].return_mask.sum())
+                    else:
+                        successed_elements.append(0)
+                    statuses.append(result[task_id].status.name if hasattr(result[task_id], 'status') else "SUCCESS")
+                else:
+                    successed_elements.append(0)
+                    statuses.append("NOT_FOUND")
+            
+            print(f"✅ {wait_type} result: task_ids={task_ids}, successed_elements={successed_elements}, statuses={statuses}")
             self.log(f"Wait completed successfully for {wait_type}")
             return result
 
         except Exception as e:
             self.log(f"Warning: Wait operation failed: {e}")
+            import traceback
+            traceback.print_exc()
             return None
 
     def replay_all_events(self):
@@ -293,8 +430,10 @@ def replay_all_events(self):
         config_events = [e for e in self.events if e['event_type'] == 'config']
         request_events = [e for e in self.events if e['event_type'] == 'request']
         wait_events = [e for e in self.events if e['event_type'] == 'wait']
+        launch_tasks_events = [e for e in self.events if e['event_type'] == 'launch_tasks']
 
-        self.log(f"Found {len(config_events)} config, {len(request_events)} request, {len(wait_events)} wait events")
+        self.log(f"Found {len(config_events)} config, {len(request_events)} request, "
+                f"{len(wait_events)} wait, {len(launch_tasks_events)} launch_tasks events")
 
         # Parse configuration first
         if config_events:
@@ -307,7 +446,7 @@ def replay_all_events(self):
         self.create_gpu_blocks()
         self.create_kvmanager()
         # Replay all non-config events in timestamp order
-        other_events = request_events + wait_events
+        other_events = request_events + wait_events + launch_tasks_events
         other_events.sort(key=lambda e: e['timestamp'])
 
         request_id_mapping = {}  # Map original request_id to replayed task_id
@@ -320,6 +459,22 @@ def replay_all_events(self):
                 request_id_mapping[original_request_id] = replayed_task_id
                 self.log(f"Mapped original request_id {original_request_id} to task_id {replayed_task_id}")
 
+            elif event_type == 'launch_tasks':
+                # Map original task_ids to replayed task_ids
+                original_task_ids = event['data']['task_ids']
+                mapped_task_ids = []
+                for orig_id in original_task_ids:
+                    if orig_id in request_id_mapping:
+                        mapped_task_ids.append(request_id_mapping[orig_id])
+                    else:
+                        self.log(f"Warning: Cannot find mapping for task_id {orig_id}")
+                        mapped_task_ids.append(orig_id)  # Use original if not found
+
+                # Update event data with mapped task_ids
+                event['data']['task_ids'] = mapped_task_ids
+                self.replay_launch_tasks_event(event)
+                print("launch_tasks done")
+
             elif event_type == 'wait':
                 # Map original task_ids to replayed task_ids
                 original_task_ids = event['data']['task_ids']
diff --git a/tests/test_kvmanager.py b/tests/test_kvmanager.py
index 473cf0741c..b1364c8932 100644
--- a/tests/test_kvmanager.py
+++ b/tests/test_kvmanager.py
@@ -7,7 +7,7 @@
 import multiprocessing as mp
 from multiprocessing import Process, Pipe
 
-from flexkv.common.config import ModelConfig, CacheConfig
+from flexkv.common.config import ModelConfig, CacheConfig, GLOBAL_CONFIG_FROM_ENV
 from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType
 from flexkv.common.request import KVResponseStatus
 from flexkv.kvtask import KVTaskEngine
@@ -24,12 +24,12 @@
     create_gpu_kv_layout, GPUKVCacheVerifier
 )
 
-def run_tp_client(dp_client_id, 
-                  tp_rank, 
-                  server_recv_port, 
-                  model_config, 
-                  cache_config, 
-                  num_gpu_blocks, 
+def run_tp_client(dp_client_id,
+                  tp_rank,
+                  server_recv_port,
+                  model_config,
+                  cache_config,
+                  num_gpu_blocks,
                   child_conn,
                   gpu_layout_type):
     """Run tp_client process"""
@@ -96,28 +96,21 @@ def shutdown_tp_client(tp_client_processes):
     {'tp_size': 4, 'dp_size': 1, 'use_mla': True},
 ], indirect=True)
 @pytest.mark.parametrize("cache_config", [
-    {'enable_cpu': True, 'enable_ssd': False, 'enable_remote': False, 'num_cpu_blocks': 1024},
-    {'enable_cpu': True, 'enable_ssd': True, 'enable_remote': False,},
-    {'enable_cpu': True, 'enable_ssd': True, 'enable_remote': False, 'ssd_cache_iouring_entries': 512},
-    {'enable_cpu': True, 'enable_ssd': True, 'enable_remote': True, 'num_ssd_blocks': 256, 'num_remote_blocks': 512},
-    {'enable_cpu': True, 'enable_ssd': True, 'enable_remote': True,
-     'num_ssd_blocks': 256, 'num_remote_blocks': 512, 'ssd_cache_iouring_entries': 512},
+    {'enable_cpu': True, 'enable_ssd': False, 'num_cpu_blocks': 1024},
+    {'enable_cpu': True, 'enable_ssd': True, 'num_cpu_blocks': 1024, 'num_ssd_blocks': 2048},
     # GDS test configs
-    {'enable_cpu': True, 'enable_gds': True, 'enable_ssd': False, 'enable_remote': False, 'num_gds_blocks': 512, 'gds_cache_dir': ["./gdstest"]},
+    {'enable_cpu': True, 'enable_gds': True, 'enable_ssd': False, \
+        'num_gds_blocks': 512, 'gds_cache_dir': ["./gdstest"]},
 ], indirect=True)
 @pytest.mark.parametrize("test_config", [
     {'num_gpu_blocks': 512, 'requests_per_block': 16, 'initial_write_ratio': 0.4},
 ], indirect=True)
-@pytest.mark.parametrize("flex_kv_layout_type", [
-    KVCacheLayoutType.LAYERWISE,
-    KVCacheLayoutType.BLOCKWISE,
-])
 @pytest.mark.parametrize("gpu_layout_type", [
     0,
     1,
     2,
 ])
-def test_kvmanager(model_config, cache_config, test_config, flex_kv_layout_type, gpu_layout_type):
+def test_kvmanager(model_config, cache_config, test_config, gpu_layout_type):
     tp_size = model_config.tp_size
     dp_size = model_config.dp_size
 
@@ -131,11 +124,6 @@ def test_kvmanager(model_config, cache_config, test_config, flex_kv_layout_type,
     enable_remote = cache_config.enable_remote
     enable_gds = cache_config.enable_gds
 
-    cache_config.cpu_kv_layout_type = flex_kv_layout_type
-    cache_config.ssd_kv_layout_type = flex_kv_layout_type
-    cache_config.remote_kv_layout_type = flex_kv_layout_type
-    cache_config.gds_kv_layout_type = flex_kv_layout_type
-
     num_gpu_blocks = test_config["num_gpu_blocks"]
     block_per_request = test_config['requests_per_block']
     initial_write_ratio = test_config['initial_write_ratio']
@@ -155,10 +143,7 @@ def test_kvmanager(model_config, cache_config, test_config, flex_kv_layout_type,
          #note that for now only dp_size=1 is supported
         pytest.skip("skip because server-client mode is not ready for dp_size > 1")
 
-    import uuid
-    gpu_register_port = f"ipc:///tmp/flexkv_gpu_{uuid.uuid4().hex[:8]}"
-    server_recv_port = f"ipc:///tmp/flexkv_srv_{uuid.uuid4().hex[:8]}"
-    kvmanager = KVManager(model_config, cache_config, gpu_register_port, server_recv_port)
+    kvmanager = KVManager(model_config, cache_config)
     kvmanager.start()
 
     # Create pipes for each tp_client to send GPU blocks back
@@ -172,7 +157,8 @@ def test_kvmanager(model_config, cache_config, test_config, flex_kv_layout_type,
 
         tp_client_process = mp_ctx.Process(
             target=run_tp_client,
-            args=(0, tp_rank, gpu_register_port, model_config, cache_config, num_gpu_blocks + tp_rank, child_conn, gpu_layout_type),
+            args=(0, tp_rank, kvmanager.gpu_register_port, model_config, cache_config, \
+                num_gpu_blocks + tp_rank, child_conn, gpu_layout_type),
             daemon=True
         )
         tp_client_processes.append(tp_client_process)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0297c0e5cb..b682d46ff1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -31,27 +31,8 @@
     'enable_remote': False,
     'num_cpu_blocks': 128,
     'num_ssd_blocks': 512,
-    'num_remote_blocks': 512,  # Aligned with ssd_blocks
-    'remote_cache_size_mode': "block_num",
-    'remote_file_size': (1024*1024*1024),
-    'remote_file_num': 16,
-    'remote_file_prefix': "remote_cache",
     'enable_gds': False,
-    'enable_trace': False,
     'ssd_cache_dir': ["./ssd_cache", "./ssd_cache2/"],
-    'ssd_cache_iouring_entries': 512,
-    'ssd_cache_iouring_flags': 1,
-    'remote_cache_path': ["remote_cache1", "remote_cache2"],
-    'remote_config_custom': {
-        "pcfs_fsid": "f_l91fz6",
-        "pcfs_port": 31,
-        "pcfs_ip": "172.21.16.177",
-        "pcfs_parent_nodeid": 144115188075855883  # Using transfer engine value for consistency
-    },
-    'use_ce_transfer_h2d': False,
-    'use_ce_transfer_d2h': False,
-    'transfer_sms_h2d': 8,
-    'transfer_sms_d2h': 8,
 }
 
 DEFAULT_TEST_CONFIG = {
@@ -121,9 +102,9 @@ def create_gpu_kv_layout(model_config, cache_config, num_gpu_blocks, gpu_layout_
     tokens_per_block = cache_config.tokens_per_block
 
     if gpu_layout_type == 0 or gpu_layout_type == 2:
-        layout_type = KVCacheLayoutType.LAYERWISE
+        layout_type = KVCacheLayoutType.LAYERFIRST
     elif gpu_layout_type == 1:
-        layout_type = KVCacheLayoutType.BLOCKWISE
+        layout_type = KVCacheLayoutType.BLOCKFIRST
     else:
         raise ValueError(f"Invalid GPU layout type: {gpu_layout_type}")
     tpgroup_gpu_kv_layout = KVCacheLayout(
@@ -151,7 +132,7 @@ def generate_gpu_blocks_with_ground_truth(model_config, cache_config, test_confi
     num_gpu_blocks = test_config["num_gpu_blocks"]
 
     tpgroup_gpu_kv_layout = KVCacheLayout(
-        type=KVCacheLayoutType.LAYERWISE,
+        type=KVCacheLayoutType.LAYERFIRST,
         num_layer=num_layers,
         num_block=num_gpu_blocks,
         tokens_per_block=tokens_per_block,
@@ -242,220 +223,6 @@ def skip_if_no_cuda():
     if torch.cuda.device_count() == 0:
         pytest.skip("No CUDA devices available")
 
-# Server-Client mode support functions
-class KVManagerServerClient:
-    """Server-Client wrapper for KVManager that manages server, tp_client, and dp_client processes"""
-
-    def __init__(self, model_config, cache_config, gpu_kv_layout, gpu_blocks):
-        import tempfile
-        from flexkv.server.client import KVDPClient, KVTPClient
-        from flexkv.server.server import KVServer
-
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.gpu_kv_layout = gpu_kv_layout
-        self.gpu_blocks = gpu_blocks
-
-        # Create temporary IPC port for communication
-        self.server_recv_port = f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
-
-        # Extract basic config parameters for server process
-        server_config = {
-            'num_layers': model_config.num_layers,
-            'num_kv_heads': model_config.num_kv_heads,
-            'head_size': model_config.head_size,
-            'use_mla': model_config.use_mla,
-            'tp_size': model_config.tp_size,
-            'dp_size': model_config.dp_size,
-            'dtype': str(model_config.dtype),
-            'tokens_per_block': cache_config.tokens_per_block,
-            'enable_cpu': cache_config.enable_cpu,
-            'enable_ssd': cache_config.enable_ssd,
-            'enable_remote': cache_config.enable_remote,
-            'num_cpu_blocks': cache_config.num_cpu_blocks,
-            'num_ssd_blocks': cache_config.num_ssd_blocks,
-            'ssd_cache_dir': cache_config.ssd_cache_dir if hasattr(cache_config, 'ssd_cache_dir') else ["./ssd_cache"],
-        }
-
-        # Start server process
-        self.server_process = Process(
-            target=self._run_server,
-            args=(self.server_recv_port, server_config),
-            daemon=False
-        )
-        self.server_process.start()
-
-        # Wait for server to start
-        time.sleep(5)
-
-        # Initialize dp_client
-        self.dp_client = KVDPClient(self.server_recv_port, model_config)
-        print("dp_client started")
-
-        # Start tp_client processes
-        self.tp_client_processes = []
-        for tp_rank in range(model_config.tp_size):
-            device_id = tp_rank + self.dp_client.dp_client_id * model_config.tp_size
-            # Extract only the necessary basic types for tp_client
-            tp_client_process = Process(
-                target=KVManagerServerClient._run_tp_client,
-                args=(self.dp_client.dp_client_id, tp_rank, device_id, self.server_recv_port,
-                      model_config.num_layers, str(model_config.dtype),
-                      list(gpu_kv_layout.kv_shape[1:]), model_config.use_mla),
-                daemon=True
-            )
-            tp_client_process.start()
-            self.tp_client_processes.append(tp_client_process)
-
-        # Wait for tp clients to register
-        time.sleep(5)
-
-        self._server_client_mode = True
-
-    def _run_server(self, server_recv_port, server_config):
-        """Run server process"""
-        from flexkv.server.server import KVServer
-        from flexkv.common.config import ModelConfig, CacheConfig
-
-        # Recreate config objects from basic parameters
-        model_config = ModelConfig(
-            num_layers=server_config['num_layers'],
-            num_kv_heads=server_config['num_kv_heads'],
-            head_size=server_config['head_size'],
-            use_mla=server_config['use_mla'],
-            tp_size=server_config['tp_size'],
-            dp_size=server_config['dp_size'],
-            dtype=torch.float16 if server_config['dtype'] == 'torch.float16' else torch.float32
-        )
-
-        cache_config = CacheConfig(
-            tokens_per_block=server_config['tokens_per_block'],
-            enable_cpu=server_config['enable_cpu'],
-            enable_ssd=server_config['enable_ssd'],
-            enable_remote=server_config['enable_remote'],
-            num_cpu_blocks=server_config['num_cpu_blocks'],
-            num_ssd_blocks=server_config['num_ssd_blocks'],
-            ssd_cache_dir=server_config['ssd_cache_dir']
-        )
-        print("starting server ... ...")
-        kvserver = KVServer(model_config, cache_config, server_recv_port)
-        kvserver.run()
-        print("server started")
-
-    @staticmethod
-    def _run_tp_client(dp_client_id, tp_rank, device_id, server_recv_port, num_layers, dtype_str, kv_shape, is_mla):
-        """Run tp_client process"""
-        from flexkv.server.client import KVTPClient
-        from flexkv.common.storage import KVCacheLayout, KVCacheLayoutType
-
-        tp_client = KVTPClient(server_recv_port, dp_client_id, device_id)
-        # Convert dtype string back to torch dtype
-        if dtype_str == "torch.float16":
-            dtype = torch.float16
-        elif dtype_str == "torch.float32":
-            dtype = torch.float32
-        else:
-            dtype = torch.float16  # default
-
-        # Create GPU blocks for this tp_rank in the tp_client process
-        gpu_blocks_for_tp = []
-        for layer_id in range(num_layers):
-            gpu_blocks_for_tp.append(
-                torch.rand(size=tuple(kv_shape), dtype=dtype).cuda(device_id)
-            )
-
-        # Create a simple layout for registration
-        gpu_kv_layout = KVCacheLayout(
-            type=KVCacheLayoutType.LAYERWISE,
-            num_layer=num_layers,
-            num_block=kv_shape[1],  # Assuming this is the block dimension
-            tokens_per_block=kv_shape[2],  # Assuming this is the tokens_per_block dimension
-            num_head=kv_shape[3],  # Assuming this is the num_head dimension
-            head_size=kv_shape[4],  # Assuming this is the head_size dimension
-            is_mla=is_mla
-        )
-        print("registering to server ... ...")
-        tp_client.register_to_server(gpu_blocks_for_tp, gpu_kv_layout)
-        print("registered to server")
-        # Keep the process running
-        while True:
-            time.sleep(1)
-
-    def is_ready(self):
-        """Check if the server-client system is ready"""
-        return self.server_process.is_alive() and all(p.is_alive() for p in self.tp_client_processes)
-
-    def start(self):
-        """Start the server-client system (already started in __init__)"""
-        return True
-
-    def put_async(self, token_ids, slot_mapping, dp_id):
-        """Put data to the server-client system"""
-        return self.dp_client.put_async(token_ids, slot_mapping, token_mask=None)
-
-    def get_async(self, token_ids, slot_mapping, layer_granularity, dp_id):
-        """Get data from the server-client system"""
-        return self.dp_client.get_async(token_ids, slot_mapping, token_mask=None)
-
-    def wait_for_graph_finished(self, request):
-        """Wait for graph to finish"""
-        masks = self.dp_client.wait(request)
-        time.sleep(0.2)
-        return masks
-
-    def wait(self, request_ids):
-        """Wait for requests to complete"""
-        masks = self.dp_client.wait(request_ids)
-        return masks
-
-    def shutdown(self):
-        """Shutdown all processes"""
-        print("Shutting down KVManagerServerClient...")
-
-        # First, try to gracefully shutdown the server by sending a shutdown signal
-        try:
-            # Send a shutdown request to the server
-            self.dp_client.shutdown()
-            print("Sent shutdown request to server")
-
-            # Wait a bit for graceful shutdown
-            time.sleep(3)
-        except Exception as e:
-            print(f"Error sending shutdown request: {e}")
-
-        # Terminate tp_client processes
-        print("Terminating tp_client processes...")
-        for tp_process in self.tp_client_processes:
-            if tp_process.is_alive():
-                tp_process.terminate()
-                tp_process.join(timeout=5)
-                if tp_process.is_alive():
-                    print(f"Force killing tp_client process {tp_process.pid}")
-                    tp_process.kill()
-                    tp_process.join(timeout=2)
-
-        # Terminate server process
-        print("Terminating server process...")
-        if self.server_process.is_alive():
-            self.server_process.terminate()
-            self.server_process.join(timeout=10)
-            if self.server_process.is_alive():
-                print(f"Force killing server process {self.server_process.pid}")
-                self.server_process.kill()
-                self.server_process.join(timeout=5)
-
-        # Clean up temporary file
-        import os
-        if hasattr(self, 'server_recv_port') and self.server_recv_port.startswith('ipc://'):
-            temp_file = self.server_recv_port[6:]  # Remove 'ipc://' prefix
-            try:
-                if os.path.exists(temp_file):
-                    os.unlink(temp_file)
-                    print(f"Cleaned up temporary file: {temp_file}")
-            except Exception as e:
-                print(f"Error cleaning up temporary file: {e}")
-
-        print("KVManagerServerClient shutdown complete")
 
 class GPUKVCacheVerifier:
     def __init__(self,
@@ -537,10 +304,12 @@ def fill_gpu_blocks(self, token_ids, block_ids):
                                                               actual_head_id)
                             # GPU tensor dim：[kv_dim, num_block, tokens_per_block, num_head, head_size]
                             if self.gpu_layout_type == 0:
-                                # gpu_layout_type 0: [num_layer][kv_dim, num_block, tokens_per_block, num_head, head_size]
+                                # gpu_layout_type 0:
+                                #     [num_layer][kv_dim, num_block, tokens_per_block, num_head, head_size]
                                 gpu_tensor[kv_id, block_id, :, head_id, :] = hash_value
                             elif self.gpu_layout_type == 1:
-                                # gpu_layout_type 1: [tp_id][0][num_block, num_layer, kv_dim, tokens_per_block, num_head, head_size]
+                                # gpu_layout_type 1:
+                                #     [tp_id][0][num_block, num_layer, kv_dim, tokens_per_block, num_head, head_size]
                                 # Need to get the first (and only) tensor from the list
                                 gpu_tensor[block_id, layer_id, kv_id, :, head_id, :] = hash_value
                             elif self.gpu_layout_type == 2:
@@ -580,10 +349,12 @@ def verify_kv_blocks(self, token_ids, block_ids)->bool:
                                                                       token_ids[start_token_idx:end_token_idx],
                                                                       actual_head_id)
                             if self.gpu_layout_type == 0:
-                                # gpu_layout_type 0: [num_layer][kv_dim, num_block, tokens_per_block, num_head, head_size]
+                                # gpu_layout_type 0:
+                                #     [num_layer][kv_dim, num_block, tokens_per_block, num_head, head_size]
                                 actual_values = gpu_tensor[kv_id, block_id, :, head_id, :]
                             elif self.gpu_layout_type == 1:
-                                # gpu_layout_type 1: [tp_id][0][num_block, num_layer, kv_dim, tokens_per_block, num_head, head_size]
+                                # gpu_layout_type 1:
+                                #     [tp_id][0][num_block, num_layer, kv_dim, tokens_per_block, num_head, head_size]
                                 # Need to get the first (and only) tensor from the list
                                 actual_values = gpu_tensor[block_id, layer_id, kv_id, :, head_id, :]
                             elif self.gpu_layout_type == 2:
@@ -620,7 +391,7 @@ def gpu_blocks_worker_process(conn, model_config, cache_config, gpu_kv_layout):
         # Create GPU blocks in subprocess
         gpu_blocks = []
         for layer_id in range(model_config.num_layers):
-            # LAYERWISE format: [kv_dim, num_block, tokens_per_block, num_head, head_size]
+            # LAYERFIRST format: [kv_dim, num_block, tokens_per_block, num_head, head_size]
             kv_dim = 2 if not model_config.use_mla else 1
             gpu_tensor = torch.zeros(
                 kv_dim,
@@ -678,7 +449,7 @@ def example_usage_gpu_kv_cache_verifier():
 
     # Create GPU KV layout
     gpu_kv_layout = KVCacheLayout(
-        type=KVCacheLayoutType.LAYERWISE,
+        type=KVCacheLayoutType.LAYERFIRST,
         num_layer=model_config.num_layers,
         num_block=64,  # Assume 64 blocks
         tokens_per_block=cache_config.tokens_per_block,
@@ -690,7 +461,7 @@ def example_usage_gpu_kv_cache_verifier():
     # Create mock GPU blocks
     gpu_blocks = []
     for layer_id in range(model_config.num_layers):
-        # LAYERWISE format: [kv_dim, num_block, tokens_per_block, num_head, head_size]
+        # LAYERFIRST format: [kv_dim, num_block, tokens_per_block, num_head, head_size]
         kv_dim = 2 if not model_config.use_mla else 1
         gpu_tensor = torch.zeros(
             kv_dim,