vllm-project · simon-mo · Apr 17, 2025 · Apr 2, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# Read prompts from output.txt
+prompts = []
+try:
+    with open("output.txt") as f:
+        for line in f:
+            prompts.append(line.strip())
+    print(f"Loaded {len(prompts)} prompts from output.txt")
+except FileNotFoundError:
+    print("Error: output.txt file not found")
+    exit(-1)
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    enforce_eager=True,
+    gpu_memory_utilization=0.8,
+    kv_transfer_config=KVTransferConfig.from_cli(
+        '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
+        '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
+    ))  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(prompts, sampling_params)
+
+new_prompts = []
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    new_prompts.append(prompt + generated_text)
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+context = "Hi " * 1000
+context2 = "Hey " * 500
+prompts = [
+    context + "Hello, my name is",
+    context + "The capital of France is",
+    context2 + "Your name is",
+    context2 + "The capital of China is",
+]
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
+          enforce_eager=True,
+          gpu_memory_utilization=0.8,
+          kv_transfer_config=KVTransferConfig.from_cli(
+              '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
+              '"kv_connector_extra_config": '
+              '{"shared_storage_path": "local_storage"}}')
+          )  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+)
+
+new_prompts = []
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    new_prompts.append(prompt + generated_text)
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+# Write new_prompts to output.txt
+with open("output.txt", "w") as f:
+    for prompt in new_prompts:
+        f.write(prompt + "\n")
+print(f"Saved {len(new_prompts)} prompts to output.txt")
diff --git a/examples/offline_inference/disaggrated-prefill-v1/run.sh b/examples/offline_inference/disaggrated-prefill-v1/run.sh
@@ -0,0 +1,5 @@
+rm -rf local_storage/
+rm output.txt
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
@@ -10,6 +10,8 @@
 from vllm.attention import AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.distributed import (get_kv_transfer_group, has_kv_transfer_group,
+                              is_v1_kv_transfer_group)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
@@ -179,6 +181,11 @@ def forward(
         context using
         `vllm.forward_context.get_forward_context().attn_metadata`.
         """
+
+        # KVConnector: start async saving kvs to connector
+        # to the layers KV cache before running attention.
+        wait_for_kv_layer_from_connector(self.layer_name)
+
         if self.calculate_kv_scales:
             attn_metadata = get_forward_context().attn_metadata
             if attn_metadata.enable_kv_scales_calculation:
@@ -217,18 +224,24 @@ def forward(
             else:
                 torch.ops.vllm.unified_attention_with_output(
                     query, key, value, output, self.layer_name)
-            return output.view(-1, hidden_size)
+            output = output.view(-1, hidden_size)
         else:
             if self.use_direct_call:
                 forward_context = get_forward_context()
                 attn_metadata = forward_context.attn_metadata
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
-                return self.impl.forward(self, query, key, value,
-                                         self_kv_cache, attn_metadata)
+                output = self.impl.forward(self, query, key, value,
+                                           self_kv_cache, attn_metadata)
             else:
-                return torch.ops.vllm.unified_attention(
+                output = torch.ops.vllm.unified_attention(
                     query, key, value, self.layer_name)
 
+        # KVConnector: start saving kvs to the connector.
+        # NOTE: forward_context completion will block until
+        # this operation is completed.
+        maybe_save_kv_layer_to_connector(self.layer_name, self.kv_cache)
+        return output
+
     def calc_kv_scales(self, query, key, value):
         self._q_scale.copy_(torch.abs(query).max() / self.q_range)
         self._k_scale.copy_(torch.abs(key).max() / self.k_range)
@@ -329,6 +342,38 @@ def forward(
         return out.reshape(bsz, q_len, -1)
 
 
+def wait_for_kv_layer_from_connector(layer_name: str):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+
+    connector.wait_for_layer_load(layer_name)
+
+
+def maybe_save_kv_layer_to_connector(
+    layer_name: str,
+    kv_cache: List[torch.Tensor],
+):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+
+    kv_cache_layer = kv_cache[forward_context.virtual_engine]
+    connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)
 def set_splitting_ops_for_v1(self): 
     # If default, override splitting ops for piecewise cudagraph on V1. 
     # NOTE: this function needs to be called 
     if not self.splitting_ops: 
         self.splitting_ops = [ 
             "vllm.unified_attention", 
             "vllm.unified_attention_with_output", 
         ] 
 def set_splitting_ops_for_v1(self): 
     # If default, override splitting ops for piecewise cudagraph on V1. 
     # NOTE: this function needs to be called 
     if not self.splitting_ops: 
         self.splitting_ops = [ 
             "vllm.unified_attention", 
             "vllm.unified_attention_with_output", 
         ] 
+
+
 def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,

@@ -1,16 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import importlib
-from typing import TYPE_CHECKING, Callable, Dict, Type
+from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union
+
+import vllm.envs as envs
+# NOTE(Kuntai): We prefer not to directly the classes with "_V1" suffix.
+# This makes it easier for us to deprecate code in v0 (which will happen soon).
+# yapf: disable
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+# yapf: enable
+from vllm.logger import init_logger
 
 from .base import KVConnectorBase
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
+logger = init_logger(__name__)
+
 
 class KVConnectorFactory:
-    _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
+    _registry: Dict[str, Callable[[], Type[Union[KVConnectorBase,
+                                                 KVConnectorBase_V1]]]] = {}
 
     @classmethod
     def register_connector(cls, name: str, module_path: str,
@@ -19,21 +31,41 @@ def register_connector(cls, name: str, module_path: str,
         if name in cls._registry:
             raise ValueError(f"Connector '{name}' is already registered.")
 
-        def loader() -> Type[KVConnectorBase]:
+        def loader() -> Type[Union[KVConnectorBase, KVConnectorBase_V1]]:
             module = importlib.import_module(module_path)
             return getattr(module, class_name)
 
         cls._registry[name] = loader
 
     @classmethod
-    def create_connector(cls, rank: int, local_rank: int,
-                         config: "VllmConfig") -> KVConnectorBase:
+    def create_connector(
+            cls, rank: Optional[int], local_rank: Optional[int],
+            config: "VllmConfig", role: KVConnectorRole
+    ) -> Union[KVConnectorBase, KVConnectorBase_V1]:
         connector_name = config.kv_transfer_config.kv_connector
         if connector_name not in cls._registry:
             raise ValueError(f"Unsupported connector type: {connector_name}")
 
-        connector_cls = cls._registry[connector_name]()
-        return connector_cls(rank, local_rank, config)
+        if envs.VLLM_USE_V1:
+            # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
+            # Scheduler connector:
+            # - Co-colate with scheduler process
+            # - Should only be used inside the Scheduler class
+            # Worker connector:
+            # - Co-locate with worker process
+            # - Should only be used inside the forward context & attention layer
+            # We build these two connectors separately to enforce strict
+            # separation
+            connector_cls_v1 = cls._registry[connector_name]()
+            assert issubclass(connector_cls_v1, KVConnectorBase_V1)
+            logger.info("Creating v1 connector with name: %s", connector_name)
+            return connector_cls_v1(rank, local_rank, config, role)
+        else:
+            assert rank is not None
+            assert local_rank is not None
+            connector_cls = cls._registry[connector_name]()
+            assert issubclass(connector_cls, KVConnectorBase)
+            return connector_cls(rank, local_rank, config)
 
 
 # Register various connectors here.
@@ -57,4 +89,9 @@ def create_connector(cls, rank: int, local_rank: int,
 KVConnectorFactory.register_connector(
     "MooncakeStoreConnector",
     "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
-    "MooncakeStoreConnector")
+    "MooncakeStoreConnector")
+
+KVConnectorFactory.register_connector(
+    "SharedStorageConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
+    "SharedStorageConnector")
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# yapf: disable
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorRole)
+
+# yapf: enable
+
+__all__ = [
+    "KVConnectorRole",
+    "KVConnectorBase_V1",
+]