From 6a12481ae9ebba7b59c113da2e31e32fb749e51b Mon Sep 17 00:00:00 2001 From: ApostaC Date: Wed, 2 Apr 2025 19:08:07 +0000 Subject: [PATCH 001/116] Fixing DCO issue and format checker issue Co-authored-by: KuntaiDu Co-authored-by: YaoJiayi <1200040070@link.cuhk.edu.cn> Signed-off-by: ApostaC --- .../disaggrated-prefill-v1/decode_example.py | 36 ++ .../disaggrated-prefill-v1/prefill_example.py | 42 ++ .../disaggrated-prefill-v1/run.sh | 4 + requirements/test.txt | 22 +- vllm/attention/layer.py | 31 +- .../kv_transfer/kv_connector/factory.py | 53 ++- .../kv_transfer/kv_connector/v1/__init__.py | 11 + .../kv_transfer/kv_connector/v1/base.py | 186 +++++++++ .../v1/shared_storage_connector.py | 358 ++++++++++++++++++ ...ransfer_agent.py => kv_connector_agent.py} | 17 +- vllm/distributed/parallel_state.py | 35 +- vllm/engine/arg_utils.py | 6 - vllm/forward_context.py | 17 + vllm/v1/core/kv_cache_manager.py | 14 +- vllm/v1/core/sched/output.py | 5 + vllm/v1/core/sched/scheduler.py | 29 +- vllm/v1/engine/core.py | 1 + vllm/v1/worker/gpu_model_runner.py | 8 + vllm/v1/worker/gpu_worker.py | 12 +- 19 files changed, 844 insertions(+), 43 deletions(-) create mode 100644 examples/offline_inference/disaggrated-prefill-v1/decode_example.py create mode 100644 examples/offline_inference/disaggrated-prefill-v1/prefill_example.py create mode 100644 examples/offline_inference/disaggrated-prefill-v1/run.sh create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/__init__.py create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/base.py create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py rename vllm/distributed/kv_transfer/{kv_transfer_agent.py => kv_connector_agent.py} (79%) diff --git a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py new file mode 100644 index 000000000000..57fa8395ab92 --- /dev/null +++ b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +# Read prompts from output.txt +prompts = [] +try: + with open("output.txt") as f: + for line in f: + prompts.append(line.strip()) + print(f"Loaded {len(prompts)} prompts from output.txt") +except FileNotFoundError: + print("Error: output.txt file not found") + exit(-1) + +sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) + +llm = LLM( + model="meta-llama/llama-3.1-8b-instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig.from_cli( + '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",' + '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}' + )) #, max_model_len=2048, max_num_batched_tokens=2048) + +# 1ST generation (prefill instance) +outputs = llm.generate(prompts, sampling_params) + +new_prompts = [] +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py new file mode 100644 index 000000000000..4456921c2fed --- /dev/null +++ b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +context = "Hi " * 1000 +context2 = "Hey " * 500 +prompts = [ + context + "Hello, my name is", + context + "The capital of France is", + context2 + "Your name is", + context2 + "The capital of China is", +] + +sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + +llm = LLM(model="meta-llama/llama-3.1-8b-instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig.from_cli( + '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' + '"kv_extra_config": {"shared_storage_path": "local_storage"}}') + ) #, max_model_len=2048, max_num_batched_tokens=2048) + +# 1ST generation (prefill instance) +outputs = llm.generate( + prompts, + sampling_params, +) + +new_prompts = [] +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +# Write new_prompts to output.txt +with open("output.txt", "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") +print(f"Saved {len(new_prompts)} prompts to output.txt") diff --git a/examples/offline_inference/disaggrated-prefill-v1/run.sh b/examples/offline_inference/disaggrated-prefill-v1/run.sh new file mode 100644 index 000000000000..694793b78be1 --- /dev/null +++ b/examples/offline_inference/disaggrated-prefill-v1/run.sh @@ -0,0 +1,4 @@ +find /tmp -iname "*attn.pt" 2>/dev/null | cut -d'/' -f1,2,3 | uniq | xargs rm -r + +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=1 python3 prefill_example.py +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=1 python3 decode_example.py \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index 236b8be32805..8dde94f313c8 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -23,6 +23,10 @@ anyio==4.6.2.post1 # via httpx argcomplete==3.5.1 # via datamodel-code-generator +async-timeout==5.0.1 + # via + # aiohttp + # redis attrs==24.2.0 # via # aiohttp @@ -117,6 +121,10 @@ encodec==0.1.1 # via vocos evaluate==0.4.3 # via lm-eval +exceptiongroup==1.2.2 + # via + # anyio + # pytest fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -556,9 +564,7 @@ sentence-transformers==3.2.1 sentencepiece==0.2.0 # via mistral-common setuptools==75.8.0 - # via - # pytablewriter - # torch + # via pytablewriter shellingham==1.5.4 # via typer six==1.16.0 @@ -605,6 +611,12 @@ timm==1.0.11 # via -r requirements/test.in tokenizers==0.21.0 # via transformers +toml==0.10.2 + # via datamodel-code-generator +tomli==2.2.1 + # via + # black + # pytest torch==2.6.0 # via # -r requirements/test.in @@ -670,12 +682,16 @@ typer==0.15.2 # via fastsafetensors typing-extensions==4.12.2 # via + # anyio + # black # huggingface-hub # librosa # mistral-common + # multidict # pqdm # pydantic # pydantic-core + # rich # torch # typer tzdata==2024.2 diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index dbf4723ee1bd..ca7d062d3d20 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -10,6 +10,7 @@ from vllm.attention import AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig, get_current_vllm_config +from vllm.distributed import get_kv_transfer_group from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.base_config import ( @@ -179,6 +180,7 @@ def forward( context using `vllm.forward_context.get_forward_context().attn_metadata`. """ + get_kv_transfer_group().wait_for_layer_load(self.layer_name) if self.calculate_kv_scales: attn_metadata = get_forward_context().attn_metadata if attn_metadata.enable_kv_scales_calculation: @@ -214,20 +216,26 @@ def forward( self_kv_cache, attn_metadata, output=output) + save_kv_layer_to_connector(self.layer_name, self.kv_cache) else: torch.ops.vllm.unified_attention_with_output( query, key, value, output, self.layer_name) + save_kv_layer_to_connector(self.layer_name, self.kv_cache) return output.view(-1, hidden_size) else: if self.use_direct_call: forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata self_kv_cache = self.kv_cache[forward_context.virtual_engine] - return self.impl.forward(self, query, key, value, - self_kv_cache, attn_metadata) + output = self.impl.forward(self, query, key, value, + self_kv_cache, attn_metadata) + save_kv_layer_to_connector(self.layer_name, self.kv_cache) + return output else: - return torch.ops.vllm.unified_attention( + output = torch.ops.vllm.unified_attention( query, key, value, self.layer_name) + save_kv_layer_to_connector(self.layer_name, self.kv_cache) + return output def calc_kv_scales(self, query, key, value): self._q_scale.copy_(torch.abs(query).max() / self.q_range) @@ -329,6 +337,23 @@ def forward( return out.reshape(bsz, q_len, -1) +def save_kv_layer_to_connector( + layer_name: str, + kv_cache: List[torch.Tensor], +): + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if attn_metadata is None: + return + + connector = get_kv_transfer_group() + if connector is None: + return + + kv_cache_layer = kv_cache[forward_context.virtual_engine] + connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata) + + def unified_attention( query: torch.Tensor, key: torch.Tensor, diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index e37ce6dc75b0..fac30324471d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,16 +1,28 @@ # SPDX-License-Identifier: Apache-2.0 import importlib -from typing import TYPE_CHECKING, Callable, Dict, Type +from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union + +import vllm.envs as envs +# NOTE(Kuntai): We prefer not to directly the classes with "_V1" suffix. +# This makes it easier for us to deprecate code in v0 (which will happen soon). +# yapf: disable +from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, + KVConnectorRole) +# yapf: enable +from vllm.logger import init_logger from .base import KVConnectorBase if TYPE_CHECKING: from vllm.config import VllmConfig +logger = init_logger(__name__) + class KVConnectorFactory: - _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {} + _registry: Dict[str, Callable[[], Type[Union[KVConnectorBase, + KVConnectorBase_V1]]]] = {} @classmethod def register_connector(cls, name: str, module_path: str, @@ -19,21 +31,41 @@ def register_connector(cls, name: str, module_path: str, if name in cls._registry: raise ValueError(f"Connector '{name}' is already registered.") - def loader() -> Type[KVConnectorBase]: + def loader() -> Type[Union[KVConnectorBase, KVConnectorBase_V1]]: module = importlib.import_module(module_path) return getattr(module, class_name) cls._registry[name] = loader @classmethod - def create_connector(cls, rank: int, local_rank: int, - config: "VllmConfig") -> KVConnectorBase: + def create_connector( + cls, rank: Optional[int], local_rank: Optional[int], + config: "VllmConfig", role: KVConnectorRole + ) -> Union[KVConnectorBase, KVConnectorBase_V1]: connector_name = config.kv_transfer_config.kv_connector if connector_name not in cls._registry: raise ValueError(f"Unsupported connector type: {connector_name}") - connector_cls = cls._registry[connector_name]() - return connector_cls(rank, local_rank, config) + if envs.VLLM_USE_V1: + # NOTE(Kuntai): v1 connector is explicitly separated into two roles. + # Scheduler connector: + # - Co-colate with scheduler process + # - Should only be used inside the Scheduler class + # Worker connector: + # - Co-locate with worker process + # - Should only be used inside the forward context & attention layer + # We build these two connectors separately to enforce strict + # separation + connector_cls_v1 = cls._registry[connector_name]() + assert issubclass(connector_cls_v1, KVConnectorBase_V1) + logger.info("Creating v1 connector with name: %s", connector_name) + return connector_cls_v1(rank, local_rank, config, role) + else: + assert rank is not None + assert local_rank is not None + connector_cls = cls._registry[connector_name]() + assert issubclass(connector_cls, KVConnectorBase) + return connector_cls(rank, local_rank, config) # Register various connectors here. @@ -57,4 +89,9 @@ def create_connector(cls, rank: int, local_rank: int, KVConnectorFactory.register_connector( "MooncakeStoreConnector", "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector", - "MooncakeStoreConnector") \ No newline at end of file + "MooncakeStoreConnector") + +KVConnectorFactory.register_connector( + "SharedStorageConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector", + "SharedStorageConnector") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py new file mode 100644 index 000000000000..ddad33e27fc4 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +# yapf: disable +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorRole) + +# yapf: enable + +__all__ = [ + "KVConnectorRole", + "KVConnectorBase_V1", +] diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py new file mode 100644 index 000000000000..efc3593140dd --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -0,0 +1,186 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State +communication in vLLM v1 + +The class provides the following primitives: +""" + +import enum +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.config import VllmConfig + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheManager + from vllm.v1.core.kv_cache_utils import KVCacheBlock + from vllm.v1.request import Request + + +class KVConnectorRole(enum.Enum): + # Connector running in the scheduler process + SCHEDULER = 0 + + # Connector running in the worker process + WORKER = 1 + + +@dataclass +class KVConnectorMetadata: + pass + + +class KVConnectorBase_V1(ABC): + + def __init__(self, rank: Optional[int], local_rank: Optional[int], + config: "VllmConfig", role: KVConnectorRole): + self._connector_metada = KVConnectorMetadata() + self._rank = rank + self._local_rank = local_rank + self._config = config + self._role = role + + @property + def role(self) -> KVConnectorRole: + return self._role + + def bind_connector_metadata( + self, connector_metadata: KVConnectorMetadata) -> None: + """Set the connector metadata from the scheduler. + + This function should be called by the model runner every time + before the model execution. The metadata will be used for runtime + KV cache loading and saving. + + Args: + connector_metadata (dict): the connector metadata. + """ + self._connector_metada = connector_metadata + + def clear_connector_metadata(self) -> None: + """Clear the connector metadata. + + This function should be called by the model runner every time + after the model execution. + """ + self._connector_metada = KVConnectorMetadata() + + def _get_connector_metadata(self) -> KVConnectorMetadata: + """Get the connector metadata. + + This function should only be called inside the connector. + + Returns: + ConnectorMetadata: the connector metadata. + """ + return self._connector_metada + + # ============================== + # Worker-side methods + # ============================== + + @abstractmethod + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + """Start loading the KV cache from the connector buffer to vLLM's + paged KV buffer. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + + """ + pass + + @abstractmethod + def wait_for_layer_load(self, layer_name: str) -> None: + """Blocking until the KV for a specific layer is loaded into vLLM's + paged buffer. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + pass + + @abstractmethod + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", **kwargs) -> None: + """Start saving the a layer of KV cache from vLLM's paged buffer + to the connector. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + pass + + @abstractmethod + def wait_for_save(self): + """Block until all the save operations is done. + + This prevents vLLM overwrites the paged KV buffer before + saving is done. + """ + pass + + # ============================== + # Scheduler-side methods + # ============================== + @abstractmethod + def get_external_prefix_cache_blocks( + self, + request: "Request", + computed_blocks: list["KVCacheBlock"], + num_computed_tokens: int, + kv_cache_manager: "KVCacheManager", + ) -> list["KVCacheBlock"]: + """Get the external prefix cache blocks from the connector. + + This function may change the state of the connector, which will be + used by `attach_connector_meta` later. + + This function will also allocate/free the blocks dynamically when + there is remote cache hit. + + Args: + request (Request): the request object. + computed_blocks (list[KVCacheBlock]): the 'local' computed blocks. + num_computed_tokens (int): the number of 'local' computed tokens. + kv_cache_manager (KVCacheManager): the KV cache manager to + allocate/free the blocks if needed. + + Returns: + The updated list of the computed blocks (appended with the remote + cached blocks) + """ + pass + + @abstractmethod + def attach_connector_meta( + self, scheduler_output: SchedulerOutput) -> SchedulerOutput: + """Attach the connector metadata to the request object. + + This function should NOT modify other fields in the scheduler_output + except the `connector_metadata` field. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py new file mode 100644 index 000000000000..4ba80b2ef4d4 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -0,0 +1,358 @@ +# SPDX-License-Identifier: Apache-2.0 +import hashlib +import os +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheManager + from vllm.v1.core.kv_cache_utils import KVCacheBlock + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class ReqMeta: + # Request tokens + token_ids: torch.Tensor + # Slot mappings, should have the same length as token_ids + slot_mapping: torch.Tensor + # Is store or load + is_store: bool + + ## Blocks allocated by the scheduler (no-longer needed) + #block_ids: torch.Tensor + + @staticmethod + def from_request(request: "Request", block_size: int, + is_store: bool) -> "ReqMeta": + valid_num_tokens = align_to_block_size(len(request.prompt_token_ids), + block_size) + token_ids = torch.tensor(request.prompt_token_ids)[:valid_num_tokens] + block_ids = torch.tensor(request.block_ids) + num_blocks = block_ids.shape[0] + block_offsets = torch.arange(0, block_size) + slot_mapping = block_offsets.reshape((1, block_size)) + \ + block_ids.reshape((num_blocks, 1)) * block_size + slot_mapping = slot_mapping.flatten()[:valid_num_tokens] + return ReqMeta( + token_ids=token_ids, + slot_mapping=slot_mapping, + is_store=is_store, + ) + + +@dataclass +class SharedStorageConnectorMetadata(KVConnectorMetadata): + requests: list[ReqMeta] + + def __init__(self): + self.requests = [] + + def add_request( + self, + request: "Request", + block_size: int, + is_store: bool, + ) -> None: + self.requests.append( + ReqMeta.from_request(request, block_size, is_store)) + + +class SharedStorageConnector(KVConnectorBase_V1): + # NOTE: This is just a simple debug implementation of the KV connector. + # It save / load the KV cache to / from the disk. + # It does extra work which will overwrite the existing prefix-cache in GPU + # - to remove the overhead, need to add some "mask" in the ReqMeta class + + def __init__(self, rank: Optional[int], local_rank: Optional[int], + config: "VllmConfig", role: KVConnectorRole): + super().__init__( + rank=rank, + local_rank=local_rank, + config=config, + role=role, + ) + self._block_size = config.cache_config.block_size + self._requests_need_load: list[str] = [] + self._storage_path = config.kv_transfer_config.get_from_extra_config( + "shared_storage_path", "/tmp") + logger.info(config.kv_transfer_config) + logger.info("Shared storage path is %s", self._storage_path) + + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + """Start loading the KV cache from the connector buffer to vLLM's + paged KV buffer. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + """ + + def inject_kv_into_layer( + dst_kv_cache_layer: torch.Tensor, + src_kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> None: + """Inject the KV cache into the layer. + + Args: + dst_kv_cache_layer (torch.Tensor): the destination KV cache + layer. In shape [2, num_pages, page_size, xxx]. + src_kv_cache (torch.Tensor): the source KV cache. In shape + [2, num_tokens, xxx]. + slot_mapping (torch.Tensor): the slot mapping. In shape + [num_tokens]. + """ + dst_kv_cache_layer_shape = dst_kv_cache_layer.shape + num_pages = dst_kv_cache_layer_shape[1] + page_size = dst_kv_cache_layer_shape[2] + dst_kv_cache_layer = dst_kv_cache_layer.reshape( + 2, num_pages * page_size, -1) + dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + + logger.info("Start loading KV cache from the connector") + # Get the metadata + metadata: KVConnectorMetadata = \ + self._get_connector_metadata() + assert isinstance(metadata, SharedStorageConnectorMetadata) + + if metadata is None: + logger.warning( + "In connector.start_load_kv, but the connector metadata is None" + ) + return + + attn_metadata = forward_context.attn_metadata + if attn_metadata is None: + logger.warning( + "In connector.start_load_kv, but the attn_metadata is None") + return + + # Load the KV for each layer + for layer_name in forward_context.no_compile_layers: + attn_layer = forward_context.no_compile_layers[layer_name] + kv_cache_layer = attn_layer.kv_cache[ + forward_context.virtual_engine] + + for request in metadata.requests: + if request.is_store: + continue + filename = self.generate_filename_debug( + layer_name, request.token_ids) + kv_cache = torch.load(filename).cuda( + ) # TODO: may need to handle the device here + inject_kv_into_layer(kv_cache_layer, kv_cache, + request.slot_mapping) + + def wait_for_layer_load(self, layer_name: str) -> None: + """Blocking until the KV for a specific layer is loaded into vLLM's + paged buffer. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + return + + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", **kwargs) -> None: + """Start saving the a layer of KV cache from vLLM's paged buffer + to the connector. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + + def extract_kv_cache_from_layer( + layer: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> torch.Tensor: + """Extract the KV cache from the layer. + + Assume the shape of the layer is (2, num_pages, page_size, xxx). + """ + num_pages, page_size = layer.shape[1], layer.shape[2] + return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, + ...] + + connector_metadata = self._get_connector_metadata() + assert isinstance(connector_metadata, SharedStorageConnectorMetadata) + for request in connector_metadata.requests: + if request.is_store: + filename = self.generate_filename_debug( + layer_name, request.token_ids) + kv_cache = extract_kv_cache_from_layer(kv_layer, + request.slot_mapping) + torch.save(kv_cache.cpu().detach(), filename) + + def wait_for_save(self): + return + + def get_external_prefix_cache_blocks( + self, + request: "Request", + computed_blocks: list["KVCacheBlock"], + num_computed_tokens: int, + kv_cache_manager: "KVCacheManager", + ) -> list["KVCacheBlock"]: + """Get the external prefix cache blocks from the connector. + + This function may change the state of the connector, which will be + used by `attach_connector_meta` later. + + Args: + request (Request): the request object. + computed_blocks (list[KVCacheBlock]): the 'local' computed blocks. + num_computed_tokens (int): the number of 'local' computed tokens. + kv_cache_manager (KVCacheManager): the KV cache manager to + allocate/free the blocks if needed. + + Returns: + The updated list of the computed blocks (appended with the remote + cached blocks) + """ + # NOTE: in this debug implementation, we assume that the prompt is + # cached_prompt + newly_generated_single_token + # Therefore, we use prompt_token_ids[:-1] to determine the folder name + + # NOTE: in current v1 scheduler, the num_computed_tokens is aligned + # with the block granularity. And it expects the returned blocks and + # num_computed_tokens to also be aligned with the block granularity. + if not self.found_match_for_request(request): + return computed_blocks + + # Now, first num_tokens_to_check tokens are hit, we need to prepare + # the metadata for the worker connector to correctly load the KV + + logger.info("Hit the cache! Allocate new blocks!") + num_tokens_to_check = align_to_block_size( + len(request.prompt_token_ids) - 1, self._block_size) + need_to_allocate = num_tokens_to_check - num_computed_tokens + if need_to_allocate > 0: + # HACK: We don't want the scheduler see the blocks are allocated + # and associated with the current request. Instead, we want the + # scheduler find that the blocks are already allocated and they + # are associated with some other requests (i.e., the case of + # prefix caching. + + # HACK: KVCacheManager.allocate_slots will pre-allocate a few + # blocks, which will cause problems in the later allocations. + # We should make sure the pre allocation does not happen. + old_req_id = request.request_id + request.request_id = "temp-req-id-for-connector" + allocated_blocks = kv_cache_manager.allocate_slots( + request, need_to_allocate, computed_blocks, preallocate=False) + request.request_id = old_req_id + kv_cache_manager.req_to_blocks.pop("temp-req-id-for-connector") + kv_cache_manager.num_cached_block.pop("temp-req-id-for-connector") + + num_expected_blocks = need_to_allocate // self._block_size + if len(allocated_blocks) > num_expected_blocks: + logger.error("Detected pre-allocated blocks in the connector!" + "This should not happen!") + allocated_blocks = allocated_blocks[:num_expected_blocks] + + self._requests_need_load.append(request.request_id) + return computed_blocks + allocated_blocks + else: + return computed_blocks + + def attach_connector_meta( + self, scheduler_output: SchedulerOutput) -> SchedulerOutput: + """Attach the connector metadata to the request object. + + This function should NOT modify other fields in the scheduler_output + except the `connector_metadata` field. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + meta = SharedStorageConnectorMetadata() + for request in scheduler_output.scheduled_new_reqs: + # T^T, why there is both req_id and request_id???? + if request.req_id in self._requests_need_load: + meta.add_request(request, self._block_size, is_store=False) + else: + # NOTE: here, we set the store and load being exclusive, + # but in LMCache use case, a single request can have both + # store and load status + if not self.found_match_for_request(request): + meta.add_request(request, self._block_size, is_store=True) + scheduler_output.connector_metadata = meta + + self._requests_need_load.clear() + return scheduler_output + + # ============================== + # Helper functions + # ============================== + + def found_match_for_request( + self, + request: "Request", + ) -> bool: + """Check if the cache is hit for the request. + """ + num_tokens_to_check = align_to_block_size( + len(request.prompt_token_ids) - 1, self._block_size) + foldername = self.generate_foldername_debug(torch.tensor( + request.prompt_token_ids)[:num_tokens_to_check], + create_folder=False) + return os.path.exists(foldername) + + def generate_foldername_debug( + self, + input_ids: torch.Tensor, + create_folder=False, + ) -> str: + """Generate a folder name based on the hash of the bytes of the input + ids. + """ + input_ids_bytes = input_ids.numpy().tobytes() + input_ids_hash = hashlib.md5(input_ids_bytes).hexdigest() + foldername = os.path.join(self._storage_path, input_ids_hash) + if create_folder: + os.makedirs(foldername, exist_ok=True) + return foldername + + def generate_filename_debug( + self, + layer_name: str, + input_ids: torch.Tensor, + ) -> str: + """Generate a file name based on the layer name and the hash + of the bytes of the input ids. + """ + foldername = self.generate_foldername_debug(input_ids, + create_folder=True) + return os.path.join(foldername, f"{layer_name}.pt") + + +def align_to_block_size(num_tokens: int, block_size) -> int: + """Align the number of tokens to the block size. + """ + return (num_tokens - 1) // block_size * block_size diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py similarity index 79% rename from vllm/distributed/kv_transfer/kv_transfer_agent.py rename to vllm/distributed/kv_transfer/kv_connector_agent.py index 1e80e0bd7de8..a3b53d2184fe 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_agent.py +++ b/vllm/distributed/kv_transfer/kv_connector_agent.py @@ -13,17 +13,25 @@ import torch +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) +# yapf: disable +from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorRole as KVConnectorRole_V1) +# yapf: enable from vllm.logger import init_logger from vllm.sequence import IntermediateTensors logger = init_logger(__name__) -class KVTransferAgent: +class KVConnectorAgent: """ A class designated for distributed KV transfer + + This class currently only wraps one KV connector. But in the future, it may + wrap multiple connectors to support more use cases. Target use cases: 1. Disaggregated prefill @@ -47,7 +55,7 @@ def __init__( "TransferAgent should only be used when kv_connector is set." self.connector = KVConnectorFactory.create_connector( - rank, local_rank, config) + rank, local_rank, config, KVConnectorRole_V1.WORKER) def send_kv_caches_and_hidden_states( self, @@ -57,12 +65,13 @@ def send_kv_caches_and_hidden_states( hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: - + assert isinstance(self.connector, KVConnectorBase) self.connector.send_kv_caches_and_hidden_states( model_executable, model_input, kv_caches, hidden_or_intermediate_states) def close(self) -> None: + assert isinstance(self.connector, KVConnectorBase) self.connector.close() def recv_kv_caches_and_hidden_states( @@ -71,6 +80,6 @@ def recv_kv_caches_and_hidden_states( kv_caches: List[torch.Tensor] ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: - + assert isinstance(self.connector, KVConnectorBase) return self.connector.recv_kv_caches_and_hidden_states( model_executable, model_input, kv_caches) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 514851694837..a5e42889d5f5 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -37,7 +37,6 @@ import torch.distributed from torch.distributed import Backend, ProcessGroup -import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer import vllm.envs as envs from vllm.distributed.device_communicators.base_device_communicator import ( DeviceCommunicatorBase) @@ -48,6 +47,8 @@ if TYPE_CHECKING: from vllm.config import VllmConfig + from vllm.distributed.kv_transfer.kv_connector.kv_connector_agent import ( + KVConnectorAgent) @dataclass @@ -767,13 +768,13 @@ def get_pp_group() -> GroupCoordinator: # kept for backward compatibility get_pipeline_model_parallel_group = get_pp_group -_KV_TRANSFER: Optional[kv_transfer.KVTransferAgent] = None +_KV_CONNECTOR_AGENT: Optional["KVConnectorAgent"] = None -def get_kv_transfer_group() -> kv_transfer.KVTransferAgent: - assert _KV_TRANSFER is not None, ( +def get_kv_transfer_group() -> "KVConnectorAgent": + assert _KV_CONNECTOR_AGENT is not None, ( "disaggregated KV cache transfer parallel group is not initialized") - return _KV_TRANSFER + return _KV_CONNECTOR_AGENT @contextmanager @@ -962,19 +963,31 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: Initialize KV cache transfer parallel group. """ - global _KV_TRANSFER + global _KV_CONNECTOR_AGENT if vllm_config.kv_transfer_config is None: return if all([ vllm_config.kv_transfer_config.is_kv_transfer_instance, - _KV_TRANSFER is None + _KV_CONNECTOR_AGENT is None ]): - _KV_TRANSFER = kv_transfer.KVTransferAgent( - rank=get_world_group().rank, - local_rank=get_world_group().local_rank, - config=vllm_config) + from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) + from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorRole as KVConnectorRole_V1) + + kwargs = { + "rank": get_world_group().rank, + "local_rank": get_world_group().local_rank, + "config": vllm_config, + } + if envs.VLLM_USE_V1: + # NOTE(Kuntai): + # Parallel state is initialized in v1 worker, + # so this connector is for sure worker connector. + kwargs["role"] = KVConnectorRole_V1.WORKER + _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(**kwargs) def ensure_model_parallel_initialized( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 88723d9f5b74..39c992986ab3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1487,12 +1487,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: recommend_to_remove=False) return False - # No Disaggregated Prefill so far. - if self.kv_transfer_config != EngineArgs.kv_transfer_config: - _raise_or_fallback(feature_name="--kv-transfer-config", - recommend_to_remove=False) - return False - # No FlashInfer or XFormers so far. V1_BACKENDS = [ "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1", diff --git a/vllm/forward_context.py b/vllm/forward_context.py index e195a03c5cac..83491249b180 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -11,6 +11,10 @@ import vllm.envs as envs from vllm.config import VllmConfig +from vllm.distributed import get_kv_transfer_group +# yapf: disable +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 +# yapf: enable from vllm.logger import init_logger if TYPE_CHECKING: @@ -40,6 +44,9 @@ class ForwardContext: virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass dp_metadata: Optional[DPMetadata] = None + # KV cache connector + # NOTE(Kuntai): only v1 connector works with ForwardContext for now + kv_connector: Optional[KVConnectorBase_V1] = None _forward_context: Optional[ForwardContext] = None @@ -98,6 +105,11 @@ def set_forward_context(attn_metadata: Any, virtual_engine=virtual_engine, attn_metadata=attn_metadata, dp_metadata=dp_metadata) + + if attn_metadata is not None: + kv_connector = get_kv_transfer_group() + kv_connector.start_load_kv(_forward_context) + try: yield finally: @@ -133,4 +145,9 @@ def set_forward_context(attn_metadata: Any, logger.info(("Batchsize forward time stats " "(batchsize, count, median_time(ms)): %s"), forward_stats) + + # Waiting for the save operation to finish + if _forward_context.kv_connector is not None: + _forward_context.kv_connector.wait_for_save() + _forward_context = prev_context diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index c0f7715209d1..3500b8629aff 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -27,6 +27,7 @@ def __init__( caching_hash_algo: str = "builtin", num_preallocate_tokens: int = 64, log_stats: bool = False, + connector=None, ) -> None: assert len(kv_cache_config.kv_cache_groups) == 1, ( "KVCacheManager does not support hybrid models with more than 1 " @@ -80,6 +81,7 @@ def __init__( # data for reempted ones. self.num_cached_block: dict[str, int] = {} self.prefix_cache_stats = PrefixCacheStats() + self.connector = connector @property def usage(self) -> float: @@ -147,6 +149,10 @@ def get_computed_blocks( # Add back the last block hash if it was removed. block_hashes.append(last_block_hash) + computed_blocks = self.connector.get_external_prefix_cache_blocks( + request, computed_blocks, + len(computed_blocks) * self.block_size, self) + self.prefix_cache_stats.queries += len(block_hashes) self.prefix_cache_stats.hits += len(computed_blocks) @@ -163,7 +169,8 @@ def allocate_slots( self, request: Request, num_tokens: int, - new_computed_blocks: Optional[list[KVCacheBlock]] = None + new_computed_blocks: Optional[list[KVCacheBlock]] = None, + preallocate=True, ) -> Optional[list[KVCacheBlock]]: """Add slots for a request with new tokens to append. @@ -173,6 +180,7 @@ def allocate_slots( not include the tokens that have already been computed. new_computed_blocks: A list of new computed blocks just hitting the prefix caching. + preallocate: Whether to preallocate blocks for the request. Blocks layout: ----------------------------------------------------------------------- @@ -245,8 +253,10 @@ def allocate_slots( else: # Get new blocks from the free block pool considering # preallocated blocks. + num_preallocate_blocks =\ + self.num_preallocate_blocks if preallocate else 0 num_new_blocks = min( - num_new_blocks + self.num_preallocate_blocks, + num_new_blocks + num_preallocate_blocks, self.block_pool.get_num_free_blocks(), # Should not exceed the maximum number of blocks per request. # This is especially because the block table has the shape diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index dc0d2d59fea7..d7d2a9132106 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -9,6 +9,8 @@ import numpy as np import numpy.typing as npt + from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorMetadata) from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.sampling_params import SamplingParams @@ -121,3 +123,6 @@ class SchedulerOutput: structured_output_request_ids: dict[str, int] # the bitmask for the whole batch grammar_bitmask: Optional[npt.NDArray[np.int32]] + + # the connector metadata + connector_metadata: Optional[KVConnectorMetadata] = None diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index a0865c8fd845..eca7953a8d41 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -7,7 +7,8 @@ from collections.abc import Iterable from typing import Optional, Union -from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig +from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig, + VllmConfig) from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, @@ -33,6 +34,7 @@ class Scheduler(SchedulerInterface): def __init__( self, + vllm_config: VllmConfig, scheduler_config: SchedulerConfig, model_config: ModelConfig, cache_config: CacheConfig, @@ -43,6 +45,7 @@ def __init__( include_finished_set: bool = False, log_stats: bool = False, ) -> None: + self.vllm_config = vllm_config self.scheduler_config = scheduler_config self.cache_config = cache_config self.lora_config = lora_config @@ -62,13 +65,28 @@ def __init__( self.scheduler_config.max_num_batched_tokens self.max_model_len = self.scheduler_config.max_model_len + # create connector + from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) + from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorRole as KVConnectorRole_V1) + self.connector = KVConnectorFactory.create_connector( + rank=None, + local_rank=None, + config=self.vllm_config, + role=KVConnectorRole_V1.SCHEDULER) + + num_gpu_blocks = cache_config.num_gpu_blocks + assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 + # Create the KV cache manager. self.kv_cache_manager = KVCacheManager( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, enable_caching=cache_config.enable_prefix_caching, caching_hash_algo=self.cache_config.prefix_caching_hash_algo, - log_stats=self.log_stats) + log_stats=self.log_stats, + connector=self.connector) self.block_size = self.cache_config.block_size # req_id -> Request @@ -416,6 +434,7 @@ def schedule(self) -> SchedulerOutput: resumed_from_preemption=False, ) for req in scheduled_running_reqs ] + scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, scheduled_cached_reqs=resumed_reqs_data + running_reqs_data, @@ -434,6 +453,12 @@ def schedule(self) -> SchedulerOutput: grammar_bitmask=grammar_bitmask, ) + # NOTE(Kuntai): this function is designed for multiple purposes: + # 1. Plan the KV cache store + # 2. Wrap up all the KV cache load / save ops into an opaque object + # 3. Clear the internal states of the connector + self.connector.attach_connector_meta(scheduler_output) + # Advance the number of computed tokens for the request AFTER # the request is scheduled. # 1. The scheduler_output of the current step has to include the diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 19c7799b59b7..5a8bc45c45db 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -93,6 +93,7 @@ def __init__( vllm_config.scheduler_config.scheduler_cls) self.scheduler: SchedulerInterface = Scheduler( + vllm_config=vllm_config, scheduler_config=vllm_config.scheduler_config, model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 513806332efe..bc8358d00988 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -13,6 +13,7 @@ from vllm.attention import AttentionType, get_attn_backend from vllm.attention.layer import Attention from vllm.config import CompilationLevel, VllmConfig +from vllm.distributed import get_kv_transfer_group from vllm.distributed.parallel_state import get_pp_group, graph_capture from vllm.forward_context import set_forward_context from vllm.logger import init_logger @@ -1042,6 +1043,10 @@ def execute_model( for k, v in self.intermediate_tensors.items() }) + # Update the connector's state with the metadata in scheduler output. + get_kv_transfer_group().bind_connector_metadata( + scheduler_output.connector_metadata) + # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): @@ -1059,6 +1064,9 @@ def execute_model( sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(sample_hidden_states, None) + # Clear connector's state + get_kv_transfer_group().clear_connector_metadata() + # Apply structured output bitmasks if present if scheduler_output.grammar_bitmask is not None: self.apply_grammar_bitmask(scheduler_output, logits) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 191443683fa0..d6d7725bf8e9 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -9,9 +9,10 @@ import torch.nn as nn import vllm.envs as envs -from vllm.config import ParallelConfig, VllmConfig +from vllm.config import VllmConfig from vllm.device_allocator.cumem import CuMemAllocator -from vllm.distributed import (ensure_model_parallel_initialized, +from vllm.distributed import (ensure_kv_transfer_initialized, + ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) from vllm.distributed.parallel_state import get_pp_group @@ -110,7 +111,7 @@ def init_device(self): raise RuntimeError( f"Not support device type: {self.device_config.device}") # Initialize the distributed environment. - init_worker_distributed_environment(self.parallel_config, self.rank, + init_worker_distributed_environment(self.vllm_config, self.rank, self.distributed_init_method, self.local_rank) # Set random seed. @@ -271,12 +272,13 @@ def check_health(self) -> None: def init_worker_distributed_environment( - parallel_config: ParallelConfig, + vllm_config: VllmConfig, rank: int, distributed_init_method: Optional[str] = None, local_rank: int = -1, ) -> None: """Initialize the distributed environment.""" + parallel_config = vllm_config.parallel_config set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) init_distributed_environment(parallel_config.world_size, rank, @@ -285,6 +287,8 @@ def init_worker_distributed_environment( ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) + ensure_kv_transfer_initialized(vllm_config) + def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): # Check if the GPU supports the dtype. From 34bea7519989b562be10a1ffec334342a54f0a83 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Wed, 2 Apr 2025 19:21:18 +0000 Subject: [PATCH 002/116] fixing pre-commit conflicts Signed-off-by: ApostaC --- requirements/test.txt | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index 8dde94f313c8..236b8be32805 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -23,10 +23,6 @@ anyio==4.6.2.post1 # via httpx argcomplete==3.5.1 # via datamodel-code-generator -async-timeout==5.0.1 - # via - # aiohttp - # redis attrs==24.2.0 # via # aiohttp @@ -121,10 +117,6 @@ encodec==0.1.1 # via vocos evaluate==0.4.3 # via lm-eval -exceptiongroup==1.2.2 - # via - # anyio - # pytest fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -564,7 +556,9 @@ sentence-transformers==3.2.1 sentencepiece==0.2.0 # via mistral-common setuptools==75.8.0 - # via pytablewriter + # via + # pytablewriter + # torch shellingham==1.5.4 # via typer six==1.16.0 @@ -611,12 +605,6 @@ timm==1.0.11 # via -r requirements/test.in tokenizers==0.21.0 # via transformers -toml==0.10.2 - # via datamodel-code-generator -tomli==2.2.1 - # via - # black - # pytest torch==2.6.0 # via # -r requirements/test.in @@ -682,16 +670,12 @@ typer==0.15.2 # via fastsafetensors typing-extensions==4.12.2 # via - # anyio - # black # huggingface-hub # librosa # mistral-common - # multidict # pqdm # pydantic # pydantic-core - # rich # torch # typer tzdata==2024.2 From 20ef2ac5715021c4d75fe3109056a5fe15d96bc3 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Wed, 2 Apr 2025 20:09:16 +0000 Subject: [PATCH 003/116] [fix] fix the runtime error when no kv cache config is provided Signed-off-by: ApostaC --- .../disaggrated-prefill-v1/run.sh | 4 +-- vllm/attention/layer.py | 28 +++++++++++-------- vllm/distributed/parallel_state.py | 4 +++ vllm/forward_context.py | 5 ++-- vllm/v1/core/kv_cache_manager.py | 9 ++++-- vllm/v1/core/sched/scheduler.py | 24 +++++++++------- vllm/v1/worker/gpu_model_runner.py | 10 ++++--- 7 files changed, 52 insertions(+), 32 deletions(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/run.sh b/examples/offline_inference/disaggrated-prefill-v1/run.sh index 694793b78be1..e74e01277734 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/run.sh +++ b/examples/offline_inference/disaggrated-prefill-v1/run.sh @@ -1,4 +1,4 @@ -find /tmp -iname "*attn.pt" 2>/dev/null | cut -d'/' -f1,2,3 | uniq | xargs rm -r +rm -rf local_storage/ VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=1 python3 prefill_example.py -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=1 python3 decode_example.py \ No newline at end of file +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=1 python3 decode_example.py diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index ca7d062d3d20..d2cf34665896 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -10,7 +10,7 @@ from vllm.attention import AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig, get_current_vllm_config -from vllm.distributed import get_kv_transfer_group +from vllm.distributed import get_kv_transfer_group, has_kv_transfer_group from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.base_config import ( @@ -180,7 +180,8 @@ def forward( context using `vllm.forward_context.get_forward_context().attn_metadata`. """ - get_kv_transfer_group().wait_for_layer_load(self.layer_name) + if has_kv_transfer_group(): + get_kv_transfer_group().wait_for_layer_load(self.layer_name) if self.calculate_kv_scales: attn_metadata = get_forward_context().attn_metadata if attn_metadata.enable_kv_scales_calculation: @@ -216,11 +217,13 @@ def forward( self_kv_cache, attn_metadata, output=output) - save_kv_layer_to_connector(self.layer_name, self.kv_cache) + maybe_save_kv_layer_to_connector(self.layer_name, + self.kv_cache) else: torch.ops.vllm.unified_attention_with_output( query, key, value, output, self.layer_name) - save_kv_layer_to_connector(self.layer_name, self.kv_cache) + maybe_save_kv_layer_to_connector(self.layer_name, + self.kv_cache) return output.view(-1, hidden_size) else: if self.use_direct_call: @@ -229,12 +232,14 @@ def forward( self_kv_cache = self.kv_cache[forward_context.virtual_engine] output = self.impl.forward(self, query, key, value, self_kv_cache, attn_metadata) - save_kv_layer_to_connector(self.layer_name, self.kv_cache) + maybe_save_kv_layer_to_connector(self.layer_name, + self.kv_cache) return output else: output = torch.ops.vllm.unified_attention( query, key, value, self.layer_name) - save_kv_layer_to_connector(self.layer_name, self.kv_cache) + maybe_save_kv_layer_to_connector(self.layer_name, + self.kv_cache) return output def calc_kv_scales(self, query, key, value): @@ -337,17 +342,18 @@ def forward( return out.reshape(bsz, q_len, -1) -def save_kv_layer_to_connector( +def maybe_save_kv_layer_to_connector( layer_name: str, kv_cache: List[torch.Tensor], ): - forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.attn_metadata - if attn_metadata is None: + if not has_kv_transfer_group(): return connector = get_kv_transfer_group() - if connector is None: + + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if attn_metadata is None: return kv_cache_layer = kv_cache[forward_context.virtual_engine] diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index a5e42889d5f5..636c4dff28d2 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -777,6 +777,10 @@ def get_kv_transfer_group() -> "KVConnectorAgent": return _KV_CONNECTOR_AGENT +def has_kv_transfer_group() -> bool: + return _KV_CONNECTOR_AGENT is not None + + @contextmanager def graph_capture(device: torch.device): """ diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 83491249b180..b8e8ecbbe84e 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -11,7 +11,7 @@ import vllm.envs as envs from vllm.config import VllmConfig -from vllm.distributed import get_kv_transfer_group +from vllm.distributed import get_kv_transfer_group, has_kv_transfer_group # yapf: disable from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 # yapf: enable @@ -106,9 +106,10 @@ def set_forward_context(attn_metadata: Any, attn_metadata=attn_metadata, dp_metadata=dp_metadata) - if attn_metadata is not None: + if has_kv_transfer_group() and attn_metadata is not None: kv_connector = get_kv_transfer_group() kv_connector.start_load_kv(_forward_context) + _forward_context.kv_connector = kv_connector try: yield diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 3500b8629aff..a155d54404ab 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -149,9 +149,12 @@ def get_computed_blocks( # Add back the last block hash if it was removed. block_hashes.append(last_block_hash) - computed_blocks = self.connector.get_external_prefix_cache_blocks( - request, computed_blocks, - len(computed_blocks) * self.block_size, self) + # Check the remote cache for the external prefix cache blocks. + if self.connector is not None: + computed_blocks =\ + self.connector.get_external_prefix_cache_blocks( + request, computed_blocks, + len(computed_blocks) * self.block_size, self) self.prefix_cache_stats.queries += len(block_hashes) self.prefix_cache_stats.hits += len(computed_blocks) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index eca7953a8d41..74b41035eaac 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -66,15 +66,18 @@ def __init__( self.max_model_len = self.scheduler_config.max_model_len # create connector - from vllm.distributed.kv_transfer.kv_connector.factory import ( - KVConnectorFactory) - from vllm.distributed.kv_transfer.kv_connector.v1 import ( - KVConnectorRole as KVConnectorRole_V1) - self.connector = KVConnectorFactory.create_connector( - rank=None, - local_rank=None, - config=self.vllm_config, - role=KVConnectorRole_V1.SCHEDULER) + if self.vllm_config.kv_transfer_config is not None: + from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) + from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorRole as KVConnectorRole_V1) + self.connector = KVConnectorFactory.create_connector( + rank=None, + local_rank=None, + config=self.vllm_config, + role=KVConnectorRole_V1.SCHEDULER) + else: + self.connector = None num_gpu_blocks = cache_config.num_gpu_blocks assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 @@ -457,7 +460,8 @@ def schedule(self) -> SchedulerOutput: # 1. Plan the KV cache store # 2. Wrap up all the KV cache load / save ops into an opaque object # 3. Clear the internal states of the connector - self.connector.attach_connector_meta(scheduler_output) + if self.connector is not None: + self.connector.attach_connector_meta(scheduler_output) # Advance the number of computed tokens for the request AFTER # the request is scheduled. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bc8358d00988..3e466c1ff7f8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -13,7 +13,7 @@ from vllm.attention import AttentionType, get_attn_backend from vllm.attention.layer import Attention from vllm.config import CompilationLevel, VllmConfig -from vllm.distributed import get_kv_transfer_group +from vllm.distributed import get_kv_transfer_group, has_kv_transfer_group from vllm.distributed.parallel_state import get_pp_group, graph_capture from vllm.forward_context import set_forward_context from vllm.logger import init_logger @@ -1044,8 +1044,9 @@ def execute_model( }) # Update the connector's state with the metadata in scheduler output. - get_kv_transfer_group().bind_connector_metadata( - scheduler_output.connector_metadata) + if has_kv_transfer_group(): + get_kv_transfer_group().bind_connector_metadata( + scheduler_output.connector_metadata) # Run the decoder. # Use persistent buffers for CUDA graphs. @@ -1065,7 +1066,8 @@ def execute_model( logits = self.model.compute_logits(sample_hidden_states, None) # Clear connector's state - get_kv_transfer_group().clear_connector_metadata() + if has_kv_transfer_group(): + get_kv_transfer_group().clear_connector_metadata() # Apply structured output bitmasks if present if scheduler_output.grammar_bitmask is not None: From 430e402a818c2f9aa78193ed25389b8e90ade408 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Sun, 6 Apr 2025 19:13:52 +0000 Subject: [PATCH 004/116] [fix] compatibility with v0 and address review comments Signed-off-by: ApostaC --- .../disaggrated-prefill-v1/prefill_example.py | 3 +- .../disaggrated-prefill-v1/run.sh | 4 +- vllm/attention/layer.py | 7 ++-- .../v1/shared_storage_connector.py | 28 ++++++++------ vllm/distributed/parallel_state.py | 37 +++++++++++++++---- vllm/forward_context.py | 13 ++++--- vllm/v1/core/kv_cache_manager.py | 16 +++++--- vllm/v1/core/sched/output.py | 2 +- vllm/v1/core/sched/scheduler.py | 24 +++++------- vllm/v1/engine/core.py | 4 -- vllm/v1/worker/gpu_model_runner.py | 2 +- 11 files changed, 84 insertions(+), 56 deletions(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py index 4456921c2fed..5f43b2870a71 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py @@ -19,7 +19,8 @@ gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' - '"kv_extra_config": {"shared_storage_path": "local_storage"}}') + '"kv_connector_extra_config": ' + '{"shared_storage_path": "local_storage"}}') ) #, max_model_len=2048, max_num_batched_tokens=2048) # 1ST generation (prefill instance) diff --git a/examples/offline_inference/disaggrated-prefill-v1/run.sh b/examples/offline_inference/disaggrated-prefill-v1/run.sh index e74e01277734..4c08f9920d53 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/run.sh +++ b/examples/offline_inference/disaggrated-prefill-v1/run.sh @@ -1,4 +1,4 @@ rm -rf local_storage/ -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=1 python3 prefill_example.py -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=1 python3 decode_example.py +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index d2cf34665896..0722b780febe 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -10,7 +10,8 @@ from vllm.attention import AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig, get_current_vllm_config -from vllm.distributed import get_kv_transfer_group, has_kv_transfer_group +from vllm.distributed import (get_kv_transfer_group, has_kv_transfer_group, + is_v1_kv_transfer_group) from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.base_config import ( @@ -180,7 +181,7 @@ def forward( context using `vllm.forward_context.get_forward_context().attn_metadata`. """ - if has_kv_transfer_group(): + if has_kv_transfer_group() and is_v1_kv_transfer_group(): get_kv_transfer_group().wait_for_layer_load(self.layer_name) if self.calculate_kv_scales: attn_metadata = get_forward_context().attn_metadata @@ -346,7 +347,7 @@ def maybe_save_kv_layer_to_connector( layer_name: str, kv_cache: List[torch.Tensor], ): - if not has_kv_transfer_group(): + if not has_kv_transfer_group() or not is_v1_kv_transfer_group(): return connector = get_kv_transfer_group() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 4ba80b2ef4d4..4fb7a2607b62 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -128,7 +128,6 @@ def inject_kv_into_layer( dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) - logger.info("Start loading KV cache from the connector") # Get the metadata metadata: KVConnectorMetadata = \ self._get_connector_metadata() @@ -146,15 +145,17 @@ def inject_kv_into_layer( "In connector.start_load_kv, but the attn_metadata is None") return - # Load the KV for each layer - for layer_name in forward_context.no_compile_layers: - attn_layer = forward_context.no_compile_layers[layer_name] - kv_cache_layer = attn_layer.kv_cache[ - forward_context.virtual_engine] + # Load the KV for each request each layer + for request in metadata.requests: + if request.is_store: + continue + logger.info("Inject KV cache of %d tokens to the paged memory", + len(request.slot_mapping)) + for layer_name in forward_context.no_compile_layers: + attn_layer = forward_context.no_compile_layers[layer_name] + kv_cache_layer = attn_layer.kv_cache[\ + forward_context.virtual_engine] - for request in metadata.requests: - if request.is_store: - continue filename = self.generate_filename_debug( layer_name, request.token_ids) kv_cache = torch.load(filename).cuda( @@ -264,7 +265,10 @@ def get_external_prefix_cache_blocks( old_req_id = request.request_id request.request_id = "temp-req-id-for-connector" allocated_blocks = kv_cache_manager.allocate_slots( - request, need_to_allocate, computed_blocks, preallocate=False) + request, + need_to_allocate, + computed_blocks, + skip_preallocate=True) request.request_id = old_req_id kv_cache_manager.req_to_blocks.pop("temp-req-id-for-connector") kv_cache_manager.num_cached_block.pop("temp-req-id-for-connector") @@ -285,7 +289,7 @@ def attach_connector_meta( """Attach the connector metadata to the request object. This function should NOT modify other fields in the scheduler_output - except the `connector_metadata` field. + except the `kv_connector_metadata` field. Also, calling this function will reset the state of the connector. Args: @@ -302,7 +306,7 @@ def attach_connector_meta( # store and load status if not self.found_match_for_request(request): meta.add_request(request, self._block_size, is_store=True) - scheduler_output.connector_metadata = meta + scheduler_output.kv_connector_metadata = meta self._requests_need_load.clear() return scheduler_output diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 636c4dff28d2..81d6c0d24e79 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -40,6 +40,8 @@ import vllm.envs as envs from vllm.distributed.device_communicators.base_device_communicator import ( DeviceCommunicatorBase) +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname, @@ -47,8 +49,6 @@ if TYPE_CHECKING: from vllm.config import VllmConfig - from vllm.distributed.kv_transfer.kv_connector.kv_connector_agent import ( - KVConnectorAgent) @dataclass @@ -768,10 +768,10 @@ def get_pp_group() -> GroupCoordinator: # kept for backward compatibility get_pipeline_model_parallel_group = get_pp_group -_KV_CONNECTOR_AGENT: Optional["KVConnectorAgent"] = None +_KV_CONNECTOR_AGENT: Union[KVConnectorBase, KVConnectorBase_V1, None] = None -def get_kv_transfer_group() -> "KVConnectorAgent": +def get_kv_transfer_group() -> Union[KVConnectorBase, KVConnectorBase_V1]: assert _KV_CONNECTOR_AGENT is not None, ( "disaggregated KV cache transfer parallel group is not initialized") return _KV_CONNECTOR_AGENT @@ -781,6 +781,30 @@ def has_kv_transfer_group() -> bool: return _KV_CONNECTOR_AGENT is not None +def is_v1_kv_transfer_group( + connector: Union[KVConnectorBase_V1, KVConnectorBase, + None] = None) -> bool: + """Check if the KV connector is the v1 connector. + If the argument is None, it will check the global KV connector + + Args: + connector: The KV connector to check. If None, it will check the + global KV connector. + + Note: + This function will no-longer be needed after the v1 KV connector + becomes the default. + """ + if connector is None: + connector = _KV_CONNECTOR_AGENT + + if connector is None: + # Global KV connector is not set + return False + + return isinstance(connector, KVConnectorBase_V1) + + @contextmanager def graph_capture(device: torch.device): """ @@ -985,12 +1009,11 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: "rank": get_world_group().rank, "local_rank": get_world_group().local_rank, "config": vllm_config, - } - if envs.VLLM_USE_V1: # NOTE(Kuntai): # Parallel state is initialized in v1 worker, # so this connector is for sure worker connector. - kwargs["role"] = KVConnectorRole_V1.WORKER + "role": KVConnectorRole_V1.WORKER, + } _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(**kwargs) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index b8e8ecbbe84e..173e34e2604b 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -11,7 +11,8 @@ import vllm.envs as envs from vllm.config import VllmConfig -from vllm.distributed import get_kv_transfer_group, has_kv_transfer_group +from vllm.distributed import (get_kv_transfer_group, has_kv_transfer_group, + is_v1_kv_transfer_group) # yapf: disable from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 # yapf: enable @@ -106,10 +107,10 @@ def set_forward_context(attn_metadata: Any, attn_metadata=attn_metadata, dp_metadata=dp_metadata) - if has_kv_transfer_group() and attn_metadata is not None: + if has_kv_transfer_group() and attn_metadata is not None and \ + is_v1_kv_transfer_group(): kv_connector = get_kv_transfer_group() kv_connector.start_load_kv(_forward_context) - _forward_context.kv_connector = kv_connector try: yield @@ -148,7 +149,9 @@ def set_forward_context(attn_metadata: Any, forward_stats) # Waiting for the save operation to finish - if _forward_context.kv_connector is not None: - _forward_context.kv_connector.wait_for_save() + if has_kv_transfer_group() and attn_metadata is not None and \ + is_v1_kv_transfer_group(): + kv_connector = get_kv_transfer_group() + kv_connector.wait_for_save() _forward_context = prev_context diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index a155d54404ab..e858b9cc350e 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -2,7 +2,7 @@ from collections import defaultdict from collections.abc import Iterable -from typing import Optional +from typing import TYPE_CHECKING, Optional from vllm.logger import init_logger from vllm.utils import cdiv, sha256 @@ -14,6 +14,9 @@ from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request, RequestStatus +if TYPE_CHECKING: + from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 + logger = init_logger(__name__) @@ -27,7 +30,7 @@ def __init__( caching_hash_algo: str = "builtin", num_preallocate_tokens: int = 64, log_stats: bool = False, - connector=None, + connector: "KVConnectorBase_V1" = None, ) -> None: assert len(kv_cache_config.kv_cache_groups) == 1, ( "KVCacheManager does not support hybrid models with more than 1 " @@ -173,7 +176,7 @@ def allocate_slots( request: Request, num_tokens: int, new_computed_blocks: Optional[list[KVCacheBlock]] = None, - preallocate=True, + skip_preallocate: bool = False, ) -> Optional[list[KVCacheBlock]]: """Add slots for a request with new tokens to append. @@ -183,7 +186,8 @@ def allocate_slots( not include the tokens that have already been computed. new_computed_blocks: A list of new computed blocks just hitting the prefix caching. - preallocate: Whether to preallocate blocks for the request. + skip_preallocate: Whether to skip preallocating blocks for + the request. Blocks layout: ----------------------------------------------------------------------- @@ -256,8 +260,8 @@ def allocate_slots( else: # Get new blocks from the free block pool considering # preallocated blocks. - num_preallocate_blocks =\ - self.num_preallocate_blocks if preallocate else 0 + num_preallocate_blocks = self.num_preallocate_blocks \ + if not skip_preallocate else 0 num_new_blocks = min( num_new_blocks + num_preallocate_blocks, self.block_pool.get_num_free_blocks(), diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index d7d2a9132106..27ceb8cc0402 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -125,4 +125,4 @@ class SchedulerOutput: grammar_bitmask: Optional[npt.NDArray[np.int32]] # the connector metadata - connector_metadata: Optional[KVConnectorMetadata] = None + kv_connector_metadata: Optional[KVConnectorMetadata] = None diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 74b41035eaac..6fb18c436868 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -7,8 +7,7 @@ from collections.abc import Iterable from typing import Optional, Union -from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig, - VllmConfig) +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, @@ -35,10 +34,6 @@ class Scheduler(SchedulerInterface): def __init__( self, vllm_config: VllmConfig, - scheduler_config: SchedulerConfig, - model_config: ModelConfig, - cache_config: CacheConfig, - lora_config: Optional[LoRAConfig], kv_cache_config: KVCacheConfig, structured_output_manager: StructuredOutputManager, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, @@ -46,9 +41,9 @@ def __init__( log_stats: bool = False, ) -> None: self.vllm_config = vllm_config - self.scheduler_config = scheduler_config - self.cache_config = cache_config - self.lora_config = lora_config + self.scheduler_config = vllm_config.scheduler_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config self.kv_cache_config = kv_cache_config self.log_stats = log_stats self.structured_output_manager = structured_output_manager @@ -79,14 +74,14 @@ def __init__( else: self.connector = None - num_gpu_blocks = cache_config.num_gpu_blocks + num_gpu_blocks = self.cache_config.num_gpu_blocks assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 # Create the KV cache manager. self.kv_cache_manager = KVCacheManager( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, - enable_caching=cache_config.enable_prefix_caching, + enable_caching=self.cache_config.enable_prefix_caching, caching_hash_algo=self.cache_config.prefix_caching_hash_algo, log_stats=self.log_stats, connector=self.connector) @@ -118,8 +113,8 @@ def __init__( # This can be changed when we make encoder cache for embedding caching # across requests. encoder_compute_budget, encoder_cache_size = compute_encoder_budget( - model_config=model_config, - scheduler_config=scheduler_config, + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, mm_registry=mm_registry, ) @@ -461,7 +456,8 @@ def schedule(self) -> SchedulerOutput: # 2. Wrap up all the KV cache load / save ops into an opaque object # 3. Clear the internal states of the connector if self.connector is not None: - self.connector.attach_connector_meta(scheduler_output) + scheduler_output = self.connector.attach_connector_meta( + scheduler_output) # Advance the number of computed tokens for the request AFTER # the request is scheduled. diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5a8bc45c45db..0b86e563e0a8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -94,10 +94,6 @@ def __init__( self.scheduler: SchedulerInterface = Scheduler( vllm_config=vllm_config, - scheduler_config=vllm_config.scheduler_config, - model_config=vllm_config.model_config, - cache_config=vllm_config.cache_config, - lora_config=vllm_config.lora_config, kv_cache_config=kv_cache_config, structured_output_manager=self.structured_output_manager, include_finished_set=vllm_config.parallel_config.data_parallel_size diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3e466c1ff7f8..2746e0f733e1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1046,7 +1046,7 @@ def execute_model( # Update the connector's state with the metadata in scheduler output. if has_kv_transfer_group(): get_kv_transfer_group().bind_connector_metadata( - scheduler_output.connector_metadata) + scheduler_output.kv_connector_metadata) # Run the decoder. # Use persistent buffers for CUDA graphs. From 300ddac1710a95c48a229e0320c97309849f498b Mon Sep 17 00:00:00 2001 From: ApostaC Date: Sun, 6 Apr 2025 19:29:23 +0000 Subject: [PATCH 005/116] [fix] format checker issue and [disable] connector during profile run Signed-off-by: ApostaC --- vllm/attention/layer.py | 17 +++++++++++++++-- vllm/forward_context.py | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 0722b780febe..6d86084e2b6e 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -181,8 +181,7 @@ def forward( context using `vllm.forward_context.get_forward_context().attn_metadata`. """ - if has_kv_transfer_group() and is_v1_kv_transfer_group(): - get_kv_transfer_group().wait_for_layer_load(self.layer_name) + wait_for_kv_layer_from_connector(self.layer_name) if self.calculate_kv_scales: attn_metadata = get_forward_context().attn_metadata if attn_metadata.enable_kv_scales_calculation: @@ -343,6 +342,20 @@ def forward( return out.reshape(bsz, q_len, -1) +def wait_for_kv_layer_from_connector(layer_name: str): + if not has_kv_transfer_group() or not is_v1_kv_transfer_group(): + return + + connector = get_kv_transfer_group() + + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if attn_metadata is None: + return + + connector.wait_for_layer_load(layer_name) + + def maybe_save_kv_layer_to_connector( layer_name: str, kv_cache: List[torch.Tensor], diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 173e34e2604b..2b1e30de0e6a 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -110,6 +110,7 @@ def set_forward_context(attn_metadata: Any, if has_kv_transfer_group() and attn_metadata is not None and \ is_v1_kv_transfer_group(): kv_connector = get_kv_transfer_group() + assert isinstance(kv_connector, KVConnectorBase_V1) kv_connector.start_load_kv(_forward_context) try: @@ -152,6 +153,7 @@ def set_forward_context(attn_metadata: Any, if has_kv_transfer_group() and attn_metadata is not None and \ is_v1_kv_transfer_group(): kv_connector = get_kv_transfer_group() + assert isinstance(kv_connector, KVConnectorBase_V1) kv_connector.wait_for_save() _forward_context = prev_context From 7e0695bf64416c8d90db418e39b989e7cd0366d8 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 7 Apr 2025 23:25:43 +0000 Subject: [PATCH 006/116] updated to remove torch.load Signed-off-by: rshaw@neuralmagic.com --- .../disaggrated-prefill-v1/decode_example.py | 2 +- .../disaggrated-prefill-v1/prefill_example.py | 2 +- .../kv_connector/v1/shared_storage_connector.py | 10 ++++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py index 57fa8395ab92..8fd47707a8bb 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py @@ -17,7 +17,7 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) llm = LLM( - model="meta-llama/llama-3.1-8b-instruct", + model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py index 5f43b2870a71..9f5481e2c1ea 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py @@ -14,7 +14,7 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) -llm = LLM(model="meta-llama/llama-3.1-8b-instruct", +llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 4fb7a2607b62..ce74e717ac6a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Optional +import safetensors import torch from vllm.config import VllmConfig @@ -71,7 +72,7 @@ def add_request( class SharedStorageConnector(KVConnectorBase_V1): - # NOTE: This is just a simple debug implementation of the KV connector. + # NOTE: This is Simple debug implementation of the KV connector. # It save / load the KV cache to / from the disk. # It does extra work which will overwrite the existing prefix-cache in GPU # - to remove the overhead, need to add some "mask" in the ReqMeta class @@ -158,7 +159,7 @@ def inject_kv_into_layer( filename = self.generate_filename_debug( layer_name, request.token_ids) - kv_cache = torch.load(filename).cuda( + kv_cache = safetensors.torch.load_file(filename)["kv_cache"].cuda( ) # TODO: may need to handle the device here inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping) @@ -207,7 +208,8 @@ def extract_kv_cache_from_layer( layer_name, request.token_ids) kv_cache = extract_kv_cache_from_layer(kv_layer, request.slot_mapping) - torch.save(kv_cache.cpu().detach(), filename) + tensors = {"kv_cache": kv_cache.cpu().detach()} + safetensors.torch.save_file(tensors, filename) def wait_for_save(self): return @@ -353,7 +355,7 @@ def generate_filename_debug( """ foldername = self.generate_foldername_debug(input_ids, create_folder=True) - return os.path.join(foldername, f"{layer_name}.pt") + return os.path.join(foldername, f"{layer_name}.safetensors") def align_to_block_size(num_tokens: int, block_size) -> int: From 553f41616cac698aefca5103798fdc72720a7f4b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 7 Apr 2025 23:26:49 +0000 Subject: [PATCH 007/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/attention/layer.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 6d86084e2b6e..c4a99054aedf 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -217,14 +217,10 @@ def forward( self_kv_cache, attn_metadata, output=output) - maybe_save_kv_layer_to_connector(self.layer_name, - self.kv_cache) else: torch.ops.vllm.unified_attention_with_output( query, key, value, output, self.layer_name) - maybe_save_kv_layer_to_connector(self.layer_name, - self.kv_cache) - return output.view(-1, hidden_size) + output = output.view(-1, hidden_size) else: if self.use_direct_call: forward_context = get_forward_context() @@ -232,15 +228,13 @@ def forward( self_kv_cache = self.kv_cache[forward_context.virtual_engine] output = self.impl.forward(self, query, key, value, self_kv_cache, attn_metadata) - maybe_save_kv_layer_to_connector(self.layer_name, - self.kv_cache) - return output else: output = torch.ops.vllm.unified_attention( query, key, value, self.layer_name) - maybe_save_kv_layer_to_connector(self.layer_name, - self.kv_cache) - return output + + maybe_save_kv_layer_to_connector(self.layer_name, + self.kv_cache) + return output def calc_kv_scales(self, query, key, value): self._q_scale.copy_(torch.abs(query).max() / self.q_range) From 55d1b5b549fb9f58365220a5fd3a0775160903ca Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 7 Apr 2025 23:38:31 +0000 Subject: [PATCH 008/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/attention/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index c4a99054aedf..7f6afbbc84fa 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -231,7 +231,7 @@ def forward( else: output = torch.ops.vllm.unified_attention( query, key, value, self.layer_name) - + maybe_save_kv_layer_to_connector(self.layer_name, self.kv_cache) return output From c50e620a14e5bbf61f78eea2a92f120e54f60b32 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 7 Apr 2025 23:39:32 +0000 Subject: [PATCH 009/116] fixed typo Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 14 +++++++------- .../kv_connector/v1/shared_storage_connector.py | 8 ++++---- vllm/v1/core/sched/output.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index efc3593140dd..94c9a0c46bc9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -41,7 +41,7 @@ class KVConnectorBase_V1(ABC): def __init__(self, rank: Optional[int], local_rank: Optional[int], config: "VllmConfig", role: KVConnectorRole): - self._connector_metada = KVConnectorMetadata() + self._connector_metadata = KVConnectorMetadata() self._rank = rank self._local_rank = local_rank self._config = config @@ -51,7 +51,7 @@ def __init__(self, rank: Optional[int], local_rank: Optional[int], def role(self) -> KVConnectorRole: return self._role - def bind_connector_metadata( + def bind_connector_metadatata( self, connector_metadata: KVConnectorMetadata) -> None: """Set the connector metadata from the scheduler. @@ -62,17 +62,17 @@ def bind_connector_metadata( Args: connector_metadata (dict): the connector metadata. """ - self._connector_metada = connector_metadata + self._connector_metadata = connector_metadata - def clear_connector_metadata(self) -> None: + def clear_connector_metadatata(self) -> None: """Clear the connector metadata. This function should be called by the model runner every time after the model execution. """ - self._connector_metada = KVConnectorMetadata() + self._connector_metadata = KVConnectorMetadata() - def _get_connector_metadata(self) -> KVConnectorMetadata: + def _get_connector_metadatata(self) -> KVConnectorMetadata: """Get the connector metadata. This function should only be called inside the connector. @@ -80,7 +80,7 @@ def _get_connector_metadata(self) -> KVConnectorMetadata: Returns: ConnectorMetadata: the connector metadata. """ - return self._connector_metada + return self._connector_metadata # ============================== # Worker-side methods diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index ce74e717ac6a..1b96dee5591c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -131,7 +131,7 @@ def inject_kv_into_layer( # Get the metadata metadata: KVConnectorMetadata = \ - self._get_connector_metadata() + self._get_connector_metadatata() assert isinstance(metadata, SharedStorageConnectorMetadata) if metadata is None: @@ -200,7 +200,7 @@ def extract_kv_cache_from_layer( return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...] - connector_metadata = self._get_connector_metadata() + connector_metadata = self._get_connector_metadatata() assert isinstance(connector_metadata, SharedStorageConnectorMetadata) for request in connector_metadata.requests: if request.is_store: @@ -291,7 +291,7 @@ def attach_connector_meta( """Attach the connector metadata to the request object. This function should NOT modify other fields in the scheduler_output - except the `kv_connector_metadata` field. + except the `kv_connector_metadatata` field. Also, calling this function will reset the state of the connector. Args: @@ -308,7 +308,7 @@ def attach_connector_meta( # store and load status if not self.found_match_for_request(request): meta.add_request(request, self._block_size, is_store=True) - scheduler_output.kv_connector_metadata = meta + scheduler_output.kv_connector_metadatata = meta self._requests_need_load.clear() return scheduler_output diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 27ceb8cc0402..27441e730d4d 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -125,4 +125,4 @@ class SchedulerOutput: grammar_bitmask: Optional[npt.NDArray[np.int32]] # the connector metadata - kv_connector_metadata: Optional[KVConnectorMetadata] = None + kv_connector_metadatata: Optional[KVConnectorMetadata] = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9d9e59a9367f..6359f65d0a0a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1046,8 +1046,8 @@ def execute_model( # Update the connector's state with the metadata in scheduler output. if has_kv_transfer_group(): - get_kv_transfer_group().bind_connector_metadata( - scheduler_output.kv_connector_metadata) + get_kv_transfer_group().bind_connector_metadatata( + scheduler_output.kv_connector_metadatata) # Run the decoder. # Use persistent buffers for CUDA graphs. @@ -1068,7 +1068,7 @@ def execute_model( # Clear connector's state if has_kv_transfer_group(): - get_kv_transfer_group().clear_connector_metadata() + get_kv_transfer_group().clear_connector_metadatata() # Apply structured output bitmasks if present if scheduler_output.grammar_bitmask is not None: From b22fe38eec1b5a25d4038f92b54cb024947a6335 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 7 Apr 2025 23:40:40 +0000 Subject: [PATCH 010/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 6 +++--- .../kv_connector/v1/shared_storage_connector.py | 8 ++++---- vllm/v1/core/sched/output.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 94c9a0c46bc9..39dda39c3d0c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -51,7 +51,7 @@ def __init__(self, rank: Optional[int], local_rank: Optional[int], def role(self) -> KVConnectorRole: return self._role - def bind_connector_metadatata( + def bind_connector_metadata( self, connector_metadata: KVConnectorMetadata) -> None: """Set the connector metadata from the scheduler. @@ -64,7 +64,7 @@ def bind_connector_metadatata( """ self._connector_metadata = connector_metadata - def clear_connector_metadatata(self) -> None: + def clear_connector_metadata(self) -> None: """Clear the connector metadata. This function should be called by the model runner every time @@ -72,7 +72,7 @@ def clear_connector_metadatata(self) -> None: """ self._connector_metadata = KVConnectorMetadata() - def _get_connector_metadatata(self) -> KVConnectorMetadata: + def _get_connector_metadata(self) -> KVConnectorMetadata: """Get the connector metadata. This function should only be called inside the connector. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 1b96dee5591c..ce74e717ac6a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -131,7 +131,7 @@ def inject_kv_into_layer( # Get the metadata metadata: KVConnectorMetadata = \ - self._get_connector_metadatata() + self._get_connector_metadata() assert isinstance(metadata, SharedStorageConnectorMetadata) if metadata is None: @@ -200,7 +200,7 @@ def extract_kv_cache_from_layer( return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...] - connector_metadata = self._get_connector_metadatata() + connector_metadata = self._get_connector_metadata() assert isinstance(connector_metadata, SharedStorageConnectorMetadata) for request in connector_metadata.requests: if request.is_store: @@ -291,7 +291,7 @@ def attach_connector_meta( """Attach the connector metadata to the request object. This function should NOT modify other fields in the scheduler_output - except the `kv_connector_metadatata` field. + except the `kv_connector_metadata` field. Also, calling this function will reset the state of the connector. Args: @@ -308,7 +308,7 @@ def attach_connector_meta( # store and load status if not self.found_match_for_request(request): meta.add_request(request, self._block_size, is_store=True) - scheduler_output.kv_connector_metadatata = meta + scheduler_output.kv_connector_metadata = meta self._requests_need_load.clear() return scheduler_output diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 27441e730d4d..27ceb8cc0402 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -125,4 +125,4 @@ class SchedulerOutput: grammar_bitmask: Optional[npt.NDArray[np.int32]] # the connector metadata - kv_connector_metadatata: Optional[KVConnectorMetadata] = None + kv_connector_metadata: Optional[KVConnectorMetadata] = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6359f65d0a0a..9d9e59a9367f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1046,8 +1046,8 @@ def execute_model( # Update the connector's state with the metadata in scheduler output. if has_kv_transfer_group(): - get_kv_transfer_group().bind_connector_metadatata( - scheduler_output.kv_connector_metadatata) + get_kv_transfer_group().bind_connector_metadata( + scheduler_output.kv_connector_metadata) # Run the decoder. # Use persistent buffers for CUDA graphs. @@ -1068,7 +1068,7 @@ def execute_model( # Clear connector's state if has_kv_transfer_group(): - get_kv_transfer_group().clear_connector_metadatata() + get_kv_transfer_group().clear_connector_metadata() # Apply structured output bitmasks if present if scheduler_output.grammar_bitmask is not None: From da257aacd8a5bc380889b9c260266fa689fa5c3b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 7 Apr 2025 23:51:33 +0000 Subject: [PATCH 011/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/forward_context.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 2b1e30de0e6a..061f71be6f75 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -107,6 +107,8 @@ def set_forward_context(attn_metadata: Any, attn_metadata=attn_metadata, dp_metadata=dp_metadata) + # KVConnector: trigger (possibly async) load before forward. + # Each attn layer will block until the reading is complete. if has_kv_transfer_group() and attn_metadata is not None and \ is_v1_kv_transfer_group(): kv_connector = get_kv_transfer_group() @@ -149,7 +151,8 @@ def set_forward_context(attn_metadata: Any, "(batchsize, count, median_time(ms)): %s"), forward_stats) - # Waiting for the save operation to finish + # KVConnector: each attn layer triggers (possibly async) save. + # Ensure all those operations complete before forward() is done. if has_kv_transfer_group() and attn_metadata is not None and \ is_v1_kv_transfer_group(): kv_connector = get_kv_transfer_group() From 9751e0b9ac999d1edbba0c5ecf241bdcb82216b9 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 00:04:16 +0000 Subject: [PATCH 012/116] update comments Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 36 ++++++++++++------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 39dda39c3d0c..e6dc281f06e7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -89,8 +89,10 @@ def _get_connector_metadata(self) -> KVConnectorMetadata: @abstractmethod def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: - """Start loading the KV cache from the connector buffer to vLLM's - paged KV buffer. + """ + Start loading the KV cache from the connector buffer to vLLM's + paged KV buffer. This is called from the forward context before + the forward pass to enable async loading during model execution. Args: forward_context (ForwardContext): the forward context. @@ -105,8 +107,10 @@ def start_load_kv(self, forward_context: "ForwardContext", @abstractmethod def wait_for_layer_load(self, layer_name: str) -> None: - """Blocking until the KV for a specific layer is loaded into vLLM's - paged buffer. + """ + Block until the KV for a specific layer is loaded into vLLM's + paged buffer. This is called from within attention layer to ensure + async copying from start_load_kv is complete. This interface will be useful for layer-by-layer pipelining. @@ -118,8 +122,10 @@ def wait_for_layer_load(self, layer_name: str) -> None: @abstractmethod def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata: "AttentionMetadata", **kwargs) -> None: - """Start saving the a layer of KV cache from vLLM's paged buffer - to the connector. + """ + Start saving the a layer of KV cache from vLLM's paged buffer + to the connector. This is called from within attention layer to + enable async copying during execution. Args: layer_name (str): the name of the layer. @@ -132,10 +138,12 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, @abstractmethod def wait_for_save(self): - """Block until all the save operations is done. + """ + Block until all the save operations is done. This is called + as the forward context exits to ensure that the async saving + from save_kv_layer is complete before finishing the forward. - This prevents vLLM overwrites the paged KV buffer before - saving is done. + This prevents overwrites of paged KV buffer before saving done. """ pass @@ -150,10 +158,11 @@ def get_external_prefix_cache_blocks( num_computed_tokens: int, kv_cache_manager: "KVCacheManager", ) -> list["KVCacheBlock"]: - """Get the external prefix cache blocks from the connector. + """ + Get the external prefix cache blocks from the connector. - This function may change the state of the connector, which will be - used by `attach_connector_meta` later. + This function may change the state of the connector, which will + be used by `attach_connector_meta` later. This function will also allocate/free the blocks dynamically when there is remote cache hit. @@ -174,7 +183,8 @@ def get_external_prefix_cache_blocks( @abstractmethod def attach_connector_meta( self, scheduler_output: SchedulerOutput) -> SchedulerOutput: - """Attach the connector metadata to the request object. + """ + Attach the connector metadata to the request object. This function should NOT modify other fields in the scheduler_output except the `connector_metadata` field. From 2b77bcdd23c16690f1531e1be174b16b6f9fcb65 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 00:47:23 +0000 Subject: [PATCH 013/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/attention/layer.py | 7 +++++++ vllm/v1/core/sched/scheduler.py | 1 - 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 7f6afbbc84fa..b355a167abe2 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -181,7 +181,11 @@ def forward( context using `vllm.forward_context.get_forward_context().attn_metadata`. """ + + # KVConnector: start async saving kvs to connector + # to the layers KV cache before running attention. wait_for_kv_layer_from_connector(self.layer_name) + if self.calculate_kv_scales: attn_metadata = get_forward_context().attn_metadata if attn_metadata.enable_kv_scales_calculation: @@ -232,6 +236,9 @@ def forward( output = torch.ops.vllm.unified_attention( query, key, value, self.layer_name) + # KVConnector: start saving kvs to the connector. + # NOTE: forward_context completion will block until + # this operation is completed. maybe_save_kv_layer_to_connector(self.layer_name, self.kv_cache) return output diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index e0235f8f1329..dc22e89a3436 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -432,7 +432,6 @@ def schedule(self) -> SchedulerOutput: resumed_from_preemption=False, ) for req in scheduled_running_reqs ] - scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, scheduled_cached_reqs=resumed_reqs_data + running_reqs_data, From 5cbd434e0343b8d924f19363ffa47ecfbabce3f9 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 01:45:20 +0000 Subject: [PATCH 014/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/forward_context.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 061f71be6f75..c7ae7f13380e 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -45,9 +45,6 @@ class ForwardContext: virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass dp_metadata: Optional[DPMetadata] = None - # KV cache connector - # NOTE(Kuntai): only v1 connector works with ForwardContext for now - kv_connector: Optional[KVConnectorBase_V1] = None _forward_context: Optional[ForwardContext] = None From 4c6a93eee402553560e277d2967e667c7832e495 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 01:54:40 +0000 Subject: [PATCH 015/116] comment Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/parallel_state.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index b9206b99c6ff..981c5258211a 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -771,6 +771,8 @@ def get_pp_group() -> GroupCoordinator: # kept for backward compatibility get_pipeline_model_parallel_group = get_pp_group +# TODO: once we deprecate V0 KV transer, we can move this to +# be a non-global object. _KV_CONNECTOR_AGENT: Union[KVConnectorBase, KVConnectorBase_V1, None] = None From d8ec5a6f2bb8ef999ae893f6b3ce98ab9ed4f74f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 02:47:29 +0000 Subject: [PATCH 016/116] updared Signed-off-by: rshaw@neuralmagic.com --- vllm/attention/layer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b355a167abe2..7a9eb3939798 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -239,8 +239,7 @@ def forward( # KVConnector: start saving kvs to the connector. # NOTE: forward_context completion will block until # this operation is completed. - maybe_save_kv_layer_to_connector(self.layer_name, - self.kv_cache) + maybe_save_kv_layer_to_connector(self.layer_name, self.kv_cache) return output def calc_kv_scales(self, query, key, value): From fcd2dc954f842050e6323c0d795dee53e16bf813 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 02:58:40 +0000 Subject: [PATCH 017/116] updated Signed-off-by: rshaw@neuralmagic.com --- examples/offline_inference/disaggrated-prefill-v1/run.sh | 1 + .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/run.sh b/examples/offline_inference/disaggrated-prefill-v1/run.sh index 4c08f9920d53..0ebf45a1586a 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/run.sh +++ b/examples/offline_inference/disaggrated-prefill-v1/run.sh @@ -1,4 +1,5 @@ rm -rf local_storage/ +rm output.txt VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index ce74e717ac6a..9fc4c8b57b0f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -159,8 +159,8 @@ def inject_kv_into_layer( filename = self.generate_filename_debug( layer_name, request.token_ids) - kv_cache = safetensors.torch.load_file(filename)["kv_cache"].cuda( - ) # TODO: may need to handle the device here + kv_cache = safetensors.torch.load_file( + filename)["kv_cache"].cuda() inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping) From 1f9c2527b0ca16f1558a923ac90a5f437c73ef7a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 03:14:14 +0000 Subject: [PATCH 018/116] format Signed-off-by: rshaw@neuralmagic.com --- tests/disaggregated/__init__.py | 0 tests/disaggregated/test_simple_storage.py | 95 ++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 tests/disaggregated/__init__.py create mode 100644 tests/disaggregated/test_simple_storage.py diff --git a/tests/disaggregated/__init__.py b/tests/disaggregated/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/disaggregated/test_simple_storage.py b/tests/disaggregated/test_simple_storage.py new file mode 100644 index 000000000000..f692e98e8804 --- /dev/null +++ b/tests/disaggregated/test_simple_storage.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +import shutil + +import pytest + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + + +@pytest.fixture(scope="function", autouse=True) +def cleanup(): + yield + if os.path.exists("output.txt"): + os.remove("output.txt") + if os.path.isdir("local_storage"): + shutil.rmtree("local_storage") + + +def test_integration(): + + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + + llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig.from_cli( + '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' + '"kv_connector_extra_config": ' + '{"shared_storage_path": "local_storage"}}')) + + context = "Hi " * 1000 + context2 = "Hey " * 500 + prompts = [ + context + "Hello, my name is", + context + "The capital of France is", + context2 + "Your name is", + context2 + "The capital of China is", + ] + + # 1ST generation (prefill instance) + outputs = llm.generate( + prompts, + sampling_params, + ) + + new_prompts = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # Write new_prompts to output.txt + with open("output.txt", "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") + print(f"Saved {len(new_prompts)} prompts to output.txt") + + del llm + + # Read prompts from output.txt + prompts = [] + try: + with open("output.txt") as f: + for line in f: + prompts.append(line.strip()) + print(f"Loaded {len(prompts)} prompts from output.txt") + except FileNotFoundError: + print("Error: output.txt file not found") + exit(-1) + + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) + + decode_llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig.from_cli( + '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",' + '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}' # noqa: E501 + )) + + # 2nd generation (decode instance) + outputs = decode_llm.generate(prompts, sampling_params) + + new_prompts = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert len(generated_text) > 5 From 7350244a3a8e8b5fb402b1eea62b06b3541739d3 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 8 Apr 2025 03:35:52 +0000 Subject: [PATCH 019/116] [fix] typo to pass format checker Signed-off-by: ApostaC --- vllm/distributed/parallel_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 981c5258211a..40b10784a0e4 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -771,7 +771,7 @@ def get_pp_group() -> GroupCoordinator: # kept for backward compatibility get_pipeline_model_parallel_group = get_pp_group -# TODO: once we deprecate V0 KV transer, we can move this to +# TODO: once we deprecate V0 KV transfer, we can move this to # be a non-global object. _KV_CONNECTOR_AGENT: Union[KVConnectorBase, KVConnectorBase_V1, None] = None From 1586d58b486ad01840b82149ab8efd4c80c96806 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 11:59:30 +0000 Subject: [PATCH 020/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 9fc4c8b57b0f..db44c8bf9d53 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -188,7 +188,7 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, **kwargs: additional arguments for the save operation. """ - def extract_kv_cache_from_layer( + def extract_kv_from_layer( layer: torch.Tensor, slot_mapping: torch.Tensor, ) -> torch.Tensor: @@ -206,8 +206,8 @@ def extract_kv_cache_from_layer( if request.is_store: filename = self.generate_filename_debug( layer_name, request.token_ids) - kv_cache = extract_kv_cache_from_layer(kv_layer, - request.slot_mapping) + kv_cache = extract_kv_from_layer(kv_layer, + request.slot_mapping) tensors = {"kv_cache": kv_cache.cpu().detach()} safetensors.torch.save_file(tensors, filename) From 5accb536121b41dcb8bb0ad3d6febe36fd7c632b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 16:00:29 +0000 Subject: [PATCH 021/116] stash Signed-off-by: rshaw@neuralmagic.com --- .../disaggrated-prefill-v1/decode_example.py | 2 +- .../disaggrated-prefill-v1/prefill_example.py | 2 +- examples/offline_inference/disaggrated-prefill-v1/run.sh | 4 ++-- vllm/attention/layer.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py index 8fd47707a8bb..104103d2e1ce 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py @@ -18,7 +18,7 @@ llm = LLM( model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True, + enforce_eager=False, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",' diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py index 9f5481e2c1ea..69c4e9c3a74f 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py @@ -15,7 +15,7 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True, + enforce_eager=False, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' diff --git a/examples/offline_inference/disaggrated-prefill-v1/run.sh b/examples/offline_inference/disaggrated-prefill-v1/run.sh index 0ebf45a1586a..e0acecadc51a 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/run.sh +++ b/examples/offline_inference/disaggrated-prefill-v1/run.sh @@ -1,5 +1,5 @@ rm -rf local_storage/ rm output.txt -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=6 python3 prefill_example.py +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=6 python3 decode_example.py diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 7a9eb3939798..33c10430bf22 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -361,8 +361,8 @@ def maybe_save_kv_layer_to_connector( kv_cache: List[torch.Tensor], ): if not has_kv_transfer_group() or not is_v1_kv_transfer_group(): + print("WE ARE HERE") return - connector = get_kv_transfer_group() forward_context: ForwardContext = get_forward_context() From 31d807e759cd8f3562159d9a53e88b2c6f424b1a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 8 Apr 2025 20:58:28 +0000 Subject: [PATCH 022/116] stash Signed-off-by: rshaw@neuralmagic.com --- .../disaggrated-prefill-v1/prefill_example.py | 2 +- vllm/attention/layer.py | 21 +++++++------ .../v1/shared_storage_connector.py | 30 ++++++++++++++++--- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py index 69c4e9c3a74f..9f5481e2c1ea 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py @@ -15,7 +15,7 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=False, + enforce_eager=True, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 33c10430bf22..e5ef38efaa2e 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -181,11 +181,6 @@ def forward( context using `vllm.forward_context.get_forward_context().attn_metadata`. """ - - # KVConnector: start async saving kvs to connector - # to the layers KV cache before running attention. - wait_for_kv_layer_from_connector(self.layer_name) - if self.calculate_kv_scales: attn_metadata = get_forward_context().attn_metadata if attn_metadata.enable_kv_scales_calculation: @@ -236,10 +231,6 @@ def forward( output = torch.ops.vllm.unified_attention( query, key, value, self.layer_name) - # KVConnector: start saving kvs to the connector. - # NOTE: forward_context completion will block until - # this operation is completed. - maybe_save_kv_layer_to_connector(self.layer_name, self.kv_cache) return output def calc_kv_scales(self, query, key, value): @@ -361,7 +352,6 @@ def maybe_save_kv_layer_to_connector( kv_cache: List[torch.Tensor], ): if not has_kv_transfer_group() or not is_v1_kv_transfer_group(): - print("WE ARE HERE") return connector = get_kv_transfer_group() @@ -380,11 +370,17 @@ def unified_attention( value: torch.Tensor, layer_name: str, ) -> torch.Tensor: + # wait_for_kv_layer_from_connector(layer_name) + forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.attn_metadata self = forward_context.no_compile_layers[layer_name] kv_cache = self.kv_cache[forward_context.virtual_engine] - return self.impl.forward(self, query, key, value, kv_cache, attn_metadata) + output = self.impl.forward(self, query, key, value, kv_cache, + attn_metadata) + + maybe_save_kv_layer_to_connector(layer_name, kv_cache) + return output def unified_attention_fake( @@ -412,6 +408,7 @@ def unified_attention_with_output( output: torch.Tensor, layer_name: str, ) -> None: + # wait_for_kv_layer_from_connector(layer_name) forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.attn_metadata self = forward_context.no_compile_layers[layer_name] @@ -424,6 +421,8 @@ def unified_attention_with_output( attn_metadata, output=output) + maybe_save_kv_layer_to_connector(layer_name, kv_cache) + def unified_attention_with_output_fake( query: torch.Tensor, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index db44c8bf9d53..1951e6c5b5db 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -197,8 +197,12 @@ def extract_kv_from_layer( Assume the shape of the layer is (2, num_pages, page_size, xxx). """ num_pages, page_size = layer.shape[1], layer.shape[2] - return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, - ...] + reshaped = layer.reshape(2, num_pages * page_size, -1) + print(f"{layer.shape=}") + print(f"{reshaped.shape=}") + print(f"{slot_mapping}") + + return reshaped[:, slot_mapping, ...] connector_metadata = self._get_connector_metadata() assert isinstance(connector_metadata, SharedStorageConnectorMetadata) @@ -208,8 +212,8 @@ def extract_kv_from_layer( layer_name, request.token_ids) kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) - tensors = {"kv_cache": kv_cache.cpu().detach()} - safetensors.torch.save_file(tensors, filename) + assert False + # torch.ops.save_lib.save_safetensors(kv_cache, filename) def wait_for_save(self): return @@ -362,3 +366,21 @@ def align_to_block_size(num_tokens: int, block_size) -> int: """Align the number of tokens to the block size. """ return (num_tokens - 1) // block_size * block_size + + +# Register a custom library and print operator +import torch +from torch.library import Library, impl + +lib = Library("save_lib", "DEF") +lib.define("save_safetensors(Tensor kv_cache, str filename) -> ()") + + +@impl(lib, "save_safetensors", "CompositeExplicitAutograd") +def save_safetensors(kv_cache, filename): + # tensors = {"kv_cache": kv_cache.detach().cpu()} + # kv_cache = kv_cache.cpu() + # tensors = {"kv_cache": kv_cache} + # safetensors.torch.save_file(tensors, filename) + a = torch.empty(10) + return From a73721ab9ca16e3091dc6d6690c62faaa1e6af3d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 13:19:17 +0000 Subject: [PATCH 023/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../disaggrated-prefill-v1/run.sh | 4 +-- vllm/attention/layer.py | 3 +- .../v1/shared_storage_connector.py | 31 ++++--------------- 3 files changed, 10 insertions(+), 28 deletions(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/run.sh b/examples/offline_inference/disaggrated-prefill-v1/run.sh index e0acecadc51a..07a57eb09403 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/run.sh +++ b/examples/offline_inference/disaggrated-prefill-v1/run.sh @@ -1,5 +1,5 @@ rm -rf local_storage/ rm output.txt -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=6 python3 prefill_example.py -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=6 python3 decode_example.py +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=5 python3 prefill_example.py +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=5 python3 decode_example.py diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index e5ef38efaa2e..2cff01ebeed4 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -353,6 +353,7 @@ def maybe_save_kv_layer_to_connector( ): if not has_kv_transfer_group() or not is_v1_kv_transfer_group(): return + connector = get_kv_transfer_group() forward_context: ForwardContext = get_forward_context() @@ -370,7 +371,7 @@ def unified_attention( value: torch.Tensor, layer_name: str, ) -> torch.Tensor: - # wait_for_kv_layer_from_connector(layer_name) + wait_for_kv_layer_from_connector(layer_name) forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.attn_metadata diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 1951e6c5b5db..ef3246f7e8f8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -197,12 +197,11 @@ def extract_kv_from_layer( Assume the shape of the layer is (2, num_pages, page_size, xxx). """ num_pages, page_size = layer.shape[1], layer.shape[2] - reshaped = layer.reshape(2, num_pages * page_size, -1) print(f"{layer.shape=}") - print(f"{reshaped.shape=}") - print(f"{slot_mapping}") - - return reshaped[:, slot_mapping, ...] + print(f"{layer.reshape(2, num_pages * page_size, -1)=}") + print(f"{slot_mapping.shape=}") + return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, + ...] connector_metadata = self._get_connector_metadata() assert isinstance(connector_metadata, SharedStorageConnectorMetadata) @@ -212,8 +211,8 @@ def extract_kv_from_layer( layer_name, request.token_ids) kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) - assert False - # torch.ops.save_lib.save_safetensors(kv_cache, filename) + tensors = {"kv_cache": kv_cache.detach().cpu()} + safetensors.torch.save_file(tensors, filename) def wait_for_save(self): return @@ -366,21 +365,3 @@ def align_to_block_size(num_tokens: int, block_size) -> int: """Align the number of tokens to the block size. """ return (num_tokens - 1) // block_size * block_size - - -# Register a custom library and print operator -import torch -from torch.library import Library, impl - -lib = Library("save_lib", "DEF") -lib.define("save_safetensors(Tensor kv_cache, str filename) -> ()") - - -@impl(lib, "save_safetensors", "CompositeExplicitAutograd") -def save_safetensors(kv_cache, filename): - # tensors = {"kv_cache": kv_cache.detach().cpu()} - # kv_cache = kv_cache.cpu() - # tensors = {"kv_cache": kv_cache} - # safetensors.torch.save_file(tensors, filename) - a = torch.empty(10) - return From 00df670ccb3f0ff749016a8f73117f60c9e794e1 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 13:20:14 +0000 Subject: [PATCH 024/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/attention/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 2cff01ebeed4..bb09a763bc08 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -409,7 +409,7 @@ def unified_attention_with_output( output: torch.Tensor, layer_name: str, ) -> None: - # wait_for_kv_layer_from_connector(layer_name) + wait_for_kv_layer_from_connector(layer_name) forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.attn_metadata self = forward_context.no_compile_layers[layer_name] From 4ebcc3ea21092aa9f2fca507e6770f9164268f5e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 13:44:41 +0000 Subject: [PATCH 025/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/attention/layer.py | 3 +-- .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index bb09a763bc08..f75e4e459e91 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -349,7 +349,7 @@ def wait_for_kv_layer_from_connector(layer_name: str): def maybe_save_kv_layer_to_connector( layer_name: str, - kv_cache: List[torch.Tensor], + kv_cache_layer: List[torch.Tensor], ): if not has_kv_transfer_group() or not is_v1_kv_transfer_group(): return @@ -361,7 +361,6 @@ def maybe_save_kv_layer_to_connector( if attn_metadata is None: return - kv_cache_layer = kv_cache[forward_context.virtual_engine] connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index ef3246f7e8f8..3624d84c9e7a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -196,10 +196,10 @@ def extract_kv_from_layer( Assume the shape of the layer is (2, num_pages, page_size, xxx). """ + # TODO(rob): make this compatible with MLA. + + assert layer.shape[0] == 2 num_pages, page_size = layer.shape[1], layer.shape[2] - print(f"{layer.shape=}") - print(f"{layer.reshape(2, num_pages * page_size, -1)=}") - print(f"{slot_mapping.shape=}") return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...] From da019df61e5be0ad86eb03bdf3aeb6601f7d99a5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 14:23:47 +0000 Subject: [PATCH 026/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../offline_inference/disaggrated-prefill-v1/prefill_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py index 9f5481e2c1ea..69c4e9c3a74f 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py @@ -15,7 +15,7 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True, + enforce_eager=False, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' From 90e8c53fd4557090e27d4178baed38168f04e648 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 14:29:17 +0000 Subject: [PATCH 027/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../offline_inference/disaggrated-prefill-v1/prefill_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py index 69c4e9c3a74f..1e1780d6dc7a 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py @@ -14,7 +14,7 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) -llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", +llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=False, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( From 8b3f606e849c308f2a02bfee6427e7af866a7f28 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 14:29:26 +0000 Subject: [PATCH 028/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../offline_inference/disaggrated-prefill-v1/decode_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py index 104103d2e1ce..00e5f343003a 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py @@ -17,7 +17,7 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", + model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=False, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( From de1e487d9037af9e94c1462419d495ea0a59ff2a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:01:14 +0000 Subject: [PATCH 029/116] fix nit Signed-off-by: rshaw@neuralmagic.com --- .../offline_inference/disaggrated-prefill-v1/decode_example.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py index 00e5f343003a..760a70553101 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py @@ -28,9 +28,7 @@ # 1ST generation (prefill instance) outputs = llm.generate(prompts, sampling_params) -new_prompts = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - new_prompts.append(prompt + generated_text) print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") From 48c2eb29561cfc1642132be76e5f9387951a9ad1 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:07:37 +0000 Subject: [PATCH 030/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/kv_connector/factory.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index fac30324471d..fe90ed20d3d4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -4,12 +4,8 @@ from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union import vllm.envs as envs -# NOTE(Kuntai): We prefer not to directly the classes with "_V1" suffix. -# This makes it easier for us to deprecate code in v0 (which will happen soon). -# yapf: disable from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, KVConnectorRole) -# yapf: enable from vllm.logger import init_logger from .base import KVConnectorBase From e72e5e446061c6f15e2fd409bf3447f64a0d3656 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:08:34 +0000 Subject: [PATCH 031/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/kv_connector/v1/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py index ddad33e27fc4..a017b140e090 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py @@ -1,10 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -# yapf: disable from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorRole) -# yapf: enable - __all__ = [ "KVConnectorRole", "KVConnectorBase_V1", From 78336452e159d6e38bea160534d7132d75eab2b6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:11:18 +0000 Subject: [PATCH 032/116] updared Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/kv_connector_agent.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py index a3b53d2184fe..96ef2fdd4127 100644 --- a/vllm/distributed/kv_transfer/kv_connector_agent.py +++ b/vllm/distributed/kv_transfer/kv_connector_agent.py @@ -16,10 +16,7 @@ from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) -# yapf: disable -from vllm.distributed.kv_transfer.kv_connector.v1 import ( - KVConnectorRole as KVConnectorRole_V1) -# yapf: enable +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole from vllm.logger import init_logger from vllm.sequence import IntermediateTensors @@ -55,7 +52,7 @@ def __init__( "TransferAgent should only be used when kv_connector is set." self.connector = KVConnectorFactory.create_connector( - rank, local_rank, config, KVConnectorRole_V1.WORKER) + rank, local_rank, config, KVConnectorRole.WORKER) def send_kv_caches_and_hidden_states( self, From 1881aa5f337fd9c3afcf0cbabc6a3371d7e6a679 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:24:55 +0000 Subject: [PATCH 033/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/factory.py | 23 +++++++++++++++---- .../kv_transfer/kv_connector/v1/base.py | 9 +++----- .../v1/shared_storage_connector.py | 19 ++++++--------- .../kv_transfer/kv_connector_agent.py | 5 ++-- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index fe90ed20d3d4..657aa24fa954 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -34,10 +34,25 @@ def loader() -> Type[Union[KVConnectorBase, KVConnectorBase_V1]]: cls._registry[name] = loader @classmethod - def create_connector( - cls, rank: Optional[int], local_rank: Optional[int], - config: "VllmConfig", role: KVConnectorRole - ) -> Union[KVConnectorBase, KVConnectorBase_V1]: + def create_connector_v1( + cls, + config: "VllmConfig", + role: KVConnectorRole, + ) -> KVConnectorBase_V1: + if not envs.VLLM_USE_V1: + raise ValueError("Attempting to initialize a V1 Connector, " + f"but found {envs.VLLM_USE_V1=}") + + connector_name = config.kv_transfer_config.kv_connector + connector_cls = cls._registry[connector_name]() + assert issubclass(connector_cls, KVConnectorBase_V1) + logger.info("Creating v1 connector with name: %s", connector_name) + return connector_cls(config, role) + + @classmethod + def create_connector_v0(cls, rank: Optional[int], + local_rank: Optional[int], config: "VllmConfig", + role: KVConnectorRole) -> KVConnectorBase: connector_name = config.kv_transfer_config.kv_connector if connector_name not in cls._registry: raise ValueError(f"Unsupported connector type: {connector_name}") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index e6dc281f06e7..490b4f613f61 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -9,7 +9,7 @@ import enum from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import torch @@ -39,12 +39,9 @@ class KVConnectorMetadata: class KVConnectorBase_V1(ABC): - def __init__(self, rank: Optional[int], local_rank: Optional[int], - config: "VllmConfig", role: KVConnectorRole): + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): self._connector_metadata = KVConnectorMetadata() - self._rank = rank - self._local_rank = local_rank - self._config = config + self._vllm_config = vllm_config self._role = role @property diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 3624d84c9e7a..c850bc29af1a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -2,7 +2,7 @@ import hashlib import os from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import safetensors import torch @@ -77,19 +77,14 @@ class SharedStorageConnector(KVConnectorBase_V1): # It does extra work which will overwrite the existing prefix-cache in GPU # - to remove the overhead, need to add some "mask" in the ReqMeta class - def __init__(self, rank: Optional[int], local_rank: Optional[int], - config: "VllmConfig", role: KVConnectorRole): - super().__init__( - rank=rank, - local_rank=local_rank, - config=config, - role=role, - ) - self._block_size = config.cache_config.block_size + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): + super().__init__(vllm_config=vllm_config, role=role) + self._block_size = vllm_config.cache_config.block_size self._requests_need_load: list[str] = [] - self._storage_path = config.kv_transfer_config.get_from_extra_config( + transfer_config = vllm_config.kv_transfer_config + self._storage_path = transfer_config.get_from_extra_config( "shared_storage_path", "/tmp") - logger.info(config.kv_transfer_config) + logger.info(vllm_config.kv_transfer_config) logger.info("Shared storage path is %s", self._storage_path) def start_load_kv(self, forward_context: "ForwardContext", diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py index 96ef2fdd4127..c6082e1a18e7 100644 --- a/vllm/distributed/kv_transfer/kv_connector_agent.py +++ b/vllm/distributed/kv_transfer/kv_connector_agent.py @@ -16,7 +16,6 @@ from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) -from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole from vllm.logger import init_logger from vllm.sequence import IntermediateTensors @@ -51,8 +50,8 @@ def __init__( assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\ "TransferAgent should only be used when kv_connector is set." - self.connector = KVConnectorFactory.create_connector( - rank, local_rank, config, KVConnectorRole.WORKER) + self.connector = KVConnectorFactory.create_connector_v0( + rank, local_rank, config) def send_kv_caches_and_hidden_states( self, From eca7a4996c90ce5d4aafa5649711818152b82c8c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:25:24 +0000 Subject: [PATCH 034/116] cleaning Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/kv_connector_agent.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py index c6082e1a18e7..f12136f0e591 100644 --- a/vllm/distributed/kv_transfer/kv_connector_agent.py +++ b/vllm/distributed/kv_transfer/kv_connector_agent.py @@ -25,9 +25,6 @@ class KVConnectorAgent: """ A class designated for distributed KV transfer - - This class currently only wraps one KV connector. But in the future, it may - wrap multiple connectors to support more use cases. Target use cases: 1. Disaggregated prefill From b0629bddf3a84684797a03205a127886a5a6ed6b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:27:58 +0000 Subject: [PATCH 035/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/attention/layer.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index f75e4e459e91..4e21e02f64bd 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -219,20 +219,18 @@ def forward( else: torch.ops.vllm.unified_attention_with_output( query, key, value, output, self.layer_name) - output = output.view(-1, hidden_size) + return output.view(-1, hidden_size) else: if self.use_direct_call: forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata self_kv_cache = self.kv_cache[forward_context.virtual_engine] - output = self.impl.forward(self, query, key, value, - self_kv_cache, attn_metadata) + return self.impl.forward(self, query, key, value, + self_kv_cache, attn_metadata) else: - output = torch.ops.vllm.unified_attention( + return torch.ops.vllm.unified_attention( query, key, value, self.layer_name) - return output - def calc_kv_scales(self, query, key, value): self._q_scale.copy_(torch.abs(query).max() / self.q_range) self._k_scale.copy_(torch.abs(key).max() / self.k_range) From 7766ca51bdf90006768e5e080e854c690613da80 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:31:33 +0000 Subject: [PATCH 036/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/factory.py | 44 +++++++++---------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 657aa24fa954..25e5dc6ef6aa 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import importlib -from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union +from typing import TYPE_CHECKING, Callable, Dict, Type, Union import vllm.envs as envs from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, @@ -47,36 +47,32 @@ def create_connector_v1( connector_cls = cls._registry[connector_name]() assert issubclass(connector_cls, KVConnectorBase_V1) logger.info("Creating v1 connector with name: %s", connector_name) + # NOTE(Kuntai): v1 connector is explicitly separated into two roles. + # Scheduler connector: + # - Co-colate with scheduler process + # - Should only be used inside the Scheduler class + # Worker connector: + # - Co-locate with worker process + # - Should only be used inside the forward context & attention layer + # We build separately to enforce strict separation return connector_cls(config, role) @classmethod - def create_connector_v0(cls, rank: Optional[int], - local_rank: Optional[int], config: "VllmConfig", - role: KVConnectorRole) -> KVConnectorBase: + def create_connector_v0(cls, rank: int, local_rank: int, + config: "VllmConfig") -> KVConnectorBase: + if envs.VLLM_USE_V1: + raise ValueError("Attempting to initialize a V0 Connector, " + f"but found {envs.VLLM_USE_V1=}") + connector_name = config.kv_transfer_config.kv_connector if connector_name not in cls._registry: raise ValueError(f"Unsupported connector type: {connector_name}") - if envs.VLLM_USE_V1: - # NOTE(Kuntai): v1 connector is explicitly separated into two roles. - # Scheduler connector: - # - Co-colate with scheduler process - # - Should only be used inside the Scheduler class - # Worker connector: - # - Co-locate with worker process - # - Should only be used inside the forward context & attention layer - # We build these two connectors separately to enforce strict - # separation - connector_cls_v1 = cls._registry[connector_name]() - assert issubclass(connector_cls_v1, KVConnectorBase_V1) - logger.info("Creating v1 connector with name: %s", connector_name) - return connector_cls_v1(rank, local_rank, config, role) - else: - assert rank is not None - assert local_rank is not None - connector_cls = cls._registry[connector_name]() - assert issubclass(connector_cls, KVConnectorBase) - return connector_cls(rank, local_rank, config) + assert rank is not None + assert local_rank is not None + connector_cls = cls._registry[connector_name]() + assert issubclass(connector_cls, KVConnectorBase) + return connector_cls(rank, local_rank, config) # Register various connectors here. From 7b64acbc4c8f3ef9c6a975b3ed246799be8e5e7f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:32:47 +0000 Subject: [PATCH 037/116] clean up code Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/factory.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 25e5dc6ef6aa..07c52e4cabd2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -33,6 +33,21 @@ def loader() -> Type[Union[KVConnectorBase, KVConnectorBase_V1]]: cls._registry[name] = loader + @classmethod + def create_connector_v0(cls, rank: int, local_rank: int, + config: "VllmConfig") -> KVConnectorBase: + if envs.VLLM_USE_V1: + raise ValueError("Attempting to initialize a V0 Connector, " + f"but found {envs.VLLM_USE_V1=}") + + connector_name = config.kv_transfer_config.kv_connector + if connector_name not in cls._registry: + raise ValueError(f"Unsupported connector type: {connector_name}") + + connector_cls = cls._registry[connector_name]() + assert issubclass(connector_cls, KVConnectorBase) + return connector_cls(rank, local_rank, config) + @classmethod def create_connector_v1( cls, @@ -57,23 +72,6 @@ def create_connector_v1( # We build separately to enforce strict separation return connector_cls(config, role) - @classmethod - def create_connector_v0(cls, rank: int, local_rank: int, - config: "VllmConfig") -> KVConnectorBase: - if envs.VLLM_USE_V1: - raise ValueError("Attempting to initialize a V0 Connector, " - f"but found {envs.VLLM_USE_V1=}") - - connector_name = config.kv_transfer_config.kv_connector - if connector_name not in cls._registry: - raise ValueError(f"Unsupported connector type: {connector_name}") - - assert rank is not None - assert local_rank is not None - connector_cls = cls._registry[connector_name]() - assert issubclass(connector_cls, KVConnectorBase) - return connector_cls(rank, local_rank, config) - # Register various connectors here. # The registration should not be done in each individual file, as we want to From b1310fd1a9e4a69857196f340297e8c6e852cc23 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:36:37 +0000 Subject: [PATCH 038/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index dc22e89a3436..af90053c1f1d 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -8,6 +8,9 @@ from typing import Optional, Union from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, @@ -62,15 +65,8 @@ def __init__( # create connector if self.vllm_config.kv_transfer_config is not None: - from vllm.distributed.kv_transfer.kv_connector.factory import ( - KVConnectorFactory) - from vllm.distributed.kv_transfer.kv_connector.v1 import ( - KVConnectorRole as KVConnectorRole_V1) self.connector = KVConnectorFactory.create_connector( - rank=None, - local_rank=None, - config=self.vllm_config, - role=KVConnectorRole_V1.SCHEDULER) + config=self.vllm_config, role=KVConnectorRole.SCHEDULER) else: self.connector = None From 689379e5658b84583b36164cc216ca92d111164e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:39:40 +0000 Subject: [PATCH 039/116] updaed Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index af90053c1f1d..909ecb4105b4 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -63,7 +63,9 @@ def __init__( self.scheduler_config.max_num_batched_tokens self.max_model_len = self.scheduler_config.max_model_len - # create connector + # Create KVConnector for the Scheduler. Note that each Worker + # will have a corresponding KVConnector with Role=WORKER. + # KV Connector pushes/pull of remote KVs for P/D and offloading. if self.vllm_config.kv_transfer_config is not None: self.connector = KVConnectorFactory.create_connector( config=self.vllm_config, role=KVConnectorRole.SCHEDULER) From 62e14218eb8a2cb2e2bd60406fe8df8eec6b8586 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:47:40 +0000 Subject: [PATCH 040/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index c850bc29af1a..e5c1dac659eb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -191,8 +191,7 @@ def extract_kv_from_layer( Assume the shape of the layer is (2, num_pages, page_size, xxx). """ - # TODO(rob): make this compatible with MLA. - + # TODO: make this compatible with MLA. assert layer.shape[0] == 2 num_pages, page_size = layer.shape[1], layer.shape[2] return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, @@ -297,7 +296,6 @@ def attach_connector_meta( """ meta = SharedStorageConnectorMetadata() for request in scheduler_output.scheduled_new_reqs: - # T^T, why there is both req_id and request_id???? if request.req_id in self._requests_need_load: meta.add_request(request, self._block_size, is_store=False) else: From 5145566eec9ab5b869896826946808afec082997 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 15:52:03 +0000 Subject: [PATCH 041/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 11 +++++------ .../kv_connector/v1/shared_storage_connector.py | 14 ++++++-------- vllm/v1/core/sched/scheduler.py | 4 ++-- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 490b4f613f61..0a56fb94f96e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -159,7 +159,7 @@ def get_external_prefix_cache_blocks( Get the external prefix cache blocks from the connector. This function may change the state of the connector, which will - be used by `attach_connector_meta` later. + be used by `build_connector_meta` later. This function will also allocate/free the blocks dynamically when there is remote cache hit. @@ -178,13 +178,12 @@ def get_external_prefix_cache_blocks( pass @abstractmethod - def attach_connector_meta( - self, scheduler_output: SchedulerOutput) -> SchedulerOutput: + def build_connector_meta( + self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: """ - Attach the connector metadata to the request object. + Build the connector metadata for this step. - This function should NOT modify other fields in the scheduler_output - except the `connector_metadata` field. + This function should NOT modify fields in the scheduler_output. Also, calling this function will reset the state of the connector. Args: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index e5c1dac659eb..592cef243b14 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -221,7 +221,7 @@ def get_external_prefix_cache_blocks( """Get the external prefix cache blocks from the connector. This function may change the state of the connector, which will be - used by `attach_connector_meta` later. + used by `build_connector_meta` later. Args: request (Request): the request object. @@ -283,12 +283,11 @@ def get_external_prefix_cache_blocks( else: return computed_blocks - def attach_connector_meta( - self, scheduler_output: SchedulerOutput) -> SchedulerOutput: - """Attach the connector metadata to the request object. + def build_connector_meta( + self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: + """Build the connector metadata for this step. - This function should NOT modify other fields in the scheduler_output - except the `kv_connector_metadata` field. + This function should NOT modify any fields in the scheduler_output. Also, calling this function will reset the state of the connector. Args: @@ -304,10 +303,9 @@ def attach_connector_meta( # store and load status if not self.found_match_for_request(request): meta.add_request(request, self._block_size, is_store=True) - scheduler_output.kv_connector_metadata = meta self._requests_need_load.clear() - return scheduler_output + return meta # ============================== # Helper functions diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 909ecb4105b4..527b729e0900 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -453,8 +453,8 @@ def schedule(self) -> SchedulerOutput: # 2. Wrap up all the KV cache load / save ops into an opaque object # 3. Clear the internal states of the connector if self.connector is not None: - scheduler_output = self.connector.attach_connector_meta( - scheduler_output) + meta = self.connector.build_connector_meta(scheduler_output) + scheduler_output.kv_connector_metadata = meta # Advance the number of computed tokens for the request AFTER # the request is scheduled. From 20decdf2cf571a33e5a7d3e096b5bfbd4cd1d2bc Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 16:06:15 +0000 Subject: [PATCH 042/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer_state.py | 74 +++++++++++++++++++++++++++ vllm/distributed/parallel_state.py | 73 +------------------------- 2 files changed, 75 insertions(+), 72 deletions(-) create mode 100644 vllm/distributed/kv_transfer_state.py diff --git a/vllm/distributed/kv_transfer_state.py b/vllm/distributed/kv_transfer_state.py new file mode 100644 index 000000000000..e6645829d26e --- /dev/null +++ b/vllm/distributed/kv_transfer_state.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import TYPE_CHECKING, Union + +from vllm import envs +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) +from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, + KVConnectorRole) +from vllm.distributed.parallel_state import get_world_group + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +_KV_CONNECTOR_AGENT: Union[KVConnectorBase, KVConnectorBase_V1, None] = None + + +def get_kv_transfer_group() -> Union[KVConnectorBase, KVConnectorBase_V1]: + assert _KV_CONNECTOR_AGENT is not None, ( + "disaggregated KV cache transfer parallel group is not initialized") + return _KV_CONNECTOR_AGENT + + +def has_kv_transfer_group() -> bool: + return _KV_CONNECTOR_AGENT is not None + + +def is_v1_kv_transfer_group( + connector: Union[KVConnectorBase_V1, KVConnectorBase, + None] = None) -> bool: + """Check if the KV connector is the v1 connector. + If the argument is None, it will check the global KV connector + + Args: + connector: The KV connector to check. If None, it will check the + global KV connector. + + Note: + This function will no-longer be needed after the v1 KV connector + becomes the default. + """ + if connector is None: + connector = _KV_CONNECTOR_AGENT + + if connector is None: + return False + + return isinstance(connector, KVConnectorBase_V1) + + +def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: + """ + Initialize KV cache transfer parallel group. + """ + + global _KV_CONNECTOR_AGENT + + if vllm_config.kv_transfer_config is None: + return + + if all([ + vllm_config.kv_transfer_config.is_kv_transfer_instance, + _KV_CONNECTOR_AGENT is None + ]): + + if envs.VLLM_USE_V1: + _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1( + config=vllm_config, role=KVConnectorRole.WORKER) + else: + _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0( + rank=get_world_group().rank, + local_rank=get_world_group().local_rank, + config=vllm_config, + ) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 40b10784a0e4..0218104e0447 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -40,15 +40,13 @@ import vllm.envs as envs from vllm.distributed.device_communicators.base_device_communicator import ( DeviceCommunicatorBase) -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase -from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname, supports_custom_op) if TYPE_CHECKING: - from vllm.config import VllmConfig + pass @dataclass @@ -771,44 +769,6 @@ def get_pp_group() -> GroupCoordinator: # kept for backward compatibility get_pipeline_model_parallel_group = get_pp_group -# TODO: once we deprecate V0 KV transfer, we can move this to -# be a non-global object. -_KV_CONNECTOR_AGENT: Union[KVConnectorBase, KVConnectorBase_V1, None] = None - - -def get_kv_transfer_group() -> Union[KVConnectorBase, KVConnectorBase_V1]: - assert _KV_CONNECTOR_AGENT is not None, ( - "disaggregated KV cache transfer parallel group is not initialized") - return _KV_CONNECTOR_AGENT - - -def has_kv_transfer_group() -> bool: - return _KV_CONNECTOR_AGENT is not None - - -def is_v1_kv_transfer_group( - connector: Union[KVConnectorBase_V1, KVConnectorBase, - None] = None) -> bool: - """Check if the KV connector is the v1 connector. - If the argument is None, it will check the global KV connector - - Args: - connector: The KV connector to check. If None, it will check the - global KV connector. - - Note: - This function will no-longer be needed after the v1 KV connector - becomes the default. - """ - if connector is None: - connector = _KV_CONNECTOR_AGENT - - if connector is None: - # Global KV connector is not set - return False - - return isinstance(connector, KVConnectorBase_V1) - @contextmanager def graph_capture(device: torch.device): @@ -991,37 +951,6 @@ def initialize_model_parallel( _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group) -def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: - """ - Initialize KV cache transfer parallel group. - """ - - global _KV_CONNECTOR_AGENT - - if vllm_config.kv_transfer_config is None: - return - - if all([ - vllm_config.kv_transfer_config.is_kv_transfer_instance, - _KV_CONNECTOR_AGENT is None - ]): - from vllm.distributed.kv_transfer.kv_connector.factory import ( - KVConnectorFactory) - from vllm.distributed.kv_transfer.kv_connector.v1 import ( - KVConnectorRole as KVConnectorRole_V1) - - kwargs = { - "rank": get_world_group().rank, - "local_rank": get_world_group().local_rank, - "config": vllm_config, - # NOTE(Kuntai): - # Parallel state is initialized in v1 worker, - # so this connector is for sure worker connector. - "role": KVConnectorRole_V1.WORKER, - } - _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(**kwargs) - - def ensure_model_parallel_initialized( tensor_model_parallel_size: int, pipeline_model_parallel_size: int, From fc58dd5c7b4a62cead89fe4d321c21272a6e3938 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 16:13:39 +0000 Subject: [PATCH 043/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/attention/layer.py | 5 +++-- vllm/distributed/kv_transfer/__init__.py | 9 +++++++++ vllm/distributed/{ => kv_transfer}/kv_transfer_state.py | 0 vllm/forward_context.py | 5 +++-- 4 files changed, 15 insertions(+), 4 deletions(-) rename vllm/distributed/{ => kv_transfer}/kv_transfer_state.py (100%) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 4e21e02f64bd..c4d7ec8bf8c3 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -10,8 +10,9 @@ from vllm.attention import AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig, get_current_vllm_config -from vllm.distributed import (get_kv_transfer_group, has_kv_transfer_group, - is_v1_kv_transfer_group) +from vllm.distributed.kv_transfer_state import (get_kv_transfer_group, + has_kv_transfer_group, + is_v1_kv_transfer_group) from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py index e69de29bb2d1..4aa6ab4823c0 100644 --- a/vllm/distributed/kv_transfer/__init__.py +++ b/vllm/distributed/kv_transfer/__init__.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +from vllm.distributed.kv_transfer.kv_transfer_state import ( + ensure_kv_transfer_initialized, get_kv_transfer_group, + has_kv_transfer_group, is_v1_kv_transfer_group) + +__all__ = [ + "get_kv_transfer_group", "has_kv_transfer_group", + "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized" +] diff --git a/vllm/distributed/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py similarity index 100% rename from vllm/distributed/kv_transfer_state.py rename to vllm/distributed/kv_transfer/kv_transfer_state.py diff --git a/vllm/forward_context.py b/vllm/forward_context.py index c7ae7f13380e..4a399bc6c248 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -11,8 +11,9 @@ import vllm.envs as envs from vllm.config import VllmConfig -from vllm.distributed import (get_kv_transfer_group, has_kv_transfer_group, - is_v1_kv_transfer_group) +from vllm.distributed.kv_transfer import (get_kv_transfer_group, + has_kv_transfer_group, + is_v1_kv_transfer_group) # yapf: disable from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 # yapf: enable From 25c95920c71d92ee4c467bffc7bebfd5a1f76e43 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 16:20:41 +0000 Subject: [PATCH 044/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/parallel_state.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 0218104e0447..ef7dbc57169e 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -29,8 +29,7 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass from multiprocessing import shared_memory -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Union) +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from unittest.mock import patch import torch @@ -45,9 +44,6 @@ from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname, supports_custom_op) -if TYPE_CHECKING: - pass - @dataclass class GraphCaptureContext: From 40e5d81797ecbef733dda63149b89e5061c46217 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 16:25:04 +0000 Subject: [PATCH 045/116] refactor Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/worker/gpu_worker.py | 4 ++-- vllm/worker/worker.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 59f407e5751b..3a29f8d0deef 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -11,10 +11,10 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.device_allocator.cumem import CuMemAllocator -from vllm.distributed import (ensure_kv_transfer_initialized, - ensure_model_parallel_initialized, +from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) +from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d59f20f49996..9ea003bec5e0 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -10,10 +10,10 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.device_allocator.cumem import CuMemAllocator -from vllm.distributed import (ensure_kv_transfer_initialized, - ensure_model_parallel_initialized, +from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) +from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed From e64f7451d80b759c2fe4a108fb4780ad3745eb9a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 16:28:51 +0000 Subject: [PATCH 046/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/attention/layer.py | 6 +++--- vllm/v1/worker/gpu_model_runner.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index c4d7ec8bf8c3..68452f4c03b0 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -10,9 +10,9 @@ from vllm.attention import AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig, get_current_vllm_config -from vllm.distributed.kv_transfer_state import (get_kv_transfer_group, - has_kv_transfer_group, - is_v1_kv_transfer_group) +from vllm.distributed.kv_transfer import (get_kv_transfer_group, + has_kv_transfer_group, + is_v1_kv_transfer_group) from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9d9e59a9367f..57f1351e7d06 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -13,7 +13,8 @@ from vllm.attention import AttentionType, get_attn_backend from vllm.attention.layer import Attention from vllm.config import CompilationLevel, VllmConfig -from vllm.distributed import get_kv_transfer_group, has_kv_transfer_group +from vllm.distributed.kv_transfer import (get_kv_transfer_group, + has_kv_transfer_group) from vllm.distributed.parallel_state import get_pp_group, graph_capture from vllm.forward_context import set_forward_context from vllm.logger import init_logger From 74af2332c238cc3ce62a6ce984ddf239be47dbf4 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 16:44:01 +0000 Subject: [PATCH 047/116] done with nits Signed-off-by: rshaw@neuralmagic.com --- vllm/forward_context.py | 2 -- vllm/worker/model_runner.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 4a399bc6c248..34ec0441f69a 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -14,9 +14,7 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group, has_kv_transfer_group, is_v1_kv_transfer_group) -# yapf: disable from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 -# yapf: enable from vllm.logger import init_logger if TYPE_CHECKING: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 86e6d9752013..36f8437989b1 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -23,7 +23,8 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.config import CompilationLevel, VllmConfig from vllm.core.scheduler import SchedulerOutputs -from vllm.distributed import get_kv_transfer_group, get_pp_group +from vllm.distributed import get_pp_group +from vllm.distributed.kv_transfer import get_kv_transfer_group from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, graph_capture) from vllm.forward_context import get_forward_context, set_forward_context From 7c31e29199e734459356e88e612612d45032b59d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 17:03:55 +0000 Subject: [PATCH 048/116] nits Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 2 +- vllm/v1/core/sched/output.py | 2 +- vllm/v1/core/sched/scheduler.py | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 0f85d8a9dc4a..352bac87c0ec 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -30,7 +30,7 @@ def __init__( caching_hash_algo: str = "builtin", num_preallocate_tokens: int = 64, log_stats: bool = False, - connector: "KVConnectorBase_V1" = None, + connector: Optional["KVConnectorBase_V1"] = None, ) -> None: assert len(kv_cache_config.kv_cache_groups) == 1, ( "KVCacheManager does not support hybrid models with more than 1 " diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 27ceb8cc0402..1d3f1f41f8fb 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -124,5 +124,5 @@ class SchedulerOutput: # the bitmask for the whole batch grammar_bitmask: Optional[npt.NDArray[np.int32]] - # the connector metadata + # KV Cache Connector metadata. kv_connector_metadata: Optional[KVConnectorMetadata] = None diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 527b729e0900..9d8b0945c483 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -66,11 +66,10 @@ def __init__( # Create KVConnector for the Scheduler. Note that each Worker # will have a corresponding KVConnector with Role=WORKER. # KV Connector pushes/pull of remote KVs for P/D and offloading. + self.connector = None if self.vllm_config.kv_transfer_config is not None: - self.connector = KVConnectorFactory.create_connector( + self.connector = KVConnectorFactory.create_connector_v1( config=self.vllm_config, role=KVConnectorRole.SCHEDULER) - else: - self.connector = None num_gpu_blocks = self.cache_config.num_gpu_blocks assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 From 7f57f3c90b0931e07e7b5f3deb43d8e489693846 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 17:13:31 +0000 Subject: [PATCH 049/116] update lifecycle Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/worker/gpu_model_runner.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 57f1351e7d06..7980690dc7db 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -979,6 +979,11 @@ def execute_model( scheduler_output: "SchedulerOutput", intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, torch.Tensor]: + # Update KVConnector with the KVConnector metadata forward(). + if has_kv_transfer_group(): + get_kv_transfer_group().bind_connector_metadata( + scheduler_output.kv_connector_metadata) + self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: # Return empty ModelRunnerOuptut if there's no work to do. @@ -1045,11 +1050,6 @@ def execute_model( for k, v in self.intermediate_tensors.items() }) - # Update the connector's state with the metadata in scheduler output. - if has_kv_transfer_group(): - get_kv_transfer_group().bind_connector_metadata( - scheduler_output.kv_connector_metadata) - # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): @@ -1067,10 +1067,6 @@ def execute_model( sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(sample_hidden_states, None) - # Clear connector's state - if has_kv_transfer_group(): - get_kv_transfer_group().clear_connector_metadata() - # Apply structured output bitmasks if present if scheduler_output.grammar_bitmask is not None: self.apply_grammar_bitmask(scheduler_output, logits) @@ -1224,6 +1220,10 @@ def execute_model( # in the next step. del draft_probs + # Clear KVConnector state after all KVs are generated. + if has_kv_transfer_group(): + get_kv_transfer_group().clear_connector_metadata() + return ModelRunnerOutput( req_ids=self.input_batch.req_ids, req_id_to_index=self.input_batch.req_id_to_index, From 05349a57318f5472c4c1071fbce2e645aebc822a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 9 Apr 2025 22:10:50 +0000 Subject: [PATCH 050/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 3 ++- vllm/v1/core/kv_cache_manager.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 592cef243b14..b130286e3b1d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -267,7 +267,8 @@ def get_external_prefix_cache_blocks( request, need_to_allocate, computed_blocks, - skip_preallocate=True) + skip_preallocate=True, + skip_inc_ref_count=True) request.request_id = old_req_id kv_cache_manager.req_to_blocks.pop("temp-req-id-for-connector") kv_cache_manager.num_cached_block.pop("temp-req-id-for-connector") diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 352bac87c0ec..dcc62c1476ef 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -178,6 +178,7 @@ def allocate_slots( num_tokens: int, new_computed_blocks: Optional[list[KVCacheBlock]] = None, skip_preallocate: bool = False, + skip_inc_ref_count: bool = False, ) -> Optional[list[KVCacheBlock]]: """Add slots for a request with new tokens to append. @@ -189,6 +190,9 @@ def allocate_slots( prefix caching. skip_preallocate: Whether to skip preallocating blocks for the request. + skip_preallocate: Whether to skip incrementing the ref count. This + is useful for the KVConnector to allocate blocks which will be + filled by the remote KVs for a single model step(). Blocks layout: ----------------------------------------------------------------------- @@ -242,7 +246,7 @@ def allocate_slots( return None # Touch the computed blocks to make sure they won't be evicted. - if self.enable_caching: + if self.enable_caching and not skip_inc_ref_count: self.block_pool.touch(new_computed_blocks) else: assert not new_computed_blocks, ( From 8e1eadca4bb2c72a812c398cf7f06b8d8ea1376c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 10 Apr 2025 20:26:37 +0000 Subject: [PATCH 051/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 39 ++++---- .../v1/shared_storage_connector.py | 99 +++++++------------ vllm/v1/core/kv_cache_manager.py | 80 +++++++++++---- vllm/v1/core/sched/scheduler.py | 19 ++++ 4 files changed, 133 insertions(+), 104 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 0a56fb94f96e..20aaae119c45 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -19,8 +19,6 @@ from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.forward_context import ForwardContext - from vllm.v1.core.kv_cache_manager import KVCacheManager - from vllm.v1.core.kv_cache_utils import KVCacheBlock from vllm.v1.request import Request @@ -148,32 +146,33 @@ def wait_for_save(self): # Scheduler-side methods # ============================== @abstractmethod - def get_external_prefix_cache_blocks( + def get_num_matched_tokens( self, request: "Request", - computed_blocks: list["KVCacheBlock"], num_computed_tokens: int, - kv_cache_manager: "KVCacheManager", - ) -> list["KVCacheBlock"]: + ) -> int: """ - Get the external prefix cache blocks from the connector. - - This function may change the state of the connector, which will - be used by `build_connector_meta` later. - - This function will also allocate/free the blocks dynamically when - there is remote cache hit. - + Check for external KV cache hit. + Args: request (Request): the request object. - computed_blocks (list[KVCacheBlock]): the 'local' computed blocks. - num_computed_tokens (int): the number of 'local' computed tokens. - kv_cache_manager (KVCacheManager): the KV cache manager to - allocate/free the blocks if needed. + num_computed_tokens (int): the number of locally + computed tokens for this request Returns: - The updated list of the computed blocks (appended with the remote - cached blocks) + the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + """ + pass + + @abstractmethod + def update_state_after_alloc(self, request: Request, + num_allocated_blocks: int): + """ + Update KVConnector state after temporary buffer alloc. + + For SharedStorageConnector, update _request_needs_load + if the CacheManager this allocated blocks for us. """ pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index b130286e3b1d..ef6252b522e1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -16,8 +16,6 @@ if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext - from vllm.v1.core.kv_cache_manager import KVCacheManager - from vllm.v1.core.kv_cache_utils import KVCacheBlock from vllm.v1.request import Request logger = init_logger(__name__) @@ -152,7 +150,7 @@ def inject_kv_into_layer( kv_cache_layer = attn_layer.kv_cache[\ forward_context.virtual_engine] - filename = self.generate_filename_debug( + filename = self._generate_filename_debug( layer_name, request.token_ids) kv_cache = safetensors.torch.load_file( filename)["kv_cache"].cuda() @@ -201,7 +199,7 @@ def extract_kv_from_layer( assert isinstance(connector_metadata, SharedStorageConnectorMetadata) for request in connector_metadata.requests: if request.is_store: - filename = self.generate_filename_debug( + filename = self._generate_filename_debug( layer_name, request.token_ids) kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) @@ -211,29 +209,18 @@ def extract_kv_from_layer( def wait_for_save(self): return - def get_external_prefix_cache_blocks( + def get_num_matched_tokens( self, request: "Request", - computed_blocks: list["KVCacheBlock"], num_computed_tokens: int, - kv_cache_manager: "KVCacheManager", - ) -> list["KVCacheBlock"]: - """Get the external prefix cache blocks from the connector. - - This function may change the state of the connector, which will be - used by `build_connector_meta` later. - - Args: - request (Request): the request object. - computed_blocks (list[KVCacheBlock]): the 'local' computed blocks. - num_computed_tokens (int): the number of 'local' computed tokens. - kv_cache_manager (KVCacheManager): the KV cache manager to - allocate/free the blocks if needed. - - Returns: - The updated list of the computed blocks (appended with the remote - cached blocks) + ) -> int: """ + Check for external KV cache hit. + + Returns the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + """ + # NOTE: in this debug implementation, we assume that the prompt is # cached_prompt + newly_generated_single_token # Therefore, we use prompt_token_ids[:-1] to determine the folder name @@ -241,48 +228,28 @@ def get_external_prefix_cache_blocks( # NOTE: in current v1 scheduler, the num_computed_tokens is aligned # with the block granularity. And it expects the returned blocks and # num_computed_tokens to also be aligned with the block granularity. - if not self.found_match_for_request(request): - return computed_blocks + if not self._found_match_for_request(request): + return 0 + + logger.info("External Cache Hit!") # Now, first num_tokens_to_check tokens are hit, we need to prepare # the metadata for the worker connector to correctly load the KV - - logger.info("Hit the cache! Allocate new blocks!") num_tokens_to_check = align_to_block_size( len(request.prompt_token_ids) - 1, self._block_size) - need_to_allocate = num_tokens_to_check - num_computed_tokens - if need_to_allocate > 0: - # HACK: We don't want the scheduler see the blocks are allocated - # and associated with the current request. Instead, we want the - # scheduler find that the blocks are already allocated and they - # are associated with some other requests (i.e., the case of - # prefix caching. - - # HACK: KVCacheManager.allocate_slots will pre-allocate a few - # blocks, which will cause problems in the later allocations. - # We should make sure the pre allocation does not happen. - old_req_id = request.request_id - request.request_id = "temp-req-id-for-connector" - allocated_blocks = kv_cache_manager.allocate_slots( - request, - need_to_allocate, - computed_blocks, - skip_preallocate=True, - skip_inc_ref_count=True) - request.request_id = old_req_id - kv_cache_manager.req_to_blocks.pop("temp-req-id-for-connector") - kv_cache_manager.num_cached_block.pop("temp-req-id-for-connector") - - num_expected_blocks = need_to_allocate // self._block_size - if len(allocated_blocks) > num_expected_blocks: - logger.error("Detected pre-allocated blocks in the connector!" - "This should not happen!") - allocated_blocks = allocated_blocks[:num_expected_blocks] + return num_tokens_to_check - num_computed_tokens + + def update_state_after_alloc(self, request: Request, + num_allocated_blocks: int): + """ + Update KVConnector state after temporary buffer alloc. + + For SharedStorageConnector, update _request_needs_load + if the CacheManager this allocated blocks for us. + """ + if num_allocated_blocks > 0: self._requests_need_load.append(request.request_id) - return computed_blocks + allocated_blocks - else: - return computed_blocks def build_connector_meta( self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: @@ -302,7 +269,7 @@ def build_connector_meta( # NOTE: here, we set the store and load being exclusive, # but in LMCache use case, a single request can have both # store and load status - if not self.found_match_for_request(request): + if not self._found_match_for_request(request): meta.add_request(request, self._block_size, is_store=True) self._requests_need_load.clear() @@ -312,7 +279,7 @@ def build_connector_meta( # Helper functions # ============================== - def found_match_for_request( + def _found_match_for_request( self, request: "Request", ) -> bool: @@ -320,12 +287,12 @@ def found_match_for_request( """ num_tokens_to_check = align_to_block_size( len(request.prompt_token_ids) - 1, self._block_size) - foldername = self.generate_foldername_debug(torch.tensor( + foldername = self._generate_foldername_debug(torch.tensor( request.prompt_token_ids)[:num_tokens_to_check], - create_folder=False) + create_folder=False) return os.path.exists(foldername) - def generate_foldername_debug( + def _generate_foldername_debug( self, input_ids: torch.Tensor, create_folder=False, @@ -340,7 +307,7 @@ def generate_foldername_debug( os.makedirs(foldername, exist_ok=True) return foldername - def generate_filename_debug( + def _generate_filename_debug( self, layer_name: str, input_ids: torch.Tensor, @@ -348,8 +315,8 @@ def generate_filename_debug( """Generate a file name based on the layer name and the hash of the bytes of the input ids. """ - foldername = self.generate_foldername_debug(input_ids, - create_folder=True) + foldername = self._generate_foldername_debug(input_ids, + create_folder=True) return os.path.join(foldername, f"{layer_name}.safetensors") diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index dcc62c1476ef..b84a414328e0 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -84,7 +84,11 @@ def __init__( # data for reempted ones. self.num_cached_block: dict[str, int] = {} self.prefix_cache_stats = PrefixCacheStats() - self.connector = connector + + # KVConnector: buffer reqs for KVConnector. We write + # the external KVs to the "buffer" req and leverage + # prefix caching to share with the "real" req + self.kv_connector_buffer_reqs: list[Request] = [] @property def usage(self) -> float: @@ -159,13 +163,6 @@ def get_computed_blocks( # we shouldn't modify it directly. block_hashes.append(last_block_hash) - # Check the remote cache for the external prefix cache blocks. - if self.connector is not None: - computed_blocks =\ - self.connector.get_external_prefix_cache_blocks( - request, computed_blocks, - len(computed_blocks) * self.block_size, self) - # NOTE(woosuk): Since incomplete blocks are not eligible for # sharing, `num_computed_tokens` is always a multiple of # `block_size`. @@ -178,7 +175,6 @@ def allocate_slots( num_tokens: int, new_computed_blocks: Optional[list[KVCacheBlock]] = None, skip_preallocate: bool = False, - skip_inc_ref_count: bool = False, ) -> Optional[list[KVCacheBlock]]: """Add slots for a request with new tokens to append. @@ -188,11 +184,7 @@ def allocate_slots( not include the tokens that have already been computed. new_computed_blocks: A list of new computed blocks just hitting the prefix caching. - skip_preallocate: Whether to skip preallocating blocks for - the request. - skip_preallocate: Whether to skip incrementing the ref count. This - is useful for the KVConnector to allocate blocks which will be - filled by the remote KVs for a single model step(). + skip_preallocate: Whether to skip preallocating blocks. Blocks layout: ----------------------------------------------------------------------- @@ -246,12 +238,11 @@ def allocate_slots( return None # Touch the computed blocks to make sure they won't be evicted. - if self.enable_caching and not skip_inc_ref_count: + if self.enable_caching: self.block_pool.touch(new_computed_blocks) else: - assert not new_computed_blocks, ( - "Computed blocks should be empty when " - "prefix caching is disabled") + assert not new_computed_blocks, "Computed blocks should "\ + "be empty when prefix caching is disabled" # Append the new computed blocks to the request blocks until now to # avoid the case where the new blocks cannot be allocated. @@ -396,3 +387,56 @@ def free_block_hashes(self, request: Request) -> None: is finished, not when it is preempted. """ self.req_to_block_hashes.pop(request.request_id, None) + + def alloc_and_get_external_blocks( + self, + request: "Request", + computed_blocks: list["KVCacheBlock"], + num_computed_tokens: int, + kv_connector: KVConnectorBase_V1, + ) -> tuple[list["KVCacheBlock"], int]: + + # Check for cache hit. + need_to_allocate = kv_connector.get_num_matched_tokens( + request, num_computed_tokens) + num_allocated_blocks = 0 + + # Cache hit: allocate buffer. + if need_to_allocate > 0: + # HACK: We don't want the scheduler see the blocks are allocated + # and associated with the current request. Instead, we want the + # scheduler find that the blocks are already allocated and they + # are associated with some other requests (i.e., the case of + # prefix caching. + + old_req_id = request.request_id + request.request_id = f"{old_req_id}-buf-for-kv-connector" + allocated_blocks = self.allocate_slots( + request, + need_to_allocate, + computed_blocks, + skip_preallocate=True, + ) + request.request_id = old_req_id + + num_expected_blocks = need_to_allocate // self.block_size + num_allocated_blocks = len( + allocated_blocks) if allocated_blocks else 0 + assert num_allocated_blocks <= num_expected_blocks, ""\ + "Detected pre-allocated blocks in the connector! "\ + "This should not happen!" + + # Update internal state. In case of: + # * SharedStorageConnector: add req_id to _requests_need_load + # so that we know to load this requests KVs later. + kv_connector.update_state_after_alloc(request, num_allocated_blocks) + num_computed_blocks = len(computed_blocks) * self.block_size + return computed_blocks, num_computed_blocks + + def free_buffer_requests(self) -> None: + """Free buffer requests for the KV connector.""" + + for buffer_req in self.kv_connector_buffer_reqs: + self.free(buffer_req) + self.free_block_hashes(buffer_req) + self.kv_connector_buffer_reqs.clear() diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 9d8b0945c483..6bd44098c560 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -308,6 +308,20 @@ def schedule(self) -> SchedulerOutput: # Get already-cached tokens. computed_blocks, num_computed_tokens = \ self.kv_cache_manager.get_computed_blocks(request) + + # KVConnector: get blocks externally-cached tokens. + # Internally, this allocates a "buffer" req with a prompt + # corresponding to externally cached tokens. In alloc_slots + # below, we will compute a cache hit and thus skip the + # computation for externally cached tokens. + # NOTE: since this allocates temporary buffer requests, + # we must call kv_cache_manager.free_buffer_requests() below. + if self.connector is not None: + computed_blocks, num_computed_tokens = \ + self.kv_cache_manager.alloc_and_get_external_blocks( + request, computed_blocks, + num_computed_tokens, self.connector) + # Number of tokens to be scheduled. # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed requests, @@ -467,6 +481,11 @@ def schedule(self) -> SchedulerOutput: for req_id, num_scheduled_token in num_scheduled_tokens.items(): self.requests[req_id].num_computed_tokens += num_scheduled_token + # KVConnector: once we have allocated the buffer blocks to the + # "real" requests (via prefix caching), free the tmp buffer reqs. + if self.connector is not None: + self.kv_cache_manager.free_buffer_requests() + self.finished_req_ids = set() return scheduler_output From 54e1491e0afa0e1482f48f0a821a22a53c0453cf Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 10 Apr 2025 20:31:35 +0000 Subject: [PATCH 052/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 2 +- .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 2 +- vllm/v1/core/kv_cache_manager.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 20aaae119c45..df21ca137d9a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -166,7 +166,7 @@ def get_num_matched_tokens( pass @abstractmethod - def update_state_after_alloc(self, request: Request, + def update_state_after_alloc(self, request: "Request", num_allocated_blocks: int): """ Update KVConnector state after temporary buffer alloc. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index ef6252b522e1..111f350b6e1c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -240,7 +240,7 @@ def get_num_matched_tokens( return num_tokens_to_check - num_computed_tokens - def update_state_after_alloc(self, request: Request, + def update_state_after_alloc(self, request: "Request", num_allocated_blocks: int): """ Update KVConnector state after temporary buffer alloc. diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index b84a414328e0..19b073fd644c 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -393,7 +393,7 @@ def alloc_and_get_external_blocks( request: "Request", computed_blocks: list["KVCacheBlock"], num_computed_tokens: int, - kv_connector: KVConnectorBase_V1, + kv_connector: "KVConnectorBase_V1", ) -> tuple[list["KVCacheBlock"], int]: # Check for cache hit. From 9c4159cb47bc79932585c25840b19ba08fc1b782 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 10 Apr 2025 21:41:20 +0000 Subject: [PATCH 053/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../offline_inference/disaggrated-prefill-v1/decode_example.py | 2 +- .../offline_inference/disaggrated-prefill-v1/prefill_example.py | 2 +- vllm/v1/core/kv_cache_manager.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py index 760a70553101..615d02d5f3cf 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py @@ -18,7 +18,7 @@ llm = LLM( model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=False, + enforce_eager=True, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",' diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py index 1e1780d6dc7a..f7cbf6557d54 100644 --- a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py @@ -15,7 +15,7 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=False, + enforce_eager=True, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig.from_cli( '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 19b073fd644c..a960a2eda37d 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -425,6 +425,7 @@ def alloc_and_get_external_blocks( assert num_allocated_blocks <= num_expected_blocks, ""\ "Detected pre-allocated blocks in the connector! "\ "This should not happen!" + computed_blocks = computed_blocks + (allocated_blocks or []) # Update internal state. In case of: # * SharedStorageConnector: add req_id to _requests_need_load From 1d8415d095461651f2efc2debf7c1c762e098c45 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 10 Apr 2025 21:59:54 +0000 Subject: [PATCH 054/116] rename Signed-off-by: rshaw@neuralmagic.com --- .../disaggrated-prefill-v1/decode_example.py | 34 --------------- .../disaggrated-prefill-v1/prefill_example.py | 43 ------------------- .../disaggrated-prefill-v1/run.sh | 5 --- 3 files changed, 82 deletions(-) delete mode 100644 examples/offline_inference/disaggrated-prefill-v1/decode_example.py delete mode 100644 examples/offline_inference/disaggrated-prefill-v1/prefill_example.py delete mode 100644 examples/offline_inference/disaggrated-prefill-v1/run.sh diff --git a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py b/examples/offline_inference/disaggrated-prefill-v1/decode_example.py deleted file mode 100644 index 615d02d5f3cf..000000000000 --- a/examples/offline_inference/disaggrated-prefill-v1/decode_example.py +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM, SamplingParams -from vllm.config import KVTransferConfig - -# Read prompts from output.txt -prompts = [] -try: - with open("output.txt") as f: - for line in f: - prompts.append(line.strip()) - print(f"Loaded {len(prompts)} prompts from output.txt") -except FileNotFoundError: - print("Error: output.txt file not found") - exit(-1) - -sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - -llm = LLM( - model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig.from_cli( - '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",' - '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}' - )) #, max_model_len=2048, max_num_batched_tokens=2048) - -# 1ST generation (prefill instance) -outputs = llm.generate(prompts, sampling_params) - -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py deleted file mode 100644 index f7cbf6557d54..000000000000 --- a/examples/offline_inference/disaggrated-prefill-v1/prefill_example.py +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM, SamplingParams -from vllm.config import KVTransferConfig - -context = "Hi " * 1000 -context2 = "Hey " * 500 -prompts = [ - context + "Hello, my name is", - context + "The capital of France is", - context2 + "Your name is", - context2 + "The capital of China is", -] - -sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - -llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig.from_cli( - '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' - '"kv_connector_extra_config": ' - '{"shared_storage_path": "local_storage"}}') - ) #, max_model_len=2048, max_num_batched_tokens=2048) - -# 1ST generation (prefill instance) -outputs = llm.generate( - prompts, - sampling_params, -) - -new_prompts = [] -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - new_prompts.append(prompt + generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -# Write new_prompts to output.txt -with open("output.txt", "w") as f: - for prompt in new_prompts: - f.write(prompt + "\n") -print(f"Saved {len(new_prompts)} prompts to output.txt") diff --git a/examples/offline_inference/disaggrated-prefill-v1/run.sh b/examples/offline_inference/disaggrated-prefill-v1/run.sh deleted file mode 100644 index 07a57eb09403..000000000000 --- a/examples/offline_inference/disaggrated-prefill-v1/run.sh +++ /dev/null @@ -1,5 +0,0 @@ -rm -rf local_storage/ -rm output.txt - -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=5 python3 prefill_example.py -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=5 python3 decode_example.py From 406d6bfa59a2ff448791ad34974682a5587388a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Delacourt?= <54138269+Flechman@users.noreply.github.com> Date: Fri, 11 Apr 2025 00:47:40 +0200 Subject: [PATCH 055/116] Add MLA support for v1 disagg connector (#6) Signed-off-by: remi --- .../v1/shared_storage_connector.py | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 111f350b6e1c..c4542768a7a2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -11,6 +11,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: @@ -98,6 +99,7 @@ def start_load_kv(self, forward_context: "ForwardContext", The number of elements in kv_caches and layer_names should be the same. """ + attn_metadata = forward_context.attn_metadata def inject_kv_into_layer( dst_kv_cache_layer: torch.Tensor, @@ -108,19 +110,29 @@ def inject_kv_into_layer( Args: dst_kv_cache_layer (torch.Tensor): the destination KV cache - layer. In shape [2, num_pages, page_size, xxx]. + layer. In shape [2, num_pages, page_size, xxx] if not + using MLA, [num_pages, page_size, xxx] otherwise. src_kv_cache (torch.Tensor): the source KV cache. In shape - [2, num_tokens, xxx]. + [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] + otherwise. slot_mapping (torch.Tensor): the slot mapping. In shape [num_tokens]. """ dst_kv_cache_layer_shape = dst_kv_cache_layer.shape - num_pages = dst_kv_cache_layer_shape[1] - page_size = dst_kv_cache_layer_shape[2] - dst_kv_cache_layer = dst_kv_cache_layer.reshape( - 2, num_pages * page_size, -1) - dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache - dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + if isinstance(attn_metadata, MLACommonMetadata): + num_pages = dst_kv_cache_layer_shape[0] + page_size = dst_kv_cache_layer_shape[1] + dst_kv_cache_layer = dst_kv_cache_layer.reshape( + num_pages * page_size, -1) + dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + else: + num_pages = dst_kv_cache_layer_shape[1] + page_size = dst_kv_cache_layer_shape[2] + dst_kv_cache_layer = dst_kv_cache_layer.reshape( + 2, num_pages * page_size, -1) + dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) # Get the metadata metadata: KVConnectorMetadata = \ @@ -170,7 +182,7 @@ def wait_for_layer_load(self, layer_name: str) -> None: def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata: "AttentionMetadata", **kwargs) -> None: - """Start saving the a layer of KV cache from vLLM's paged buffer + """Start saving the KV cache of the layer from vLLM's paged buffer to the connector. Args: @@ -187,10 +199,13 @@ def extract_kv_from_layer( ) -> torch.Tensor: """Extract the KV cache from the layer. - Assume the shape of the layer is (2, num_pages, page_size, xxx). + Assume the shape of the layer is (2, num_pages, page_size, xxx) + if MLA is not used, and (num_pages, page_size, xxx) otherwise. """ - # TODO: make this compatible with MLA. - assert layer.shape[0] == 2 + if isinstance(attn_metadata, MLACommonMetadata): + num_pages, page_size = layer.shape[0], layer.shape[1] + return layer.reshape(num_pages * page_size, -1)[slot_mapping, + ...] num_pages, page_size = layer.shape[1], layer.shape[2] return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...] From 3a248977553b78e2e6731474245a421c35acfd4b Mon Sep 17 00:00:00 2001 From: ApostaC Date: Thu, 10 Apr 2025 18:31:47 -0700 Subject: [PATCH 056/116] [Fix] memory leak problem by proper clean up Signed-off-by: ApostaC --- vllm/v1/core/kv_cache_manager.py | 59 ++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index a960a2eda37d..e828f37aafbc 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -88,7 +88,7 @@ def __init__( # KVConnector: buffer reqs for KVConnector. We write # the external KVs to the "buffer" req and leverage # prefix caching to share with the "real" req - self.kv_connector_buffer_reqs: list[Request] = [] + self.kv_connector_buffer_req_ids: list[str] = [] @property def usage(self) -> float: @@ -299,16 +299,13 @@ def allocate_slots( request.request_id] = num_full_blocks_after_append return new_blocks - def free(self, request: Request) -> None: + def _free_by_request_id(self, request_id: str) -> None: """Free the blocks allocated for the request. - When caching is enabled, we free the blocks in reverse order so that - the tail blocks are evicted first. Args: - request: The request to free the blocks. + request_id: The request ID to free the blocks. """ - # Default to [] in case a request is freed (aborted) before alloc. - blocks = self.req_to_blocks.pop(request.request_id, []) + blocks = self.req_to_blocks.pop(request_id, []) ordered_blocks: Iterable[KVCacheBlock] = blocks if self.enable_caching: # Free blocks in reverse order so that the tail blocks are @@ -316,7 +313,17 @@ def free(self, request: Request) -> None: ordered_blocks = reversed(blocks) self.block_pool.free_blocks(ordered_blocks) - self.num_cached_block.pop(request.request_id, None) + self.num_cached_block.pop(request_id, None) + + def free(self, request: Request) -> None: + """Free the blocks allocated for the request. + When caching is enabled, we free the blocks in reverse order so that + the tail blocks are evicted first. + + Args: + request: The request to free the blocks. + """ + self._free_by_request_id(request.request_id) def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF @@ -380,13 +387,21 @@ def get_num_common_prefix_blocks( break return num_common_blocks + def _free_block_hashes_by_request_id(self, request_id: str) -> None: + """Free the block hashes allocated for the request. + + Args: + request_id: The request ID to free the block hashes. + """ + self.req_to_block_hashes.pop(request_id, None) + def free_block_hashes(self, request: Request) -> None: """Discard the block hashes for the request. NOTE: Unlike `free`, this method should be called only when the request is finished, not when it is preempted. """ - self.req_to_block_hashes.pop(request.request_id, None) + self._free_block_hashes_by_request_id(request.request_id) def alloc_and_get_external_blocks( self, @@ -417,15 +432,23 @@ def alloc_and_get_external_blocks( computed_blocks, skip_preallocate=True, ) + self.kv_connector_buffer_req_ids.append(request.request_id) request.request_id = old_req_id + if allocated_blocks is None: + allocated_blocks = [] + + # Avoid over-allocating num_expected_blocks = need_to_allocate // self.block_size - num_allocated_blocks = len( - allocated_blocks) if allocated_blocks else 0 - assert num_allocated_blocks <= num_expected_blocks, ""\ - "Detected pre-allocated blocks in the connector! "\ - "This should not happen!" + allocated_blocks = allocated_blocks[:num_expected_blocks] + + # Back-off one block if the external KV is for all tokens + if (len(allocated_blocks) + len(computed_blocks)) \ + * self.block_size >= len(request.prompt_token_ids): + allocated_blocks = allocated_blocks[:-1] + computed_blocks = computed_blocks + (allocated_blocks or []) + num_allocated_blocks = len(allocated_blocks) # Update internal state. In case of: # * SharedStorageConnector: add req_id to _requests_need_load @@ -437,7 +460,7 @@ def alloc_and_get_external_blocks( def free_buffer_requests(self) -> None: """Free buffer requests for the KV connector.""" - for buffer_req in self.kv_connector_buffer_reqs: - self.free(buffer_req) - self.free_block_hashes(buffer_req) - self.kv_connector_buffer_reqs.clear() + for request_id in self.kv_connector_buffer_req_ids: + self._free_by_request_id(request_id) + self._free_block_hashes_by_request_id(request_id) + self.kv_connector_buffer_req_ids.clear() From c6c436876631f71d8ac9138f75a2d98abe29b73c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Fri, 11 Apr 2025 15:32:26 +0000 Subject: [PATCH 057/116] fixed test failures Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 21a1cbf540ae..5227308ed86e 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -79,13 +79,10 @@ def create_scheduler( ) cache_config.num_gpu_blocks = 10000 return Scheduler( - scheduler_config, - model_config, - cache_config, - lora_config=None, + vllm_config, kv_cache_config=kv_cache_config, - log_stats=True, structured_output_manager=StructuredOutputManager(vllm_config), + log_stats=True, ) From 4afa50e82c6f09f1c4476e2d5bb401f53033813c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 13 Apr 2025 20:28:43 +0000 Subject: [PATCH 058/116] stash Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 3d685e3fe58a..f1e8b1e0f967 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -95,7 +95,6 @@ def __init__( self.scheduler: SchedulerInterface = Scheduler( vllm_config=vllm_config, kv_cache_config=kv_cache_config, - speculative_config=vllm_config.speculative_config, structured_output_manager=self.structured_output_manager, include_finished_set=vllm_config.parallel_config.data_parallel_size > 1, From 09be260912334334e8249aa794c85d8f5c0ace5f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 01:10:02 +0000 Subject: [PATCH 059/116] clean up typing Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/__init__.py | 9 ++++++++- vllm/distributed/kv_transfer/kv_connector/factory.py | 8 ++++---- vllm/distributed/kv_transfer/kv_transfer_state.py | 11 +++++------ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py index 4aa6ab4823c0..f62b403eaf4a 100644 --- a/vllm/distributed/kv_transfer/__init__.py +++ b/vllm/distributed/kv_transfer/__init__.py @@ -1,9 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Union + +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.distributed.kv_transfer.kv_transfer_state import ( ensure_kv_transfer_initialized, get_kv_transfer_group, has_kv_transfer_group, is_v1_kv_transfer_group) +KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1] + __all__ = [ "get_kv_transfer_group", "has_kv_transfer_group", - "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized" + "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized", + "KVConnectorBaseType" ] diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 07c52e4cabd2..586482f1d089 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import importlib -from typing import TYPE_CHECKING, Callable, Dict, Type, Union +from typing import TYPE_CHECKING, Callable, Dict, Type import vllm.envs as envs +from vllm.distributed.kv_transfer import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, KVConnectorRole) from vllm.logger import init_logger @@ -17,8 +18,7 @@ class KVConnectorFactory: - _registry: Dict[str, Callable[[], Type[Union[KVConnectorBase, - KVConnectorBase_V1]]]] = {} + _registry: Dict[str, Callable[[], Type[KVConnectorBaseType]]] = {} @classmethod def register_connector(cls, name: str, module_path: str, @@ -27,7 +27,7 @@ def register_connector(cls, name: str, module_path: str, if name in cls._registry: raise ValueError(f"Connector '{name}' is already registered.") - def loader() -> Type[Union[KVConnectorBase, KVConnectorBase_V1]]: + def loader() -> Type[KVConnectorBaseType]: module = importlib.import_module(module_path) return getattr(module, class_name) diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index e6645829d26e..16f71b95c377 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Optional from vllm import envs -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, @@ -12,10 +12,10 @@ if TYPE_CHECKING: from vllm.config import VllmConfig -_KV_CONNECTOR_AGENT: Union[KVConnectorBase, KVConnectorBase_V1, None] = None +_KV_CONNECTOR_AGENT: Optional[KVConnectorBaseType] = None -def get_kv_transfer_group() -> Union[KVConnectorBase, KVConnectorBase_V1]: +def get_kv_transfer_group() -> KVConnectorBaseType: assert _KV_CONNECTOR_AGENT is not None, ( "disaggregated KV cache transfer parallel group is not initialized") return _KV_CONNECTOR_AGENT @@ -26,8 +26,7 @@ def has_kv_transfer_group() -> bool: def is_v1_kv_transfer_group( - connector: Union[KVConnectorBase_V1, KVConnectorBase, - None] = None) -> bool: + connector: Optional[KVConnectorBaseType] = None) -> bool: """Check if the KV connector is the v1 connector. If the argument is None, it will check the global KV connector From 3f7844d894c9c3072f0a8940665097b6fabe1225 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 01:17:42 +0000 Subject: [PATCH 060/116] cleanup nits Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 5 +++-- .../v1/shared_storage_connector.py | 20 +++++++++++-------- .../kv_transfer/kv_connector_agent.py | 7 +++---- vllm/v1/core/kv_cache_manager.py | 4 ++-- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index df21ca137d9a..854eb01a5c3c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -146,13 +146,14 @@ def wait_for_save(self): # Scheduler-side methods # ============================== @abstractmethod - def get_num_matched_tokens( + def get_num_new_matched_tokens( self, request: "Request", num_computed_tokens: int, ) -> int: """ - Check for external KV cache hit. + Get number of new tokens that can be loaded from the + external KV cache beyond the num_computed_tokens. Args: request (Request): the request object. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index c4542768a7a2..aff6dbe9314f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -31,13 +31,10 @@ class ReqMeta: # Is store or load is_store: bool - ## Blocks allocated by the scheduler (no-longer needed) - #block_ids: torch.Tensor - @staticmethod def from_request(request: "Request", block_size: int, is_store: bool) -> "ReqMeta": - valid_num_tokens = align_to_block_size(len(request.prompt_token_ids), + valid_num_tokens = align_to_block_size(request.num_prompt_tokens, block_size) token_ids = torch.tensor(request.prompt_token_ids)[:valid_num_tokens] block_ids = torch.tensor(request.block_ids) @@ -224,16 +221,23 @@ def extract_kv_from_layer( def wait_for_save(self): return - def get_num_matched_tokens( + def get_num_new_matched_tokens( self, request: "Request", num_computed_tokens: int, ) -> int: """ - Check for external KV cache hit. + Get number of new tokens that can be loaded from the + external KV cache beyond the num_computed_tokens. - Returns the number of tokens that can be loaded from the - external KV cache beyond what is already computed. + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + the number of tokens that can be loaded from the + external KV cache beyond what is already computed. """ # NOTE: in this debug implementation, we assume that the prompt is diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py index f12136f0e591..9a300fc7cb9d 100644 --- a/vllm/distributed/kv_transfer/kv_connector_agent.py +++ b/vllm/distributed/kv_transfer/kv_connector_agent.py @@ -13,7 +13,6 @@ import torch -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.logger import init_logger @@ -58,13 +57,13 @@ def send_kv_caches_and_hidden_states( hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: - assert isinstance(self.connector, KVConnectorBase) + self.connector.send_kv_caches_and_hidden_states( model_executable, model_input, kv_caches, hidden_or_intermediate_states) def close(self) -> None: - assert isinstance(self.connector, KVConnectorBase) + self.connector.close() def recv_kv_caches_and_hidden_states( @@ -73,6 +72,6 @@ def recv_kv_caches_and_hidden_states( kv_caches: List[torch.Tensor] ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: - assert isinstance(self.connector, KVConnectorBase) + return self.connector.recv_kv_caches_and_hidden_states( model_executable, model_input, kv_caches) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index ec6a7dfddc3c..0f169f085f2e 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -417,8 +417,8 @@ def alloc_and_get_external_blocks( kv_connector: "KVConnectorBase_V1", ) -> tuple[list["KVCacheBlock"], int]: - # Check for cache hit. - need_to_allocate = kv_connector.get_num_matched_tokens( + # Check for cache hit beyond the num_computed_tokens. + need_to_allocate = kv_connector.get_num_new_matched_tokens( request, num_computed_tokens) num_allocated_blocks = 0 From d44f6994705db734852ace801e62fb6fe5a4e41d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 01:26:02 +0000 Subject: [PATCH 061/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 13 +++++++------ vllm/v1/core/sched/scheduler.py | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 0f169f085f2e..f1bcae4fde3a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -409,7 +409,7 @@ def free_block_hashes(self, request: Request) -> None: """ self._free_block_hashes_by_request_id(request.request_id) - def alloc_and_get_external_blocks( + def alloc_and_append_external_blocks( self, request: "Request", computed_blocks: list["KVCacheBlock"], @@ -449,19 +449,20 @@ def alloc_and_get_external_blocks( allocated_blocks = allocated_blocks[:num_expected_blocks] # Back-off one block if the external KV is for all tokens - if (len(allocated_blocks) + len(computed_blocks)) \ - * self.block_size >= len(request.prompt_token_ids): + if ((len(allocated_blocks) + len(computed_blocks)) * + self.block_size >= request.num_prompt_tokens): allocated_blocks = allocated_blocks[:-1] - computed_blocks = computed_blocks + (allocated_blocks or []) + if allocated_blocks: + computed_blocks = computed_blocks + allocated_blocks num_allocated_blocks = len(allocated_blocks) # Update internal state. In case of: # * SharedStorageConnector: add req_id to _requests_need_load # so that we know to load this requests KVs later. kv_connector.update_state_after_alloc(request, num_allocated_blocks) - num_computed_blocks = len(computed_blocks) * self.block_size - return computed_blocks, num_computed_blocks + num_computed_tokens = len(computed_blocks) * self.block_size + return computed_blocks, num_computed_tokens def free_buffer_requests(self) -> None: """Free buffer requests for the KV connector.""" diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4b4281125a15..2109d2c13909 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -72,7 +72,7 @@ def __init__( config=self.vllm_config, role=KVConnectorRole.SCHEDULER) num_gpu_blocks = self.cache_config.num_gpu_blocks - assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 + assert num_gpu_blocks is not None and num_gpu_blocks > 0 # Create the KV cache manager. self.kv_cache_manager = KVCacheManager( @@ -326,7 +326,7 @@ def schedule(self) -> SchedulerOutput: # we must call kv_cache_manager.free_buffer_requests() below. if self.connector is not None: computed_blocks, num_computed_tokens = \ - self.kv_cache_manager.alloc_and_get_external_blocks( + self.kv_cache_manager.alloc_and_append_external_blocks( request, computed_blocks, num_computed_tokens, self.connector) From 329f2e7d69f2db5cc47a3a0f119602c70c7929c0 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 01:28:38 +0000 Subject: [PATCH 062/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 2109d2c13909..8f3228d85ab2 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -469,14 +469,18 @@ def schedule(self) -> SchedulerOutput: grammar_bitmask=grammar_bitmask, ) - # NOTE(Kuntai): this function is designed for multiple purposes: - # 1. Plan the KV cache store - # 2. Wrap up all the KV cache load / save ops into an opaque object - # 3. Clear the internal states of the connector if self.connector is not None: + # NOTE(Kuntai): this function is designed for multiple purposes: + # 1. Plan the KV cache store + # 2. Wrap up all the KV cache load / save ops into an opaque object + # 3. Clear the internal states of the connector meta = self.connector.build_connector_meta(scheduler_output) scheduler_output.kv_connector_metadata = meta + # KVConnector: once we have allocated the buffer blocks to the + # "real" requests (via prefix caching), free the tmp buffer reqs. + self.kv_cache_manager.free_buffer_requests() + # Advance the number of computed tokens for the request AFTER # the request is scheduled. # 1. The scheduler_output of the current step has to include the @@ -489,11 +493,6 @@ def schedule(self) -> SchedulerOutput: for req_id, num_scheduled_token in num_scheduled_tokens.items(): self.requests[req_id].num_computed_tokens += num_scheduled_token - # KVConnector: once we have allocated the buffer blocks to the - # "real" requests (via prefix caching), free the tmp buffer reqs. - if self.connector is not None: - self.kv_cache_manager.free_buffer_requests() - self.finished_req_ids = set() return scheduler_output From 72041ca98cb69c4404182ebe391546e9d43db623 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 01:32:41 +0000 Subject: [PATCH 063/116] finish docstring Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 854eb01a5c3c..3c72536c54a7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -4,6 +4,20 @@ communication in vLLM v1 The class provides the following primitives: + Scheduler-side: runs in the scheduler, binds metadata, which + is used by the worker-side to load/save KV cache. + get_num_new_matched_tokens() - get number of new tokens + that exist in the remote KV cache + update_state_after_alloc() - update KVConnector state after + temporary buffer alloc by the CacheManager. + + Worker-side: runs in each worker, loads/saves KV cache to/from + the Connector based on the metadata. + start_load_kv() - starts loading all KVs (maybe async) + wait_for_layer_load() - blocks until layer i load is done + + save_kv_layer() - starts saving KV for layer i (maybe async) + wait_for_save() - blocks until all saves are done """ import enum From f9f87f278a49e1c89951bd368227319f1508eea0 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 03:54:33 +0000 Subject: [PATCH 064/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 5 +- .../v1/shared_storage_connector.py | 6 +- .../kv_transfer/kv_transfer_state.py | 7 +- vllm/forward_context.py | 9 +- vllm/v1/core/kv_cache_manager.py | 250 +++++++++++------- vllm/v1/core/sched/scheduler.py | 27 +- 6 files changed, 175 insertions(+), 129 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 3c72536c54a7..1d814c73ccee 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -184,10 +184,7 @@ def get_num_new_matched_tokens( def update_state_after_alloc(self, request: "Request", num_allocated_blocks: int): """ - Update KVConnector state after temporary buffer alloc. - - For SharedStorageConnector, update _request_needs_load - if the CacheManager this allocated blocks for us. + Update KVConnector state after block allocation. """ pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index aff6dbe9314f..d27d6252867d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -262,10 +262,10 @@ def get_num_new_matched_tokens( def update_state_after_alloc(self, request: "Request", num_allocated_blocks: int): """ - Update KVConnector state after temporary buffer alloc. + Update KVConnector state after block allocation. - For SharedStorageConnector, update _request_needs_load - if the CacheManager this allocated blocks for us. + If blocks were allocated, add to _requests_need_load, + such that we load the KVs in the next forward pass. """ if num_allocated_blocks > 0: self._requests_need_load.append(request.request_id) diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 16f71b95c377..820723148517 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -57,11 +57,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: if vllm_config.kv_transfer_config is None: return - if all([ - vllm_config.kv_transfer_config.is_kv_transfer_instance, - _KV_CONNECTOR_AGENT is None - ]): - + if (vllm_config.kv_transfer_config.is_kv_transfer_instance + and _KV_CONNECTOR_AGENT is None): if envs.VLLM_USE_V1: _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1( config=vllm_config, role=KVConnectorRole.WORKER) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 34ec0441f69a..06790d8ee2f8 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -105,8 +105,10 @@ def set_forward_context(attn_metadata: Any, # KVConnector: trigger (possibly async) load before forward. # Each attn layer will block until the reading is complete. - if has_kv_transfer_group() and attn_metadata is not None and \ - is_v1_kv_transfer_group(): + trigger_kv_transfer = (attn_metadata is not None + and has_kv_transfer_group() + and is_v1_kv_transfer_group()) + if trigger_kv_transfer: kv_connector = get_kv_transfer_group() assert isinstance(kv_connector, KVConnectorBase_V1) kv_connector.start_load_kv(_forward_context) @@ -149,8 +151,7 @@ def set_forward_context(attn_metadata: Any, # KVConnector: each attn layer triggers (possibly async) save. # Ensure all those operations complete before forward() is done. - if has_kv_transfer_group() and attn_metadata is not None and \ - is_v1_kv_transfer_group(): + if trigger_kv_transfer: kv_connector = get_kv_transfer_group() assert isinstance(kv_connector, KVConnectorBase_V1) kv_connector.wait_for_save() diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index f1bcae4fde3a..6e101642ba44 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -85,11 +85,6 @@ def __init__( self.num_cached_block: dict[str, int] = {} self.prefix_cache_stats = PrefixCacheStats() - # KVConnector: buffer reqs for KVConnector. We write - # the external KVs to the "buffer" req and leverage - # prefix caching to share with the "real" req - self.kv_connector_buffer_req_ids: list[str] = [] - @property def usage(self) -> float: """Get the KV cache usage. @@ -169,13 +164,59 @@ def get_computed_blocks( num_computed_tokens = len(computed_blocks) * self.block_size return computed_blocks, num_computed_tokens + def alloc_and_append_external_blocks( + self, + request: "Request", + computed_blocks: list["KVCacheBlock"], + num_computed_tokens: int, + kv_connector: "KVConnectorBase_V1", + ) -> tuple[list["KVCacheBlock"], int]: + """Get the external blocks for the request. + Note that the computed blocks must be full. + + Args: + request: The request to get the computed blocks. + num_tokens: The number of tokens to allocate. Note that this does + not include the tokens that have already been computed. + computed_blocks: List of computed blocks from prefix cache. + num_computed_tokens: Number of computed tokens. + + Returns: + A tuple containing: + - A list of blocks that are computed for the request. + - The number of computed tokens. + """ + + # Check for cache hit beyond the num_computed_tokens. + num_tokens_needed = kv_connector.get_num_new_matched_tokens( + request, num_computed_tokens) + num_allocated_blocks = 0 + + # If cache hit, allocate slots for external KVs. + if num_tokens_needed > 0: + allocated_blocks = self._allocate_slots_for_external( + request=request, + num_tokens=num_tokens_needed, + computed_blocks=computed_blocks, + ) + + # Append to the new_computed_blocks. + if allocated_blocks: + computed_blocks = computed_blocks + allocated_blocks + num_allocated_blocks = len(allocated_blocks) + + # Update KVConnector state: + # * SharedStorageConnector: adds to _requests_need_load. + kv_connector.update_state_after_alloc(request, num_allocated_blocks) + num_computed_tokens = len(computed_blocks) * self.block_size + return computed_blocks, num_computed_tokens + def allocate_slots( self, request: Request, num_tokens: int, new_computed_blocks: Optional[list[KVCacheBlock]] = None, num_lookahead_tokens: int = 0, - skip_preallocate: bool = False, ) -> Optional[list[KVCacheBlock]]: """Add slots for a request with new tokens to append. @@ -183,8 +224,8 @@ def allocate_slots( request: The request to allocate slots. num_tokens: The number of tokens to allocate. Note that this does not include the tokens that have already been computed. - new_computed_blocks: A list of new computed blocks just hitting the - prefix caching. + new_computed_blocks: A list of new computed blocks just hitting + the prefix caching. Blocks layout: ----------------------------------------------------------------------- @@ -218,24 +259,15 @@ def allocate_slots( req_blocks, request.num_computed_tokens) self.block_pool.free_blocks(removed_blocks) - # The number of computed tokens is the number of computed tokens plus - # the new prefix caching hits + # The number of computed tokens is the number of computed tokens + # plus the new prefix caching hits. num_computed_tokens = (request.num_computed_tokens + len(new_computed_blocks) * self.block_size) - num_required_blocks = cdiv( - num_computed_tokens + num_tokens + num_lookahead_tokens, - self.block_size) - num_new_blocks = (num_required_blocks - len(req_blocks) - - len(new_computed_blocks)) - # If a computed block of a request is an eviction candidate (in the - # free queue and ref_cnt == 0), it cannot be counted as a free block - # when allocating this request. - num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks - if blk.ref_cnt == 0) - if (num_new_blocks > self.block_pool.get_num_free_blocks() - - num_evictable_computed_blocks): - # Cannot allocate new blocks + # Get the number of incremental blocks to allocate. + num_incr_blocks = self._get_num_incremental_new_blocks( + num_tokens, req_blocks, num_computed_tokens, new_computed_blocks) + if num_incr_blocks <= 0: return None # Touch the computed blocks to make sure they won't be evicted. @@ -251,31 +283,27 @@ def allocate_slots( # Start to handle new blocks - if num_new_blocks <= 0: + if num_incr_blocks <= 0: # No new block is needed. new_blocks = [] else: # Get new blocks from the free block pool considering # preallocated blocks. - _num_preallocate_blocks = 0 if skip_preallocate else \ - self.num_preallocate_blocks num_preallocate_blocks = max( - 0, _num_preallocate_blocks - + 0, self.num_preallocate_blocks - num_lookahead_tokens // self.block_size) - num_preallocate_blocks = self.num_preallocate_blocks \ - if not skip_preallocate else 0 - num_new_blocks = min( - num_new_blocks + num_preallocate_blocks, + num_incr_blocks = min( + num_incr_blocks + num_preallocate_blocks, self.block_pool.get_num_free_blocks(), # Should not exceed the maximum number of blocks per request. # This is especially because the block table has the shape # [..., max_num_blocks_per_req]. self.max_num_blocks_per_req - len(req_blocks), ) - assert num_new_blocks > 0 + assert num_incr_blocks > 0 # Concatenate the computed block IDs and the new block IDs. - new_blocks = self.block_pool.get_new_blocks(num_new_blocks) + new_blocks = self.block_pool.get_new_blocks(num_incr_blocks) req_blocks.extend(new_blocks) if not self.enable_caching: @@ -305,6 +333,99 @@ def allocate_slots( request.request_id] = num_full_blocks_after_append return new_blocks + def _allocate_slots_for_external( + self, + request: Request, + num_tokens: int, + computed_blocks: Optional[list[KVCacheBlock]] = None, + ) -> list[KVCacheBlock]: + """Allocate for external blocks and append to new_computed_block: + + Args: + request: The request to allocate slots. + num_tokens: The number of tokens to allocate. Note that this does + not include the tokens that have already been computed. + computed_blocks: A list of computed blocks from prefix cache. + + Returns: + A list of new allocated blocks for the external blocks. + """ + # TODO(rob): validate this works well with sliding window + # and other specialized managers. + assert self.enable_caching, "Remote cache requires prefix caching." + + if num_tokens == 0: + raise ValueError("num_tokens must be greater than 0") + + computed_blocks = computed_blocks or [] + + # NOTE(rob): this will returns > [] if there is a preemption + # TODO(rob): handle case of preemption w/ remote KV as FUP. + req_blocks = self.req_to_blocks[request.request_id] + assert len(req_blocks) == 0 + + # The number of computed tokens is the number of computed tokens plus + # the new prefix caching hits + num_computed_tokens = (request.num_computed_tokens + + len(computed_blocks) * self.block_size) + + # Get the number of incremental blocks to allocate. + num_incr_blocks = self._get_num_incremental_new_blocks( + num_tokens, req_blocks, num_computed_tokens, computed_blocks) + if num_incr_blocks <= 0: + # TODO(rob): handle case with not enough external KVs in FUP. + raise NotImplementedError( + "TODO: handle preemption with external KV cache") + return [] + + assert num_incr_blocks <= self.block_pool.get_num_free_blocks() + num_existing_blocks = len(req_blocks) + len(computed_blocks) + assert (num_incr_blocks + <= self.max_num_blocks_per_req - num_existing_blocks) + + # Return the new blocks. + return self.block_pool.get_new_blocks(num_incr_blocks) + + def _get_num_incremental_new_blocks( + self, + num_tokens: int, + req_blocks: list[KVCacheBlock], + num_computed_tokens: int, + new_computed_blocks: list[KVCacheBlock], + ) -> int: + """ + Get number of incremental blocks to allocate for the request. + + Args: + num_tokens: The number of tokens to allocate. Note that this does + not include the tokens that have already been computed. + req_blocks: The blocks corresponding to this request. + num_computed_tokens: The number of computed tokens for this request, + including req_blocks and new_computed_blocks. + new_computed_blocks: List of new computed blocks from prefix cache. + Returns: + If not enough free blocks: return 0 + Else: return the number of incremental blocks to allocate. + """ + + # Allocate blocks for the tokens beyond the prefix cache hit. + num_required_blocks = cdiv(num_computed_tokens + num_tokens, + self.block_size) + num_new_blocks = (num_required_blocks - len(req_blocks) - + len(new_computed_blocks)) + + # If a computed block of a request is an eviction candidate (in the + # free queue and ref_cnt == 0), it cannot be counted as a free block + # when allocating this request. + num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks + if blk.ref_cnt == 0) + + # Return True if there are enough free blocks. + if (num_new_blocks > self.block_pool.get_num_free_blocks() - + num_evictable_computed_blocks): + return 0 + return num_new_blocks + def _free_by_request_id(self, request_id: str) -> None: """Free the blocks allocated for the request. @@ -408,66 +529,3 @@ def free_block_hashes(self, request: Request) -> None: is finished, not when it is preempted. """ self._free_block_hashes_by_request_id(request.request_id) - - def alloc_and_append_external_blocks( - self, - request: "Request", - computed_blocks: list["KVCacheBlock"], - num_computed_tokens: int, - kv_connector: "KVConnectorBase_V1", - ) -> tuple[list["KVCacheBlock"], int]: - - # Check for cache hit beyond the num_computed_tokens. - need_to_allocate = kv_connector.get_num_new_matched_tokens( - request, num_computed_tokens) - num_allocated_blocks = 0 - - # Cache hit: allocate buffer. - if need_to_allocate > 0: - # HACK: We don't want the scheduler see the blocks are allocated - # and associated with the current request. Instead, we want the - # scheduler find that the blocks are already allocated and they - # are associated with some other requests (i.e., the case of - # prefix caching. - - old_req_id = request.request_id - request.request_id = f"{old_req_id}-buf-for-kv-connector" - allocated_blocks = self.allocate_slots( - request, - need_to_allocate, - computed_blocks, - skip_preallocate=True, - ) - self.kv_connector_buffer_req_ids.append(request.request_id) - request.request_id = old_req_id - - if allocated_blocks is None: - allocated_blocks = [] - - # Avoid over-allocating - num_expected_blocks = need_to_allocate // self.block_size - allocated_blocks = allocated_blocks[:num_expected_blocks] - - # Back-off one block if the external KV is for all tokens - if ((len(allocated_blocks) + len(computed_blocks)) * - self.block_size >= request.num_prompt_tokens): - allocated_blocks = allocated_blocks[:-1] - - if allocated_blocks: - computed_blocks = computed_blocks + allocated_blocks - num_allocated_blocks = len(allocated_blocks) - - # Update internal state. In case of: - # * SharedStorageConnector: add req_id to _requests_need_load - # so that we know to load this requests KVs later. - kv_connector.update_state_after_alloc(request, num_allocated_blocks) - num_computed_tokens = len(computed_blocks) * self.block_size - return computed_blocks, num_computed_tokens - - def free_buffer_requests(self) -> None: - """Free buffer requests for the KV connector.""" - - for request_id in self.kv_connector_buffer_req_ids: - self._free_by_request_id(request_id) - self._free_block_hashes_by_request_id(request_id) - self.kv_connector_buffer_req_ids.clear() diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 8f3228d85ab2..5efddfcb710c 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -317,18 +317,15 @@ def schedule(self) -> SchedulerOutput: computed_blocks, num_computed_tokens = \ self.kv_cache_manager.get_computed_blocks(request) - # KVConnector: get blocks externally-cached tokens. - # Internally, this allocates a "buffer" req with a prompt - # corresponding to externally cached tokens. In alloc_slots - # below, we will compute a cache hit and thus skip the - # computation for externally cached tokens. - # NOTE: since this allocates temporary buffer requests, - # we must call kv_cache_manager.free_buffer_requests() below. + # Get externally-cached tokens if using a KVConnector. if self.connector is not None: computed_blocks, num_computed_tokens = \ self.kv_cache_manager.alloc_and_append_external_blocks( - request, computed_blocks, - num_computed_tokens, self.connector) + request=request, + computed_blocks=computed_blocks, + num_computed_tokens=num_computed_tokens, + kv_connector=self.connector, + ) # Number of tokens to be scheduled. # We use `request.num_tokens` instead of @@ -469,18 +466,14 @@ def schedule(self) -> SchedulerOutput: grammar_bitmask=grammar_bitmask, ) + # NOTE(Kuntai): this function is designed for multiple purposes: + # 1. Plan the KV cache store + # 2. Wrap up all the KV cache load / save ops into an opaque object + # 3. Clear the internal states of the connector if self.connector is not None: - # NOTE(Kuntai): this function is designed for multiple purposes: - # 1. Plan the KV cache store - # 2. Wrap up all the KV cache load / save ops into an opaque object - # 3. Clear the internal states of the connector meta = self.connector.build_connector_meta(scheduler_output) scheduler_output.kv_connector_metadata = meta - # KVConnector: once we have allocated the buffer blocks to the - # "real" requests (via prefix caching), free the tmp buffer reqs. - self.kv_cache_manager.free_buffer_requests() - # Advance the number of computed tokens for the request AFTER # the request is scheduled. # 1. The scheduler_output of the current step has to include the From 33f6e603b2ba0c6656f8188a2b75e7acc82cc18d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:04:09 +0000 Subject: [PATCH 065/116] make pr easier to read Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/__init__.py | 5 ----- vllm/distributed/kv_transfer/kv_connector/base.py | 4 ++++ vllm/distributed/kv_transfer/kv_connector/factory.py | 2 +- vllm/distributed/kv_transfer/kv_transfer_state.py | 2 +- vllm/v1/core/kv_cache_manager.py | 4 ++-- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py index f62b403eaf4a..ec07c6fe0d12 100644 --- a/vllm/distributed/kv_transfer/__init__.py +++ b/vllm/distributed/kv_transfer/__init__.py @@ -1,14 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Union -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase -from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.distributed.kv_transfer.kv_transfer_state import ( ensure_kv_transfer_initialized, get_kv_transfer_group, has_kv_transfer_group, is_v1_kv_transfer_group) -KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1] - __all__ = [ "get_kv_transfer_group", "has_kv_transfer_group", "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized", diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py index 57c764b481c2..0d1a3d40af41 100644 --- a/vllm/distributed/kv_transfer/kv_connector/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -12,6 +12,7 @@ import torch +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.sequence import IntermediateTensors if TYPE_CHECKING: @@ -121,3 +122,6 @@ def recv_kv_caches_and_hidden_states( """ raise NotImplementedError + + +KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1] diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 586482f1d089..6739e616a1e9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Callable, Dict, Type import vllm.envs as envs -from vllm.distributed.kv_transfer import KVConnectorBaseType +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, KVConnectorRole) from vllm.logger import init_logger diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 820723148517..25d2f2cf5c6e 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Optional from vllm import envs -from vllm.distributed.kv_transfer import KVConnectorBaseType +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 6e101642ba44..d6c38fcef267 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -259,8 +259,8 @@ def allocate_slots( req_blocks, request.num_computed_tokens) self.block_pool.free_blocks(removed_blocks) - # The number of computed tokens is the number of computed tokens - # plus the new prefix caching hits. + # The number of computed tokens is the number of computed tokens plus + # the new prefix caching hits num_computed_tokens = (request.num_computed_tokens + len(new_computed_blocks) * self.block_size) From db28310cb4129fbd04194f452188fa5dbfa56cdf Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:06:21 +0000 Subject: [PATCH 066/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index d6c38fcef267..0b5aadecaf43 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -176,8 +176,6 @@ def alloc_and_append_external_blocks( Args: request: The request to get the computed blocks. - num_tokens: The number of tokens to allocate. Note that this does - not include the tokens that have already been computed. computed_blocks: List of computed blocks from prefix cache. num_computed_tokens: Number of computed tokens. From 3701b5d76c29c03be5541b0edc0b2d60b8d1f42f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:16:31 +0000 Subject: [PATCH 067/116] stash Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 0b5aadecaf43..6882aa9090d3 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -178,6 +178,7 @@ def alloc_and_append_external_blocks( request: The request to get the computed blocks. computed_blocks: List of computed blocks from prefix cache. num_computed_tokens: Number of computed tokens. + kv_connector: KVCache connector object. Returns: A tuple containing: From deb1323634a28439a9cdb36f7b38085e733e5abc Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:26:08 +0000 Subject: [PATCH 068/116] type checking is wrong for ReqMeta Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index d27d6252867d..fd327154d336 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -34,7 +34,7 @@ class ReqMeta: @staticmethod def from_request(request: "Request", block_size: int, is_store: bool) -> "ReqMeta": - valid_num_tokens = align_to_block_size(request.num_prompt_tokens, + valid_num_tokens = align_to_block_size(len(request.prompt_token_ids), block_size) token_ids = torch.tensor(request.prompt_token_ids)[:valid_num_tokens] block_ids = torch.tensor(request.block_ids) From be789bf48c6d077370cab5c4e5e22852889a4186 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:33:19 +0000 Subject: [PATCH 069/116] add todo for the morning Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 6882aa9090d3..1dcb4b813bb6 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -383,7 +383,11 @@ def _allocate_slots_for_external( <= self.max_num_blocks_per_req - num_existing_blocks) # Return the new blocks. - return self.block_pool.get_new_blocks(num_incr_blocks) + new_blocks = self.block_pool.get_new_blocks(num_incr_blocks) + + # TODO(rob): need to hash the blocks here. + + return new_blocks def _get_num_incremental_new_blocks( self, From a3e5762ce388ed56d12834bd8f22310739ef6541 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:34:47 +0000 Subject: [PATCH 070/116] revery by id Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 1dcb4b813bb6..9b8f988642b1 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -429,13 +429,15 @@ def _get_num_incremental_new_blocks( return 0 return num_new_blocks - def _free_by_request_id(self, request_id: str) -> None: + def free(self, request: Request) -> None: """Free the blocks allocated for the request. + When caching is enabled, we free the blocks in reverse order so that + the tail blocks are evicted first. Args: - request_id: The request ID to free the blocks. + request: The request to free the blocks. """ - blocks = self.req_to_blocks.pop(request_id, []) + blocks = self.req_to_blocks.pop(request.request_id, []) ordered_blocks: Iterable[KVCacheBlock] = blocks if self.enable_caching: # Free blocks in reverse order so that the tail blocks are @@ -443,17 +445,7 @@ def _free_by_request_id(self, request_id: str) -> None: ordered_blocks = reversed(blocks) self.block_pool.free_blocks(ordered_blocks) - self.num_cached_block.pop(request_id, None) - - def free(self, request: Request) -> None: - """Free the blocks allocated for the request. - When caching is enabled, we free the blocks in reverse order so that - the tail blocks are evicted first. - - Args: - request: The request to free the blocks. - """ - self._free_by_request_id(request.request_id) + self.num_cached_block.pop(request.request_id, None) def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF @@ -517,18 +509,10 @@ def get_num_common_prefix_blocks( break return num_common_blocks - def _free_block_hashes_by_request_id(self, request_id: str) -> None: - """Free the block hashes allocated for the request. - - Args: - request_id: The request ID to free the block hashes. - """ - self.req_to_block_hashes.pop(request_id, None) - def free_block_hashes(self, request: Request) -> None: """Discard the block hashes for the request. NOTE: Unlike `free`, this method should be called only when the request is finished, not when it is preempted. """ - self._free_block_hashes_by_request_id(request.request_id) + self.req_to_block_hashes.pop(request.request_id, None) From a03d707cca9a1a7181fc530c5862673d51bcd280 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:38:43 +0000 Subject: [PATCH 071/116] revery by id Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 40 ++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 9b8f988642b1..6194d1e02078 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -225,6 +225,9 @@ def allocate_slots( not include the tokens that have already been computed. new_computed_blocks: A list of new computed blocks just hitting the prefix caching. + num_lookahead_tokens: The number of speculative tokens to allocate. + This is used by spec decode proposers with kv-cache such + as eagle. Blocks layout: ----------------------------------------------------------------------- @@ -264,9 +267,10 @@ def allocate_slots( len(new_computed_blocks) * self.block_size) # Get the number of incremental blocks to allocate. - num_incr_blocks = self._get_num_incremental_new_blocks( - num_tokens, req_blocks, num_computed_tokens, new_computed_blocks) - if num_incr_blocks <= 0: + num_new_blocks = self._get_num_new_blocks(num_tokens, req_blocks, + num_computed_tokens, + new_computed_blocks) + if num_new_blocks <= 0: return None # Touch the computed blocks to make sure they won't be evicted. @@ -282,7 +286,7 @@ def allocate_slots( # Start to handle new blocks - if num_incr_blocks <= 0: + if num_new_blocks <= 0: # No new block is needed. new_blocks = [] else: @@ -291,18 +295,18 @@ def allocate_slots( num_preallocate_blocks = max( 0, self.num_preallocate_blocks - num_lookahead_tokens // self.block_size) - num_incr_blocks = min( - num_incr_blocks + num_preallocate_blocks, + num_new_blocks = min( + num_new_blocks + num_preallocate_blocks, self.block_pool.get_num_free_blocks(), # Should not exceed the maximum number of blocks per request. # This is especially because the block table has the shape # [..., max_num_blocks_per_req]. self.max_num_blocks_per_req - len(req_blocks), ) - assert num_incr_blocks > 0 + assert num_new_blocks > 0 # Concatenate the computed block IDs and the new block IDs. - new_blocks = self.block_pool.get_new_blocks(num_incr_blocks) + new_blocks = self.block_pool.get_new_blocks(num_new_blocks) req_blocks.extend(new_blocks) if not self.enable_caching: @@ -369,27 +373,29 @@ def _allocate_slots_for_external( len(computed_blocks) * self.block_size) # Get the number of incremental blocks to allocate. - num_incr_blocks = self._get_num_incremental_new_blocks( - num_tokens, req_blocks, num_computed_tokens, computed_blocks) - if num_incr_blocks <= 0: + num_new_blocks = self._get_num_new_blocks(num_tokens, req_blocks, + num_computed_tokens, + computed_blocks) + if num_new_blocks <= 0: # TODO(rob): handle case with not enough external KVs in FUP. raise NotImplementedError( "TODO: handle preemption with external KV cache") return [] - assert num_incr_blocks <= self.block_pool.get_num_free_blocks() + assert num_new_blocks <= self.block_pool.get_num_free_blocks() num_existing_blocks = len(req_blocks) + len(computed_blocks) - assert (num_incr_blocks + assert (num_new_blocks <= self.max_num_blocks_per_req - num_existing_blocks) # Return the new blocks. - new_blocks = self.block_pool.get_new_blocks(num_incr_blocks) + new_blocks = self.block_pool.get_new_blocks(num_new_blocks) - # TODO(rob): need to hash the blocks here. + # TODO(rob): need to hash the blocks here. The current impl + # is broken without this. return new_blocks - def _get_num_incremental_new_blocks( + def _get_num_new_blocks( self, num_tokens: int, req_blocks: list[KVCacheBlock], @@ -397,7 +403,7 @@ def _get_num_incremental_new_blocks( new_computed_blocks: list[KVCacheBlock], ) -> int: """ - Get number of incremental blocks to allocate for the request. + Get number of new blocks to allocate for the request. Args: num_tokens: The number of tokens to allocate. Note that this does From f6960000e48912a36927cb04cc1d5d4cf6f974c9 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:40:10 +0000 Subject: [PATCH 072/116] revery by id Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 1 - vllm/v1/core/sched/scheduler.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 6194d1e02078..5cd22e599d62 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -30,7 +30,6 @@ def __init__( caching_hash_algo: str = "builtin", num_preallocate_tokens: int = 64, log_stats: bool = False, - connector: Optional["KVConnectorBase_V1"] = None, ) -> None: assert len(kv_cache_config.kv_cache_groups) == 1, ( "KVCacheManager does not support hybrid models with more than 1 " diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 5efddfcb710c..db78a2e19c33 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -81,7 +81,7 @@ def __init__( enable_caching=self.cache_config.enable_prefix_caching, caching_hash_algo=self.cache_config.prefix_caching_hash_algo, log_stats=self.log_stats, - connector=self.connector) + ) self.block_size = self.cache_config.block_size # req_id -> Request From 1d85e635ba61860bc4a28b6b82e097bf023f6f7c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:41:04 +0000 Subject: [PATCH 073/116] readabilty Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 5cd22e599d62..019b5ab663e6 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -222,8 +222,8 @@ def allocate_slots( request: The request to allocate slots. num_tokens: The number of tokens to allocate. Note that this does not include the tokens that have already been computed. - new_computed_blocks: A list of new computed blocks just hitting - the prefix caching. + new_computed_blocks: A list of new computed blocks just hitting the + prefix caching. num_lookahead_tokens: The number of speculative tokens to allocate. This is used by spec decode proposers with kv-cache such as eagle. From 521ed1421046c97452f360c432f8b1df51d13b25 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:43:54 +0000 Subject: [PATCH 074/116] updared Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 019b5ab663e6..c7a0f06f2b53 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -442,6 +442,7 @@ def free(self, request: Request) -> None: Args: request: The request to free the blocks. """ + # Default to [] in case a request is freed (aborted) before alloc. blocks = self.req_to_blocks.pop(request.request_id, []) ordered_blocks: Iterable[KVCacheBlock] = blocks if self.enable_caching: From 6709943f83cc1387869aa47a4707f6fea07df995 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 04:46:11 +0000 Subject: [PATCH 075/116] nits Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/kv_connector_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py index 9a300fc7cb9d..8f42207c1ebf 100644 --- a/vllm/distributed/kv_transfer/kv_connector_agent.py +++ b/vllm/distributed/kv_transfer/kv_connector_agent.py @@ -63,7 +63,6 @@ def send_kv_caches_and_hidden_states( hidden_or_intermediate_states) def close(self) -> None: - self.connector.close() def recv_kv_caches_and_hidden_states( From 44ea15600687d55617ad629ddc95ce6b1d7b1717 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 12:29:30 +0000 Subject: [PATCH 076/116] cleanup Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 1 + vllm/v1/core/sched/scheduler.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index c7a0f06f2b53..7145a44f5582 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -269,6 +269,7 @@ def allocate_slots( num_new_blocks = self._get_num_new_blocks(num_tokens, req_blocks, num_computed_tokens, new_computed_blocks) + if num_new_blocks <= 0: return None diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index db78a2e19c33..d1f875b90a88 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -80,8 +80,7 @@ def __init__( max_model_len=self.max_model_len, enable_caching=self.cache_config.enable_prefix_caching, caching_hash_algo=self.cache_config.prefix_caching_hash_algo, - log_stats=self.log_stats, - ) + log_stats=self.log_stats) self.block_size = self.cache_config.block_size # req_id -> Request From 818010139c5357e704d8a546525e03718ba7f78e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 12:36:42 +0000 Subject: [PATCH 077/116] cleaning Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 4 ++-- vllm/v1/core/kv_cache_manager.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 007be7bbc76b..7d6b40d408c3 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -66,6 +66,7 @@ def create_scheduler( cache_dtype="auto", **kwargs_cache, ) + cache_config.num_gpu_blocks = 10000 vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, @@ -79,12 +80,11 @@ def create_scheduler( FullAttentionSpec(16, 1, 1, torch.float32, False)) ], ) - cache_config.num_gpu_blocks = 10000 return Scheduler( vllm_config, kv_cache_config=kv_cache_config, - structured_output_manager=StructuredOutputManager(vllm_config), log_stats=True, + structured_output_manager=StructuredOutputManager(vllm_config), ) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 7145a44f5582..f26e13f5687f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -429,7 +429,7 @@ def _get_num_new_blocks( num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks if blk.ref_cnt == 0) - # Return True if there are enough free blocks. + # Return 0 if not enough blocks. if (num_new_blocks > self.block_pool.get_num_free_blocks() - num_evictable_computed_blocks): return 0 From c3a2cc65cdfd4ae419aec031a89fa2c03b528c4f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 13:06:24 +0000 Subject: [PATCH 078/116] fix bug Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 2 +- vllm/v1/core/kv_cache_manager.py | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 7d6b40d408c3..69cc77f64ff4 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -81,7 +81,7 @@ def create_scheduler( ], ) return Scheduler( - vllm_config, + vllm_config=vllm_config, kv_cache_config=kv_cache_config, log_stats=True, structured_output_manager=StructuredOutputManager(vllm_config), diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index f26e13f5687f..b6310f7e5e1c 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -269,8 +269,8 @@ def allocate_slots( num_new_blocks = self._get_num_new_blocks(num_tokens, req_blocks, num_computed_tokens, new_computed_blocks) - - if num_new_blocks <= 0: + if num_new_blocks is None: + # Cannot allocate new blocks return None # Touch the computed blocks to make sure they won't be evicted. @@ -303,7 +303,7 @@ def allocate_slots( # [..., max_num_blocks_per_req]. self.max_num_blocks_per_req - len(req_blocks), ) - assert num_new_blocks > 0 + assert num_new_blocks is not None and num_new_blocks > 0 # Concatenate the computed block IDs and the new block IDs. new_blocks = self.block_pool.get_new_blocks(num_new_blocks) @@ -376,12 +376,16 @@ def _allocate_slots_for_external( num_new_blocks = self._get_num_new_blocks(num_tokens, req_blocks, num_computed_tokens, computed_blocks) - if num_new_blocks <= 0: + if num_new_blocks is None: # TODO(rob): handle case with not enough external KVs in FUP. raise NotImplementedError( "TODO: handle preemption with external KV cache") return [] + if num_new_blocks <= 0: + # No new block is needed. + new_blocks = [] + assert num_new_blocks <= self.block_pool.get_num_free_blocks() num_existing_blocks = len(req_blocks) + len(computed_blocks) assert (num_new_blocks @@ -401,7 +405,7 @@ def _get_num_new_blocks( req_blocks: list[KVCacheBlock], num_computed_tokens: int, new_computed_blocks: list[KVCacheBlock], - ) -> int: + ) -> Optional[int]: """ Get number of new blocks to allocate for the request. @@ -413,7 +417,7 @@ def _get_num_new_blocks( including req_blocks and new_computed_blocks. new_computed_blocks: List of new computed blocks from prefix cache. Returns: - If not enough free blocks: return 0 + If not enough free blocks: returns None. Else: return the number of incremental blocks to allocate. """ @@ -429,10 +433,10 @@ def _get_num_new_blocks( num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks if blk.ref_cnt == 0) - # Return 0 if not enough blocks. + # Return None if not enough blocks for the request. if (num_new_blocks > self.block_pool.get_num_free_blocks() - num_evictable_computed_blocks): - return 0 + return None return num_new_blocks def free(self, request: Request) -> None: From 5273e240e797cde1b08dde1fd59ba9cf1f7c2351 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 13:09:10 +0000 Subject: [PATCH 079/116] updated Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 2 +- vllm/distributed/kv_transfer/kv_connector_agent.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 69cc77f64ff4..fa53b0de668d 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -66,7 +66,6 @@ def create_scheduler( cache_dtype="auto", **kwargs_cache, ) - cache_config.num_gpu_blocks = 10000 vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, @@ -80,6 +79,7 @@ def create_scheduler( FullAttentionSpec(16, 1, 1, torch.float32, False)) ], ) + cache_config.num_gpu_blocks = 10000 return Scheduler( vllm_config=vllm_config, kv_cache_config=kv_cache_config, diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py index 8f42207c1ebf..9d7145098105 100644 --- a/vllm/distributed/kv_transfer/kv_connector_agent.py +++ b/vllm/distributed/kv_transfer/kv_connector_agent.py @@ -21,7 +21,7 @@ logger = init_logger(__name__) -class KVConnectorAgent: +class KVTransferAgent: """ A class designated for distributed KV transfer From 913325fd4396e3faa14db78bf418be51e1d70531 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 13:11:10 +0000 Subject: [PATCH 080/116] update name Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index b6310f7e5e1c..12e318898848 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -192,7 +192,7 @@ def alloc_and_append_external_blocks( # If cache hit, allocate slots for external KVs. if num_tokens_needed > 0: - allocated_blocks = self._allocate_slots_for_external( + allocated_blocks = self._allocate_slots_for_connector( request=request, num_tokens=num_tokens_needed, computed_blocks=computed_blocks, @@ -336,7 +336,7 @@ def allocate_slots( request.request_id] = num_full_blocks_after_append return new_blocks - def _allocate_slots_for_external( + def _allocate_slots_for_connector( self, request: Request, num_tokens: int, From 75c24d3ba878a01166ce5e3062a7a5a53acfbd17 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 13:13:13 +0000 Subject: [PATCH 081/116] cleanup Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 12e318898848..7b287f436338 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -277,8 +277,9 @@ def allocate_slots( if self.enable_caching: self.block_pool.touch(new_computed_blocks) else: - assert not new_computed_blocks, "Computed blocks should "\ - "be empty when prefix caching is disabled" + assert not new_computed_blocks, ( + "Computed blocks should be empty when " + "prefix caching is disabled") # Append the new computed blocks to the request blocks until now to # avoid the case where the new blocks cannot be allocated. From 17a36180059b07fcc9a8437413c604004336d5fd Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 15:40:51 +0000 Subject: [PATCH 082/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 6 +- vllm/v1/core/kv_cache_manager.py | 99 +++++++++++++++---- 2 files changed, 81 insertions(+), 24 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 1d814c73ccee..66ad8d82b120 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -99,9 +99,9 @@ def _get_connector_metadata(self) -> KVConnectorMetadata: def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: """ - Start loading the KV cache from the connector buffer to vLLM's - paged KV buffer. This is called from the forward context before - the forward pass to enable async loading during model execution. + Start loading the KV cache from the connector to vLLM's paged + KV buffer. This is called from the forward context before the + forward pass to enable async loading during model execution. Args: forward_context (ForwardContext): the forward context. diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 7b287f436338..73af826c17db 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -201,6 +201,7 @@ def alloc_and_append_external_blocks( # Append to the new_computed_blocks. if allocated_blocks: computed_blocks = computed_blocks + allocated_blocks + assert allocated_blocks is not None num_allocated_blocks = len(allocated_blocks) # Update KVConnector state: @@ -266,9 +267,12 @@ def allocate_slots( len(new_computed_blocks) * self.block_size) # Get the number of incremental blocks to allocate. - num_new_blocks = self._get_num_new_blocks(num_tokens, req_blocks, - num_computed_tokens, - new_computed_blocks) + num_new_blocks = self._get_num_new_blocks( + num_tokens, + req_blocks, + num_computed_tokens, + new_computed_blocks, + num_lookahead_tokens=num_lookahead_tokens) if num_new_blocks is None: # Cannot allocate new blocks return None @@ -342,8 +346,8 @@ def _allocate_slots_for_connector( request: Request, num_tokens: int, computed_blocks: Optional[list[KVCacheBlock]] = None, - ) -> list[KVCacheBlock]: - """Allocate for external blocks and append to new_computed_block: + ) -> Optional[list[KVCacheBlock]]: + """Allocate for external blocks and append to new_computed_block Args: request: The request to allocate slots. @@ -363,9 +367,10 @@ def _allocate_slots_for_connector( computed_blocks = computed_blocks or [] - # NOTE(rob): this will returns > [] if there is a preemption - # TODO(rob): handle case of preemption w/ remote KV as FUP. + # NOTE(rob): req_to_blocks[req_id] will have items if the + # request is a resumed preemption. req_blocks = self.req_to_blocks[request.request_id] + # TODO(rob): handle case of preemption w/ remote KV as FUP. assert len(req_blocks) == 0 # The number of computed tokens is the number of computed tokens plus @@ -378,22 +383,20 @@ def _allocate_slots_for_connector( num_computed_tokens, computed_blocks) if num_new_blocks is None: - # TODO(rob): handle case with not enough external KVs in FUP. - raise NotImplementedError( - "TODO: handle preemption with external KV cache") - return [] + # Cannot allocate new blocks + return None if num_new_blocks <= 0: # No new block is needed. new_blocks = [] + else: + assert num_new_blocks <= self.block_pool.get_num_free_blocks() + num_existing_blocks = len(req_blocks) + len(computed_blocks) + assert (num_new_blocks + <= self.max_num_blocks_per_req - num_existing_blocks) - assert num_new_blocks <= self.block_pool.get_num_free_blocks() - num_existing_blocks = len(req_blocks) + len(computed_blocks) - assert (num_new_blocks - <= self.max_num_blocks_per_req - num_existing_blocks) - - # Return the new blocks. - new_blocks = self.block_pool.get_new_blocks(num_new_blocks) + # Get the new blocks. + new_blocks = self.block_pool.get_new_blocks(num_new_blocks) # TODO(rob): need to hash the blocks here. The current impl # is broken without this. @@ -406,6 +409,7 @@ def _get_num_new_blocks( req_blocks: list[KVCacheBlock], num_computed_tokens: int, new_computed_blocks: list[KVCacheBlock], + num_lookahead_tokens: int = 0, ) -> Optional[int]: """ Get number of new blocks to allocate for the request. @@ -413,18 +417,22 @@ def _get_num_new_blocks( Args: num_tokens: The number of tokens to allocate. Note that this does not include the tokens that have already been computed. - req_blocks: The blocks corresponding to this request. + reqblocks: The blocks corresponding to this request. num_computed_tokens: The number of computed tokens for this request, including req_blocks and new_computed_blocks. new_computed_blocks: List of new computed blocks from prefix cache. + num_lookahead_tokens: The number of speculative tokens to allocate. + This is used by spec decode proposers with kv-cache such + as eagle. Returns: If not enough free blocks: returns None. Else: return the number of incremental blocks to allocate. """ # Allocate blocks for the tokens beyond the prefix cache hit. - num_required_blocks = cdiv(num_computed_tokens + num_tokens, - self.block_size) + num_required_blocks = cdiv( + num_computed_tokens + num_tokens + num_lookahead_tokens, + self.block_size) num_new_blocks = (num_required_blocks - len(req_blocks) - len(new_computed_blocks)) @@ -440,6 +448,55 @@ def _get_num_new_blocks( return None return num_new_blocks + def _cache_blocks( + self, + request: Request, + req_blocks: list[KVCacheBlock], + num_computed_tokens: int, + num_tokens: int, + new_computed_blocks: list[KVCacheBlock], + ): + """ + Cache blocks in the Block Pool. + + Args: + request: The request to cache the blocks. + req_blocks: All blocks in the request. + block_hashes: Block hashes of the blocks in the request. Note that + this list may be shorter than the blocks list. In this case the + missed block hash will be computed in this function. + num_cached_blocks: The number of blocks that are already cached. + num_full_blocks: The number of blocks that are full and should + be cached after this function. + block_size: Number of tokens in each block. + hash_fn: The hash function to use for block hashes. + Returns: + If not enough free blocks: returns None. + Else: return the number of incremental blocks to allocate. + """ + # Use `new_computed_blocks` for a new request, and `num_cached_block` + # for a running request. + num_cached_blocks = self.num_cached_block.get(request.request_id, + len(new_computed_blocks)) + # Speculated tokens might be rejected in the future, so we do + # not cache any speculated tokens. We only cache blocks with + # generated (accepted) tokens. + num_full_blocks_after_append = (num_computed_tokens + num_tokens - len( + request.spec_token_ids)) // self.block_size + + self.block_pool.cache_full_blocks( + request=request, + blocks=req_blocks, + block_hashes=self.req_to_block_hashes[request.request_id], + num_cached_blocks=num_cached_blocks, + num_full_blocks=num_full_blocks_after_append, + block_size=self.block_size, + hash_fn=self.caching_hash_fn, + ) + + self.num_cached_block[ + request.request_id] = num_full_blocks_after_append + def free(self, request: Request) -> None: """Free the blocks allocated for the request. When caching is enabled, we free the blocks in reverse order so that From b4bd1173f6b8277bd94d1fa3ac6026838f6e9272 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 22:40:49 +0000 Subject: [PATCH 083/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 8 +- .../v1/shared_storage_connector.py | 4 +- vllm/v1/core/kv_cache_manager.py | 180 ++---------------- vllm/v1/core/sched/scheduler.py | 26 ++- 4 files changed, 43 insertions(+), 175 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 66ad8d82b120..373bd2a47b13 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -27,6 +27,7 @@ import torch +from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: @@ -35,6 +36,8 @@ from vllm.forward_context import ForwardContext from vllm.v1.request import Request +logger = init_logger(__name__) + class KVConnectorRole(enum.Enum): # Connector running in the scheduler process @@ -52,6 +55,9 @@ class KVConnectorMetadata: class KVConnectorBase_V1(ABC): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): + logger.warning( + "Initializing KVConnectorBase_V1. This API is experimental and " + "subject to change in the future as we iterate the design.") self._connector_metadata = KVConnectorMetadata() self._vllm_config = vllm_config self._role = role @@ -182,7 +188,7 @@ def get_num_new_matched_tokens( @abstractmethod def update_state_after_alloc(self, request: "Request", - num_allocated_blocks: int): + num_external_tokens: int): """ Update KVConnector state after block allocation. """ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index fd327154d336..b84df0eaad2d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -260,14 +260,14 @@ def get_num_new_matched_tokens( return num_tokens_to_check - num_computed_tokens def update_state_after_alloc(self, request: "Request", - num_allocated_blocks: int): + num_external_tokens: int): """ Update KVConnector state after block allocation. If blocks were allocated, add to _requests_need_load, such that we load the KVs in the next forward pass. """ - if num_allocated_blocks > 0: + if num_external_tokens > 0: self._requests_need_load.append(request.request_id) def build_connector_meta( diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 73af826c17db..e4b475ba6519 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -15,7 +15,7 @@ from vllm.v1.request import Request, RequestStatus if TYPE_CHECKING: - from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 + pass logger = init_logger(__name__) @@ -163,59 +163,13 @@ def get_computed_blocks( num_computed_tokens = len(computed_blocks) * self.block_size return computed_blocks, num_computed_tokens - def alloc_and_append_external_blocks( - self, - request: "Request", - computed_blocks: list["KVCacheBlock"], - num_computed_tokens: int, - kv_connector: "KVConnectorBase_V1", - ) -> tuple[list["KVCacheBlock"], int]: - """Get the external blocks for the request. - Note that the computed blocks must be full. - - Args: - request: The request to get the computed blocks. - computed_blocks: List of computed blocks from prefix cache. - num_computed_tokens: Number of computed tokens. - kv_connector: KVCache connector object. - - Returns: - A tuple containing: - - A list of blocks that are computed for the request. - - The number of computed tokens. - """ - - # Check for cache hit beyond the num_computed_tokens. - num_tokens_needed = kv_connector.get_num_new_matched_tokens( - request, num_computed_tokens) - num_allocated_blocks = 0 - - # If cache hit, allocate slots for external KVs. - if num_tokens_needed > 0: - allocated_blocks = self._allocate_slots_for_connector( - request=request, - num_tokens=num_tokens_needed, - computed_blocks=computed_blocks, - ) - - # Append to the new_computed_blocks. - if allocated_blocks: - computed_blocks = computed_blocks + allocated_blocks - assert allocated_blocks is not None - num_allocated_blocks = len(allocated_blocks) - - # Update KVConnector state: - # * SharedStorageConnector: adds to _requests_need_load. - kv_connector.update_state_after_alloc(request, num_allocated_blocks) - num_computed_tokens = len(computed_blocks) * self.block_size - return computed_blocks, num_computed_tokens - def allocate_slots( self, request: Request, num_tokens: int, new_computed_blocks: Optional[list[KVCacheBlock]] = None, num_lookahead_tokens: int = 0, + num_external_tokens: int = 0, ) -> Optional[list[KVCacheBlock]]: """Add slots for a request with new tokens to append. @@ -228,6 +182,8 @@ def allocate_slots( num_lookahead_tokens: The number of speculative tokens to allocate. This is used by spec decode proposers with kv-cache such as eagle. + num_external_tokens: The number of external tokens to allocate. + This is used by KVConnector for remote KV cache. Blocks layout: ----------------------------------------------------------------------- @@ -266,14 +222,19 @@ def allocate_slots( num_computed_tokens = (request.num_computed_tokens + len(new_computed_blocks) * self.block_size) - # Get the number of incremental blocks to allocate. - num_new_blocks = self._get_num_new_blocks( - num_tokens, - req_blocks, - num_computed_tokens, - new_computed_blocks, - num_lookahead_tokens=num_lookahead_tokens) - if num_new_blocks is None: + num_required_blocks = cdiv( + num_computed_tokens + num_external_tokens + num_tokens + + num_lookahead_tokens, self.block_size) + num_new_blocks = (num_required_blocks - len(req_blocks) - + len(new_computed_blocks)) + + # If a computed block of a request is an eviction candidate (in the + # free queue and ref_cnt == 0), it cannot be counted as a free block + # when allocating this request. + num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks + if blk.ref_cnt == 0) + if (num_new_blocks > self.block_pool.get_num_free_blocks() - + num_evictable_computed_blocks): # Cannot allocate new blocks return None @@ -341,113 +302,6 @@ def allocate_slots( request.request_id] = num_full_blocks_after_append return new_blocks - def _allocate_slots_for_connector( - self, - request: Request, - num_tokens: int, - computed_blocks: Optional[list[KVCacheBlock]] = None, - ) -> Optional[list[KVCacheBlock]]: - """Allocate for external blocks and append to new_computed_block - - Args: - request: The request to allocate slots. - num_tokens: The number of tokens to allocate. Note that this does - not include the tokens that have already been computed. - computed_blocks: A list of computed blocks from prefix cache. - - Returns: - A list of new allocated blocks for the external blocks. - """ - # TODO(rob): validate this works well with sliding window - # and other specialized managers. - assert self.enable_caching, "Remote cache requires prefix caching." - - if num_tokens == 0: - raise ValueError("num_tokens must be greater than 0") - - computed_blocks = computed_blocks or [] - - # NOTE(rob): req_to_blocks[req_id] will have items if the - # request is a resumed preemption. - req_blocks = self.req_to_blocks[request.request_id] - # TODO(rob): handle case of preemption w/ remote KV as FUP. - assert len(req_blocks) == 0 - - # The number of computed tokens is the number of computed tokens plus - # the new prefix caching hits - num_computed_tokens = (request.num_computed_tokens + - len(computed_blocks) * self.block_size) - - # Get the number of incremental blocks to allocate. - num_new_blocks = self._get_num_new_blocks(num_tokens, req_blocks, - num_computed_tokens, - computed_blocks) - if num_new_blocks is None: - # Cannot allocate new blocks - return None - - if num_new_blocks <= 0: - # No new block is needed. - new_blocks = [] - else: - assert num_new_blocks <= self.block_pool.get_num_free_blocks() - num_existing_blocks = len(req_blocks) + len(computed_blocks) - assert (num_new_blocks - <= self.max_num_blocks_per_req - num_existing_blocks) - - # Get the new blocks. - new_blocks = self.block_pool.get_new_blocks(num_new_blocks) - - # TODO(rob): need to hash the blocks here. The current impl - # is broken without this. - - return new_blocks - - def _get_num_new_blocks( - self, - num_tokens: int, - req_blocks: list[KVCacheBlock], - num_computed_tokens: int, - new_computed_blocks: list[KVCacheBlock], - num_lookahead_tokens: int = 0, - ) -> Optional[int]: - """ - Get number of new blocks to allocate for the request. - - Args: - num_tokens: The number of tokens to allocate. Note that this does - not include the tokens that have already been computed. - reqblocks: The blocks corresponding to this request. - num_computed_tokens: The number of computed tokens for this request, - including req_blocks and new_computed_blocks. - new_computed_blocks: List of new computed blocks from prefix cache. - num_lookahead_tokens: The number of speculative tokens to allocate. - This is used by spec decode proposers with kv-cache such - as eagle. - Returns: - If not enough free blocks: returns None. - Else: return the number of incremental blocks to allocate. - """ - - # Allocate blocks for the tokens beyond the prefix cache hit. - num_required_blocks = cdiv( - num_computed_tokens + num_tokens + num_lookahead_tokens, - self.block_size) - num_new_blocks = (num_required_blocks - len(req_blocks) - - len(new_computed_blocks)) - - # If a computed block of a request is an eviction candidate (in the - # free queue and ref_cnt == 0), it cannot be counted as a free block - # when allocating this request. - num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks - if blk.ref_cnt == 0) - - # Return None if not enough blocks for the request. - if (num_new_blocks > self.block_pool.get_num_free_blocks() - - num_evictable_computed_blocks): - return None - return num_new_blocks - def _cache_blocks( self, request: Request, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d1f875b90a88..26b8a8267195 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -317,20 +317,18 @@ def schedule(self) -> SchedulerOutput: self.kv_cache_manager.get_computed_blocks(request) # Get externally-cached tokens if using a KVConnector. + num_external_tokens = 0 if self.connector is not None: - computed_blocks, num_computed_tokens = \ - self.kv_cache_manager.alloc_and_append_external_blocks( - request=request, - computed_blocks=computed_blocks, - num_computed_tokens=num_computed_tokens, - kv_connector=self.connector, - ) + num_external_tokens = ( + self.connector.get_num_new_matched_tokens( + request, num_computed_tokens)) # Number of tokens to be scheduled. # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed requests, # which have output tokens. - num_new_tokens = request.num_tokens - num_computed_tokens + num_new_tokens = (request.num_tokens - num_computed_tokens - + num_external_tokens) if (0 < self.scheduler_config.long_prefill_token_threshold < num_new_tokens): num_new_tokens = ( @@ -352,11 +350,21 @@ def schedule(self) -> SchedulerOutput: new_encoder_budget = encoder_budget new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens, computed_blocks) + request=request, + num_tokens=num_new_tokens, + new_computed_blocks=computed_blocks, + num_external_tokens=num_external_tokens) if new_blocks is None: # The request cannot be scheduled. break + # KVConnector: update internal state after allocation. + # This information is used to determine if a load is + # needed for this request. + if self.connector is not None: + self.connector.update_state_after_alloc( + request, num_external_tokens) + self.waiting.popleft() if request.use_structured_output: structured_output_request_ids[ From 01caf6124bd1dd5f2442dd9c1250bd97dfb2b210 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 22:53:04 +0000 Subject: [PATCH 084/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 37 ++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 26b8a8267195..8f4b6c47f2e2 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -189,8 +189,11 @@ def schedule(self) -> SchedulerOutput: if request.has_encoder_inputs: (encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget) = self._try_schedule_encoder_inputs( - request, request.num_computed_tokens, num_new_tokens, - encoder_budget) + request=request, + num_computed_tokens=request.num_computed_tokens, + num_new_tokens=num_new_tokens, + num_external_tokens=0, + encoder_budget=encoder_budget) if num_new_tokens == 0: # The request cannot be scheduled because the encoder budget # or the encoder cache is exhausted. @@ -340,8 +343,11 @@ def schedule(self) -> SchedulerOutput: if request.has_encoder_inputs: (encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget) = self._try_schedule_encoder_inputs( - request, num_computed_tokens, num_new_tokens, - encoder_budget) + request=request, + num_computed_tokens=request.num_computed_tokens, + num_new_tokens=num_new_tokens, + num_external_tokens=num_external_tokens, + encoder_budget=encoder_budget) if num_new_tokens == 0: # The request cannot be scheduled. break @@ -528,6 +534,7 @@ def _try_schedule_encoder_inputs( self, request: Request, num_computed_tokens: int, + num_external_tokens: int, num_new_tokens: int, encoder_budget: int, ) -> tuple[list[int], int, int]: @@ -538,7 +545,8 @@ def _try_schedule_encoder_inputs( An encoder input will be scheduled if: - Its output tokens overlap with the range of tokens being computed in this step, i.e., - [num_computed_tokens, num_computed_tokens + num_new_tokens). + [num_existing_tokens, num_existing_tokens + num_new_tokens). + where num_existing_tokens = num_computed_tokens + num_external_tokens - It is not already computed and stored in the encoder cache. - There is sufficient encoder token budget to process it. - The encoder cache has space to store it. @@ -549,6 +557,7 @@ def _try_schedule_encoder_inputs( """ encoder_inputs_to_schedule: list[int] = [] mm_positions = request.mm_positions + num_existing_tokens = num_computed_tokens + num_external_tokens assert mm_positions is not None assert len(mm_positions) > 0 for i, pos_info in enumerate(mm_positions): @@ -556,12 +565,12 @@ def _try_schedule_encoder_inputs( num_encoder_tokens = pos_info.length # The encoder output is needed if the two ranges overlap: - # [num_computed_tokens, num_computed_tokens + num_new_tokens) and + # [num_existing_tokens, num_existing_tokens + num_new_tokens) and # [start_pos, start_pos + num_encoder_tokens) - if start_pos >= num_computed_tokens + num_new_tokens: + if start_pos >= num_existing_tokens + num_new_tokens: # The encoder input is not needed in this step. break - if start_pos + num_encoder_tokens <= num_computed_tokens: + if start_pos + num_encoder_tokens <= num_existing_tokens: # The encoder input is already computed and stored # in the decoder's KV cache. continue @@ -574,10 +583,10 @@ def _try_schedule_encoder_inputs( # partially schedule a multimodal item. If the scheduled range would # only cover part of the mm input, roll back to before the mm item. if (self.scheduler_config.disable_chunked_mm_input - and num_computed_tokens < start_pos - and (num_computed_tokens + num_new_tokens) + and num_existing_tokens < start_pos + and (num_existing_tokens + num_new_tokens) < (start_pos + num_encoder_tokens)): - num_new_tokens = start_pos - num_computed_tokens + num_new_tokens = start_pos - num_existing_tokens break if (not self.encoder_cache_manager.can_allocate(request, i) @@ -586,12 +595,12 @@ def _try_schedule_encoder_inputs( # NOTE(woosuk): We assume that the encoder input tokens should # be processed altogether, as the encoder usually uses # bidirectional attention. - if num_computed_tokens < start_pos: + if num_existing_tokens < start_pos: # We only schedule the decoder tokens just before the # encoder input. - num_new_tokens = start_pos - num_computed_tokens + num_new_tokens = start_pos - num_existing_tokens else: - # Because of prefix caching, num_computed_tokens is greater + # Because of prefix caching, num_existing_tokens is greater # than start_pos even though its encoder input is not # available. In this case, we can't schedule any token for # the request in this step. From d8549cbb3b841d4f1822706e52533bdc915aacdc Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 14 Apr 2025 22:57:28 +0000 Subject: [PATCH 085/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 49 -------------------------------- 1 file changed, 49 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index e4b475ba6519..21d8555a385c 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -302,55 +302,6 @@ def allocate_slots( request.request_id] = num_full_blocks_after_append return new_blocks - def _cache_blocks( - self, - request: Request, - req_blocks: list[KVCacheBlock], - num_computed_tokens: int, - num_tokens: int, - new_computed_blocks: list[KVCacheBlock], - ): - """ - Cache blocks in the Block Pool. - - Args: - request: The request to cache the blocks. - req_blocks: All blocks in the request. - block_hashes: Block hashes of the blocks in the request. Note that - this list may be shorter than the blocks list. In this case the - missed block hash will be computed in this function. - num_cached_blocks: The number of blocks that are already cached. - num_full_blocks: The number of blocks that are full and should - be cached after this function. - block_size: Number of tokens in each block. - hash_fn: The hash function to use for block hashes. - Returns: - If not enough free blocks: returns None. - Else: return the number of incremental blocks to allocate. - """ - # Use `new_computed_blocks` for a new request, and `num_cached_block` - # for a running request. - num_cached_blocks = self.num_cached_block.get(request.request_id, - len(new_computed_blocks)) - # Speculated tokens might be rejected in the future, so we do - # not cache any speculated tokens. We only cache blocks with - # generated (accepted) tokens. - num_full_blocks_after_append = (num_computed_tokens + num_tokens - len( - request.spec_token_ids)) // self.block_size - - self.block_pool.cache_full_blocks( - request=request, - blocks=req_blocks, - block_hashes=self.req_to_block_hashes[request.request_id], - num_cached_blocks=num_cached_blocks, - num_full_blocks=num_full_blocks_after_append, - block_size=self.block_size, - hash_fn=self.caching_hash_fn, - ) - - self.num_cached_block[ - request.request_id] = num_full_blocks_after_append - def free(self, request: Request) -> None: """Free the blocks allocated for the request. When caching is enabled, we free the blocks in reverse order so that From b362ef1f1580753faf4d77a984d4385ea9350369 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 01:31:28 +0000 Subject: [PATCH 086/116] trying to fix mm, added tests Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 107 +++++++++++++++++++++++++++++++- vllm/v1/core/sched/scheduler.py | 2 +- 2 files changed, 105 insertions(+), 4 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index fa53b0de668d..b2a2d0378ced 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Optional +from unittest.mock import Mock import pytest import torch -from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, + SchedulerConfig, VllmConfig) from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.v1.core.sched.output import SchedulerOutput @@ -25,6 +27,8 @@ def create_scheduler( enable_prefix_caching: Optional[bool] = None, long_prefill_token_threshold: int = 0, disable_chunked_mm_input: bool = False, + use_kv_connector: bool = False, + num_blocks: int = 10000, ) -> Scheduler: '''Create scheduler under test. @@ -66,20 +70,27 @@ def create_scheduler( cache_dtype="auto", **kwargs_cache, ) + kv_transfer_config = KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": "local_storage"}, + ) if use_kv_connector else None + vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, cache_config=cache_config, + kv_transfer_config=kv_transfer_config, ) kv_cache_config = KVCacheConfig( - num_blocks=10000, # A large number of blocks to hold all requests + num_blocks=num_blocks, # A large number of blocks to hold all requests tensors={}, kv_cache_groups=[ KVCacheGroupSpec(['layer'], FullAttentionSpec(16, 1, 1, torch.float32, False)) ], ) - cache_config.num_gpu_blocks = 10000 + cache_config.num_gpu_blocks = num_blocks return Scheduler( vllm_config=vllm_config, kv_cache_config=kv_cache_config, @@ -758,3 +769,93 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): stats = scheduler_stats.spec_decoding_stats assert stats.num_draft_tokens == expected[0] assert stats.num_accepted_tokens == expected[1] + + +def test_kv_connector_basic(): + """Test basic functionality of KVConnector.""" + + scheduler = create_scheduler(use_kv_connector=True) + NUM_TOTAL_BLOCKS = ( + scheduler.kv_cache_manager.block_pool.get_num_free_blocks()) + BLOCK_SIZE = scheduler.cache_config.block_size + + # Every request should match a single block. + NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE + scheduler.connector.get_num_new_matched_tokens = Mock(name="method") + scheduler.connector.get_num_new_matched_tokens.return_value = ( + NUM_MATCHED_NEW_TOKENS) + + NUM_REQUESTS = 2 + NUM_TOKENS = BLOCK_SIZE * 2 + MAX_TOKENS = 3 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i + + MODEL_RUNNER_OUTPUT = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0]] * len(req_ids), + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + + # We should get an external prefix cache hit for every request. + output = scheduler.schedule() + assert len(output.kv_connector_metadata.requests) == NUM_REQUESTS + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + # We should only schedule new tokens beyond this request. + assert num_scheduled_tokens == NUM_TOKENS - NUM_MATCHED_NEW_TOKENS + + # We should not have any connector metadata in running. + all_finished = False + _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + while not all_finished: + # Schedule + a few iterations until stopping. + output = scheduler.schedule() + assert len(output.kv_connector_metadata.requests) == 0 + ecos = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + all_done = True + for eco in ecos.outputs: + if eco.finish_reason is None: + all_done = False + all_finished = all_done + + # Confirm we have no leaks (all blocks are freed). + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ + == NUM_TOTAL_BLOCKS + + # Every request should match a single block. + NUM_TOKENS_PREFIX = NUM_TOKENS + NUM_TOKENS = NUM_TOKENS_PREFIX * 2 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + + # We should get an external prefix cache hit for every request, + # but only for the i + output = scheduler.schedule() + assert len(output.kv_connector_metadata.requests) == NUM_REQUESTS + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + # We should only schedule new tokens beyond this request. + assert num_scheduled_tokens == (NUM_TOKENS - NUM_TOKENS_PREFIX - + NUM_MATCHED_NEW_TOKENS) + + all_finished = False + while not all_finished: + # Schedule + a few iterations until stopping. + output = scheduler.schedule() + assert len(output.kv_connector_metadata.requests) == 0 + ecos = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + all_done = True + for eco in ecos.outputs: + if eco.finish_reason is None: + all_done = False + all_finished = all_done diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 8f4b6c47f2e2..cb439fdb19f2 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -344,7 +344,7 @@ def schedule(self) -> SchedulerOutput: (encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget) = self._try_schedule_encoder_inputs( request=request, - num_computed_tokens=request.num_computed_tokens, + num_computed_tokens=num_computed_tokens, num_new_tokens=num_new_tokens, num_external_tokens=num_external_tokens, encoder_budget=encoder_budget) From 78d523ef1ff9756d10938e6669ce66335e47b7c6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 01:33:43 +0000 Subject: [PATCH 087/116] update comment Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 21d8555a385c..29fa848eabcd 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -183,7 +183,9 @@ def allocate_slots( This is used by spec decode proposers with kv-cache such as eagle. num_external_tokens: The number of external tokens to allocate. - This is used by KVConnector for remote KV cache. + This is used by KVConnector for remote KV cache. KVConnector + injects external tokens into the blocks during execute_model. + Blocks layout: ----------------------------------------------------------------------- From 4c38138205ea7b20b3036a072bbe0b6f33d2fa56 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 01:39:02 +0000 Subject: [PATCH 088/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 29fa848eabcd..b8687484776d 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -184,8 +184,7 @@ def allocate_slots( as eagle. num_external_tokens: The number of external tokens to allocate. This is used by KVConnector for remote KV cache. KVConnector - injects external tokens into the blocks during execute_model. - + injects external tokens into the blocks during execute_model. Blocks layout: ----------------------------------------------------------------------- From 7af6ce22da3c3b711e2e965ff2826b4659c6cac5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 02:14:32 +0000 Subject: [PATCH 089/116] commit test improvements Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 48 +++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index b2a2d0378ced..a7cb3f7fd40b 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -774,7 +774,11 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): def test_kv_connector_basic(): """Test basic functionality of KVConnector.""" - scheduler = create_scheduler(use_kv_connector=True) + scheduler = create_scheduler( + enable_prefix_caching=True, + use_kv_connector=True, + ) + NUM_TOTAL_BLOCKS = ( scheduler.kv_cache_manager.block_pool.get_num_free_blocks()) BLOCK_SIZE = scheduler.cache_config.block_size @@ -801,19 +805,25 @@ def test_kv_connector_basic(): MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=req_to_index, - sampled_token_ids=[[0]] * len(req_ids), + sampled_token_ids=[[1000]] * len(req_ids), spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, ) - # We should get an external prefix cache hit for every request. + # We should get an external prefix cache hit. output = scheduler.schedule() assert len(output.kv_connector_metadata.requests) == NUM_REQUESTS for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): # We should only schedule new tokens beyond this request. assert num_scheduled_tokens == NUM_TOKENS - NUM_MATCHED_NEW_TOKENS + # Make sure we actually touched all the blocks. + BLOCKS_PER_REQ = (NUM_TOKENS / BLOCK_SIZE + + scheduler.kv_cache_manager.num_preallocate_blocks) + assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == + NUM_TOTAL_BLOCKS - NUM_REQUESTS * BLOCKS_PER_REQ) + # We should not have any connector metadata in running. all_finished = False _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) @@ -827,6 +837,7 @@ def test_kv_connector_basic(): if eco.finish_reason is None: all_done = False all_finished = all_done + output = scheduler.schedule() # Confirm we have no leaks (all blocks are freed). assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ @@ -838,9 +849,24 @@ def test_kv_connector_basic(): requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, max_tokens=MAX_TOKENS) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i + + MODEL_RUNNER_OUTPUT = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[1000]] * len(req_ids), + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) - # We should get an external prefix cache hit for every request, - # but only for the i + # We should get a local cache hit of NUM_TOKENS_PREFIX and + # a remote KV cache hit of NUM_MATCHED_NEW_TOKENS. output = scheduler.schedule() assert len(output.kv_connector_metadata.requests) == NUM_REQUESTS for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): @@ -848,7 +874,14 @@ def test_kv_connector_basic(): assert num_scheduled_tokens == (NUM_TOKENS - NUM_TOKENS_PREFIX - NUM_MATCHED_NEW_TOKENS) + # Make sure we actually touched all the blocks. + BLOCKS_PER_REQ = (NUM_TOKENS / BLOCK_SIZE + + scheduler.kv_cache_manager.num_preallocate_blocks) + assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == + NUM_TOTAL_BLOCKS - NUM_REQUESTS * BLOCKS_PER_REQ) + all_finished = False + _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) while not all_finished: # Schedule + a few iterations until stopping. output = scheduler.schedule() @@ -859,3 +892,8 @@ def test_kv_connector_basic(): if eco.finish_reason is None: all_done = False all_finished = all_done + output = scheduler.schedule() + + # Confirm we have no leaks (all blocks are freed). + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ + == NUM_TOTAL_BLOCKS From 1ad993b752e4494459b80806b1d52155da0d6a48 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 02:16:57 +0000 Subject: [PATCH 090/116] remove disaggregated tests Signed-off-by: rshaw@neuralmagic.com --- tests/disaggregated/__init__.py | 0 tests/disaggregated/test_simple_storage.py | 95 ---------------------- 2 files changed, 95 deletions(-) delete mode 100644 tests/disaggregated/__init__.py delete mode 100644 tests/disaggregated/test_simple_storage.py diff --git a/tests/disaggregated/__init__.py b/tests/disaggregated/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/disaggregated/test_simple_storage.py b/tests/disaggregated/test_simple_storage.py deleted file mode 100644 index f692e98e8804..000000000000 --- a/tests/disaggregated/test_simple_storage.py +++ /dev/null @@ -1,95 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import os -import shutil - -import pytest - -from vllm import LLM, SamplingParams -from vllm.config import KVTransferConfig - - -@pytest.fixture(scope="function", autouse=True) -def cleanup(): - yield - if os.path.exists("output.txt"): - os.remove("output.txt") - if os.path.isdir("local_storage"): - shutil.rmtree("local_storage") - - -def test_integration(): - - sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - - llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig.from_cli( - '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' - '"kv_connector_extra_config": ' - '{"shared_storage_path": "local_storage"}}')) - - context = "Hi " * 1000 - context2 = "Hey " * 500 - prompts = [ - context + "Hello, my name is", - context + "The capital of France is", - context2 + "Your name is", - context2 + "The capital of China is", - ] - - # 1ST generation (prefill instance) - outputs = llm.generate( - prompts, - sampling_params, - ) - - new_prompts = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - new_prompts.append(prompt + generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - # Write new_prompts to output.txt - with open("output.txt", "w") as f: - for prompt in new_prompts: - f.write(prompt + "\n") - print(f"Saved {len(new_prompts)} prompts to output.txt") - - del llm - - # Read prompts from output.txt - prompts = [] - try: - with open("output.txt") as f: - for line in f: - prompts.append(line.strip()) - print(f"Loaded {len(prompts)} prompts from output.txt") - except FileNotFoundError: - print("Error: output.txt file not found") - exit(-1) - - sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - - decode_llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig.from_cli( - '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",' - '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}' # noqa: E501 - )) - - # 2nd generation (decode instance) - outputs = decode_llm.generate(prompts, sampling_params) - - new_prompts = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - new_prompts.append(prompt + generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - assert len(generated_text) > 5 From 3a08ddac17cf8de4e919af762439f5a70b6a5136 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 02:28:19 +0000 Subject: [PATCH 091/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index cb439fdb19f2..d71cddc87c84 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -397,7 +397,8 @@ def schedule(self) -> SchedulerOutput: num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens request.status = RequestStatus.RUNNING - request.num_computed_tokens = num_computed_tokens + request.num_computed_tokens = (num_computed_tokens + + num_external_tokens) # Encoder-related. if encoder_inputs_to_schedule: From e49874d09c6222a8a2ddd8192bcaf578117d1b14 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 02:28:57 +0000 Subject: [PATCH 092/116] update comment Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d71cddc87c84..4f12bc346b0e 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -315,7 +315,7 @@ def schedule(self) -> SchedulerOutput: skipped_waiting_requests.appendleft(request) continue - # Get already-cached tokens. + # Get locally-cached tokens. computed_blocks, num_computed_tokens = \ self.kv_cache_manager.get_computed_blocks(request) From dd7969a9d72ec51bf279663e61cf9d88267fb9ed Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 02:32:38 +0000 Subject: [PATCH 093/116] fix test case Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index a7cb3f7fd40b..92bfeb3b79d9 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -830,6 +830,9 @@ def test_kv_connector_basic(): while not all_finished: # Schedule + a few iterations until stopping. output = scheduler.schedule() + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + # We should be in the decode phase now. + assert num_scheduled_tokens == 1 assert len(output.kv_connector_metadata.requests) == 0 ecos = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) all_done = True @@ -885,6 +888,9 @@ def test_kv_connector_basic(): while not all_finished: # Schedule + a few iterations until stopping. output = scheduler.schedule() + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + # We should be in the decode phase now. + assert num_scheduled_tokens == 1 assert len(output.kv_connector_metadata.requests) == 0 ecos = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) all_done = True From e1f130e42c39870b4e31b1fa6a770c20dcb1c89c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 03:06:21 +0000 Subject: [PATCH 094/116] improve test code quality Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 165 +++++++++++++++++++------------ vllm/v1/core/kv_cache_manager.py | 7 +- 2 files changed, 105 insertions(+), 67 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 92bfeb3b79d9..f19495844555 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -771,26 +771,92 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): assert stats.num_accepted_tokens == expected[1] +def _assert_right_scheduler_output( + output: SchedulerOutput, + num_requests: int, + expected_num_scheduled_tokens: int, +): + # We should inject the kv_connector_metadata. + assert len(output.kv_connector_metadata.requests) == num_requests + + # Only num_tokens - matched_num_new_tokens should be scheduled. + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + assert num_scheduled_tokens == expected_num_scheduled_tokens + + +def _assert_right_kv_cache_manager( + scheduler: Scheduler, + req_ids: list[str], + num_tokens: int, + block_size: int, + num_requests: int, + num_total_blocks: int, +): + """Assert KV Cache Manager Is Right After Remote Cache Hit.""" + + EXPECTED_ACTUAL_BLOCKS = num_tokens // block_size + EXPECTED_TOTAL_BLOCKS = (EXPECTED_ACTUAL_BLOCKS + + scheduler.kv_cache_manager.num_preallocate_blocks) + for req_id in req_ids: + blocks = scheduler.kv_cache_manager.req_to_blocks[req_id] + hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id] + assert (scheduler.kv_cache_manager.num_cached_block[req_id] == + EXPECTED_ACTUAL_BLOCKS) + assert len(blocks) == EXPECTED_TOTAL_BLOCKS + assert len(hashes) == EXPECTED_ACTUAL_BLOCKS + + # Make sure we actually touched all the blocks. + BLOCKS_PER_REQ = (num_tokens / block_size + + scheduler.kv_cache_manager.num_preallocate_blocks) + assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == + num_total_blocks - num_requests * BLOCKS_PER_REQ) + + +def _step_until_done( + scheduler: Scheduler, + output: SchedulerOutput, + model_runner_output: ModelRunnerOutput, +): + all_finished = False + _ = scheduler.update_from_output(output, model_runner_output) + while not all_finished: + # Schedule + a few iterations until stopping. + output = scheduler.schedule() + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + # We should be in the decode phase now. + assert num_scheduled_tokens == 1 + assert len(output.kv_connector_metadata.requests) == 0 + ecos = scheduler.update_from_output(output, model_runner_output) + all_done = True + for eco in ecos.outputs: + if eco.finish_reason is None: + all_done = False + all_finished = all_done + _ = scheduler.schedule() + + def test_kv_connector_basic(): """Test basic functionality of KVConnector.""" + # Setup Scheduler. scheduler = create_scheduler( enable_prefix_caching=True, use_kv_connector=True, ) - NUM_TOTAL_BLOCKS = ( scheduler.kv_cache_manager.block_pool.get_num_free_blocks()) BLOCK_SIZE = scheduler.cache_config.block_size - # Every request should match a single block. - NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE + # Mock External Cache Hit. + NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 scheduler.connector.get_num_new_matched_tokens = Mock(name="method") scheduler.connector.get_num_new_matched_tokens.return_value = ( NUM_MATCHED_NEW_TOKENS) + ###################################################### + # FIRST SET OF REQUESTS - External Hit Only NUM_REQUESTS = 2 - NUM_TOKENS = BLOCK_SIZE * 2 + NUM_TOKENS = NUM_MATCHED_NEW_TOKENS * 2 MAX_TOKENS = 3 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, @@ -811,43 +877,30 @@ def test_kv_connector_basic(): prompt_logprobs_dict={}, ) - # We should get an external prefix cache hit. + # Ensure ScheduleOutput is correct. output = scheduler.schedule() - assert len(output.kv_connector_metadata.requests) == NUM_REQUESTS - for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): - # We should only schedule new tokens beyond this request. - assert num_scheduled_tokens == NUM_TOKENS - NUM_MATCHED_NEW_TOKENS - - # Make sure we actually touched all the blocks. - BLOCKS_PER_REQ = (NUM_TOKENS / BLOCK_SIZE + - scheduler.kv_cache_manager.num_preallocate_blocks) - assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == - NUM_TOTAL_BLOCKS - NUM_REQUESTS * BLOCKS_PER_REQ) + _assert_right_scheduler_output( + output=output, + num_requests=NUM_REQUESTS, + # Just the incremental tokens should be scheduled. + expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS, + ) - # We should not have any connector metadata in running. - all_finished = False - _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) - while not all_finished: - # Schedule + a few iterations until stopping. - output = scheduler.schedule() - for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): - # We should be in the decode phase now. - assert num_scheduled_tokens == 1 - assert len(output.kv_connector_metadata.requests) == 0 - ecos = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) - all_done = True - for eco in ecos.outputs: - if eco.finish_reason is None: - all_done = False - all_finished = all_done - output = scheduler.schedule() + # Ensure KVCacheManager is correct. + _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE, + NUM_REQUESTS, NUM_TOTAL_BLOCKS) - # Confirm we have no leaks (all blocks are freed). + # Continue Generation until done. + _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT) + # Confirm we clean up the memory properly. assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ == NUM_TOTAL_BLOCKS - # Every request should match a single block. + ###################################################### + # SECOND SET OF REQUESTS - Local And External Hit NUM_TOKENS_PREFIX = NUM_TOKENS + # We will get a local prefix cache hit for the first + # NUM_TOKENS_PREFIX tokens since they are used above. NUM_TOKENS = NUM_TOKENS_PREFIX * 2 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, @@ -871,35 +924,19 @@ def test_kv_connector_basic(): # We should get a local cache hit of NUM_TOKENS_PREFIX and # a remote KV cache hit of NUM_MATCHED_NEW_TOKENS. output = scheduler.schedule() - assert len(output.kv_connector_metadata.requests) == NUM_REQUESTS - for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): - # We should only schedule new tokens beyond this request. - assert num_scheduled_tokens == (NUM_TOKENS - NUM_TOKENS_PREFIX - - NUM_MATCHED_NEW_TOKENS) - - # Make sure we actually touched all the blocks. - BLOCKS_PER_REQ = (NUM_TOKENS / BLOCK_SIZE + - scheduler.kv_cache_manager.num_preallocate_blocks) - assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == - NUM_TOTAL_BLOCKS - NUM_REQUESTS * BLOCKS_PER_REQ) - - all_finished = False - _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) - while not all_finished: - # Schedule + a few iterations until stopping. - output = scheduler.schedule() - for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): - # We should be in the decode phase now. - assert num_scheduled_tokens == 1 - assert len(output.kv_connector_metadata.requests) == 0 - ecos = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) - all_done = True - for eco in ecos.outputs: - if eco.finish_reason is None: - all_done = False - all_finished = all_done - output = scheduler.schedule() - - # Confirm we have no leaks (all blocks are freed). + _assert_right_scheduler_output( + output=output, + num_requests=NUM_REQUESTS, + # Just the incremental tokens after local + remote cache hit. + expected_num_scheduled_tokens=(NUM_TOKENS - NUM_TOKENS_PREFIX - + NUM_MATCHED_NEW_TOKENS)) + + # Ensure KVCacheManager is correct. + _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE, + NUM_REQUESTS, NUM_TOTAL_BLOCKS) + + # Continue Generation until done. + _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT) + # Confirm we clean up the memory properly. assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ == NUM_TOTAL_BLOCKS diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index b8687484776d..9e3759773bde 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -283,11 +283,12 @@ def allocate_slots( # for a running request. num_cached_blocks = self.num_cached_block.get(request.request_id, len(new_computed_blocks)) - # Speculated tokens might be rejected in the future, so we does + # Speculated tokens might be rejected in the future, so we do # not cache any speculated tokens. We only cache blocks with # generated (accepted) tokens. - num_full_blocks_after_append = (num_computed_tokens + num_tokens - len( - request.spec_token_ids)) // self.block_size + num_full_blocks_after_append = ( + num_computed_tokens + num_external_tokens + num_tokens - + len(request.spec_token_ids)) // self.block_size self.block_pool.cache_full_blocks( request=request, From 611b782473da0dd784751ab71dfc362321d2809e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 03:42:14 +0000 Subject: [PATCH 095/116] added better testing Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 87 ++++++++++++++++++++++++++++++-- vllm/v1/core/kv_cache_manager.py | 1 - 2 files changed, 84 insertions(+), 4 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index f19495844555..9434922f5420 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -29,6 +29,7 @@ def create_scheduler( disable_chunked_mm_input: bool = False, use_kv_connector: bool = False, num_blocks: int = 10000, + block_size: int = 16, ) -> Scheduler: '''Create scheduler under test. @@ -64,7 +65,7 @@ def create_scheduler( 'enable_prefix_caching': enable_prefix_caching }) cache_config = CacheConfig( - block_size=16, + block_size=block_size, gpu_memory_utilization=0.9, swap_space=0, cache_dtype="auto", @@ -87,7 +88,8 @@ def create_scheduler( tensors={}, kv_cache_groups=[ KVCacheGroupSpec(['layer'], - FullAttentionSpec(16, 1, 1, torch.float32, False)) + FullAttentionSpec(block_size, 1, 1, torch.float32, + False)) ], ) cache_config.num_gpu_blocks = num_blocks @@ -822,6 +824,7 @@ def _step_until_done( while not all_finished: # Schedule + a few iterations until stopping. output = scheduler.schedule() + assert len(scheduler.running) for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): # We should be in the decode phase now. assert num_scheduled_tokens == 1 @@ -832,7 +835,6 @@ def _step_until_done( if eco.finish_reason is None: all_done = False all_finished = all_done - _ = scheduler.schedule() def test_kv_connector_basic(): @@ -892,6 +894,7 @@ def test_kv_connector_basic(): # Continue Generation until done. _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT) + _ = scheduler.schedule() # Confirm we clean up the memory properly. assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ == NUM_TOTAL_BLOCKS @@ -937,6 +940,84 @@ def test_kv_connector_basic(): # Continue Generation until done. _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT) + _ = scheduler.schedule() # Confirm we clean up the memory properly. assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ == NUM_TOTAL_BLOCKS + + +def test_kv_connector_unable_to_allocate(): + """ + Test KVConnector is able to handle unable to allocate (run out of + blocks during + """ + + # Setup Scheduler With Mock External Cache Hit. + BLOCK_SIZE = 4 + NUM_BLOCKS = 10 + scheduler = create_scheduler( + enable_prefix_caching=True, + use_kv_connector=True, + block_size=BLOCK_SIZE, + num_blocks=NUM_BLOCKS, + ) + NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 + scheduler.connector.get_num_new_matched_tokens = Mock(name="method") + scheduler.connector.get_num_new_matched_tokens.return_value = ( + NUM_MATCHED_NEW_TOKENS) + + # Create two requests. The second request will not be able to + # allocate slots because it will not have enough blocks. + NUM_REQUESTS = 2 + NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE + MAX_TOKENS = 2 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i + + MODEL_RUNNER_OUTPUT = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[1000]] * len(req_ids), + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + + # Just one request should be running. + output = scheduler.schedule() + _assert_right_scheduler_output(output, + num_requests=1, + expected_num_scheduled_tokens=NUM_TOKENS - + NUM_MATCHED_NEW_TOKENS) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + + # All memory should be freed, with one request waiting. + _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT) + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ + == NUM_BLOCKS - 1 + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 1 + + # Just one request should be running. + output = scheduler.schedule() + _assert_right_scheduler_output(output, + num_requests=1, + expected_num_scheduled_tokens=NUM_TOKENS - + NUM_MATCHED_NEW_TOKENS) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + + # All memory should be freed, with no requests waiting / running. + _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT) + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ + == NUM_BLOCKS - 1 + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 0 diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 9e3759773bde..ecf4feeff50f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -203,7 +203,6 @@ def allocate_slots( """ if num_tokens == 0: raise ValueError("num_tokens must be greater than 0") - new_computed_blocks = new_computed_blocks or [] req_blocks = self.req_to_blocks[request.request_id] From f6b8bff094d25578b4efae78d906e72a425620e7 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 03:45:00 +0000 Subject: [PATCH 096/116] update comments Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 9434922f5420..56b1d5a2ab14 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -778,6 +778,8 @@ def _assert_right_scheduler_output( num_requests: int, expected_num_scheduled_tokens: int, ): + """Check if SchedulerOutput is correct after remote KV cache hit.""" + # We should inject the kv_connector_metadata. assert len(output.kv_connector_metadata.requests) == num_requests @@ -794,8 +796,9 @@ def _assert_right_kv_cache_manager( num_requests: int, num_total_blocks: int, ): - """Assert KV Cache Manager Is Right After Remote Cache Hit.""" + """Check whether KVCacheManager is correct after allocate.""" + # Make sure the request stats are right. EXPECTED_ACTUAL_BLOCKS = num_tokens // block_size EXPECTED_TOTAL_BLOCKS = (EXPECTED_ACTUAL_BLOCKS + scheduler.kv_cache_manager.num_preallocate_blocks) @@ -819,6 +822,8 @@ def _step_until_done( output: SchedulerOutput, model_runner_output: ModelRunnerOutput, ): + """Loop over schedule(), update_from_output() until finished.""" + all_finished = False _ = scheduler.update_from_output(output, model_runner_output) while not all_finished: @@ -838,7 +843,10 @@ def _step_until_done( def test_kv_connector_basic(): - """Test basic functionality of KVConnector.""" + """ + Test whether Scheduler with KVConnector schedules tokens, allocates + memory, and cleans up requests as expected under normal operation. + """ # Setup Scheduler. scheduler = create_scheduler( @@ -948,8 +956,8 @@ def test_kv_connector_basic(): def test_kv_connector_unable_to_allocate(): """ - Test KVConnector is able to handle unable to allocate (run out of - blocks during + Test whether scheduler with KVConnector is able to handle + unable to allocate (run out of blocks in allocate_slots(). """ # Setup Scheduler With Mock External Cache Hit. From 96091157d3089e9557e6d1a45428c04d20780fe3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 04:46:49 +0000 Subject: [PATCH 097/116] updated Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 131 ++++++++++++++++++ .../v1/shared_storage_connector.py | 19 ++- 2 files changed, 146 insertions(+), 4 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 56b1d5a2ab14..2ed9a9c95bb9 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1029,3 +1029,134 @@ def test_kv_connector_unable_to_allocate(): == NUM_BLOCKS - 1 assert len(scheduler.running) == 0 assert len(scheduler.waiting) == 0 + + +def test_kv_connector_handles_preemption(): + """ + Test whether scheduler with KVConnector is able to handle + unable to allocate (run out of blocks in allocate_slots(). + """ + + # Setup Scheduler With Mock External Cache Hit. + BLOCK_SIZE = 2 + # NOTE: there is 1 null block, so this is 6 blocks. + NUM_BLOCKS = 7 + scheduler = create_scheduler( + enable_prefix_caching=True, + use_kv_connector=True, + block_size=BLOCK_SIZE, + num_blocks=NUM_BLOCKS, + ) + scheduler.kv_cache_manager.num_preallocate_blocks = 0 + + NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE + scheduler.connector.get_num_new_matched_tokens = Mock(name="method") + scheduler.connector.get_num_new_matched_tokens.return_value = ( + NUM_MATCHED_NEW_TOKENS) + + # Create two requests. + # Both can be scheduled at first, but the second request + # will be preempted and re-scheduled. + NUM_REQUESTS = 2 + NUM_TOKENS = BLOCK_SIZE * 2 + 1 + MAX_TOKENS = BLOCK_SIZE * 2 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i + + MODEL_RUNNER_OUTPUT = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[1000]] * len(req_ids), + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + + # All can be scheduled - 1st token. + output = scheduler.schedule() + _assert_right_scheduler_output( + output, + # 2 remote kv cache hits. + num_requests=2, + expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS) + assert len(scheduler.running) == 2 + _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + + # All can be scheduled - 2nd token. + output = scheduler.schedule() + _assert_right_scheduler_output( + output, + # no connector_metadata + num_requests=0, + expected_num_scheduled_tokens=1) + assert len(scheduler.running) == 2 + _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + + # This will generate a new block and cause a preemption - 3rd token. + output = scheduler.schedule() + _assert_right_scheduler_output( + output, + # no connector_metadata + num_requests=0, + expected_num_scheduled_tokens=1) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + + # Only 1 can be scheduled - 4th (and last token). + output = scheduler.schedule() + _assert_right_scheduler_output( + output, + # no connector_metadata + num_requests=0, + expected_num_scheduled_tokens=1) + assert len(scheduler.waiting) == 1 + assert len(scheduler.running) == 1 + _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 1 + # All memory should be freed since nothing is running. + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ + == NUM_BLOCKS - 1 + + # Restarts the preempted request - generate 3rd token. + # This will have a local and remote cache hit. + breakpoint() + output = scheduler.schedule() + breakpoint() + _assert_right_scheduler_output( + output, + # 1 remote kv_cache hit! + num_requests=1, + # Only 1 block was preempted and there is a single + # remote hit. So only single new token is scheduled. + expected_num_scheduled_tokens=1, + ) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + + # Only 1 can be scheduled - 4th (and last token). + output = scheduler.schedule() + _assert_right_scheduler_output( + output, + # no connector_metadata + num_requests=0, + expected_num_scheduled_tokens=1) + assert len(scheduler.running) == 1 + _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) + assert len(scheduler.running) == 0 + # All memory should be freed since nothing is running. + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ + == NUM_BLOCKS - 1 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index b84df0eaad2d..15c049767982 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -12,7 +12,7 @@ KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonMetadata -from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -32,7 +32,7 @@ class ReqMeta: is_store: bool @staticmethod - def from_request(request: "Request", block_size: int, + def from_request(request: NewRequestData, block_size: int, is_store: bool) -> "ReqMeta": valid_num_tokens = align_to_block_size(len(request.prompt_token_ids), block_size) @@ -59,7 +59,7 @@ def __init__(self): def add_request( self, - request: "Request", + request: NewRequestData, block_size: int, is_store: bool, ) -> None: @@ -281,7 +281,18 @@ def build_connector_meta( scheduler_output (SchedulerOutput): the scheduler output object. """ meta = SharedStorageConnectorMetadata() - for request in scheduler_output.scheduled_new_reqs: + + # If we have a rescheduled preempted request with a + # remote KV cache hit, it will be in scheduled_cached_reqs. + # NOTE(rob): this requires a pass over all + if scheduler_output.scheduled_cached_reqs: + scheduled_reqs = (scheduler_output.scheduled_new_reqs + + scheduler_output.scheduled_cached_reqs) + else: + scheduled_reqs = scheduler_output.scheduled_new_reqs + + # Check if the regu + for request in scheduled_reqs: if request.req_id in self._requests_need_load: meta.add_request(request, self._block_size, is_store=False) else: From 7ce3bd68385969bd59d587ab631614047fee8277 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 04:57:08 +0000 Subject: [PATCH 098/116] updated Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 2ed9a9c95bb9..053ef2ae978a 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1130,6 +1130,10 @@ def test_kv_connector_handles_preemption(): # Restarts the preempted request - generate 3rd token. # This will have a local and remote cache hit. + + # FIXME(rob): this is currently broken because: + # https://vllm-dev.slack.com/archives/C08MSU8THEC/p1744692863711629 + breakpoint() output = scheduler.schedule() breakpoint() From c3f38d7e92be524930319470d9dfd334a8f9febd Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 05:00:53 +0000 Subject: [PATCH 099/116] cleanup Signed-off-by: rshaw@neuralmagic.com --- .../kv_connector/v1/shared_storage_connector.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 15c049767982..8847dbefcdf6 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -281,18 +281,7 @@ def build_connector_meta( scheduler_output (SchedulerOutput): the scheduler output object. """ meta = SharedStorageConnectorMetadata() - - # If we have a rescheduled preempted request with a - # remote KV cache hit, it will be in scheduled_cached_reqs. - # NOTE(rob): this requires a pass over all - if scheduler_output.scheduled_cached_reqs: - scheduled_reqs = (scheduler_output.scheduled_new_reqs + - scheduler_output.scheduled_cached_reqs) - else: - scheduled_reqs = scheduler_output.scheduled_new_reqs - - # Check if the regu - for request in scheduled_reqs: + for request in scheduler_output.scheduled_new_reqs: if request.req_id in self._requests_need_load: meta.add_request(request, self._block_size, is_store=False) else: From 6dfda4456720a7826f832afe2870c3e6628b0b82 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 05:03:13 +0000 Subject: [PATCH 100/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4f12bc346b0e..afe251c353ff 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -546,8 +546,8 @@ def _try_schedule_encoder_inputs( An encoder input will be scheduled if: - Its output tokens overlap with the range of tokens being computed in this step, i.e., - [num_existing_tokens, num_existing_tokens + num_new_tokens). - where num_existing_tokens = num_computed_tokens + num_external_tokens + [num_cached_tokens, num_cached_tokens + num_new_tokens), + where num_cached_tokens = num_computed_tokens + num_external_tokens - It is not already computed and stored in the encoder cache. - There is sufficient encoder token budget to process it. - The encoder cache has space to store it. @@ -558,7 +558,7 @@ def _try_schedule_encoder_inputs( """ encoder_inputs_to_schedule: list[int] = [] mm_positions = request.mm_positions - num_existing_tokens = num_computed_tokens + num_external_tokens + num_cached_tokens = num_computed_tokens + num_external_tokens assert mm_positions is not None assert len(mm_positions) > 0 for i, pos_info in enumerate(mm_positions): @@ -566,12 +566,12 @@ def _try_schedule_encoder_inputs( num_encoder_tokens = pos_info.length # The encoder output is needed if the two ranges overlap: - # [num_existing_tokens, num_existing_tokens + num_new_tokens) and + # [num_cached_tokens, num_cached_tokens + num_new_tokens) and # [start_pos, start_pos + num_encoder_tokens) - if start_pos >= num_existing_tokens + num_new_tokens: + if start_pos >= num_cached_tokens + num_new_tokens: # The encoder input is not needed in this step. break - if start_pos + num_encoder_tokens <= num_existing_tokens: + if start_pos + num_encoder_tokens <= num_cached_tokens: # The encoder input is already computed and stored # in the decoder's KV cache. continue @@ -584,10 +584,10 @@ def _try_schedule_encoder_inputs( # partially schedule a multimodal item. If the scheduled range would # only cover part of the mm input, roll back to before the mm item. if (self.scheduler_config.disable_chunked_mm_input - and num_existing_tokens < start_pos - and (num_existing_tokens + num_new_tokens) + and num_cached_tokens < start_pos + and (num_cached_tokens + num_new_tokens) < (start_pos + num_encoder_tokens)): - num_new_tokens = start_pos - num_existing_tokens + num_new_tokens = start_pos - num_cached_tokens break if (not self.encoder_cache_manager.can_allocate(request, i) @@ -596,12 +596,12 @@ def _try_schedule_encoder_inputs( # NOTE(woosuk): We assume that the encoder input tokens should # be processed altogether, as the encoder usually uses # bidirectional attention. - if num_existing_tokens < start_pos: + if num_cached_tokens < start_pos: # We only schedule the decoder tokens just before the # encoder input. - num_new_tokens = start_pos - num_existing_tokens + num_new_tokens = start_pos - num_cached_tokens else: - # Because of prefix caching, num_existing_tokens is greater + # Because of prefix caching, num_cached_tokens is greater # than start_pos even though its encoder input is not # available. In this case, we can't schedule any token for # the request in this step. From 81d008ab960f57ba79d01f765086c7798af17c0d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 05:11:21 +0000 Subject: [PATCH 101/116] cosmetic Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index afe251c353ff..57d7df9dd7e3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -190,9 +190,8 @@ def schedule(self) -> SchedulerOutput: (encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget) = self._try_schedule_encoder_inputs( request=request, - num_computed_tokens=request.num_computed_tokens, + num_total_computed_tokens=request.num_computed_tokens, num_new_tokens=num_new_tokens, - num_external_tokens=0, encoder_budget=encoder_budget) if num_new_tokens == 0: # The request cannot be scheduled because the encoder budget @@ -344,9 +343,9 @@ def schedule(self) -> SchedulerOutput: (encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget) = self._try_schedule_encoder_inputs( request=request, - num_computed_tokens=num_computed_tokens, + num_total_computed_tokens=num_computed_tokens + + num_external_tokens, num_new_tokens=num_new_tokens, - num_external_tokens=num_external_tokens, encoder_budget=encoder_budget) if num_new_tokens == 0: # The request cannot be scheduled. @@ -534,8 +533,7 @@ def _make_cached_request_data( def _try_schedule_encoder_inputs( self, request: Request, - num_computed_tokens: int, - num_external_tokens: int, + num_total_computed_tokens: int, num_new_tokens: int, encoder_budget: int, ) -> tuple[list[int], int, int]: @@ -546,8 +544,8 @@ def _try_schedule_encoder_inputs( An encoder input will be scheduled if: - Its output tokens overlap with the range of tokens being computed in this step, i.e., - [num_cached_tokens, num_cached_tokens + num_new_tokens), - where num_cached_tokens = num_computed_tokens + num_external_tokens + [num_total_computed_tokens, num_total_computed_tokens + num_new_tokens), + num_total_computed_tokens = num_computed_tokens + num_external_tokens - It is not already computed and stored in the encoder cache. - There is sufficient encoder token budget to process it. - The encoder cache has space to store it. @@ -558,7 +556,6 @@ def _try_schedule_encoder_inputs( """ encoder_inputs_to_schedule: list[int] = [] mm_positions = request.mm_positions - num_cached_tokens = num_computed_tokens + num_external_tokens assert mm_positions is not None assert len(mm_positions) > 0 for i, pos_info in enumerate(mm_positions): @@ -568,10 +565,10 @@ def _try_schedule_encoder_inputs( # The encoder output is needed if the two ranges overlap: # [num_cached_tokens, num_cached_tokens + num_new_tokens) and # [start_pos, start_pos + num_encoder_tokens) - if start_pos >= num_cached_tokens + num_new_tokens: + if start_pos >= num_total_computed_tokens + num_new_tokens: # The encoder input is not needed in this step. break - if start_pos + num_encoder_tokens <= num_cached_tokens: + if start_pos + num_encoder_tokens <= num_total_computed_tokens: # The encoder input is already computed and stored # in the decoder's KV cache. continue @@ -584,10 +581,10 @@ def _try_schedule_encoder_inputs( # partially schedule a multimodal item. If the scheduled range would # only cover part of the mm input, roll back to before the mm item. if (self.scheduler_config.disable_chunked_mm_input - and num_cached_tokens < start_pos - and (num_cached_tokens + num_new_tokens) + and num_total_computed_tokens < start_pos + and (num_total_computed_tokens + num_new_tokens) < (start_pos + num_encoder_tokens)): - num_new_tokens = start_pos - num_cached_tokens + num_new_tokens = start_pos - num_total_computed_tokens break if (not self.encoder_cache_manager.can_allocate(request, i) @@ -596,13 +593,13 @@ def _try_schedule_encoder_inputs( # NOTE(woosuk): We assume that the encoder input tokens should # be processed altogether, as the encoder usually uses # bidirectional attention. - if num_cached_tokens < start_pos: + if num_total_computed_tokens < start_pos: # We only schedule the decoder tokens just before the # encoder input. - num_new_tokens = start_pos - num_cached_tokens + num_new_tokens = start_pos - num_total_computed_tokens else: - # Because of prefix caching, num_cached_tokens is greater - # than start_pos even though its encoder input is not + # Because of prefix caching, num_total_computed_tokens is + # greater than start_pos even though encoder input is not # available. In this case, we can't schedule any token for # the request in this step. num_new_tokens = 0 From 6d3588405f5930661e67b3b8c1a3e014ebd1c273 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 05:16:15 +0000 Subject: [PATCH 102/116] clean up Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 57d7df9dd7e3..0a14e0f63401 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -189,10 +189,8 @@ def schedule(self) -> SchedulerOutput: if request.has_encoder_inputs: (encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget) = self._try_schedule_encoder_inputs( - request=request, - num_total_computed_tokens=request.num_computed_tokens, - num_new_tokens=num_new_tokens, - encoder_budget=encoder_budget) + request, request.num_computed_tokens, num_new_tokens, + encoder_budget) if num_new_tokens == 0: # The request cannot be scheduled because the encoder budget # or the encoder cache is exhausted. @@ -321,16 +319,19 @@ def schedule(self) -> SchedulerOutput: # Get externally-cached tokens if using a KVConnector. num_external_tokens = 0 if self.connector is not None: - num_external_tokens = ( + num_external_tokens += ( self.connector.get_num_new_matched_tokens( request, num_computed_tokens)) + # Total computed blocks (local + external). + num_total_computed_tokens = (num_computed_tokens + + num_external_tokens) + # Number of tokens to be scheduled. # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed requests, # which have output tokens. - num_new_tokens = (request.num_tokens - num_computed_tokens - - num_external_tokens) + num_new_tokens = request.num_tokens - num_total_computed_tokens if (0 < self.scheduler_config.long_prefill_token_threshold < num_new_tokens): num_new_tokens = ( @@ -342,11 +343,8 @@ def schedule(self) -> SchedulerOutput: if request.has_encoder_inputs: (encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget) = self._try_schedule_encoder_inputs( - request=request, - num_total_computed_tokens=num_computed_tokens + - num_external_tokens, - num_new_tokens=num_new_tokens, - encoder_budget=encoder_budget) + request, num_total_computed_tokens, num_new_tokens, + encoder_budget) if num_new_tokens == 0: # The request cannot be scheduled. break @@ -396,8 +394,7 @@ def schedule(self) -> SchedulerOutput: num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens request.status = RequestStatus.RUNNING - request.num_computed_tokens = (num_computed_tokens + - num_external_tokens) + request.num_computed_tokens = num_total_computed_tokens # Encoder-related. if encoder_inputs_to_schedule: From 79fe730ccb594cb6b048783c7528eb4134d8766b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 05:23:29 +0000 Subject: [PATCH 103/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index ecf4feeff50f..048664c660d8 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -203,6 +203,7 @@ def allocate_slots( """ if num_tokens == 0: raise ValueError("num_tokens must be greater than 0") + new_computed_blocks = new_computed_blocks or [] req_blocks = self.req_to_blocks[request.request_id] @@ -219,12 +220,16 @@ def allocate_slots( # The number of computed tokens is the number of computed tokens plus # the new prefix caching hits - num_computed_tokens = (request.num_computed_tokens + - len(new_computed_blocks) * self.block_size) + num_total_computed_tokens = ( + request.num_computed_tokens + + len(new_computed_blocks) * self.block_size) + if num_external_tokens > 0: + num_total_computed_tokens += num_external_tokens + assert num_total_computed_tokens % self.block_size == 0 num_required_blocks = cdiv( - num_computed_tokens + num_external_tokens + num_tokens + - num_lookahead_tokens, self.block_size) + num_total_computed_tokens + num_tokens + num_lookahead_tokens, + self.block_size) num_new_blocks = (num_required_blocks - len(req_blocks) - len(new_computed_blocks)) @@ -282,11 +287,11 @@ def allocate_slots( # for a running request. num_cached_blocks = self.num_cached_block.get(request.request_id, len(new_computed_blocks)) - # Speculated tokens might be rejected in the future, so we do + # Speculated tokens might be rejected in the future, so we does # not cache any speculated tokens. We only cache blocks with # generated (accepted) tokens. num_full_blocks_after_append = ( - num_computed_tokens + num_external_tokens + num_tokens - + num_total_computed_tokens + num_tokens - len(request.spec_token_ids)) // self.block_size self.block_pool.cache_full_blocks( From ad18a3b440a7403cfc7e465f0d4604b8111a0fc3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 05:25:38 +0000 Subject: [PATCH 104/116] update nits Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 048664c660d8..22fa6b74a3ad 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -2,7 +2,7 @@ from collections import defaultdict from collections.abc import Iterable -from typing import TYPE_CHECKING, Optional +from typing import Optional from vllm.logger import init_logger from vllm.utils import cdiv, sha256 @@ -14,9 +14,6 @@ from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request, RequestStatus -if TYPE_CHECKING: - pass - logger = init_logger(__name__) From c1a1169741a8637d139b2a8ef1eb471524c3747c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 15 Apr 2025 22:35:14 +0000 Subject: [PATCH 105/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../decode_example.py | 36 ++++++++++++++++ .../prefill_example.py | 43 +++++++++++++++++++ .../disaggregated-prefill-v1/run.sh | 5 +++ 3 files changed, 84 insertions(+) create mode 100644 examples/offline_inference/disaggregated-prefill-v1/decode_example.py create mode 100644 examples/offline_inference/disaggregated-prefill-v1/prefill_example.py create mode 100644 examples/offline_inference/disaggregated-prefill-v1/run.sh diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py new file mode 100644 index 000000000000..66efbc0c9dee --- /dev/null +++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +# Read prompts from output.txt +prompts = [] +try: + with open("output.txt") as f: + for line in f: + prompts.append(line.strip()) + print(f"Loaded {len(prompts)} prompts from output.txt") +except FileNotFoundError: + print("Error: output.txt file not found") + exit(-1) + +sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) + +llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + max_num_batched_tokens=64, + max_num_seqs=16, + kv_transfer_config=KVTransferConfig.from_cli( + '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",' + '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}' + )) #, max_model_len=2048, max_num_batched_tokens=2048) + +# 1ST generation (prefill instance) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py new file mode 100644 index 000000000000..f7cbf6557d54 --- /dev/null +++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +context = "Hi " * 1000 +context2 = "Hey " * 500 +prompts = [ + context + "Hello, my name is", + context + "The capital of France is", + context2 + "Your name is", + context2 + "The capital of China is", +] + +sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + +llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig.from_cli( + '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' + '"kv_connector_extra_config": ' + '{"shared_storage_path": "local_storage"}}') + ) #, max_model_len=2048, max_num_batched_tokens=2048) + +# 1ST generation (prefill instance) +outputs = llm.generate( + prompts, + sampling_params, +) + +new_prompts = [] +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +# Write new_prompts to output.txt +with open("output.txt", "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") +print(f"Saved {len(new_prompts)} prompts to output.txt") diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/offline_inference/disaggregated-prefill-v1/run.sh new file mode 100644 index 000000000000..0ebf45a1586a --- /dev/null +++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh @@ -0,0 +1,5 @@ +rm -rf local_storage/ +rm output.txt + +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py From 1b8ec0bd15a93730d8f82e5cd2861400b3ba49f4 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 16 Apr 2025 02:02:44 +0000 Subject: [PATCH 106/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 4 +--- vllm/v1/core/sched/scheduler.py | 24 +++++++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 22fa6b74a3ad..fe96d70829cd 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -220,9 +220,7 @@ def allocate_slots( num_total_computed_tokens = ( request.num_computed_tokens + len(new_computed_blocks) * self.block_size) - if num_external_tokens > 0: - num_total_computed_tokens += num_external_tokens - assert num_total_computed_tokens % self.block_size == 0 + num_total_computed_tokens += num_external_tokens num_required_blocks = cdiv( num_total_computed_tokens + num_tokens + num_lookahead_tokens, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 0a14e0f63401..f08d7c437817 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -530,7 +530,7 @@ def _make_cached_request_data( def _try_schedule_encoder_inputs( self, request: Request, - num_total_computed_tokens: int, + num_computed_tokens: int, num_new_tokens: int, encoder_budget: int, ) -> tuple[list[int], int, int]: @@ -541,8 +541,7 @@ def _try_schedule_encoder_inputs( An encoder input will be scheduled if: - Its output tokens overlap with the range of tokens being computed in this step, i.e., - [num_total_computed_tokens, num_total_computed_tokens + num_new_tokens), - num_total_computed_tokens = num_computed_tokens + num_external_tokens + [num_computed_tokens, num_computed_tokens + num_new_tokens), - It is not already computed and stored in the encoder cache. - There is sufficient encoder token budget to process it. - The encoder cache has space to store it. @@ -550,6 +549,9 @@ def _try_schedule_encoder_inputs( If an encoder input cannot be scheduled due to cache or budget limitations, the method adjusts `num_new_tokens` to schedule only the decoder tokens up to just before the unschedulable encoder input. + + Note that num_computed_tokens includes both locally cached + blocks and externally cached blocks (via KVConnector). """ encoder_inputs_to_schedule: list[int] = [] mm_positions = request.mm_positions @@ -562,10 +564,10 @@ def _try_schedule_encoder_inputs( # The encoder output is needed if the two ranges overlap: # [num_cached_tokens, num_cached_tokens + num_new_tokens) and # [start_pos, start_pos + num_encoder_tokens) - if start_pos >= num_total_computed_tokens + num_new_tokens: + if start_pos >= num_computed_tokens + num_new_tokens: # The encoder input is not needed in this step. break - if start_pos + num_encoder_tokens <= num_total_computed_tokens: + if start_pos + num_encoder_tokens <= num_computed_tokens: # The encoder input is already computed and stored # in the decoder's KV cache. continue @@ -578,10 +580,10 @@ def _try_schedule_encoder_inputs( # partially schedule a multimodal item. If the scheduled range would # only cover part of the mm input, roll back to before the mm item. if (self.scheduler_config.disable_chunked_mm_input - and num_total_computed_tokens < start_pos - and (num_total_computed_tokens + num_new_tokens) + and num_computed_tokens < start_pos + and (num_computed_tokens + num_new_tokens) < (start_pos + num_encoder_tokens)): - num_new_tokens = start_pos - num_total_computed_tokens + num_new_tokens = start_pos - num_computed_tokens break if (not self.encoder_cache_manager.can_allocate(request, i) @@ -590,12 +592,12 @@ def _try_schedule_encoder_inputs( # NOTE(woosuk): We assume that the encoder input tokens should # be processed altogether, as the encoder usually uses # bidirectional attention. - if num_total_computed_tokens < start_pos: + if num_computed_tokens < start_pos: # We only schedule the decoder tokens just before the # encoder input. - num_new_tokens = start_pos - num_total_computed_tokens + num_new_tokens = start_pos - num_computed_tokens else: - # Because of prefix caching, num_total_computed_tokens is + # Because of prefix caching, num_computed_tokens is # greater than start_pos even though encoder input is not # available. In this case, we can't schedule any token for # the request in this step. From ff4b98f391381e4ad8f314f7eb8278ce6bfcb9f2 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 16 Apr 2025 02:10:31 +0000 Subject: [PATCH 107/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 24 +++++++++--------------- vllm/v1/core/sched/scheduler.py | 5 ++--- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index fe96d70829cd..1f04582b8d12 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -166,22 +166,19 @@ def allocate_slots( num_tokens: int, new_computed_blocks: Optional[list[KVCacheBlock]] = None, num_lookahead_tokens: int = 0, - num_external_tokens: int = 0, ) -> Optional[list[KVCacheBlock]]: """Add slots for a request with new tokens to append. Args: request: The request to allocate slots. - num_tokens: The number of tokens to allocate. Note that this does - not include the tokens that have already been computed. + num_tokens: The number of tokens to allocate, including external + tokens. Note that this does not include tokens that have + already been computed locally (i.e. new_computed_blocks). new_computed_blocks: A list of new computed blocks just hitting the prefix caching. num_lookahead_tokens: The number of speculative tokens to allocate. This is used by spec decode proposers with kv-cache such - as eagle. - num_external_tokens: The number of external tokens to allocate. - This is used by KVConnector for remote KV cache. KVConnector - injects external tokens into the blocks during execute_model. + as eagle. Blocks layout: ----------------------------------------------------------------------- @@ -217,13 +214,11 @@ def allocate_slots( # The number of computed tokens is the number of computed tokens plus # the new prefix caching hits - num_total_computed_tokens = ( - request.num_computed_tokens + - len(new_computed_blocks) * self.block_size) - num_total_computed_tokens += num_external_tokens + num_computed_tokens = (request.num_computed_tokens + + len(new_computed_blocks) * self.block_size) num_required_blocks = cdiv( - num_total_computed_tokens + num_tokens + num_lookahead_tokens, + num_computed_tokens + num_tokens + num_lookahead_tokens, self.block_size) num_new_blocks = (num_required_blocks - len(req_blocks) - len(new_computed_blocks)) @@ -285,9 +280,8 @@ def allocate_slots( # Speculated tokens might be rejected in the future, so we does # not cache any speculated tokens. We only cache blocks with # generated (accepted) tokens. - num_full_blocks_after_append = ( - num_total_computed_tokens + num_tokens - - len(request.spec_token_ids)) // self.block_size + num_full_blocks_after_append = (num_computed_tokens + num_tokens - len( + request.spec_token_ids)) // self.block_size self.block_pool.cache_full_blocks( request=request, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index f08d7c437817..838752169efc 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -354,9 +354,8 @@ def schedule(self) -> SchedulerOutput: new_blocks = self.kv_cache_manager.allocate_slots( request=request, - num_tokens=num_new_tokens, - new_computed_blocks=computed_blocks, - num_external_tokens=num_external_tokens) + num_tokens=num_new_tokens + num_external_tokens, + new_computed_blocks=computed_blocks) if new_blocks is None: # The request cannot be scheduled. break From 17b61fbc261f452ebe214b1fd6cfff402bf7151b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 16 Apr 2025 02:15:28 +0000 Subject: [PATCH 108/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 373bd2a47b13..95967d2ca919 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -138,7 +138,7 @@ def wait_for_layer_load(self, layer_name: str) -> None: def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata: "AttentionMetadata", **kwargs) -> None: """ - Start saving the a layer of KV cache from vLLM's paged buffer + Start saving a layer of KV cache from vLLM's paged buffer to the connector. This is called from within attention layer to enable async copying during execution. From ac0660de80b440bcb5e3a925b5c891cc5a2fd307 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 16 Apr 2025 02:17:39 +0000 Subject: [PATCH 109/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/kv_cache_manager.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 1f04582b8d12..6e5f969d72f1 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -178,7 +178,7 @@ def allocate_slots( prefix caching. num_lookahead_tokens: The number of speculative tokens to allocate. This is used by spec decode proposers with kv-cache such - as eagle. + as eagle. Blocks layout: ----------------------------------------------------------------------- @@ -216,7 +216,6 @@ def allocate_slots( # the new prefix caching hits num_computed_tokens = (request.num_computed_tokens + len(new_computed_blocks) * self.block_size) - num_required_blocks = cdiv( num_computed_tokens + num_tokens + num_lookahead_tokens, self.block_size) @@ -264,7 +263,7 @@ def allocate_slots( # [..., max_num_blocks_per_req]. self.max_num_blocks_per_req - len(req_blocks), ) - assert num_new_blocks is not None and num_new_blocks > 0 + assert num_new_blocks > 0 # Concatenate the computed block IDs and the new block IDs. new_blocks = self.block_pool.get_new_blocks(num_new_blocks) From ecfb4ea9e7089d6ecd99287142c79a03530731e3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 16 Apr 2025 02:24:57 +0000 Subject: [PATCH 110/116] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 838752169efc..2f674eede31c 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -312,7 +312,7 @@ def schedule(self) -> SchedulerOutput: skipped_waiting_requests.appendleft(request) continue - # Get locally-cached tokens. + # Get already-cached tokens. computed_blocks, num_computed_tokens = \ self.kv_cache_manager.get_computed_blocks(request) @@ -323,15 +323,14 @@ def schedule(self) -> SchedulerOutput: self.connector.get_num_new_matched_tokens( request, num_computed_tokens)) - # Total computed blocks (local + external). - num_total_computed_tokens = (num_computed_tokens + - num_external_tokens) + # Total computed tokens (local + external). + num_computed_tokens += num_external_tokens # Number of tokens to be scheduled. # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed requests, # which have output tokens. - num_new_tokens = request.num_tokens - num_total_computed_tokens + num_new_tokens = request.num_tokens - num_computed_tokens if (0 < self.scheduler_config.long_prefill_token_threshold < num_new_tokens): num_new_tokens = ( @@ -343,7 +342,7 @@ def schedule(self) -> SchedulerOutput: if request.has_encoder_inputs: (encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget) = self._try_schedule_encoder_inputs( - request, num_total_computed_tokens, num_new_tokens, + request, num_computed_tokens, num_new_tokens, encoder_budget) if num_new_tokens == 0: # The request cannot be scheduled. @@ -393,7 +392,7 @@ def schedule(self) -> SchedulerOutput: num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens request.status = RequestStatus.RUNNING - request.num_computed_tokens = num_total_computed_tokens + request.num_computed_tokens = num_computed_tokens # Encoder-related. if encoder_inputs_to_schedule: From abdddf08404625b550f15b042e437357ebd731c7 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 16 Apr 2025 02:27:18 +0000 Subject: [PATCH 111/116] cleanup Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 2f674eede31c..5b477eb1a375 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -539,7 +539,7 @@ def _try_schedule_encoder_inputs( An encoder input will be scheduled if: - Its output tokens overlap with the range of tokens being computed in this step, i.e., - [num_computed_tokens, num_computed_tokens + num_new_tokens), + [num_computed_tokens, num_computed_tokens + num_new_tokens). - It is not already computed and stored in the encoder cache. - There is sufficient encoder token budget to process it. - The encoder cache has space to store it. @@ -560,7 +560,7 @@ def _try_schedule_encoder_inputs( num_encoder_tokens = pos_info.length # The encoder output is needed if the two ranges overlap: - # [num_cached_tokens, num_cached_tokens + num_new_tokens) and + # [num_computed_tokens, num_computed_tokens + num_new_tokens) and # [start_pos, start_pos + num_encoder_tokens) if start_pos >= num_computed_tokens + num_new_tokens: # The encoder input is not needed in this step. @@ -595,8 +595,8 @@ def _try_schedule_encoder_inputs( # encoder input. num_new_tokens = start_pos - num_computed_tokens else: - # Because of prefix caching, num_computed_tokens is - # greater than start_pos even though encoder input is not + # Because of prefix caching, num_computed_tokens is greater + # than start_pos even though its encoder input is not # available. In this case, we can't schedule any token for # the request in this step. num_new_tokens = 0 From 8695d963e86b638ed0cb1bb330618f8979bf3a51 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 16 Apr 2025 02:29:24 +0000 Subject: [PATCH 112/116] cleanup Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/core/sched/scheduler.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 5b477eb1a375..cda2951b2256 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -317,11 +317,10 @@ def schedule(self) -> SchedulerOutput: self.kv_cache_manager.get_computed_blocks(request) # Get externally-cached tokens if using a KVConnector. - num_external_tokens = 0 - if self.connector is not None: - num_external_tokens += ( - self.connector.get_num_new_matched_tokens( - request, num_computed_tokens)) + num_external_tokens = ( + 0 if self.connector is None else + self.connector.get_num_new_matched_tokens( + request, num_computed_tokens)) # Total computed tokens (local + external). num_computed_tokens += num_external_tokens @@ -352,9 +351,8 @@ def schedule(self) -> SchedulerOutput: new_encoder_budget = encoder_budget new_blocks = self.kv_cache_manager.allocate_slots( - request=request, - num_tokens=num_new_tokens + num_external_tokens, - new_computed_blocks=computed_blocks) + request, num_new_tokens + num_external_tokens, + computed_blocks) if new_blocks is None: # The request cannot be scheduled. break From 7b5ba2c353d9e5b99d11957b1e1a2f0117a255ea Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 16 Apr 2025 03:09:51 +0000 Subject: [PATCH 113/116] updated Signed-off-by: rshaw@neuralmagic.com --- .../kv_transfer/kv_connector/v1/base.py | 7 ++- .../v1/shared_storage_connector.py | 53 ++++++++++++++----- vllm/v1/core/sched/scheduler.py | 4 +- 3 files changed, 49 insertions(+), 15 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 95967d2ca919..d06ff8301df9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -28,7 +28,7 @@ import torch from vllm.logger import init_logger -from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.core.sched.output import CachedRequestData, NewRequestData if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -196,7 +196,10 @@ def update_state_after_alloc(self, request: "Request", @abstractmethod def build_connector_meta( - self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: + self, + new_reqs_data: NewRequestData, + resumed_reqs_data: CachedRequestData, + ) -> KVConnectorMetadata: """ Build the connector metadata for this step. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 8847dbefcdf6..71b530895ae4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -12,7 +12,7 @@ KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonMetadata -from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput +from vllm.v1.core.sched.output import CachedRequestData, NewRequestData if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -32,8 +32,8 @@ class ReqMeta: is_store: bool @staticmethod - def from_request(request: NewRequestData, block_size: int, - is_store: bool) -> "ReqMeta": + def from_new_request(request: NewRequestData, block_size: int, + is_store: bool) -> "ReqMeta": valid_num_tokens = align_to_block_size(len(request.prompt_token_ids), block_size) token_ids = torch.tensor(request.prompt_token_ids)[:valid_num_tokens] @@ -57,14 +57,23 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata): def __init__(self): self.requests = [] - def add_request( + def add_new_request( self, request: NewRequestData, block_size: int, is_store: bool, ) -> None: self.requests.append( - ReqMeta.from_request(request, block_size, is_store)) + ReqMeta.from_new_request(request, block_size, is_store)) + + def add_cached_request( + self, + request: CachedRequestData, + block_size: int, + is_store: bool, + ) -> None: + self.requests.append( + ReqMeta.from_cached_request(request, block_size, is_store)) class SharedStorageConnector(KVConnectorBase_V1): @@ -271,7 +280,10 @@ def update_state_after_alloc(self, request: "Request", self._requests_need_load.append(request.request_id) def build_connector_meta( - self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: + self, + new_reqs_data: NewRequestData, + resumed_reqs_data: CachedRequestData, + ) -> KVConnectorMetadata: """Build the connector metadata for this step. This function should NOT modify any fields in the scheduler_output. @@ -281,16 +293,33 @@ def build_connector_meta( scheduler_output (SchedulerOutput): the scheduler output object. """ meta = SharedStorageConnectorMetadata() - for request in scheduler_output.scheduled_new_reqs: - if request.req_id in self._requests_need_load: - meta.add_request(request, self._block_size, is_store=False) + + total_need_load = 0 + for new_req in new_reqs_data: + if new_req.req_id in self._requests_need_load: + meta.add_new_request(new_req, self._block_size, is_store=False) + total_need_load += 1 else: # NOTE: here, we set the store and load being exclusive, # but in LMCache use case, a single request can have both # store and load status - if not self._found_match_for_request(request): - meta.add_request(request, self._block_size, is_store=True) - + # NOTE(rob): for this debug implementation, we only cache + # the original prompt tokens. + if not self._found_match_for_request(new_req): + meta.add_new_request(new_req, + self._block_size, + is_store=True) + + # NOTE(rob): here we rely on the resumed requests being + # the first N requests in the list scheduled_cache_reqs. + for resumed_req in resumed_reqs_data: + if resumed_req.req_id in self._requests_need_load: + meta.add_cached_request(resumed_req, + self._block_size, + is_store=False) + total_need_load += 1 + + assert total_need_load == len(self._requests_need_load) self._requests_need_load.clear() return meta diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index cda2951b2256..8d7e1cb42d6a 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -477,7 +477,9 @@ def schedule(self) -> SchedulerOutput: # 2. Wrap up all the KV cache load / save ops into an opaque object # 3. Clear the internal states of the connector if self.connector is not None: - meta = self.connector.build_connector_meta(scheduler_output) + meta = self.connector.build_connector_meta(new_reqs_data, + resumed_reqs_data, + running_reqs_data) scheduler_output.kv_connector_metadata = meta # Advance the number of computed tokens for the request AFTER From 6be9cf90a1238394685c3412539acc5e9a77a626 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 17 Apr 2025 13:12:12 +0000 Subject: [PATCH 114/116] fixed preemption Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 6 -- .../kv_transfer/kv_connector/v1/base.py | 7 +- .../v1/shared_storage_connector.py | 84 ++++++++++--------- vllm/v1/core/sched/scheduler.py | 8 +- 4 files changed, 52 insertions(+), 53 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 053ef2ae978a..691ca59b062c 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1130,13 +1130,7 @@ def test_kv_connector_handles_preemption(): # Restarts the preempted request - generate 3rd token. # This will have a local and remote cache hit. - - # FIXME(rob): this is currently broken because: - # https://vllm-dev.slack.com/archives/C08MSU8THEC/p1744692863711629 - - breakpoint() output = scheduler.schedule() - breakpoint() _assert_right_scheduler_output( output, # 1 remote kv_cache hit! diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index d06ff8301df9..95967d2ca919 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -28,7 +28,7 @@ import torch from vllm.logger import init_logger -from vllm.v1.core.sched.output import CachedRequestData, NewRequestData +from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -196,10 +196,7 @@ def update_state_after_alloc(self, request: "Request", @abstractmethod def build_connector_meta( - self, - new_reqs_data: NewRequestData, - resumed_reqs_data: CachedRequestData, - ) -> KVConnectorMetadata: + self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: """ Build the connector metadata for this step. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 71b530895ae4..7cc12090ed35 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -12,7 +12,7 @@ KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonMetadata -from vllm.v1.core.sched.output import CachedRequestData, NewRequestData +from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -32,12 +32,11 @@ class ReqMeta: is_store: bool @staticmethod - def from_new_request(request: NewRequestData, block_size: int, - is_store: bool) -> "ReqMeta": - valid_num_tokens = align_to_block_size(len(request.prompt_token_ids), - block_size) - token_ids = torch.tensor(request.prompt_token_ids)[:valid_num_tokens] - block_ids = torch.tensor(request.block_ids) + def make_meta(token_ids: list[int], block_ids: list[int], block_size: int, + is_store: bool) -> "ReqMeta": + valid_num_tokens = align_to_block_size(len(token_ids), block_size) + token_ids = torch.tensor(token_ids)[:valid_num_tokens] + block_ids = torch.tensor(block_ids) num_blocks = block_ids.shape[0] block_offsets = torch.arange(0, block_size) slot_mapping = block_offsets.reshape((1, block_size)) + \ @@ -57,23 +56,15 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata): def __init__(self): self.requests = [] - def add_new_request( + def add_request( self, - request: NewRequestData, + token_ids: list[int], + block_ids: list[int], block_size: int, is_store: bool, ) -> None: self.requests.append( - ReqMeta.from_new_request(request, block_size, is_store)) - - def add_cached_request( - self, - request: CachedRequestData, - block_size: int, - is_store: bool, - ) -> None: - self.requests.append( - ReqMeta.from_cached_request(request, block_size, is_store)) + ReqMeta.make_meta(token_ids, block_ids, block_size, is_store)) class SharedStorageConnector(KVConnectorBase_V1): @@ -85,7 +76,7 @@ class SharedStorageConnector(KVConnectorBase_V1): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) self._block_size = vllm_config.cache_config.block_size - self._requests_need_load: list[str] = [] + self._requests_need_load: dict[str, Request] = {} transfer_config = vllm_config.kv_transfer_config self._storage_path = transfer_config.get_from_extra_config( "shared_storage_path", "/tmp") @@ -277,12 +268,11 @@ def update_state_after_alloc(self, request: "Request", such that we load the KVs in the next forward pass. """ if num_external_tokens > 0: - self._requests_need_load.append(request.request_id) + self._requests_need_load[request.request_id] = request def build_connector_meta( self, - new_reqs_data: NewRequestData, - resumed_reqs_data: CachedRequestData, + scheduler_output: SchedulerOutput, ) -> KVConnectorMetadata: """Build the connector metadata for this step. @@ -295,28 +285,46 @@ def build_connector_meta( meta = SharedStorageConnectorMetadata() total_need_load = 0 - for new_req in new_reqs_data: + for new_req in scheduler_output.scheduled_new_reqs: if new_req.req_id in self._requests_need_load: - meta.add_new_request(new_req, self._block_size, is_store=False) + meta.add_request(token_ids=new_req.prompt_token_ids, + block_ids=new_req.block_ids, + block_size=self._block_size, + is_store=False) total_need_load += 1 else: # NOTE: here, we set the store and load being exclusive, - # but in LMCache use case, a single request can have both - # store and load status + # but a single request can have both store and load. # NOTE(rob): for this debug implementation, we only cache # the original prompt tokens. if not self._found_match_for_request(new_req): - meta.add_new_request(new_req, - self._block_size, - is_store=True) - - # NOTE(rob): here we rely on the resumed requests being - # the first N requests in the list scheduled_cache_reqs. - for resumed_req in resumed_reqs_data: - if resumed_req.req_id in self._requests_need_load: - meta.add_cached_request(resumed_req, - self._block_size, - is_store=False) + meta.add_request(token_ids=new_req.prompt_token_ids, + block_ids=new_req.block_ids, + block_size=self._block_size, + is_store=True) + + for cached_req in scheduler_output.scheduled_cached_reqs: + # NOTE(rob): here we rely on the resumed requests being + # the first N requests in the list scheduled_cache_reqs. + if not cached_req.resumed_from_preemption: + break + if cached_req.req_id in self._requests_need_load: + # NOTE(rob): cached_req_data does not have the full + # list of token ids (only new tokens). So we look it + # up in the actual request object. + request = self._requests_need_load[cached_req.req_id] + total_tokens = (len(cached_req.new_token_ids) + + cached_req.num_computed_tokens) + token_ids = request.all_token_ids[:total_tokens] + + # NOTE(rob): For resumed req, new_block_ids is all + # of the block_ids for the request. + block_ids = cached_req.new_block_ids + + meta.add_request(token_ids=token_ids, + block_ids=block_ids, + block_size=self._block_size, + is_store=False) total_need_load += 1 assert total_need_load == len(self._requests_need_load) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 8d7e1cb42d6a..7e658d134cf7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -362,7 +362,9 @@ def schedule(self) -> SchedulerOutput: # needed for this request. if self.connector is not None: self.connector.update_state_after_alloc( - request, num_external_tokens) + request, + num_external_tokens, + ) self.waiting.popleft() if request.use_structured_output: @@ -477,9 +479,7 @@ def schedule(self) -> SchedulerOutput: # 2. Wrap up all the KV cache load / save ops into an opaque object # 3. Clear the internal states of the connector if self.connector is not None: - meta = self.connector.build_connector_meta(new_reqs_data, - resumed_reqs_data, - running_reqs_data) + meta = self.connector.build_connector_meta(scheduler_output) scheduler_output.kv_connector_metadata = meta # Advance the number of computed tokens for the request AFTER From 5363ed0cd15cf0579ce069e9feb3e032a661272e Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Thu, 17 Apr 2025 06:16:20 -0700 Subject: [PATCH 115/116] Update vllm/distributed/kv_transfer/kv_connector/factory.py Co-authored-by: Tyler Michael Smith --- vllm/distributed/kv_transfer/kv_connector/factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 6739e616a1e9..665ea2f5ba01 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -64,7 +64,7 @@ def create_connector_v1( logger.info("Creating v1 connector with name: %s", connector_name) # NOTE(Kuntai): v1 connector is explicitly separated into two roles. # Scheduler connector: - # - Co-colate with scheduler process + # - Co-locate with scheduler process # - Should only be used inside the Scheduler class # Worker connector: # - Co-locate with worker process From 247195d4ac91334adf87ca6b9a940eec68f6f6cb Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 17 Apr 2025 14:26:19 +0000 Subject: [PATCH 116/116] fix pre-commit Signed-off-by: rshaw@neuralmagic.com --- .../kv_connector/v1/shared_storage_connector.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 7cc12090ed35..1d2040784e6c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -35,15 +35,15 @@ class ReqMeta: def make_meta(token_ids: list[int], block_ids: list[int], block_size: int, is_store: bool) -> "ReqMeta": valid_num_tokens = align_to_block_size(len(token_ids), block_size) - token_ids = torch.tensor(token_ids)[:valid_num_tokens] - block_ids = torch.tensor(block_ids) - num_blocks = block_ids.shape[0] + token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens] + block_ids_tensor = torch.tensor(block_ids) + num_blocks = block_ids_tensor.shape[0] block_offsets = torch.arange(0, block_size) slot_mapping = block_offsets.reshape((1, block_size)) + \ - block_ids.reshape((num_blocks, 1)) * block_size + block_ids_tensor.reshape((num_blocks, 1)) * block_size slot_mapping = slot_mapping.flatten()[:valid_num_tokens] return ReqMeta( - token_ids=token_ids, + token_ids=token_ids_tensor, slot_mapping=slot_mapping, is_store=is_store, )