vllm-project · WoosukKwon · Nov 21, 2025 · Aug 17, 2025 · Aug 18, 2025 · Aug 22, 2025
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
@@ -24,6 +24,8 @@
 
 
 def kernel_warmup(worker: "Worker"):
+    return
+
     # Deep GEMM warmup
     do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
                            and is_deep_gemm_supported()

@@ -864,6 +864,7 @@
         model_runner_output: ModelRunnerOutput,
     ) -> dict[int, EngineCoreOutputs]:
         sampled_token_ids = model_runner_output.sampled_token_ids
+        num_sampled_tokens = model_runner_output.num_sampled_tokens
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
@@ -878,7 +879,8 @@
         # to avoid expensive operations inside the loop.
         stopped_running_reqs: set[Request] = set()
         stopped_preempted_reqs: set[Request] = set()
-        for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
+        for req_index, req_id in enumerate(model_runner_output.req_ids):
+            num_tokens_scheduled = num_scheduled_tokens[req_id]
             assert num_tokens_scheduled > 0
             request = self.requests.get(req_id)
             if request is None:
@@ -887,14 +889,18 @@
                 # in pipeline parallelism).
                 continue
 
-            req_index = model_runner_output.req_id_to_index[req_id]
-            generated_token_ids = sampled_token_ids[
-                req_index] if sampled_token_ids else []
+            generated_token_ids = []
+            if sampled_token_ids is not None:
+                assert num_sampled_tokens is not None
+                n = num_sampled_tokens[req_index]
+                if n > 0:
+                    generated_token_ids = sampled_token_ids[req_index, :n]
+                    generated_token_ids = generated_token_ids.tolist()
 
             scheduled_spec_token_ids = (
                 scheduler_output.scheduled_spec_decode_tokens.get(req_id))
            if scheduled_spec_token_ids:
                num_draft_tokens = len(scheduled_spec_token_ids)
                num_accepted = len(generated_token_ids) - 1
                num_rejected = num_draft_tokens - num_accepted
                # num_computed_tokens represents the number of tokens

diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
@@ -3,6 +3,7 @@
 
 import copy
 from dataclasses import dataclass, fields
+from functools import cached_property
 from math import prod
 from typing import Optional
 
@@ -272,3 +273,10 @@ class KVCacheConfig:
     see `_get_kv_cache_config_uniform_page_size` for more details.
     """
     kv_cache_groups: list[KVCacheGroupSpec]
+
+    @cached_property
+    def block_sizes(self) -> list[int]:
+        return [
+            kv_cache_group.kv_cache_spec.block_size
+            for kv_cache_group in self.kv_cache_groups
+        ]
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
@@ -5,17 +5,18 @@
 from dataclasses import dataclass
 from typing import NamedTuple, Optional
 
+import numpy as np
 import torch
 
 
 class LogprobsLists(NamedTuple):
 
     # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids: list[list[int]]
+    logprob_token_ids: np.ndarray
     # [num_reqs, max_num_logprobs + 1]
-    logprobs: list[list[float]]
+    logprobs: np.ndarray
     # [num_reqs]
-    sampled_token_ranks: list[int]
+    sampled_token_ranks: np.ndarray
 
     def slice(self, start: int, end: int):
         return LogprobsLists(
@@ -36,9 +37,9 @@ class LogprobsTensors(NamedTuple):
 
     def tolists(self):
         return LogprobsLists(
-            self.logprob_token_ids.tolist(),
-            self.logprobs.tolist(),
-            self.selected_token_ranks.tolist(),
+            self.logprob_token_ids.cpu().numpy(),
+            self.logprobs.cpu().numpy(),
+            self.selected_token_ranks.cpu().numpy(),
         )
 
     @staticmethod
@@ -80,20 +81,18 @@ class KVConnectorOutput:
 
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
-# This is expensive for torch.Tensor so prefer to use list instead.
 @dataclass
 class ModelRunnerOutput:
 
     # [num_reqs]
     req_ids: list[str]
-    # req_id -> index
-    req_id_to_index: dict[str, int]
 
     # num_reqs x num_generated_tokens
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: list[list[int]]
+    sampled_token_ids: Optional[np.ndarray]
+    num_sampled_tokens: Optional[np.ndarray]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
@@ -139,8 +138,8 @@ class DraftTokenIds:
 
 
 EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[],
-                                              req_id_to_index={},
-                                              sampled_token_ids=[],
+                                              sampled_token_ids=None,
+                                              num_sampled_tokens=None,
                                               logprobs=None,
                                               prompt_logprobs_dict={},
                                               pooler_output=[],

@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.v1.outputs import (AsyncModelRunnerOutput, LogprobsTensors,
+                             ModelRunnerOutput, SamplerOutput)
+
+
+class AsyncOutput(AsyncModelRunnerOutput):
+
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampler_output: SamplerOutput,
+        copy_stream: torch.cuda.Stream,
+    ):
+        self.model_runner_output = model_runner_output
+        self.sampler_output = sampler_output
+        self.copy_stream = copy_stream
+        self.copy_event = torch.cuda.Event()
+
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(self.copy_stream):
+            self.copy_stream.wait_stream(default_stream)
+
+            self.sampled_token_ids = sampler_output.sampled_token_ids.to(
+                "cpu", non_blocking=True)
+            x = sampler_output.logprobs_tensors
+            if x is not None:
+                self.logprobs_tensors = LogprobsTensors(
+                    logprob_token_ids=x.logprob_token_ids.to(
+                        "cpu", non_blocking=True),
+                    logprobs=x.logprobs.to("cpu", non_blocking=True),
+                    selected_token_ranks=x.selected_token_ranks.to(
+                        "cpu", non_blocking=True),
+                )
+            else:
+                self.logprobs_tensors = None
+            self.copy_event.record()
+
+    def get_output(self) -> ModelRunnerOutput:
+        self.copy_event.synchronize()
+        self.model_runner_output.sampled_token_ids = (
+            self.sampled_token_ids.numpy())
+        if self.logprobs_tensors is not None:
+            self.model_runner_output.logprobs = (
+                self.logprobs_tensors.tolists())
+        return self.model_runner_output
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend, AttentionType
+from vllm.attention.layer import Attention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheSpec, SlidingWindowSpec)
+from vllm.v1.worker.utils import bind_kv_cache
+
+
+def get_kv_cache_spec(
+    vllm_config: VllmConfig,
+    kv_cache_dtype: torch.dtype,
+) -> dict[str, KVCacheSpec]:
+    block_size = vllm_config.cache_config.block_size
+    use_mla = vllm_config.model_config.use_mla
+
+    kv_cache_spec: dict[str, KVCacheSpec] = {}
+    attn_layers = get_layers_from_vllm_config(vllm_config, Attention)
+    for layer_name, attn_module in attn_layers.items():
+        assert attn_module.attn_type == AttentionType.DECODER
+        if attn_module.sliding_window is not None:
+            kv_cache_spec[layer_name] = SlidingWindowSpec(
+                block_size=block_size,
+                num_kv_heads=attn_module.num_kv_heads,
+                head_size=attn_module.head_size,
+                dtype=kv_cache_dtype,
+                sliding_window=attn_module.sliding_window,
+                use_mla=use_mla,
+            )
+        else:
+            kv_cache_spec[layer_name] = FullAttentionSpec(
+                block_size=block_size,
+                num_kv_heads=attn_module.num_kv_heads,
+                head_size=attn_module.head_size,
+                dtype=kv_cache_dtype,
+                use_mla=use_mla,
+            )
+    return kv_cache_spec
+
+
+def init_attn_backend(
+    kv_cache_config: KVCacheConfig,
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    attn_backends: dict[str, AttentionBackend] = {}
+    attn_metadata_builders: list[AttentionMetadataBuilder] = []
+
+    attn_layers = get_layers_from_vllm_config(vllm_config, Attention)
+    for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+        layer_names = kv_cache_group_spec.layer_names
+        any_layer_name = next(iter(layer_names))
+
+        attn_backend = attn_layers[any_layer_name].get_attn_backend()
+        for layer_name in layer_names:
+            attn_backends[layer_name] = attn_backend
+
+        attn_metadata_builder = attn_backend.get_builder_cls()(
+            kv_cache_group_spec.kv_cache_spec,
+            layer_names,
+            vllm_config,
+            device,
+        )
+        attn_metadata_builders.append(attn_metadata_builder)
+    return attn_backends, attn_metadata_builders
+
+
+def _allocate_kv_cache(
+    kv_cache_config: KVCacheConfig,
+    device: torch.device,
+):
+    kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        tensor = torch.zeros(kv_cache_tensor.size,
+                             dtype=torch.int8,
+                             device=device)
+        for layer_name in kv_cache_tensor.shared_by:
+            kv_cache_raw_tensors[layer_name] = tensor
+
+    layer_names = set()
+    for group in kv_cache_config.kv_cache_groups:
+        for layer_name in group.layer_names:
+            layer_names.add(layer_name)
+    assert layer_names == set(kv_cache_raw_tensors.keys()
+                              ), "Some layers are not correctly initialized"
+    return kv_cache_raw_tensors
+
+
+def _reshape_kv_cache(
+    kv_cache_config: KVCacheConfig,
+    kv_cache_raw_tensors: dict[str, torch.Tensor],
+    attn_backends: dict[str, AttentionBackend],
+) -> dict[str, torch.Tensor]:
+    kv_caches: dict[str, torch.Tensor] = {}
+    for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+        kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+        for layer_name in kv_cache_group_spec.layer_names:
+            raw_tensor = kv_cache_raw_tensors[layer_name]
+            assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
+            num_blocks = (raw_tensor.numel() // kv_cache_spec.page_size_bytes)
+
+            attn_backend = attn_backends[layer_name]
+            kv_cache_shape = attn_backend.get_kv_cache_shape(
+                num_blocks, kv_cache_spec.block_size,
+                kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+
+            dtype = kv_cache_spec.dtype
+            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
+            kv_cache_shape = tuple(kv_cache_shape[i]
+                                   for i in kv_cache_stride_order)
+
+            inv_order = [
+                kv_cache_stride_order.index(i)
+                for i in range(len(kv_cache_stride_order))
+            ]
+
+            raw_tensor = raw_tensor.view(dtype)
+            raw_tensor = raw_tensor.view(kv_cache_shape)
+            kv_caches[layer_name] = raw_tensor.permute(*inv_order)
+    return kv_caches
+
+
+def init_kv_cache(
+    runner_kv_caches: list[torch.Tensor],
+    forward_context: dict[str, Any],
+    kv_cache_config: KVCacheConfig,
+    attn_backends: dict[str, AttentionBackend],
+    device: torch.device,
+):
+    kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device)
+    kv_caches = _reshape_kv_cache(kv_cache_config, kv_cache_raw_tensors,
+                                  attn_backends)
+    bind_kv_cache(kv_caches, forward_context, runner_kv_caches)