vllm-project · WoosukKwon · Nov 21, 2025 · Aug 17, 2025 · Aug 18, 2025 · Aug 22, 2025
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -35,6 +35,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
 
+# Model runner V2
+/vllm/v1/worker/gpu @WoosukKwon
+
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao

diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
@@ -16,7 +16,10 @@
 
 def main():
     # Create an LLM.
-    llm = LLM(model="facebook/opt-125m")
+    llm = LLM(
+        model="facebook/opt-125m",
+        compilation_config={"cudagraph_mode": "full_decode_only"},
+    )
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -225,6 +225,7 @@
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     VLLM_FLAT_LOGPROBS: bool = False
+    VLLM_USE_V2_MODEL_RUNNER: bool = False
 
 
 def get_default_cache_root():
@@ -1498,6 +1499,10 @@ def get_vllm_port() -> int | None:
     # After enabled, PromptLogprobs and SampleLogprobs would populated as
     # FlatLogprobs.
     "VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
+    # Flag to enable v2 model runner.
+    "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
+        int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
+    ),
 }
 
 # --8<-- [end:env-vars-definition]

@@ -455,6 +455,9 @@ def _get_workspace_buffer(self):
             )
         return self._workspace_buffer
 
+    def set_workspace_buffer(self, workspace_buffer: torch.Tensor):
+        self._workspace_buffer = workspace_buffer
+
     def _get_prefill_wrapper(self):
         if self._prefill_wrapper is None:
             self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(

@@ -36,6 +36,7 @@
 class NewRequestData:
     req_id: str
     prompt_token_ids: list[int] | None
+    prefill_token_ids: list[int] | None
     mm_features: list[MultiModalFeatureSpec]
     sampling_params: SamplingParams | None
     pooling_params: PoolingParams | None
@@ -53,6 +54,7 @@
         return cls(
             req_id=request.request_id,
             prompt_token_ids=request.prompt_token_ids,
+            prefill_token_ids=request._all_token_ids,
             mm_features=request.mm_features,
             sampling_params=request.sampling_params,
             pooling_params=request.pooling_params,
@@ -175,6 +177,7 @@
     # This can be used for cascade attention.
     num_common_prefix_blocks: list[int]
 
+    preempted_req_ids: set[str]
     # Request IDs that are finished in between the previous and the current
     # steps. This is used to notify the workers about the finished requests
     # so that they can free the cached states for those requests.
@@ -193,6 +196,21 @@
     # EC Cache Connector metadata
     ec_connector_metadata: ECConnectorMetadata | None = None
 
+    @classmethod
+    def make_empty(cls) -> "SchedulerOutput":
+        return cls(
+            scheduled_new_reqs=[],
+            scheduled_cached_reqs=CachedRequestData.make_empty(),
+            num_scheduled_tokens={},
+            total_num_scheduled_tokens=0,
+            scheduled_spec_decode_tokens={},
+            scheduled_encoder_inputs={},
+            num_common_prefix_blocks=[],
+            preempted_req_ids=set(),
+            finished_req_ids=set(),
+            free_encoder_mm_hashes=[],
+        )
+
 
 @dataclass
 class GrammarOutput:

@@ -653,6 +653,9 @@ def schedule(self) -> SchedulerOutput:
                 )
 
         # Construct the scheduler output.
+        scheduled_new_reqs = scheduled_new_reqs + scheduled_resumed_reqs
+        scheduled_resumed_reqs = []
+
         new_reqs_data = [
             NewRequestData.from_request(
                 req, req_to_new_blocks[req.request_id].get_block_ids()
@@ -680,6 +683,7 @@ def schedule(self) -> SchedulerOutput:
             scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
             num_common_prefix_blocks=num_common_prefix_blocks,
+            preempted_req_ids={req.request_id for req in preempted_reqs},
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
             # It contains the request IDs that are finished in between
@@ -754,9 +758,8 @@ def _make_cached_request_data(
         all_token_ids: dict[str, list[int]] = {}
         num_computed_tokens: list[int] = []
         num_output_tokens: list[int] = []
-        resumed_req_ids = set()
+        resumed_req_ids = set[str]()
 
-        num_running_reqs = len(running_reqs)
         for idx, req in enumerate(itertools.chain(running_reqs, resumed_reqs)):
             req_id = req.request_id
             req_ids.append(req_id)
@@ -773,14 +776,6 @@ def _make_cached_request_data(
                     req.num_computed_tokens : req.num_computed_tokens + num_tokens
                 ]
                 new_token_ids.append(token_ids)
-            scheduled_in_prev_step = req_id in self.prev_step_scheduled_req_ids
-            if idx >= num_running_reqs:
-                assert not scheduled_in_prev_step
-                resumed_req_ids.add(req_id)
-            if not scheduled_in_prev_step:
-                all_token_ids[req_id] = req.all_token_ids[
-                    : req.num_computed_tokens + num_tokens
-                ]
             new_block_ids.append(
                 req_to_new_blocks[req_id].get_block_ids(allow_none=True)
             )
@@ -997,7 +992,8 @@ def update_from_output(
         # to avoid expensive operations inside the loop.
         stopped_running_reqs: set[Request] = set()
         stopped_preempted_reqs: set[Request] = set()
-        for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
+        for req_index, req_id in enumerate(model_runner_output.req_ids):
+            num_tokens_scheduled = num_scheduled_tokens[req_id]
             assert num_tokens_scheduled > 0
             if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids:
                 # Skip requests that were recovered from KV load failure

@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+
+from vllm.v1.outputs import (
+    AsyncModelRunnerOutput,
+    ModelRunnerOutput,
+    SamplerOutput,
+)
+
+
+class AsyncOutput(AsyncModelRunnerOutput):
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampler_output: SamplerOutput,
+        num_sampled_tokens: np.ndarray,
+        copy_stream: torch.cuda.Stream,
+    ):
+        self.model_runner_output = model_runner_output
+        self.sampler_output = sampler_output
+        self.num_sampled_tokens = num_sampled_tokens
+        self.copy_stream = copy_stream
+        self.copy_event = torch.cuda.Event()
+
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(self.copy_stream):
+            self.copy_stream.wait_stream(default_stream)
+
+            # NOTE(woosuk): We should keep the CPU tensors unfreed
+            # until the copy completes.
+            self.sampled_token_ids = sampler_output.sampled_token_ids.to(
+                "cpu", non_blocking=True
+            )
+            if sampler_output.logprobs_tensors is not None:
+                self.logprobs_tensors = (
+                    sampler_output.logprobs_tensors.to_cpu_nonblocking()
+                )
+            else:
+                self.logprobs_tensors = None
+            self.prompt_logprobs_dict = {}
+            if self.model_runner_output.prompt_logprobs_dict:
+                for k, v in self.model_runner_output.prompt_logprobs_dict.items():
+                    self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+            self.copy_event.record(self.copy_stream)
+
+    def get_output(self) -> ModelRunnerOutput:
+        self.copy_event.synchronize()
+
+        # NOTE(woosuk): The following code ensures compatibility with OSS vLLM.
+        # Going forward, we should keep the data structures as NumPy arrays
+        # rather than Python lists.
+        sampled_token_ids_np = self.sampled_token_ids.numpy()
+        sampled_token_ids = sampled_token_ids_np.tolist()
+        for i, tokens in enumerate(sampled_token_ids):
+            del tokens[self.num_sampled_tokens[i] :]
+        self.model_runner_output.sampled_token_ids = sampled_token_ids
+
+        if self.logprobs_tensors is not None:
+            self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
+        self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
+        return self.model_runner_output
+
+
+@contextmanager
+def async_barrier(event: torch.cuda.Event | None):
+    if event is not None:
+        event.synchronize()
+    try:
+        yield
+    finally:
+        if event is not None:
+            event.record()