Zhathw
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 0 deletions b/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎vllm/envs.py‎
Lines changed: 5 additions & 0 deletions b/‎vllm/envs.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎vllm/v1/attention/backends/flashinfer.py‎
Lines changed: 3 additions & 0 deletions b/‎vllm/v1/attention/backends/flashinfer.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎vllm/v1/core/sched/output.py‎
Lines changed: 24 additions & 0 deletions b/‎vllm/v1/core/sched/output.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎vllm/v1/core/sched/scheduler.py‎
Lines changed: 22 additions & 6 deletions b/‎vllm/v1/core/sched/scheduler.py‎
Lines changed: 22 additions & 6 deletions
diff --git a/‎vllm/v1/worker/gpu/README.md‎
Lines changed: 4 additions & 0 deletions b/‎vllm/v1/worker/gpu/README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎vllm/v1/worker/gpu/__init__.py‎ b/‎vllm/v1/worker/gpu/__init__.py‎
diff --git a/‎vllm/v1/worker/gpu/async_utils.py‎
Lines changed: 89 additions & 0 deletions b/‎vllm/v1/worker/gpu/async_utils.py‎
Lines changed: 89 additions & 0 deletions
@@ -35,6 +35,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
 
+# Model runner V2
+/vllm/v1/worker/gpu @WoosukKwon
+
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 
@@ -231,6 +231,7 @@
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
+    VLLM_USE_V2_MODEL_RUNNER: bool = False
 
 
 def get_default_cache_root():
@@ -1522,6 +1523,10 @@ def get_vllm_port() -> int | None:
     "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
         "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
     ),
+    # Flag to enable v2 model runner.
+    "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
+        int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
+    ),
 }
 
 # --8<-- [end:env-vars-definition]
 
@@ -593,6 +593,9 @@ def _get_workspace_buffer(self):
             )
         return self._workspace_buffer
 
+    def set_workspace_buffer(self, workspace_buffer: torch.Tensor):
+        self._workspace_buffer = workspace_buffer
+
     def _get_prefill_wrapper(
         self,
     ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
 
@@ -44,11 +44,15 @@ class NewRequestData:
     lora_request: LoRARequest | None
     prompt_embeds: "torch.Tensor | None" = None
 
+    # Only used for v2 model runner.
+    prefill_token_ids: list[int] | None = None
+
     @classmethod
     def from_request(
         cls,
         request: Request,
         block_ids: tuple[list[int], ...],
+        prefill_token_ids: list[int] | None = None,
     ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
@@ -60,6 +64,7 @@ def from_request(
             num_computed_tokens=request.num_computed_tokens,
             lora_request=request.lora_request,
             prompt_embeds=request.prompt_embeds,
+            prefill_token_ids=prefill_token_ids,
         )
 
     def __repr__(self) -> str:
@@ -68,6 +73,7 @@ def __repr__(self) -> str:
             f"NewRequestData("
             f"req_id={self.req_id},"
             f"prompt_token_ids={self.prompt_token_ids},"
+            f"prefill_token_ids={self.prefill_token_ids},"
             f"mm_features={self.mm_features},"
             f"sampling_params={self.sampling_params},"
             f"block_ids={self.block_ids},"
@@ -183,6 +189,10 @@ class SchedulerOutput:
     # freed from the encoder cache.
     free_encoder_mm_hashes: list[str]
 
+    # Request IDs that are preempted in this step.
+    # Only used for v2 model runner.
+    preempted_req_ids: set[str] | None = None
+
     # Whether the scheduled requests have all the output tokens they
     # need to perform grammar bitmask computation.
     pending_structured_output_tokens: bool = False
@@ -193,6 +203,20 @@ class SchedulerOutput:
     # EC Cache Connector metadata
     ec_connector_metadata: ECConnectorMetadata | None = None
 
+    @classmethod
+    def make_empty(cls) -> "SchedulerOutput":
+        return cls(
+            scheduled_new_reqs=[],
+            scheduled_cached_reqs=CachedRequestData.make_empty(),
+            num_scheduled_tokens={},
+            total_num_scheduled_tokens=0,
+            scheduled_spec_decode_tokens={},
+            scheduled_encoder_inputs={},
+            num_common_prefix_blocks=[],
+            finished_req_ids=set(),
+            free_encoder_mm_hashes=[],
+        )
+
 
 @dataclass
 class GrammarOutput:
 
@@ -6,6 +6,7 @@
 from collections.abc import Iterable
 from typing import Any
 
+from vllm import envs
 from vllm.config import VllmConfig
 from vllm.distributed.ec_transfer.ec_connector.base import (
     ECConnectorMetadata,
@@ -187,6 +188,7 @@ def __init__(
             pcp_world_size=self.pcp_world_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
+        self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
@@ -658,12 +660,25 @@ def schedule(self) -> SchedulerOutput:
                 )
 
         # Construct the scheduler output.
-        new_reqs_data = [
-            NewRequestData.from_request(
-                req, req_to_new_blocks[req.request_id].get_block_ids()
-            )
-            for req in scheduled_new_reqs
-        ]
+        if self.use_v2_model_runner:
+            scheduled_new_reqs = scheduled_new_reqs + scheduled_resumed_reqs
+            scheduled_resumed_reqs = []
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req,
+                    req_to_new_blocks[req.request_id].get_block_ids(),
+                    req._all_token_ids,
+                )
+                for req in scheduled_new_reqs
+            ]
+        else:
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req, req_to_new_blocks[req.request_id].get_block_ids()
+                )
+                for req in scheduled_new_reqs
+            ]
+
         with record_function_or_nullcontext("schedule: make_cached_request_data"):
             cached_reqs_data = self._make_cached_request_data(
                 scheduled_running_reqs,
@@ -685,6 +700,7 @@ def schedule(self) -> SchedulerOutput:
             scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
             num_common_prefix_blocks=num_common_prefix_blocks,
+            preempted_req_ids={req.request_id for req in preempted_reqs},
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
             # It contains the request IDs that are finished in between
 
@@ -0,0 +1,4 @@
+# [Experimental] Model Runner V2
+
+This directory contains the new model runner which is under active development.
+Ping [Woosuk Kwon](https://github.com/WoosukKwon) for any changes.
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+
+from vllm.v1.outputs import (
+    AsyncModelRunnerOutput,
+    ModelRunnerOutput,
+    SamplerOutput,
+)
+
+
+class AsyncOutput(AsyncModelRunnerOutput):
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampler_output: SamplerOutput,
+        num_sampled_tokens: np.ndarray,
+        copy_stream: torch.cuda.Stream,
+        copy_event: torch.cuda.Event,
+    ):
+        self.model_runner_output = model_runner_output
+        self.sampler_output = sampler_output
+        self.num_sampled_tokens = num_sampled_tokens
+        self.copy_stream = copy_stream
+        self.copy_event = copy_event
+
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(self.copy_stream):
+            self.copy_stream.wait_stream(default_stream)
+
+            # NOTE(woosuk): We must ensure that CPU tensors are not freed
+            # before the device-to-host copy is fully completed. For instance,
+            # operations like
+            # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
+            # are unsafe because the underlying CPU tensor can be prematurely freed and
+            # reused by other tensors before the asynchronous copy finishes, potentially
+            # causing race conditions. To prevent this, we delay freeing by holding
+            # references until the copy event signals completion.
+            # Likewise, we also need to keep the reference to the GPU tensors.
+            # This is done by keeping the reference to sampler_output and
+            # model_runner_output.
+            self.sampled_token_ids = sampler_output.sampled_token_ids.to(
+                "cpu", non_blocking=True
+            )
+            if sampler_output.logprobs_tensors is not None:
+                self.logprobs_tensors = (
+                    sampler_output.logprobs_tensors.to_cpu_nonblocking()
+                )
+            else:
+                self.logprobs_tensors = None
+            self.prompt_logprobs_dict = {}
+            if self.model_runner_output.prompt_logprobs_dict:
+                for k, v in self.model_runner_output.prompt_logprobs_dict.items():
+                    self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+            self.copy_event.record(self.copy_stream)
+
+    def get_output(self) -> ModelRunnerOutput:
+        self.copy_event.synchronize()
+
+        # NOTE(woosuk): The following code is to ensure compatibility with
+        # the existing model runner.
+        # Going forward, we should keep the data structures as NumPy arrays
+        # rather than Python lists.
+        sampled_token_ids_np = self.sampled_token_ids.numpy()
+        num_reqs = sampled_token_ids_np.shape[0]
+        sampled_token_ids: list[np.ndarray] = [
+            sampled_token_ids_np[i, : self.num_sampled_tokens[i]]
+            for i in range(num_reqs)
+        ]
+        self.model_runner_output.sampled_token_ids = sampled_token_ids
+
+        if self.logprobs_tensors is not None:
+            self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
+        self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
+        return self.model_runner_output
+
+
+@contextmanager
+def async_barrier(event: torch.cuda.Event | None):
+    if event is not None:
+        event.synchronize()
+    try:
+        yield
+    finally:
+        if event is not None:
+            event.record()
Original file line number	Diff line number	Diff line change
`@@ -593,6 +593,9 @@ def _get_workspace_buffer(self):`
`593`	`593`	`)`
`594`	`594`	`return self._workspace_buffer`
`595`	`595`
	`596`	`+ def set_workspace_buffer(self, workspace_buffer: torch.Tensor):`
	`597`	`+ self._workspace_buffer = workspace_buffer`
	`598`	`+`
`596`	`599`	`def _get_prefill_wrapper(`
`597`	`600`	`self,`
`598`	`601`	`) -> BatchPrefillWithPagedKVCacheWrapper \| BatchDCPPrefillWrapper:`