vllm-project · ayushsatyam146 · Sep 19, 2025 · Oct 5, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -39,13 +39,20 @@ class BatchDescriptor(NamedTuple):
     False can also be used for an uniform decode batch to dispatch to the 
     cudagraph supporting non-uniform batches.
     """
+    num_reqs: Optional[int] = None
+    """
+    Number of requests in the batch. Can be None for PIECEWISE cudagraphs where
+    we don't need to know the number of requests.
+    """
 
     @property
     def non_uniform(self) -> "BatchDescriptor":
         """
         Return a non-uniform version of current batch descriptor.
         """
-        return BatchDescriptor(self.num_tokens, uniform_decode=False)
+        return BatchDescriptor(
+            self.num_tokens, uniform_decode=False, num_reqs=self.num_reqs
+        )
 
 
 def _compute_sp_num_tokens(

@@ -656,31 +656,12 @@ def build(
 
             if num_decodes > 0:
                 pure_decode = num_prefills == 0
-                # possible required padding for cudagraph replay
                 use_cudagraph = (
                     self.enable_cuda_graph
                     and pure_decode
                     and num_decodes <= self._decode_cudagraph_max_bs
                 )
-                if use_cudagraph:
-                    num_input_tokens = self.vllm_config.pad_for_cudagraph(
-                        num_decode_tokens
-                    )
-                    # Carefully fulfill the padding region with reasonable value
-                    # on cpu.
-                    # Make sure paged_kv_indptr_cpu is not decreasing
-                    self.paged_kv_indptr_cpu[
-                        1 + num_decodes : 1 + num_input_tokens
-                    ].fill_(paged_kv_indptr_cpu[-1])
-                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
-                    # This is because flashinfer treats 0 as a full page
-                    # instead of empty.
-                    self.paged_kv_last_page_len_cpu[num_decodes:num_input_tokens].fill_(
-                        1
-                    )
-
-                else:
-                    num_input_tokens = num_decode_tokens
+                num_input_tokens = num_decode_tokens
 
                 attn_metadata.decode_wrapper = self._get_decode_wrapper(
                     num_input_tokens, use_cudagraph

@@ -771,7 +771,9 @@ def split_decodes_and_prefills(
     if require_uniform:
         is_prefill = query_lens != query_lens[0]
     else:
-        is_prefill = query_lens > decode_threshold
+        # 0-query len indicates a padded request; leave this at the back
+        # of the batch with the prefills
+        is_prefill = query_lens > decode_threshold | query_lens == 0
 
     if not torch.any(is_prefill):
         return num_reqs, 0, num_tokens, 0

diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
@@ -1,9 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from math import ceil
 from typing import Optional
 
+from typing_extensions import TypeAlias
+
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.forward_context import BatchDescriptor
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+CUDAGraphKey: TypeAlias = tuple[int, bool]
 
 
 class CudagraphDispatcher:
@@ -31,9 +38,11 @@ def __init__(self, vllm_config: VllmConfig):
         self.cudagraph_mode = self.compilation_config.cudagraph_mode
 
         # Dict to store valid cudagraph dispatching keys.
-        self.cudagraph_keys: dict[CUDAGraphMode, set[BatchDescriptor]] = {
-            CUDAGraphMode.PIECEWISE: set(),
-            CUDAGraphMode.FULL: set(),
+        self.cudagraph_keys: dict[
+            CUDAGraphMode, dict[CUDAGraphKey, BatchDescriptor]
+        ] = {
+            CUDAGraphMode.PIECEWISE: {},
+            CUDAGraphMode.FULL: {},
         }
 
         not_use_piecewise_compilation = (
@@ -61,7 +70,8 @@ def add_cudagraph_key(
         assert runtime_mode in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], (
             f"Invalid cudagraph runtime mode for keys: {runtime_mode}"
         )
-        self.cudagraph_keys[runtime_mode].add(batch_descriptor)
+        key = (batch_descriptor.num_tokens, batch_descriptor.uniform_decode)
+        self.cudagraph_keys[runtime_mode][key] = batch_descriptor
 
     def initialize_cudagraph_keys(
         self, cudagraph_mode: CUDAGraphMode, uniform_decode_query_len: int
@@ -71,11 +81,24 @@ def initialize_cudagraph_keys(
         # Note: we create all valid keys for cudagraph here but do not
         # guarantee all keys would be used. For example, if we allow lazy
         # capturing in future PR, some keys may never be triggered.
-        if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
+        # Add mixed mode keys with proper num_reqs calculation
+        if (mixed_mode := cudagraph_mode.mixed_mode()) in (
+            CUDAGraphMode.PIECEWISE,
+            CUDAGraphMode.FULL,
+        ):
             for bs in self.compilation_config.cudagraph_capture_sizes:
+                num_reqs = (
+                    self.calculate_num_reqs_for_tokens(
+                        bs, uniform_decode_query_len, False
+                    )
+                    if mixed_mode == CUDAGraphMode.FULL
+                    else None
+                )
                 self.add_cudagraph_key(
-                    cudagraph_mode.mixed_mode(),
-                    BatchDescriptor(num_tokens=bs, uniform_decode=False),
+                    mixed_mode,
+                    BatchDescriptor(
+                        num_tokens=bs, uniform_decode=False, num_reqs=num_reqs
+                    ),
                 )
 
         # if decode cudagraph mode is FULL, and we don't already have mixed
@@ -94,12 +117,38 @@ def initialize_cudagraph_keys(
                 if x <= max_num_tokens and x >= uniform_decode_query_len
             ]
             for bs in cudagraph_capture_sizes_for_decode:
+                num_reqs = self.calculate_num_reqs_for_tokens(
+                    bs, uniform_decode_query_len, True
+                )
                 self.add_cudagraph_key(
                     CUDAGraphMode.FULL,
-                    BatchDescriptor(num_tokens=bs, uniform_decode=True),
+                    BatchDescriptor(
+                        num_tokens=bs, uniform_decode=True, num_reqs=num_reqs
+                    ),
                 )
+
         self.keys_initialized = True
 
+    def calculate_num_reqs_for_tokens(
+        self, num_tokens: int, uniform_decode_query_len: int, uniform_decode: bool
+    ) -> int:
+        max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs
+
+        if uniform_decode:
+            num_reqs = ceil(num_tokens / uniform_decode_query_len)
+            return min(num_reqs, max_num_seqs)
+        else:
+            return min(num_tokens, max_num_seqs)
+
+    def _is_compatible(
+        self, batch_descriptor: BatchDescriptor, candidate: BatchDescriptor
+    ) -> bool:
+        """Check if candidate cudagraph can handle the batch request."""
+        if candidate.num_reqs is None:
+            return True
+        assert batch_descriptor.num_reqs is not None
+        return candidate.num_reqs >= batch_descriptor.num_reqs
+
     def dispatch(
         self, batch_descriptor: BatchDescriptor, use_cascade_attn: bool = False
     ) -> tuple[CUDAGraphMode, Optional[BatchDescriptor]]:
@@ -113,21 +162,19 @@ def dispatch(
         if not self.keys_initialized:
             return CUDAGraphMode.NONE, None
 
-        non_uniform_key = batch_descriptor.non_uniform
-        # if a batch use cascade attention, bypass checking full cudagraphs
-        if not use_cascade_attn:
-            # check if key exists for full cudagraph
-            if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
-                return CUDAGraphMode.FULL, batch_descriptor
+        num_tokens, uniform_decode = (
+            batch_descriptor.num_tokens,
+            batch_descriptor.uniform_decode,
+        )
 
-            # otherwise, check if non-uniform key exists
-            if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
-                return CUDAGraphMode.FULL, non_uniform_key
+        candidates = [(CUDAGraphMode.FULL, (num_tokens, uniform_decode))]
+        if uniform_decode:
+            candidates.append((CUDAGraphMode.FULL, (num_tokens, False)))
+        candidates.append((CUDAGraphMode.PIECEWISE, (num_tokens, False)))
 
-        # also check if non-uniform key exists for more "general"
-        # piecewise cudagraph
-        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
-            return CUDAGraphMode.PIECEWISE, non_uniform_key
+        for mode, key in candidates:
+            candidate = self.cudagraph_keys[mode].get(key)
+            if candidate and self._is_compatible(batch_descriptor, candidate):
+                return mode, candidate
 
-        # finally, just return no cudagraphs
         return CUDAGraphMode.NONE, None