vllm-project · ProExpertProg · Sep 26, 2025 · Aug 16, 2025 · Aug 17, 2025 · Aug 17, 2025
@@ -338,7 +338,7 @@ def call_module(self, target: torch.fx.node.Target,
                 runtime_shape=None)
             # Lazy import here to avoid circular import
             from .cuda_graph import CUDAGraphOptions
-            from .cuda_piecewise_backend import PiecewiseBackend
+            from .piecewise_backend import PiecewiseBackend
 
             piecewise_backend = PiecewiseBackend(
                 submod, self.vllm_config, index,

@@ -3626,13 +3626,21 @@ def __post_init__(self):
 
         # final check of cudagraph mode after platform-specific update
         if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
-            if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
+            if self.compilation_config.cudagraph_mode.has_full_cudagraphs()\
                 and self.model_config is not None and \
                 not self.model_config.disable_cascade_attn:
-                logger.info("CUDAGraphMode.FULL is not supported with "
-                            "cascade attention currently. Disabling cascade"
-                            "attention.")
-                self.model_config.disable_cascade_attn = True
+                warn_msg = ("Cascade attention is not supported with full "
+                            "cudagraphs currently. ")
+                if self.compilation_config.cudagraph_mode.\
+                    has_piecewise_cudagraphs():
+                    logger.warning_once(
+                        warn_msg + "It will dispatched to "
+                        "piecewise cudagraphs if a batch runs into cascade "
+                        "attentions")
+                else:
+                    logger.warning_once(
+                        warn_msg + "It will fallback to eager execution if a "
+                        "batch runs into cascade attentions")
 
             if self.compilation_config.cudagraph_mode\
                 .requires_piecewise_compilation():

@@ -62,9 +62,17 @@ def max_cudagraph_mode(self) -> 'CUDAGraphMode':
     def has_full_cudagraphs(self) -> bool:
         return self.max_cudagraph_mode() == CUDAGraphMode.FULL
 
+    def has_piecewise_cudagraphs(self) -> bool:
+        return self.requires_piecewise_compilation()
+
     def separate_routine(self) -> bool:
         return isinstance(self.value, tuple)
 
+    def vaild_runtime_modes(self) -> bool:
+        return self in [
+            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
+        ]
+
 
 @config
 @dataclass
@@ -544,20 +552,37 @@ def set_splitting_ops_for_v1(self):
             # full cudagraph outside the fx graph. This reduces some cpu
             # overhead when the runtime batch_size is not cudagraph captured.
             # see https://github.com/vllm-project/vllm/pull/20059 for details.
-            self.splitting_ops = self._attention_ops
+            if self.pass_config.enable_attn_fusion:
+                self.splitting_ops = []
+                if self.cudagraph_mode.has_piecewise_cudagraphs():
+                    logger.warning_once(
+                        "When enable_attn_fusion, splitting_ops will be set "
+                        "to empty list, and cudagraph_mode containing "
+                        "PIECEWISE will be treated as FULL cudagraph_mode. "
+                        "Please ensure you are using attention backends that "
+                        "support cudagraph or set cudagraph_mode to NONE "
+                        "explicitly if encountering any problems.")
+                    self.cudagraph_mode = CUDAGraphMode.FULL
+            else:
+                self.splitting_ops = self._attention_ops
         elif len(self.splitting_ops) == 0:
             logger.warning_once("Using piecewise compilation with empty "
                                 "splitting_ops.")
-            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+            if self.cudagraph_mode.has_piecewise_cudagraphs():
                 logger.warning_once(
                     "When compilation level is piecewise with empty "
-                    "splitting_ops, PIECEWISE cudagraph_mode will be "
-                    "treated as FULL cudagraph_mode. Please ensure you are "
+                    "splitting_ops, cudagraph_mode containing PIECEWISE will "
+                    "be treated as FULL cudagraph_mode. Please ensure you are "
                     "using attention backends that support cudagraph or set "
                     "cudagraph_mode to NONE explicitly if encountering "
                     "any problems.")
                 self.cudagraph_mode = CUDAGraphMode.FULL
             self.splitting_ops = []
+        else:  # len(self.splitting_ops) > 0:
+            assert not self.pass_config.enable_attn_fusion or \
+                not self.splitting_ops_contain_attention(), (
+                "attention ops should not be in splitting_ops "
+                "when enable_attn_fusion is True")
 
     def splitting_ops_contain_attention(self) -> bool:
         return self.splitting_ops is not None and all(

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -179,8 +179,7 @@ class ForwardContext:
     batch_descriptor: Optional[BatchDescriptor] = None
 
     def __post_init__(self):
-        assert self.cudagraph_runtime_mode in [
-            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
+        assert self.cudagraph_runtime_mode.vaild_runtime_modes(), \
             f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}"
 
 

diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
@@ -11,7 +11,8 @@
 
 class CudagraphDispatcher:
     """
-    Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
+    Runtime cudagraph dispatcher to dispatch keys for multiple sets of
+    cudagraphs.
 
     The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
     for FULL cudagraph runtime mode. The keys are initialized depending on 
@@ -21,10 +22,10 @@ class CudagraphDispatcher:
 
     At runtime, the dispatch method generates the runtime cudagraph mode (FULL, 
     PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
-    based on the input key. After dispatching (commuicate via forward context), 
-    the cudagraph wrappers will trust the dispatch key to do either capturing
-    or replaying (if mode matched), or pass through to the underlying runnable 
-    without cudagraph (if mode no match or mode is NONE).
+    based on the input key. After dispatching (communicated via forward 
+    context), the cudagraph wrappers will trust the dispatch key to either
+    capture or replay (if the mode matches), or pass through to the underlying
+    runnable without cudagraph (if the mode does not match or mode is NONE).
     """
 
     def __init__(self, vllm_config: VllmConfig):
@@ -52,19 +53,15 @@ def __init__(self, vllm_config: VllmConfig):
     def add_cudagraph_key(self, runtime_mode: CUDAGraphMode,
                           batch_descriptor: BatchDescriptor):
         assert runtime_mode in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
-            f"Invalid cudagraph runtime mode: {runtime_mode}"
+            f"Invalid cudagraph runtime mode for keys: {runtime_mode}"
         self.cudagraph_keys[runtime_mode].add(batch_descriptor)
 
     def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode,
                                   uniform_decode_query_len: int):
         # This should be called only after attention backend is initialized.
 
-        # Note: we create all valid keys possible for cudagraph but do not
-        # guarantee all keys would be used. For example, we create keys for
-        # piecewise cudagraphs when it is piecewise compilation, which is always
-        # valid, but for attention backend support unified routine, we may not
-        # trigger capturing/replaying the piecewise cudagraphs depending on
-        # CompilationConfig.cudagraph_mode. In addition, if we allow lazy
+        # Note: we create all valid keys for cudagraph here but do not
+        # guarantee all keys would be used. For example, if we allow lazy
         # capturing in future PR, some keys may never be triggered.
         if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
             for bs in self.compilation_config.cudagraph_capture_sizes:
@@ -89,10 +86,13 @@ def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode,
         self.keys_initialized = True
 
     def dispatch(
-        self, batch_descriptor: BatchDescriptor
+        self,
+        batch_descriptor: BatchDescriptor,
+        use_cascade_attn: bool = False
     ) -> tuple[CUDAGraphMode, Optional[BatchDescriptor]]:
         """
-        Given a batch descriptor, dispatch to a cudagraph mode.
+        Given conditions(e.g.,batch descriptor and if using cascade attention),
+        dispatch to a cudagraph runtime mode and the valid batch descriptor.
         A new batch descriptor is returned as we might dispatch a uniform batch 
         to a graph that supports a more general batch (uniform to non-uniform).
         """
@@ -102,14 +102,16 @@ def dispatch(
                                 "initialized. No cudagraph will be used.")
             return CUDAGraphMode.NONE, None
 
-        # check if key exists for full cudagraph
-        if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
-            return CUDAGraphMode.FULL, batch_descriptor
-
-        # otherwise, check if non-uniform key exists
         non_uniform_key = batch_descriptor.non_uniform
-        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
-            return CUDAGraphMode.FULL, non_uniform_key
+        # if a batch use cascade attention, bypass checking full cudagraphs
+        if not use_cascade_attn:
+            # check if key exists for full cudagraph
+            if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, batch_descriptor
+
+            # otherwise, check if non-uniform key exists
+            if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, non_uniform_key
 
         # also check if non-uniform key exists for more "general"
         # piecewise cudagraph

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -689,11 +689,13 @@ def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata],
-               np.ndarray, Optional[CommonAttentionMetadata], int]:
+               np.ndarray, Optional[CommonAttentionMetadata], int, bool]:
         """
         :return: tuple[
             attn_metadata: layer-to-attention_metadata mapping,
-            logits_indices, spec_decode_metadata
+            logits_indices, spec_decode_metadata,
+            num_scheduled_tokens, spec_decode_common_attn_metadata,
+            max_num_scheduled_tokens, use_cascade_attn
         ]
         """
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -840,6 +842,7 @@ def _prepare_inputs(
             )
 
         attn_metadata: dict[str, Any] = {}
+        use_cascade_attn = False
 
         # Prepare encoder attention metadata separately
         # (encoder layers are not in KV cache groups)
@@ -908,6 +911,8 @@ def _prepare_inputs(
                     common_attn_metadata=common_attn_metadata,
                 ))
 
+                use_cascade_attn |= attn_metadata_i.use_cascade
+
                 fast_prefill_metadata = attn_metadata_i
                 if (self.cache_config.kv_sharing_fast_prefill
                         and self.kv_sharing_fast_prefill_eligible_layers):
@@ -938,7 +943,7 @@ def _prepare_inputs(
 
         return (attn_metadata, logits_indices, spec_decode_metadata,
                 num_scheduled_tokens, spec_decode_common_attn_metadata,
-                max_num_scheduled_tokens)
+                max_num_scheduled_tokens, use_cascade_attn)
 
     def _compute_cascade_attn_prefix_len(
         self,
@@ -1517,7 +1522,8 @@ def execute_model(
         # Prepare the decoder inputs.
         (attn_metadata, logits_indices, spec_decode_metadata,
          num_scheduled_tokens_np, spec_decode_common_attn_metadata,
-         max_query_len) = (self._prepare_inputs(scheduler_output))
+         max_query_len,
+         use_cascade_attn) = (self._prepare_inputs(scheduler_output))
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
@@ -1593,7 +1599,8 @@ def execute_model(
         batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
                                            uniform_decode=uniform_decode)
         cudagraph_runtime_mode, batch_descriptor = \
-            self.cudagraph_dispatcher.dispatch(batch_descriptor)
+            self.cudagraph_dispatcher.dispatch(batch_descriptor,
+                                                use_cascade_attn)
 
         # Run the model.
         # Use persistent buffers for CUDA graphs.
@@ -2253,9 +2260,7 @@ def _dummy_run(
             skip_eplb: If True, skip EPLB state update.
             is_profile: If True, this is a profile run.
         """
-        assert cudagraph_runtime_mode in {
-            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
-        }
+        assert cudagraph_runtime_mode.vaild_runtime_modes()
 
         # Padding for DP
         num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
@@ -2709,9 +2714,9 @@ def freeze_gc():
     def _capture_cudagraphs(self, compilation_cases: list[int],
                             cudagraph_runtime_mode: CUDAGraphMode,
                             uniform_decode: bool):
-        assert cudagraph_runtime_mode != CUDAGraphMode.NONE and \
-            cudagraph_runtime_mode in [CUDAGraphMode.FULL,
-                                        CUDAGraphMode.PIECEWISE]
+        assert cudagraph_runtime_mode in [CUDAGraphMode.FULL,
+                                         CUDAGraphMode.PIECEWISE],\
+            f"Invalid cudagraph runtime mode: {cudagraph_runtime_mode}"
 
         # Only rank 0 should print progress bar during capture
         if is_global_first_rank():
@@ -2853,6 +2858,12 @@ def create_attn_groups(
             self.is_encoder_only_model = True
 
     def initialize_cudagraph_capture(self) -> None:
+        """
+        Resolve the cudagraph_mode when there are multiple
+        attention backends with conflicting CUDA graph support.
+        Initialize the cudagraph_dispatcher based on the resolved
+        cudagraph_mode.
+        """
         min_cg_support = AttentionCGSupport.ALWAYS
         min_cg_builder_name = None