vllm-project · LucasWilkinson · Aug 15, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;8.9;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
 
     #
@@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;8.9;9.0+PTX" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #

@@ -563,10 +563,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         self._called = True
 
-        if not self.compilation_config.use_cudagraph or \
-            not self.compilation_config.cudagraph_copy_inputs:
-            return self.split_gm
-
         # if we need to copy input buffers for cudagraph
         from torch._guards import detect_fake_mode
         fake_mode = detect_fake_mode()
@@ -585,6 +581,18 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
                 any(is_symbolic(d) for d in x.size())
         ]
 
+        if self.compilation_config.full_cuda_graph:
+            assert self.compilation_config.use_cudagraph, \
+                "full_cuda_graph mode requires use_cudagraph to be True"
+            fullgraph_wrapper = resolve_obj_by_qualname(
+                current_platform.get_fullgraph_wrapper_cls())
+            self.split_gm = fullgraph_wrapper(self.split_gm, self.vllm_config,
+                                              self.graph_pool, self.sym_tensor_indices)
+
+        if not self.compilation_config.use_cudagraph or \
+            not self.compilation_config.cudagraph_copy_inputs:
+            return self.split_gm
+
         # compiler managed cudagraph input buffers
         # we assume the first run with symbolic shapes
         # has the maximum size among all the tensors

@@ -70,3 +70,46 @@ def __call__(self, *args) -> Any:
             or a replayed static graph.
         """
         raise NotImplementedError
+
+
+class AbstractFullgraphWrapper(Protocol):
+    """
+    FullgraphWrapper interface that allows platforms to wrap the piecewise graph
+    to be viewed or captured as a full graph.
+    """
+
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, sym_shape_indices: list[int], **kwargs):
+        """
+        Initializes the FullgraphWrapper class with compilation and 
+        execution-related configurations.
+
+        Args:
+            graph (fx.GraphModule): The graph represented in fx.
+            vllm_config (VllmConfig): Global configuration for vLLM.
+            graph_pool (Any): 
+                Graph memory pool handle, e.g., 
+                    `torch.cuda.graph_pool_handle()`.
+            sym_shape_indices (list[int]): 
+                Indices of symbolic shape.
+
+        Keyword Args:
+            kwargs: Additional keyword arguments reserved for future 
+                extensions or custom platforms.
+
+        """
+        raise NotImplementedError
+
+    def __call__(self, *args) -> Any:
+        """
+        Executes the wrapped graph for given input args.
+
+        Args:
+            *args: Variable length input arguments to be passed into the 
+                graph. The symbolic shape is expected to be in position 
+                `sym_shape_indices[0]`.
+
+        Returns:
+            Any: Output of the executed wrapped graph.
+        """
+        raise NotImplementedError
@@ -37,6 +37,8 @@
     # during capture, and check if they are the same during replay
     input_addresses: Optional[list[int]] = None
 
+    usage_type: Optional[str] = None
+
 
 class CUDAPiecewiseBackend:
 
@@ -96,6 +98,7 @@
                 runtime_shape=shape,
                 need_to_compile=shape in self.compile_sizes,
                 use_cudagraph=shape in self.cudagraph_capture_sizes,
+                usage_type="piecewise(general)",  # for logging only
             )
 
     def check_for_ending_compilation(self):
@@ -139,27 +142,32 @@
                 self.check_for_ending_compilation()
 
         # Skip CUDA graphs if this entry doesn't use them OR
-        # if we're supposed to skip them globally
-        skip_cuda_graphs = get_forward_context().skip_cuda_graphs
-        if not entry.use_cudagraph or skip_cuda_graphs:
+        # if we're supposed to treat the piecewise graphs as a whole,
+        # which implies forward_context.skip_attention_cuda_graphs is False.
+        # In the latter case, we rely on a wrapper class to capture
+        # the full cudagraph outside the fx graph.
+        skip_attention_cuda_graphs = get_forward_context().skip_attention_cuda_graphs
+        if not entry.use_cudagraph or not skip_attention_cuda_graphs:
             return entry.runnable(*args)
 
         if entry.cudagraph is None:
             if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
                 entry.num_finished_warmup += 1
                 if self.is_first_graph:
                     logger.debug(
-                        "Warming up %s/%s for shape %s",
+                        "Warming up %s/%s of %s usage for shape %s",
                         entry.num_finished_warmup,
                         self.compilation_config.cudagraph_num_of_warmups,
+                        entry.usage_type,
                         runtime_shape)
                 return entry.runnable(*args)
 
             if self.is_first_graph:
                 # Since we capture cudagraph for many different shapes and
                 # capturing is fast, we don't need to log it for every shape.
                 # We only log it in the debug mode.
-                logger.debug("Capturing a cudagraph for shape %s",
+                logger.debug("Capturing a cudagraph of %s usage for shape %s",
+                             entry.usage_type,
                              runtime_shape)
 
             input_addresses = [
@@ -216,3 +224,137 @@
 
         entry.cudagraph.replay()
         return entry.output
+
+
+class FullCudagraphWrapper:
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, sym_shape_indices: list[int],
+                 ):
+        self.graph = graph
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
+        self.graph_pool = graph_pool
+        self.sym_shape_indices = sym_shape_indices
+
+        self.separate_attention_routine = vllm_config.compilation_config.separate_attention_routine
+
+        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+
+        self.first_run_finished = False
+
+        self.cudagraph_capture_sizes: set[int] = set(
+                self.compilation_config.cudagraph_capture_sizes
+            ) if self.compilation_config.use_cudagraph else set()
+
+        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
+        self.concrete_size_entries_decode: dict[int, ConcreteSizeEntry] = {}
+
+
+        for shape in self.cudagraph_capture_sizes:
+            self.concrete_size_entries[shape] = ConcreteSizeEntry(
+                runtime_shape=shape,
+                need_to_compile=False,
+                use_cudagraph=True,
+                usage_type="general",
+            )
+            if self.separate_attention_routine:
+                self.concrete_size_entries_decode[shape] = ConcreteSizeEntry(
+                    runtime_shape=shape,
+                    need_to_compile=False,
+                    use_cudagraph=True,
+                    usage_type="decode",
+                )
+
+    def __call__(self, *args) -> Any:
+        if not self.first_run_finished:
+            self.first_run_finished = True
+            return self.graph(*args)
+        list_args = list(args)
+        runtime_shape = list_args[self.sym_shape_indices[0]].shape[0]
+        forward_context = get_forward_context()
+
+        if forward_context.skip_attention_cuda_graphs:
+            # turn back to piecewise cudagraphs backend, which is responsible
+            # for capturing and running the piecewise cudagraphs.
+            return self.graph(*args)  
+
+        # if not skip, the fx graph and its sub-graphs will only be supposed to 
+        # eagerly run the compiled graphs, which should be cudagraph capturable
+        # as a whole.
+
+        concrete_size_entries = self.concrete_size_entries  # default as general usage
+        if self.separate_attention_routine and forward_context.is_pure_decoding:
+            concrete_size_entries = self.concrete_size_entries_decode
+
+        if not runtime_shape in concrete_size_entries:
+            # we don't need to do anything for this shape.
+            return self.graph(*args)
+
+        entry = concrete_size_entries[runtime_shape]
+
+        if entry.runnable is None:
+            entry.runnable = self.graph
+
+        if not entry.use_cudagraph:
+            return entry.runnable(*args)
+
+        if entry.cudagraph is None:
+            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
+                entry.num_finished_warmup += 1
+                logger.debug(
+                    "Warming up %s/%s of %s usage for shape %s",
+                    entry.num_finished_warmup,
+                    self.compilation_config.cudagraph_num_of_warmups,
+                    entry.usage_type,
+                    runtime_shape)
+                return entry.runnable(*args)
+
+
+            # Since we capture cudagraph for many different shapes and
+            # capturing is fast, we don't need to log it for every shape.
+            # We only log it in the debug mode.
+
+            logger.debug("Capturing a cudagraph of %s usage for shape %s",
+                            entry.usage_type,
+                            runtime_shape)
+
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
+            cudagraph = torch.cuda.CUDAGraph()
+
+            with ExitStack() as stack:
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = entry.runnable(*args)
+                    # by converting it to weak ref,
+                    # the original `output` will immediately be released
+                    # to save memory. 
+                    output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
+
+            compilation_counter.num_cudagraph_captured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                "Input addresses for cudagraphs are different during replay."
+                f" Expected {entry.input_addresses}, got {new_input_addresses}"
+            )
+
+        entry.cudagraph.replay()
+        return entry.output
diff --git a/vllm/config.py b/vllm/config.py
@@ -3974,13 +3974,21 @@
     splitting certain operations such as attention into subgraphs. Thus this
     flag cannot be used together with splitting_ops. This may provide
     performance benefits for smaller models."""
+    separate_attention_routine: bool = False
+    """
+    Enable a distinct attention calls routine under an attention backend for full
+    cuda graph capturing. This is because some attention backends like FlashMLA,
+    FlashInfer, FA2, etc. implement different branches for mix prefill-decode and
+    pure decode cases. This flag enables us to potentially capture the cudagraph
+    separately for each branch.
+    """
 
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""

    max_capture_size: int = field(default=None, init=False)  # type: ignore
    """not configurable, computed after init"""
    local_cache_dir: str = field(default=None, init=False)  # type: ignore
    """local cache dir for each rank"""
    bs_to_padded_graph_size: list[int] = field(
        default=None,  # type: ignore
@@ -4172,13 +4180,16 @@
 
     def set_splitting_ops_for_v1(self):
         # NOTE: this function needs to be called
-        if self.splitting_ops and self.full_cuda_graph:
-            raise ValueError("full_cuda_graph cannot be used together with "
-                             "splitting_ops, as Full CUDA graph will override "
-                             f"the splitting_ops: {self.splitting_ops}")
-
+        # NOTE: When full_cuda_graph is True, instead of setting an empty
+        # list and capture the full cudagraph inside the flattened fx graph,
+        # we keep the piecewise fx graph structure but capture the full 
+        # cudagraph outside the fx graph. This reduces some cpu overhead when
+        # the runtime batch_size is not cudagraph captured. This is only
+        # supported for separate_attention_routine.
+        if self.separate_attention_routine:
+            assert self.full_cuda_graph, "separate_attention_routine requires full_cuda_graph to be True"
         if not self.splitting_ops:
-            self.splitting_ops = [] if self.full_cuda_graph else [
+            self.splitting_ops = [
                 "vllm.unified_attention",
                 "vllm.unified_attention_with_output",
             ]
@@ -4186,7 +4197,7 @@

 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
    """Dataclass which contains all vllm-related configuration. This
    simplifies passing around the distinct configurations in the codebase.
    """

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -94,7 +94,11 @@ class ForwardContext:
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
     dp_metadata: Optional[DPMetadata] = None
-    skip_cuda_graphs: bool = False
+    # determine whether to use a full cudagraph for attention or piecewise 
+    # cudagraphs that skip the attention part. By default true, we use piecewise 
+    # cudagraphs.
+    skip_attention_cuda_graphs: bool = True,
+    is_pure_decoding: bool = False
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -115,7 +119,8 @@ def set_forward_context(
     virtual_engine: int = 0,
     num_tokens: Optional[int] = None,
     num_tokens_across_dp: Optional[torch.Tensor] = None,
-    skip_cuda_graphs: bool = False,
+    skip_attention_cuda_graphs: bool = True,
+    is_pure_decoding: bool = False,
 ):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
@@ -140,7 +145,8 @@ def set_forward_context(
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata,
-        skip_cuda_graphs=skip_cuda_graphs,
+        skip_attention_cuda_graphs=skip_attention_cuda_graphs,
+        is_pure_decoding=is_pure_decoding,
     )
 
     try:

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -370,6 +370,10 @@ def use_custom_allreduce(cls) -> bool:
     @classmethod
     def get_piecewise_backend_cls(cls) -> str:
         return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
+    @classmethod
+    def get_fullgraph_wrapper_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.FullCudagraphWrapper"  # noqa
 
     @classmethod
     def stateless_init_device_torch_dist_pg(

diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -531,6 +531,14 @@ def get_piecewise_backend_cls(cls) -> str:
         Get piecewise backend class for piecewise graph.
         """
         return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa
+
+    @classmethod
+    def get_fullgraph_wrapper_cls(cls) -> str:
+        """
+        Get fullgraph wrapper class for fullgraph static graph.
+        """
+        return "vllm.compilation.base_piecewise_backend.AbstractFullgraphWrapper"  # noqa
+
 
     @classmethod
     def stateless_init_device_torch_dist_pg(