[main] Support MTP shape with ACLgraph

lilinsiman · lilinsiman · commit 07a3814f443e · 2025-11-28T11:27:17.000+08:00
Signed-off-by: lilinsiman &lt;lilinsiman@gmail.com&gt;
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -452,6 +452,55 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
         update_cudagraph_capture_sizes(vllm_config,
                                        new_cudagraph_capture_sizes)
 
+    # modify the default capture_sizes for num_speculative_tokens >= 1 scenario.
+    # this is mainly because in the scenario where MTP is superimposed with Full Graph, the FIA operator needs to perform
+    # padding operations to adapt to its actual_seq_lengths parameter. The padding operation will
+    # expand each request to the maximum request count under MTP, therefore the input shape must be
+    # equal to a multiple of the MTP layer count (k+1). Assuming k=2, capture_sizes = [3, 6, 9, 15, 18, ...].
+    # Consequently, it is necessary to modify the default captured graph shape of Full Graph to
+    # accommodate this requirement of the FIA operator.
+    # TODO: It is more appropriate to place the initialization of shape capture for the fullgraph of the FIA
+    # operator adapted for MTP in the vLLM community. Therefore, this section will be removed
+    # migrated to the vLLM community.
+    from vllm.config.compilation import CUDAGraphMode
+    aclgraph_mode = vllm_config.compilation_config.cudagraph_mode
+    if vllm_config.speculative_config is not None and \
+        aclgraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
+        num_speculative_tokens = vllm_config.speculative_config.num_speculative_tokens
+        max_num_seqs = vllm_config.scheduler_config.max_num_seqs
+        target_sizes = (num_speculative_tokens + 1) * max_num_seqs
+        original_sizes, vllm_config.compilation_config.cudagraph_capture_sizes = \
+            vllm_config.compilation_config.cudagraph_capture_sizes, None
+        assert len(original_sizes) > 0
+        assert max_num_seqs > 0
+        assert num_speculative_tokens > 0
+        if num_speculative_tokens > 1:
+            if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
+                new_original_sizes = sorted(
+                    set(
+                        list(range(1, min(10, max_num_seqs + 1), 2)) +
+                        list(range(8, max_num_seqs + 1, 4))))
+                enlarged_sizes = [(num_speculative_tokens + 1) * sizes
+                                  for sizes in new_original_sizes]
+                if enlarged_sizes[-1] < target_sizes:
+                    enlarged_sizes.append(target_sizes)
+                update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
+                logger.info(
+                    "Adjusted ACL full graphs: %s → %s for speculative decoding",
+                    original_sizes, enlarged_sizes)
+            else:
+                vllm_config.compilation_config.cudagraph_capture_sizes = original_sizes
+        if num_speculative_tokens == 1:
+            padding_sizes = original_sizes.copy()
+            if padding_sizes[-1] < target_sizes:
+                padding_sizes.append(target_sizes)
+                update_cudagraph_capture_sizes(vllm_config, padding_sizes)
+                logger.info(
+                    "Adjusted ACL full graphs: %s → %s for speculative decoding",
+                    original_sizes, padding_sizes)
+            else:
+                vllm_config.compilation_config.cudagraph_capture_sizes = original_sizes
+
 
 def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
     """Update ACL graph capture sizes based on hardware limitations"""
@@ -571,13 +620,24 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
         max_num_seqs = vllm_config.scheduler_config.max_num_seqs
         original_sizes, compilation_config.cudagraph_capture_sizes = \
             compilation_config.cudagraph_capture_sizes, None
+        new_original_sizes = sorted(
+            set(
+                list(range(1, min(10, max_num_seqs + 1), 2)) +
+                list(range(8, max_num_seqs + 1, 4))))
+        step = (len(new_original_sizes) - 1) / (max_num_batch_sizes - 1)
+        indices = [round(i * step) for i in range(max_num_batch_sizes)]
+        indices[0], indices[-1] = 0, len(new_original_sizes) - 1
+        new_sampled_sizes = [new_original_sizes[i] for i in indices]
+        target_sizes = (num_speculative_tokens + 1) * max_num_seqs
         assert len(original_sizes) > 0
         if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
             enlarged_sizes = [(num_speculative_tokens + 1) * size
-                              for size in original_sizes]
+                              for size in new_sampled_sizes]
+            if enlarged_sizes[-1] < target_sizes:
+                enlarged_sizes[-1] = target_sizes
             update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
             logger.info(
-                "Adjusted ACL graphs: %s → %s for speculative decoding",
+                "Adjusted PieceWise ACL graphs: %s → %s for speculative decoding",
                 original_sizes, enlarged_sizes)
         else:
             compilation_config.cudagraph_capture_sizes = original_sizes
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -4010,16 +4010,6 @@ def _capture_model(self):
                     and x >= self.uniform_decode_query_len
                 ]
                 compilation_cases_decode = sorted(decode_cudagraph_batch_sizes)
-                # TODO: refactor this when vLLM supports mtp>1
-                if not all(x % self.uniform_decode_query_len == 0
-                           for x in decode_cudagraph_batch_sizes):
-                    raise ValueError(
-                        "In the MTP fullgraph scenario, each graph size must be an integer multiple of "
-                        f"(num_speculative_tokens + 1): {self.uniform_decode_query_len}. "
-                        f"Please modify the cudagraph_capture_sizes variable to be integer multiple of {self.uniform_decode_query_len}, "
-                        f"while ensuring the maximum cudagraph_capture_sizes does not exceed max_num_seqs * (num_speculative_tokens + 1): {max_num_tokens}. "
-                        "For example, with MTP=2 and max_num_seqs=16, we recommend setting cudagraph_capture_sizes to [48]."
-                    )
                 self._capture_aclgraphs(
                     compilation_cases=compilation_cases_decode,
                     aclgraph_runtime_mode=CUDAGraphMode.FULL,