[Ascend]: Fixed the issue where OOT Platform vllm-ascend could not enable SP in Eager mode (vllm-project#28935)

leo-pony · Bofeng BF1 Xue · commit 08f0a99e6367 · 2025-12-05T11:16:24.000+08:00
Signed-off-by: leo-pony &lt;nengjunma@outlook.com&gt;
Signed-off-by: Bofeng BF1 Xue &lt;xuebf1@Lenovo.com&gt;
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -855,6 +855,13 @@ def post_init_cudagraph_sizes(self) -> None:
         self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(self):
+        # To compatible with OOT hardware plugin platform (for example vllm-ascend)
+        # which currently only supports sequence parallelism in eager mode.
+        if self.mode != CompilationMode.VLLM_COMPILE:
+            if self.splitting_ops is None:
+                self.splitting_ops = []
+            return
+
         # NOTE: this function needs to be called only when mode is
         # CompilationMode.VLLM_COMPILE
         assert self.mode == CompilationMode.VLLM_COMPILE, (
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
@@ -797,15 +797,21 @@ def has_blocked_weights():
         ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
 
         # Do this after all the updates to compilation_config.mode
-        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
-            self.compilation_config.set_splitting_ops_for_v1()
+        self.compilation_config.set_splitting_ops_for_v1()
 
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             # With pipeline parallelism or dynamo partitioning,
             # native rms norm tracing errors due to incorrect residual shape.
             # Use custom rms norm to unblock. In the future,
             # the pass will operate on higher-level IR to avoid the issue.
             # TODO: https://github.com/vllm-project/vllm/issues/27894
+            if self.compilation_config.mode != CompilationMode.VLLM_COMPILE:
+                logger.warning(
+                    "Sequence parallelism is enabled, but running in wrong "
+                    "vllm compile mode: %s.",
+                    self.compilation_config.mode,
+                )
+
             is_fullgraph = (
                 self.compilation_config.use_inductor_graph_partition
                 or len(self.compilation_config.splitting_ops) == 0