vllm-project · ProExpertProg · Dec 1, 2025 · Nov 27, 2025
@@ -855,6 +855,13 @@ def post_init_cudagraph_sizes(self) -> None:
         self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(self):
+        # To compatible with OOT hardware plugin platform (for example vllm-ascend)
+        # which currently only supports sequence parallelism in eager mode.
+        if self.mode != CompilationMode.VLLM_COMPILE:
+            if self.splitting_ops is None:
+                self.splitting_ops = []
+            return
+
         # NOTE: this function needs to be called only when mode is
         # CompilationMode.VLLM_COMPILE
         assert self.mode == CompilationMode.VLLM_COMPILE, (

@@ -835,15 +835,21 @@ def has_blocked_weights():
         ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
 
         # Do this after all the updates to compilation_config.mode
-        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
-            self.compilation_config.set_splitting_ops_for_v1()
+        self.compilation_config.set_splitting_ops_for_v1()
 
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             # With pipeline parallelism or dynamo partitioning,
             # native rms norm tracing errors due to incorrect residual shape.
             # Use custom rms norm to unblock. In the future,
             # the pass will operate on higher-level IR to avoid the issue.
             # TODO: https://github.com/vllm-project/vllm/issues/27894
+            if self.compilation_config.mode != CompilationMode.VLLM_COMPILE:
+                logger.warning(
+                    "Sequence parallelism is enabled, but running in wrong "
+                    "vllm compile mode: %s.",
+                    self.compilation_config.mode,
+                )
+
             is_fullgraph = (
                 self.compilation_config.use_inductor_graph_partition
                 or len(self.compilation_config.splitting_ops) == 0