diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index da2c100dae3d..0f876c38169a 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -855,6 +855,13 @@ def post_init_cudagraph_sizes(self) -> None: self.compute_bs_to_padded_graph_size() def set_splitting_ops_for_v1(self): + # To compatible with OOT hardware plugin platform (for example vllm-ascend) + # which currently only supports sequence parallelism in eager mode. + if self.mode != CompilationMode.VLLM_COMPILE: + if self.splitting_ops is None: + self.splitting_ops = [] + return + # NOTE: this function needs to be called only when mode is # CompilationMode.VLLM_COMPILE assert self.mode == CompilationMode.VLLM_COMPILE, ( diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index c576275e80fe..fbb9dbdd139f 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -835,8 +835,7 @@ def has_blocked_weights(): ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now." # Do this after all the updates to compilation_config.mode - if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: - self.compilation_config.set_splitting_ops_for_v1() + self.compilation_config.set_splitting_ops_for_v1() if self.compilation_config.pass_config.enable_sequence_parallelism: # With pipeline parallelism or dynamo partitioning, @@ -844,6 +843,13 @@ def has_blocked_weights(): # Use custom rms norm to unblock. In the future, # the pass will operate on higher-level IR to avoid the issue. # TODO: https://github.com/vllm-project/vllm/issues/27894 + if self.compilation_config.mode != CompilationMode.VLLM_COMPILE: + logger.warning( + "Sequence parallelism is enabled, but running in wrong " + "vllm compile mode: %s.", + self.compilation_config.mode, + ) + is_fullgraph = ( self.compilation_config.use_inductor_graph_partition or len(self.compilation_config.splitting_ops) == 0