vllm-project · wangxiyuan · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml
@@ -36,7 +36,7 @@ jobs:
 
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=2918c1b49c88c29783c86f78d2c4221cb9622379
+          VLLM_COMMIT=v0.11.2
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository

diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/pre-commit.yml
     with:
-      vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
+      vllm: v0.11.2
   changes:
     runs-on: ubuntu-latest
     outputs:
@@ -83,7 +83,7 @@ jobs:
         VLLM_USE_MODELSCOPE: True
     strategy:
       matrix:
-        vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0]
+        vllm_version: [v0.11.2, v0.11.0]
     steps:
       - name: Install packages
         run: |
@@ -138,7 +138,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0]
+        vllm_version: [v0.11.2, v0.11.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml
@@ -69,7 +69,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0]
+        vllm_version: [v0.11.2, v0.11.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml

diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
@@ -43,7 +43,7 @@ The table below is the release compatibility matrix for vLLM Ascend release.
 For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | v0.11.0/2918c1b49c88c29783c86f78d2c4221cb9622379 | >= 3.10, < 3.12   | 8.3.RC1 | 2.7.1 / 2.7.1 |
+|     main    | v0.11.0 or v0.11.2 | >= 3.10, < 3.12   | 8.3.RC1 | 2.7.1 / 2.7.1 |
 
 ## Release cadence
 

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -21,6 +21,7 @@
 Run `pytest tests/test_offline_inference.py`.
 """
 import os
+from typing import List
 from unittest.mock import patch
 
 import pytest
@@ -175,17 +176,23 @@ def test_sp_for_qwen3_moe() -> None:
                                      top_k=50,
                                      top_p=0.9)
 
-    with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
-                    dtype="auto",
-                    tensor_parallel_size=2,
-                    distributed_executor_backend="mp",
-                    compilation_config={
-                        "pass_config": {
-                            "enable_sequence_parallelism": True
-                        }
-                    },
-                    enable_expert_parallel=True,
-                    enforce_eager=True) as vllm_model:
+    splitting_ops: List[str] = []
+    with VllmRunner(
+            snapshot_download("Qwen/Qwen3-30B-A3B"),
+            dtype="auto",
+            tensor_parallel_size=4,
+            distributed_executor_backend="mp",
+            compilation_config=
+        {
+            "pass_config": {
+                "enable_sequence_parallelism": True,
+            },
+            # FIXME: When check the splitting_ops list is empyt should first check it is not none
+            #  issue has been fixed which imported in PR:https://github.com/vllm-project/vllm/pull/27126
+            "splitting_ops": splitting_ops
+        },
+            enable_expert_parallel=True,
+            enforce_eager=True) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
 
 

diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -18,9 +18,8 @@ def test_concurrent_partial_prefill(enforce_eager):
                         },
                     },
                     max_num_seqs=3,
-                    max_num_batched_tokens=2048,
+                    max_num_batched_tokens=8192,
                     enforce_eager=enforce_eager,
-                    max_model_len=2048,
                     gpu_memory_utilization=0.7) as vllm_model:
         outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
                                             3)
@@ -38,9 +37,8 @@ def test_prefix_cache_stats_is_recorded(enforce_eager):
                         },
                     },
                     max_num_seqs=3,
-                    max_num_batched_tokens=2048,
+                    max_num_batched_tokens=8192,
                     enforce_eager=enforce_eager,
-                    max_model_len=2048,
                     gpu_memory_utilization=0.7) as vllm_model:
         # 17 tokens will make sure first 16 tokens are cached in a block
         input_tokens = {"prompt_token_ids": [101] * 129}
@@ -51,7 +49,7 @@ def test_prefix_cache_stats_is_recorded(enforce_eager):
 
 @pytest.mark.parametrize("max_tokens",
                          [4])  # cannot align results when max_tokens > 4
-@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
 def test_chunked_prefill_with_ascend_scheduler(
         max_tokens: int, chunked_prefill_token_size: int) -> None:
     example_prompts = [
@@ -93,7 +91,7 @@ def test_chunked_prefill_with_ascend_scheduler(
 
 @pytest.mark.parametrize("max_tokens",
                          [4])  # cannot align results when max_tokens > 4
-@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+@pytest.mark.parametrize("chunked_prefill_token_size", [2048])
 def test_chunked_prefill_with_scheduler_dynamic_batch(
         max_tokens: int, chunked_prefill_token_size: int) -> None:
     example_prompts = [

diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py
@@ -376,9 +376,7 @@ def test_stop_via_update_from_output(self):
                                            },
                                            num_common_prefix_blocks=0,
                                            finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
         model_output = ModelRunnerOutput(
             req_ids=[req.request_id for req in requests],
             req_id_to_index={
@@ -429,9 +427,7 @@ def test_stop_via_update_from_output(self):
                                            },
                                            num_common_prefix_blocks=0,
                                            finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
         model_output = ModelRunnerOutput(
             req_ids=[req.request_id for req in requests],
             req_id_to_index={
@@ -481,9 +477,7 @@ def test_stop_via_update_from_output(self):
                                            },
                                            num_common_prefix_blocks=0,
                                            finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
         model_output = ModelRunnerOutput(
             req_ids=[req.request_id for req in requests],
             req_id_to_index={
@@ -526,9 +520,7 @@ def test_stop_via_update_from_output(self):
             },
             num_common_prefix_blocks=0,
             finished_req_ids=set(),
-            free_encoder_mm_hashes=[],
-            structured_output_request_ids={},
-            grammar_bitmask=None)
+            free_encoder_mm_hashes=[])
         model_output = ModelRunnerOutput(
             req_ids=[requests[0].request_id],
             req_id_to_index={requests[0].request_id: 0},
@@ -1069,9 +1061,7 @@ def test_stop_via_update_from_output(self):
                                            },
                                            num_common_prefix_blocks=0,
                                            finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
         model_output = ModelRunnerOutput(
             req_ids=[req.request_id for req in requests],
             req_id_to_index={
@@ -1122,9 +1112,7 @@ def test_stop_via_update_from_output(self):
                                            },
                                            num_common_prefix_blocks=0,
                                            finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
         model_output = ModelRunnerOutput(
             req_ids=[req.request_id for req in requests],
             req_id_to_index={
@@ -1174,9 +1162,7 @@ def test_stop_via_update_from_output(self):
                                            },
                                            num_common_prefix_blocks=0,
                                            finished_req_ids=set(),
-                                           free_encoder_mm_hashes=[],
-                                           structured_output_request_ids={},
-                                           grammar_bitmask=None)
+                                           free_encoder_mm_hashes=[])
         model_output = ModelRunnerOutput(
             req_ids=[req.request_id for req in requests],
             req_id_to_index={
@@ -1219,9 +1205,7 @@ def test_stop_via_update_from_output(self):
             },
             num_common_prefix_blocks=0,
             finished_req_ids=set(),
-            free_encoder_mm_hashes=[],
-            structured_output_request_ids={},
-            grammar_bitmask=None)
+            free_encoder_mm_hashes=[])
         model_output = ModelRunnerOutput(
             req_ids=[requests[0].request_id],
             req_id_to_index={requests[0].request_id: 0},

diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -63,13 +63,17 @@
 
 # isort: on
 
+from vllm.attention.backends.registry import (AttentionBackendEnum,
+                                              register_backend)
 
+
+@register_backend(AttentionBackendEnum.CUSTOM, "ASCEND")
 class AscendAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
 
     @staticmethod
     def get_name() -> str:
-        return "ASCEND"
+        return "CUSTOM"
 
     @staticmethod
     def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:

diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py
@@ -483,25 +483,42 @@ def skip_cur_request():
             num_scheduled_tokens, scheduled_spec_decode_tokens,
             req_to_new_blocks)
         scheduled_cached_reqs = cached_reqs_data
-
-        scheduler_output = SchedulerOutput(
-            scheduled_new_reqs=new_reqs_data,
-            scheduled_cached_reqs=scheduled_cached_reqs,
-            num_scheduled_tokens=num_scheduled_tokens,
-            total_num_scheduled_tokens=total_num_scheduled_tokens,
-            scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
-            scheduled_encoder_inputs=scheduled_encoder_inputs,
-            num_common_prefix_blocks=num_common_prefix_blocks,
-            # finished_req_ids is an existing state in the scheduler,
-            # instead of being newly scheduled in this step.
-            # It contains the request IDs that are finished in between
-            # the previous and the current steps.
-            finished_req_ids=self.finished_req_ids,  # type: ignore
-            free_encoder_mm_hashes=self.encoder_cache_manager.
-            get_freed_mm_hashes(),
-            structured_output_request_ids={},
-            grammar_bitmask=None,
-        )
+        if vllm_version_is("0.11.0"):
+            scheduler_output = SchedulerOutput(
+                scheduled_new_reqs=new_reqs_data,
+                scheduled_cached_reqs=scheduled_cached_reqs,
+                num_scheduled_tokens=num_scheduled_tokens,
+                total_num_scheduled_tokens=total_num_scheduled_tokens,
+                scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
+                scheduled_encoder_inputs=scheduled_encoder_inputs,
+                num_common_prefix_blocks=num_common_prefix_blocks,
+                # finished_req_ids is an existing state in the scheduler,
+                # instead of being newly scheduled in this step.
+                # It contains the request IDs that are finished in between
+                # the previous and the current steps.
+                finished_req_ids=self.finished_req_ids,  # type: ignore
+                free_encoder_mm_hashes=self.encoder_cache_manager.
+                get_freed_mm_hashes(),
+                structured_output_request_ids={},
+                grammar_bitmask=None,
+            )
+        else:
+            scheduler_output = SchedulerOutput(
+                scheduled_new_reqs=new_reqs_data,
+                scheduled_cached_reqs=scheduled_cached_reqs,
+                num_scheduled_tokens=num_scheduled_tokens,
+                total_num_scheduled_tokens=total_num_scheduled_tokens,
+                scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
+                scheduled_encoder_inputs=scheduled_encoder_inputs,
+                num_common_prefix_blocks=num_common_prefix_blocks,
+                # finished_req_ids is an existing state in the scheduler,
+                # instead of being newly scheduled in this step.
+                # It contains the request IDs that are finished in between
+                # the previous and the current steps.
+                finished_req_ids=self.finished_req_ids,  # type: ignore
+                free_encoder_mm_hashes=self.encoder_cache_manager.
+                get_freed_mm_hashes(),
+            )
 
         # NOTE(Kuntai): this function is designed for multiple purposes:
         # 1. Plan the KV cache store

diff --git a/vllm_ascend/core/scheduler_dynamic_batch.py b/vllm_ascend/core/scheduler_dynamic_batch.py
@@ -561,29 +561,47 @@ def schedule(self) -> SchedulerOutput:
             scheduled_spec_decode_tokens,
             req_to_new_blocks,
         )
-        scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs +
-                              scheduled_resumed_reqs)
-        structured_output_request_ids, grammar_bitmask = (
-            self.get_grammar_bitmask(scheduled_requests,
-                                     scheduled_spec_decode_tokens))
-        scheduler_output = SchedulerOutput(
-            scheduled_new_reqs=new_reqs_data,
-            scheduled_cached_reqs=cached_reqs_data,
-            num_scheduled_tokens=num_scheduled_tokens,
-            total_num_scheduled_tokens=total_num_scheduled_tokens,
-            scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
-            scheduled_encoder_inputs=scheduled_encoder_inputs,
-            num_common_prefix_blocks=num_common_prefix_blocks,
-            # finished_req_ids is an existing state in the scheduler,
-            # instead of being newly scheduled in this step.
-            # It contains the request IDs that are finished in between
-            # the previous and the current steps.
-            finished_req_ids=self.finished_req_ids,
-            free_encoder_mm_hashes=self.encoder_cache_manager.
-            get_freed_mm_hashes(),
-            structured_output_request_ids=structured_output_request_ids,
-            grammar_bitmask=grammar_bitmask,
-        )
+        if vllm_version_is("0.11.0"):
+            scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs +
+                                  scheduled_resumed_reqs)
+            structured_output_request_ids, grammar_bitmask = (
+                self.get_grammar_bitmask(scheduled_requests,
+                                         scheduled_spec_decode_tokens))
+            scheduler_output = SchedulerOutput(
+                scheduled_new_reqs=new_reqs_data,
+                scheduled_cached_reqs=cached_reqs_data,
+                num_scheduled_tokens=num_scheduled_tokens,
+                total_num_scheduled_tokens=total_num_scheduled_tokens,
+                scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
+                scheduled_encoder_inputs=scheduled_encoder_inputs,
+                num_common_prefix_blocks=num_common_prefix_blocks,
+                # finished_req_ids is an existing state in the scheduler,
+                # instead of being newly scheduled in this step.
+                # It contains the request IDs that are finished in between
+                # the previous and the current steps.
+                finished_req_ids=self.finished_req_ids,
+                free_encoder_mm_hashes=self.encoder_cache_manager.
+                get_freed_mm_hashes(),
+                structured_output_request_ids=structured_output_request_ids,
+                grammar_bitmask=grammar_bitmask,
+            )
+        else:
+            scheduler_output = SchedulerOutput(
+                scheduled_new_reqs=new_reqs_data,
+                scheduled_cached_reqs=cached_reqs_data,
+                num_scheduled_tokens=num_scheduled_tokens,
+                total_num_scheduled_tokens=total_num_scheduled_tokens,
+                scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
+                scheduled_encoder_inputs=scheduled_encoder_inputs,
+                num_common_prefix_blocks=num_common_prefix_blocks,
+                # finished_req_ids is an existing state in the scheduler,
+                # instead of being newly scheduled in this step.
+                # It contains the request IDs that are finished in between
+                # the previous and the current steps.
+                finished_req_ids=self.finished_req_ids,
+                free_encoder_mm_hashes=self.encoder_cache_manager.
+                get_freed_mm_hashes(),
+            )
 
         # NOTE(Kuntai): this function is designed for multiple purposes:
         # 1. Plan the KV cache store