From 8823330a6920b0a272ad109494eca89070637f2b Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 21 Aug 2025 01:49:55 +0000
Subject: [PATCH 1/8] [CI] fix ci

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 .github/workflows/accuracy_test.yaml          |  8 ++++----
 .github/workflows/format_pr_body.yaml         |  2 +-
 .github/workflows/image_310p_openeuler.yml    |  2 +-
 .github/workflows/image_310p_ubuntu.yml       |  2 +-
 .github/workflows/image_a3_openeuler.yml      |  2 +-
 .github/workflows/image_a3_ubuntu.yml         |  2 +-
 .github/workflows/image_openeuler.yml         |  2 +-
 .github/workflows/image_ubuntu.yml            |  2 +-
 .github/workflows/nightly_benchmarks.yaml     |  4 ++--
 .github/workflows/pre-commit.yml              |  4 ++--
 .github/workflows/vllm_ascend_doctest.yaml    |  2 +-
 .github/workflows/vllm_ascend_test.yaml       |  4 ++--
 .github/workflows/vllm_ascend_test_310p.yaml  |  4 ++--
 .github/workflows/vllm_ascend_test_pd.yaml    |  4 ++--
 .../test_offline_inference_distributed.py     | 20 -------------------
 15 files changed, 22 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
index 1f0350dc41b..044c5dcfd00 100644
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -88,7 +88,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
 
       - name: Set model name as output
         id: set_output
@@ -109,7 +109,7 @@ jobs:
           apt-get -y install gcc g++ cmake libnuma-dev
 
       - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
           ref: v0.10.0
@@ -138,7 +138,7 @@ jobs:
           echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV
 
       - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm-ascend
           path: ./vllm-ascend
@@ -236,7 +236,7 @@ jobs:
       UPSTREAM_REPO: vllm-project/vllm-ascend
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-ascend-ci/vllm-ascend
           token: ${{ secrets.PAT_TOKEN }}
diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml
index e50395cf72b..dedf7a4847c 100644
--- a/.github/workflows/format_pr_body.yaml
+++ b/.github/workflows/format_pr_body.yaml
@@ -34,7 +34,7 @@ jobs:
 
     steps:
       - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
           path: ./vllm-empty
diff --git a/.github/workflows/image_310p_openeuler.yml b/.github/workflows/image_310p_openeuler.yml
index 9339c9e4b24..e6062a81233 100644
--- a/.github/workflows/image_310p_openeuler.yml
+++ b/.github/workflows/image_310p_openeuler.yml
@@ -53,7 +53,7 @@ jobs:
           'ubuntu-24.04-arm'
       }}
     steps:
-    - uses: actions/checkout@v5
+    - uses: actions/checkout@v4
 
     - name: Print
       run: |
diff --git a/.github/workflows/image_310p_ubuntu.yml b/.github/workflows/image_310p_ubuntu.yml
index 86ca73f1a5a..0e9444fa50a 100644
--- a/.github/workflows/image_310p_ubuntu.yml
+++ b/.github/workflows/image_310p_ubuntu.yml
@@ -49,7 +49,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v5
+    - uses: actions/checkout@v4
 
     - name: Print
       run: |
diff --git a/.github/workflows/image_a3_openeuler.yml b/.github/workflows/image_a3_openeuler.yml
index 3eda8dd7ecc..a10ad1c89f9 100644
--- a/.github/workflows/image_a3_openeuler.yml
+++ b/.github/workflows/image_a3_openeuler.yml
@@ -53,7 +53,7 @@ jobs:
           'ubuntu-24.04-arm'
       }}
     steps:
-    - uses: actions/checkout@v5
+    - uses: actions/checkout@v4
 
     - name: Print
       run: |
diff --git a/.github/workflows/image_a3_ubuntu.yml b/.github/workflows/image_a3_ubuntu.yml
index 7a6506cb391..61160150ac6 100644
--- a/.github/workflows/image_a3_ubuntu.yml
+++ b/.github/workflows/image_a3_ubuntu.yml
@@ -49,7 +49,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v5
+    - uses: actions/checkout@v4
 
     - name: Print
       run: |
diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml
index 22ea1dc4a04..d4e69a55a4d 100644
--- a/.github/workflows/image_openeuler.yml
+++ b/.github/workflows/image_openeuler.yml
@@ -52,7 +52,7 @@ jobs:
           'ubuntu-24.04-arm'
       }}
     steps:
-    - uses: actions/checkout@v5
+    - uses: actions/checkout@v4
 
     - name: Print
       run: |
diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml
index b70875066dd..1c2ddcdffb4 100644
--- a/.github/workflows/image_ubuntu.yml
+++ b/.github/workflows/image_ubuntu.yml
@@ -49,7 +49,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v5
+    - uses: actions/checkout@v4
 
     - name: Print
       run: |
diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml
index 64dadf2c7fe..8a434813776 100644
--- a/.github/workflows/nightly_benchmarks.yaml
+++ b/.github/workflows/nightly_benchmarks.yaml
@@ -97,12 +97,12 @@ jobs:
           git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
 
       - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
       - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
           path: ./vllm-empty
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 1564bd7ab5b..e41dd6e634e 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -11,14 +11,14 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout vllm-project/vllm-ascend repo
-      uses: actions/checkout@v5
+      uses: actions/checkout@v4
     - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
       with:
         python-version: "3.11"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
     - name: Checkout vllm-project/vllm repo
-      uses: actions/checkout@v5
+      uses: actions/checkout@v4
       with:
         repository: vllm-project/vllm
         path: ./vllm-empty
diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml
index ffb552fc786..1b4faeacba8 100644
--- a/.github/workflows/vllm_ascend_doctest.yaml
+++ b/.github/workflows/vllm_ascend_doctest.yaml
@@ -66,7 +66,7 @@ jobs:
           git --no-pager log -1 || true
 
       - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
 
       - name: Run vllm-ascend/tests/e2e/run_doctests.sh
         run: |
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index 7dbcb1863d4..c3159aa511f 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -89,7 +89,7 @@ jobs:
           apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
 
       - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
           ref: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
           python3 -m pip uninstall -y triton
 
       - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
 
       - name: Install vllm-project/vllm-ascend
         run: |
diff --git a/.github/workflows/vllm_ascend_test_310p.yaml b/.github/workflows/vllm_ascend_test_310p.yaml
index 52d6ddeffd3..a3d3cae94d2 100644
--- a/.github/workflows/vllm_ascend_test_310p.yaml
+++ b/.github/workflows/vllm_ascend_test_310p.yaml
@@ -77,7 +77,7 @@ jobs:
           apt install git -y
 
       - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
 
       - name: Install system dependencies
         run: |
@@ -85,7 +85,7 @@ jobs:
           apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
 
       - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
           ref: ${{ matrix.vllm_version }}
diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml
index 2f21365829a..a86ba60a65f 100644
--- a/.github/workflows/vllm_ascend_test_pd.yaml
+++ b/.github/workflows/vllm_ascend_test_pd.yaml
@@ -80,7 +80,7 @@ jobs:
           git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
 
       - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
 
       - name: Install system dependencies
         run: |
@@ -88,7 +88,7 @@ jobs:
           apt-get -y install gcc g++ cmake libnuma-dev
 
       - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
           ref: ${{ matrix.vllm_verison }}
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
index e869c2d5992..f7354abe916 100644
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -78,26 +78,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
-def test_models_distributed_DeepSeek_dbo():
-    example_prompts = ["The president of the United States is"] * 41
-    dtype = "half"
-    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
-    with VllmRunner(
-            "deepseek-ai/DeepSeek-V2-Lite",
-            dtype=dtype,
-            tensor_parallel_size=2,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
-        model_arch = 'DeepseekV2ForCausalLM'
-        registed_models = ModelRegistry.models
-        assert registed_models[
-            model_arch].module_name == "vllm_ascend.models.deepseek_dbo"
-        assert registed_models[
-            model_arch].class_name == "CustomDeepseekDBOForCausalLM"
-        vllm_model.generate(example_prompts, sampling_params)
-
-
 @pytest.mark.skip(
     reason=
     "deepseek dbo dose not consider the support on half precision float, will enable this ut after we actually support it"

From b643a2891f26679ebede21018a1b56faf46c11c9 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 21 Aug 2025 03:39:42 +0000
Subject: [PATCH 2/8] some fixes

 - fix kvcache block changes
 - maintain v0.10.1.1

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 .github/workflows/vllm_ascend_test.yaml      |  6 +--
 .github/workflows/vllm_ascend_test_310p.yaml |  2 +-
 Dockerfile                                   |  2 +-
 Dockerfile.310p                              |  2 +-
 Dockerfile.310p.openEuler                    |  2 +-
 Dockerfile.a3                                |  2 +-
 Dockerfile.a3.openEuler                      |  2 +-
 Dockerfile.openEuler                         |  2 +-
 vllm_ascend/models/qwen3_moe.py              |  3 +-
 vllm_ascend/worker/model_runner_v1.py        | 57 ++++++++++++++------
 10 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index c3159aa511f..6e3aff0d154 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -81,7 +81,7 @@ jobs:
         VLLM_USE_MODELSCOPE: True
     strategy:
       matrix:
-        vllm_version: [main]
+        vllm_version: [v0.10.1.1, main]
     steps:
       - name: Install packages
         run: |
@@ -137,7 +137,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-aarch64-a2-1]
-        vllm_version: [main]
+        vllm_version: [v0.10.1.1, main]
     name: singlecard e2e test
     runs-on: ${{ matrix.os }}
     container:
@@ -219,7 +219,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-aarch64-a2-2]
-        vllm_version: [main]
+        vllm_version: [v0.10.1.1, main]
     name: multicard e2e test
     runs-on: ${{ matrix.os }}
     container:
diff --git a/.github/workflows/vllm_ascend_test_310p.yaml b/.github/workflows/vllm_ascend_test_310p.yaml
index a3d3cae94d2..9d4a9709dda 100644
--- a/.github/workflows/vllm_ascend_test_310p.yaml
+++ b/.github/workflows/vllm_ascend_test_310p.yaml
@@ -53,7 +53,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-aarch64-310p-1, linux-aarch64-310p-4]
-        vllm_version: [main]
+        vllm_version: [v0.10.1.1, main]
     name: 310p e2e test
     runs-on: ${{ matrix.os }}
     container:
diff --git a/Dockerfile b/Dockerfile
index a12df1e0b73..29d6445ec04 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.10.0
+ARG VLLM_TAG=v0.10.1.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.310p b/Dockerfile.310p
index 299624c5417..4eb3c63a81f 100644
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.10.0
+ARG VLLM_TAG=v0.10.1.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler
index ff7ec05a430..0e76ba37faf 100644
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.10.0
+ARG VLLM_TAG=v0.10.1.1
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
diff --git a/Dockerfile.a3 b/Dockerfile.a3
index da1efcc41b2..8bdfb0e2d9f 100644
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.10.0
+ARG VLLM_TAG=v0.10.1.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler
index b03851ca652..afaf11dabe0 100644
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.10.0
+ARG VLLM_TAG=v0.10.1.1
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
index 1146d0a00ab..b744b3325fd 100644
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.10.0
+ARG VLLM_TAG=v0.10.1.1
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 29ab6755250..4ee41eba17b 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -253,7 +253,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         parallel_config = vllm_config.parallel_config
-        self.num_redundant_experts = parallel_config.num_redundant_experts
+        eplb_config = parallel_config.eplb_config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.config = config
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index b55cc1395ae..39b29caeff2 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -64,8 +64,8 @@
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
-                             LogprobsTensors, ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
+                             ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -94,11 +94,17 @@
 from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
                                ProfileExecuteDuration, is_310p,
-                               maybe_converting_weight_acl_format)
+                               maybe_converting_weight_acl_format,
+                               vllm_version_is)
 from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
 
+if not vllm_version_is("0.10.1.1"):
+    from vllm.v1.outputs import DraftTokenIds
+else:
+    DraftTokenIds = None
+
 if TYPE_CHECKING:
     import xgrammar as xgr  # type: ignore[import-untyped]
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -513,11 +519,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
             # Update the block IDs.
             if not resumed_from_preemption:
-                # Append the new blocks to the existing block IDs.
-                for block_ids, new_ids in zip(req_state.block_ids,
-                                              new_block_ids):
-                    block_ids.extend(new_ids)
+                if new_block_ids is not None:
+                    # Append the new blocks to the existing block IDs.
+                    for block_ids, new_ids in zip(req_state.block_ids,
+                                                  new_block_ids):
+                        block_ids.extend(new_ids)
             else:
+                assert new_block_ids is not None
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
                 req_state.block_ids = new_block_ids
@@ -533,7 +541,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             # Update the persistent batch.
             self.input_batch.num_computed_tokens_cpu[req_index] = (
                 num_computed_tokens)
-            self.input_batch.block_table.append_row(new_block_ids, req_index)
+            if new_block_ids is not None:
+                self.input_batch.block_table.append_row(
+                    new_block_ids, req_index)
 
             # For the last rank, we don't need to update the token_ids_cpu
             # because the sampled tokens are already cached.
@@ -1528,6 +1538,7 @@ def _pool(
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=[],
+            spec_token_ids=None,
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=pooler_output,
@@ -1754,15 +1765,27 @@ def execute_model(
 
         extra_args = ({"kv_connector_output": kv_connector_output})
 
-        model_runner_output = ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids,
-            req_id_to_index=self.input_batch.req_id_to_index,
-            sampled_token_ids=valid_sampled_token_ids,
-            logprobs=logprobs_lists,
-            prompt_logprobs_dict=prompt_logprobs_dict,
-            pooler_output=[],
-            **extra_args,
-        )
+        if vllm_version_is("0.10.1.1"):
+            model_runner_output = ModelRunnerOutput(
+                req_ids=self.input_batch.req_ids,
+                req_id_to_index=self.input_batch.req_id_to_index,
+                sampled_token_ids=valid_sampled_token_ids,
+                logprobs=logprobs_lists,
+                spec_token_ids=self._draft_token_ids,
+                prompt_logprobs_dict=prompt_logprobs_dict,
+                pooler_output=[],
+                **extra_args,
+            )
+        else:
+            model_runner_output = ModelRunnerOutput(
+                req_ids=self.input_batch.req_ids,
+                req_id_to_index=self.input_batch.req_id_to_index,
+                sampled_token_ids=valid_sampled_token_ids,
+                logprobs=logprobs_lists,
+                prompt_logprobs_dict=prompt_logprobs_dict,
+                pooler_output=[],
+                **extra_args,
+            )
 
         durations = ProfileExecuteDuration().pop_captured_sync()
         if durations:

From 4988427ee7f5d890ca96d79631c9d208ddaba40d Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 21 Aug 2025 05:19:22 +0000
Subject: [PATCH 3/8] fix ut

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 tests/ut/core/test_scheduler.py       | 294 ++++++++++++++++++--------
 vllm_ascend/models/qwen3_moe.py       |   8 +-
 vllm_ascend/worker/model_runner_v1.py |  33 ++-
 3 files changed, 230 insertions(+), 105 deletions(-)

diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py
index 78b0c65f0cd..6680a258c19 100644
--- a/tests/ut/core/test_scheduler.py
+++ b/tests/ut/core/test_scheduler.py
@@ -13,7 +13,7 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
-from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
@@ -21,6 +21,11 @@
 from vllm_ascend.core.scheduler import AscendScheduler
 from vllm_ascend.utils import vllm_version_is
 
+if not vllm_version_is("0.10.1.1"):
+    from vllm.v1.outputs import DraftTokenIds
+else:
+    DraftTokenIds = None
+
 EOS_TOKEN_ID = 50256
 MODEL = "Qwen3-0.6B"
 ENABLE_PREFIX_CACHING = None
@@ -66,16 +71,33 @@ def create_requests(
 
 
 def make_output(scheduler):
-    return ModelRunnerOutput(
-        req_ids=[req.request_id for req in scheduler.running],
-        req_id_to_index={
-            req.request_id: i
-            for i, req in enumerate(scheduler.running)
-        },
-        sampled_token_ids=[[1000]] * len(scheduler.running),
-        logprobs=None,
-        prompt_logprobs_dict={},
-        pooler_output=[])
+    req_ids = [req.request_id for req in scheduler.running]
+    req_id_to_index = {
+        req.request_id: i
+        for i, req in enumerate(scheduler.running)
+    }
+    sampled_token_ids = [[1000]] * len(scheduler.running)
+    logprobs = None
+    if vllm_version_is("0.10.1.1"):
+        modelrunner_output = ModelRunnerOutput(
+            req_ids=req_ids,
+            req_id_to_index=req_id_to_index,
+            sampled_token_ids=sampled_token_ids,
+            spec_token_ids=None,
+            logprobs=logprobs,
+            prompt_logprobs_dict={},
+            pooler_output=[],
+        )
+    else:
+        modelrunner_output = ModelRunnerOutput(
+            req_ids=req_ids,
+            req_id_to_index=req_id_to_index,
+            sampled_token_ids=sampled_token_ids,
+            logprobs=logprobs,
+            prompt_logprobs_dict={},
+            pooler_output=[],
+        )
+    return modelrunner_output
 
 
 class TestAscendScheduler(TestBase):
@@ -271,8 +293,7 @@ def test_stop_via_update_from_output(self):
             req.num_computed_tokens = req.num_tokens
             scheduler.requests[req.request_id] = req
             scheduler.running.append(req)
-            if not vllm_version_is("0.9.2"):
-                req.status = RequestStatus.RUNNING
+            req.status = RequestStatus.RUNNING
 
         scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
                                            scheduled_cached_reqs=[],
@@ -291,18 +312,33 @@ def test_stop_via_update_from_output(self):
                                            free_encoder_input_ids=[],
                                            structured_output_request_ids={},
                                            grammar_bitmask=None)
-
-        model_output = ModelRunnerOutput(
-            req_ids=[req.request_id for req in requests],
-            req_id_to_index={
-                req.request_id: i
-                for i, req in enumerate(requests)
-            },
-            sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
-                               ],  # First request hits EOS, second continues
-            logprobs=None,
-            prompt_logprobs_dict={},
-            pooler_output=[])
+        if vllm_version_is("0.10.1.1"):
+            model_output = ModelRunnerOutput(
+                req_ids=[req.request_id for req in requests],
+                req_id_to_index={
+                    req.request_id: i
+                    for i, req in enumerate(requests)
+                },
+                sampled_token_ids=[[EOS_TOKEN_ID], [
+                    10, 11
+                ]],  # First request hits EOS, second continues
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[])
+        else:
+            model_output = ModelRunnerOutput(
+                req_ids=[req.request_id for req in requests],
+                req_id_to_index={
+                    req.request_id: i
+                    for i, req in enumerate(requests)
+                },
+                sampled_token_ids=[[EOS_TOKEN_ID], [
+                    10, 11
+                ]],  # First request hits EOS, second continues
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[])
 
         scheduler.update_from_output(scheduler_output, model_output)
 
@@ -325,8 +361,7 @@ def test_stop_via_update_from_output(self):
             req.num_computed_tokens = req.num_tokens
             scheduler.requests[req.request_id] = req
             scheduler.running.append(req)
-            if not vllm_version_is("0.9.2"):
-                req.status = RequestStatus.RUNNING
+            req.status = RequestStatus.RUNNING
 
         scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
                                            scheduled_cached_reqs=[],
@@ -346,18 +381,31 @@ def test_stop_via_update_from_output(self):
                                            free_encoder_input_ids=[],
                                            structured_output_request_ids={},
                                            grammar_bitmask=None)
-
-        model_output = ModelRunnerOutput(
-            req_ids=[req.request_id for req in requests],
-            req_id_to_index={
-                req.request_id: i
-                for i, req in enumerate(requests)
-            },
-            sampled_token_ids=[[10, 42, 12],
-                               [13, 14]],  # First request hits stop token
-            logprobs=None,
-            prompt_logprobs_dict={},
-            pooler_output=[])
+        if vllm_version_is("0.10.1.1"):
+            model_output = ModelRunnerOutput(
+                req_ids=[req.request_id for req in requests],
+                req_id_to_index={
+                    req.request_id: i
+                    for i, req in enumerate(requests)
+                },
+                sampled_token_ids=[[10, 42, 12],
+                                   [13, 14]],  # First request hits stop token
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[])
+        else:
+            model_output = ModelRunnerOutput(
+                req_ids=[req.request_id for req in requests],
+                req_id_to_index={
+                    req.request_id: i
+                    for i, req in enumerate(requests)
+                },
+                sampled_token_ids=[[10, 42, 12],
+                                   [13, 14]],  # First request hits stop token
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[])
 
         scheduler.update_from_output(scheduler_output, model_output)
 
@@ -379,8 +427,7 @@ def test_stop_via_update_from_output(self):
             req.num_computed_tokens = req.num_tokens
             scheduler.requests[req.request_id] = req
             scheduler.running.append(req)
-            if not vllm_version_is("0.9.2"):
-                req.status = RequestStatus.RUNNING
+            req.status = RequestStatus.RUNNING
 
         scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
                                            scheduled_cached_reqs=[],
@@ -401,18 +448,31 @@ def test_stop_via_update_from_output(self):
                                            structured_output_request_ids={},
                                            grammar_bitmask=None)
 
-        model_output = ModelRunnerOutput(
-            req_ids=[req.request_id for req in requests],
-            req_id_to_index={
-                req.request_id: i
-                for i, req in enumerate(requests)
-            },
-            sampled_token_ids=[[10, 11, 12],
-                               [13]],  # First request exceeds max_tokens
-            logprobs=None,
-            prompt_logprobs_dict={},
-            pooler_output=[])
-
+        if vllm_version_is("0.10.1.1"):
+            model_output = ModelRunnerOutput(
+                req_ids=[req.request_id for req in requests],
+                req_id_to_index={
+                    req.request_id: i
+                    for i, req in enumerate(requests)
+                },
+                sampled_token_ids=[[10, 11, 12],
+                                   [13]],  # First request exceeds max_tokens
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[])
+        else:
+            model_output = ModelRunnerOutput(
+                req_ids=[req.request_id for req in requests],
+                req_id_to_index={
+                    req.request_id: i
+                    for i, req in enumerate(requests)
+                },
+                sampled_token_ids=[[10, 11, 12],
+                                   [13]],  # First request exceeds max_tokens
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[])
         scheduler.update_from_output(scheduler_output, model_output)
 
         # Verify first request stopped due to length
@@ -448,13 +508,24 @@ def test_stop_via_update_from_output(self):
             structured_output_request_ids={},
             grammar_bitmask=None)
 
-        model_output = ModelRunnerOutput(
-            req_ids=[requests[0].request_id],
-            req_id_to_index={requests[0].request_id: 0},
-            sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
-            logprobs=None,
-            prompt_logprobs_dict={},
-            pooler_output=[])
+        if vllm_version_is("0.10.1.1"):
+            model_output = ModelRunnerOutput(
+                req_ids=[requests[0].request_id],
+                req_id_to_index={requests[0].request_id: 0},
+                sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[])
+
+        else:
+            model_output = ModelRunnerOutput(
+                req_ids=[requests[0].request_id],
+                req_id_to_index={requests[0].request_id: 0},
+                sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=[])
 
         scheduler.update_from_output(scheduler_output, model_output)
 
@@ -505,13 +576,23 @@ def test_schedule_concurrent_batches(self):
                 512)
 
             # Model output of the first request.
-            model_runner_output = ModelRunnerOutput(
-                req_ids=[requests[0].request_id],
-                req_id_to_index={requests[0].request_id: 0},
-                sampled_token_ids=[[0]],
-                logprobs=None,
-                prompt_logprobs_dict={},
-                pooler_output=[])
+            if vllm_version_is("0.10.1.1"):
+                model_runner_output = ModelRunnerOutput(
+                    req_ids=[requests[0].request_id],
+                    req_id_to_index={requests[0].request_id: 0},
+                    sampled_token_ids=[[0]],
+                    spec_token_ids=None,
+                    logprobs=None,
+                    prompt_logprobs_dict={},
+                    pooler_output=[])
+            else:
+                model_runner_output = ModelRunnerOutput(
+                    req_ids=[requests[0].request_id],
+                    req_id_to_index={requests[0].request_id: 0},
+                    sampled_token_ids=[[0]],
+                    logprobs=None,
+                    prompt_logprobs_dict={},
+                    pooler_output=[])
 
             scheduler.update_from_output(scheduler_output0,
                                          model_runner_output)
@@ -521,13 +602,23 @@ def test_schedule_concurrent_batches(self):
             # request is still running.
             scheduler.schedule()
             # Model output of the second request.
-            model_runner_output = ModelRunnerOutput(
-                req_ids=[requests[1].request_id],
-                req_id_to_index={requests[1].request_id: 0},
-                sampled_token_ids=[[0]],
-                logprobs=None,
-                prompt_logprobs_dict={},
-                pooler_output=[])
+            if vllm_version_is("0.10.1.1"):
+                model_runner_output = ModelRunnerOutput(
+                    req_ids=[requests[1].request_id],
+                    req_id_to_index={requests[1].request_id: 0},
+                    sampled_token_ids=[[0]],
+                    spec_token_ids=None,
+                    logprobs=None,
+                    prompt_logprobs_dict={},
+                    pooler_output=[])
+            else:
+                model_runner_output = ModelRunnerOutput(
+                    req_ids=[requests[1].request_id],
+                    req_id_to_index={requests[1].request_id: 0},
+                    sampled_token_ids=[[0]],
+                    logprobs=None,
+                    prompt_logprobs_dict={},
+                    pooler_output=[])
 
             scheduler.update_from_output(scheduler_output1,
                                          model_runner_output)
@@ -579,19 +670,29 @@ def test_schedule_spec_decoding_stats(self):
                 req_id = requests[i].request_id
                 self.assertEqual(output.num_scheduled_tokens[req_id], 1)
                 self.assertNotIn(req_id, output.scheduled_spec_decode_tokens)
-
-            model_runner_output = ModelRunnerOutput(
-                req_ids=req_ids,
-                req_id_to_index=req_to_index,
-                sampled_token_ids=[[0] for _ in range(len(requests))],
-                logprobs=None,
-                prompt_logprobs_dict={},
-                pooler_output=[])
-            draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
+            if vllm_version_is("0.10.1.1"):
+                model_runner_output = ModelRunnerOutput(
+                    req_ids=req_ids,
+                    req_id_to_index=req_to_index,
+                    sampled_token_ids=[[0] for _ in range(len(requests))],
+                    logprobs=None,
+                    prompt_logprobs_dict={},
+                    spec_token_ids=spec_tokens,
+                    pooler_output=[])
+            else:
+                model_runner_output = ModelRunnerOutput(
+                    req_ids=req_ids,
+                    req_id_to_index=req_to_index,
+                    sampled_token_ids=[[0] for _ in range(len(requests))],
+                    logprobs=None,
+                    prompt_logprobs_dict={},
+                    pooler_output=[])
+                draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
 
             engine_core_outputs = scheduler.update_from_output(
                 output, model_runner_output)
-            scheduler.update_draft_token_ids(draft_token_ids)
+            if not vllm_version_is("0.10.1.1"):
+                scheduler.update_draft_token_ids(draft_token_ids)
 
             for i in range(len(requests)):
                 running_req = scheduler.running[i]
@@ -627,14 +728,23 @@ def test_schedule_spec_decoding_stats(self):
                 else:
                     self.assertNotIn(req_id,
                                      output.scheduled_spec_decode_tokens)
-
-            model_runner_output = ModelRunnerOutput(
-                req_ids=req_ids,
-                req_id_to_index=req_to_index,
-                sampled_token_ids=output_tokens,
-                logprobs=None,
-                prompt_logprobs_dict={},
-                pooler_output=[])
+            if vllm_version_is("0.10.1.1"):
+                model_runner_output = ModelRunnerOutput(
+                    req_ids=req_ids,
+                    req_id_to_index=req_to_index,
+                    sampled_token_ids=output_tokens,
+                    spec_token_ids=None,
+                    logprobs=None,
+                    prompt_logprobs_dict={},
+                    pooler_output=[])
+            else:
+                model_runner_output = ModelRunnerOutput(
+                    req_ids=req_ids,
+                    req_id_to_index=req_to_index,
+                    sampled_token_ids=output_tokens,
+                    logprobs=None,
+                    prompt_logprobs_dict={},
+                    pooler_output=[])
 
             engine_core_outputs = scheduler.update_from_output(
                 output, model_runner_output)
diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
index 4ee41eba17b..0df83772b8a 100644
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -50,6 +50,7 @@
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
 from vllm_ascend.ops.sequence_parallel import (MetadataForPadding,
                                                init_metadata_for_sp)
+from vllm_ascend.utils import vllm_version_is
 
 
 class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@@ -253,8 +254,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         parallel_config = vllm_config.parallel_config
-        eplb_config = parallel_config.eplb_config
-        self.num_redundant_experts = eplb_config.num_redundant_experts
+        if vllm_version_is("0.10.1.1"):
+            self.num_redundant_experts = parallel_config.num_redundant_experts
+        else:
+            eplb_config = parallel_config.eplb_config
+            self.num_redundant_experts = eplb_config.num_redundant_experts
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.config = config
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 39b29caeff2..963b54c7a9a 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1533,17 +1533,28 @@ def _pool(
             else:
                 pooler_output.append(None)
         extra_args = ({"kv_connector_output": kv_connector_output})
-
-        return ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids,
-            req_id_to_index=self.input_batch.req_id_to_index,
-            sampled_token_ids=[],
-            spec_token_ids=None,
-            logprobs=None,
-            prompt_logprobs_dict={},
-            pooler_output=pooler_output,
-            **extra_args,
-        )
+        if vllm_version_is("0.10.1.1"):
+            modelrunner_output = ModelRunnerOutput(
+                req_ids=self.input_batch.req_ids,
+                req_id_to_index=self.input_batch.req_id_to_index,
+                sampled_token_ids=[],
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=pooler_output,
+                **extra_args,
+            )
+        else:
+            modelrunner_output = ModelRunnerOutput(
+                req_ids=self.input_batch.req_ids,
+                req_id_to_index=self.input_batch.req_id_to_index,
+                sampled_token_ids=[],
+                logprobs=None,
+                prompt_logprobs_dict={},
+                pooler_output=pooler_output,
+                **extra_args,
+            )
+        return modelrunner_output
 
     @torch.inference_mode()
     def execute_model(

From a4c0367f5ec57ccf1519dd1274083bf2330ce86d Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 21 Aug 2025 07:22:33 +0000
Subject: [PATCH 4/8] fix ascendscheduler and sampler

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 vllm_ascend/core/scheduler.py | 57 ++++++++++++++++++++++++++---------
 vllm_ascend/sample/sampler.py | 21 +++++++++++--
 2 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py
index dfdc9aa863c..627d5ea8991 100644
--- a/vllm_ascend/core/scheduler.py
+++ b/vllm_ascend/core/scheduler.py
@@ -31,6 +31,13 @@
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.10.1.1"):
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+else:
+    KVCacheBlocks = None
+
 
 class AscendScheduler(Scheduler):
     """This Scheduler extends vllm's original v1 scheduler
@@ -59,7 +66,10 @@ def schedule(self) -> SchedulerOutput:
         scheduled_running_reqs: list[Request] = []
         preempted_reqs: list[Request] = []
 
-        req_to_new_block_ids: dict[str, list[int]] = {}
+        if vllm_version_is("0.10.1.1"):
+            req_to_new_block_ids: dict[str, list[int]] = {}
+        else:
+            req_to_new_blocks: dict[str, KVCacheBlocks] = {}
         num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
         # Spec decode-related.
@@ -217,8 +227,11 @@ def skip_cur_request():
 
             if self.lora_config and request.lora_request:
                 scheduled_loras.add(request.lora_request.lora_int_id)
-            req_to_new_block_ids[request.request_id] = (
-                self.kv_cache_manager.get_block_ids(request.request_id))
+            if vllm_version_is("0.10.1.1"):
+                req_to_new_block_ids[request.request_id] = (
+                    self.kv_cache_manager.get_block_ids(request.request_id))
+            else:
+                req_to_new_blocks[request.request_id] = new_blocks
             # Update request info.
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
@@ -307,8 +320,11 @@ def skip_cur_request():
                 # Schedule the request.
                 scheduled_running_reqs.append(request)
                 self.scheduled_req_ids.add(request.request_id)
-                req_to_new_block_ids[request.request_id] = (
-                    new_blocks.get_block_ids())
+                if vllm_version_is("0.10.1.1"):
+                    req_to_new_block_ids[request.request_id] = (
+                        new_blocks.get_block_ids())
+                else:
+                    req_to_new_blocks[request.request_id] = new_blocks
                 num_scheduled_tokens[request.request_id] = num_new_tokens
                 token_budget -= num_new_tokens
                 req_index += 1
@@ -346,16 +362,27 @@ def skip_cur_request():
                     any_request, len(self.running)))
 
         # Construct the scheduler output.
-        new_reqs_data = [
-            NewRequestData.from_request(req,
-                                        req_to_new_block_ids[req.request_id])
-            for req in scheduled_new_reqs
-        ]
-
-        cached_reqs_data = self._make_cached_request_data(
-            scheduled_running_reqs, scheduled_resumed_reqs,
-            num_scheduled_tokens, scheduled_spec_decode_tokens,
-            req_to_new_block_ids)
+        if vllm_version_is("0.10.1.1"):
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req, req_to_new_block_ids[req.request_id])
+                for req in scheduled_new_reqs
+            ]
+            cached_reqs_data = self._make_cached_request_data(
+                scheduled_running_reqs, scheduled_resumed_reqs,
+                num_scheduled_tokens, scheduled_spec_decode_tokens,
+                req_to_new_block_ids)
+        else:
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req, req_to_new_blocks[req.request_id].get_block_ids())
+                for req in scheduled_new_reqs
+            ]
+
+            cached_reqs_data = self._make_cached_request_data(
+                scheduled_running_reqs, scheduled_resumed_reqs,
+                num_scheduled_tokens, scheduled_spec_decode_tokens,
+                req_to_new_blocks)
         scheduled_cached_reqs = cached_reqs_data
 
         scheduler_output = SchedulerOutput(
diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py
index c082f988adf..d0e015480f0 100644
--- a/vllm_ascend/sample/sampler.py
+++ b/vllm_ascend/sample/sampler.py
@@ -3,7 +3,12 @@
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
 from vllm.v1.sample.sampler import Sampler
 
-from vllm_ascend.utils import is_310p
+from vllm_ascend.utils import is_310p, vllm_version_is
+
+if not vllm_version_is("0.10.1.1"):
+    from vllm.config import LogprobsMode
+else:
+    LogprobsMode = None
 
 
 class AscendSampler(Sampler):
@@ -60,6 +65,18 @@ def _apply_top_k_top_p(
 
     def forward_native(self, logits, generators, k, p):
         """Override pytorch native implementation to torch_npu"""
+        logits = self.apply_top_k_top_p(logits, k, p)
+        logits_to_return = None
+        if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
+            logits_to_return = logits
+        elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
+            logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
+
         logits = self._apply_top_k_top_p(logits, k, p)
         probs = logits.softmax(dim=-1, dtype=torch.float32)
-        return random_sample(probs, generators)
+        output = None
+        if vllm_version_is("0.10.1.1"):
+            output = random_sample(probs, generators)
+        else:
+            output = (random_sample(probs, generators), logits_to_return)
+        return output

From ede6c81ee4311971b87b1db1c16e3c2f723818f3 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 21 Aug 2025 07:33:56 +0000
Subject: [PATCH 5/8] fix AscendTopKTopPSampler

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 vllm_ascend/sample/sampler.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py
index d0e015480f0..086cae0a832 100644
--- a/vllm_ascend/sample/sampler.py
+++ b/vllm_ascend/sample/sampler.py
@@ -65,14 +65,16 @@ def _apply_top_k_top_p(
 
     def forward_native(self, logits, generators, k, p):
         """Override pytorch native implementation to torch_npu"""
-        logits = self.apply_top_k_top_p(logits, k, p)
-        logits_to_return = None
-        if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
-            logits_to_return = logits
-        elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
-            logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
-
         logits = self._apply_top_k_top_p(logits, k, p)
+        if not vllm_version_is("0.10.1.1"):
+
+            logits_to_return = None
+            if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
+                logits_to_return = logits
+            elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
+                logits_to_return = logits.log_softmax(dim=-1,
+                                                      dtype=torch.float32)
+
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         output = None
         if vllm_version_is("0.10.1.1"):

From ed14490cf4c489c20c6b75b4c9fd8f3894e592ec Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 21 Aug 2025 07:43:24 +0000
Subject: [PATCH 6/8] fix modeloutput

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 tests/ut/kv_connector/utils.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py
index 9dc6dfc8309..c2e0a1f955e 100644
--- a/tests/ut/kv_connector/utils.py
+++ b/tests/ut/kv_connector/utils.py
@@ -200,12 +200,26 @@ def create_model_runner_output(
     kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
                                             finished_recving=finished_recving)
     extra_args = {"kv_connector_output": kv_connector_output}
-    return ModelRunnerOutput(
-        req_ids=req_ids,
-        req_id_to_index=req_id_to_index,
-        sampled_token_ids=sampled_token_ids,
-        logprobs=None,
-        prompt_logprobs_dict={},
-        pooler_output=[],
-        **extra_args,
-    )
+    if vllm_version_is("0.10.1.1"):
+        model_runner_output = ModelRunnerOutput(
+            req_ids=req_ids,
+            req_id_to_index=req_id_to_index,
+            sampled_token_ids=sampled_token_ids,
+            spec_token_ids=None,
+            logprobs=None,
+            prompt_logprobs_dict={},
+            pooler_output=[],
+            **extra_args,
+        )
+    else:
+        model_runner_output = ModelRunnerOutput(
+            req_ids=req_ids,
+            req_id_to_index=req_id_to_index,
+            sampled_token_ids=sampled_token_ids,
+            logprobs=None,
+            prompt_logprobs_dict={},
+            pooler_output=[],
+            **extra_args,
+        )
+
+    return model_runner_output

From 95b640e2617905fa96339c061a65ee40a842190d Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 21 Aug 2025 08:51:30 +0000
Subject: [PATCH 7/8] fix ascendsampler

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 vllm_ascend/sample/sampler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py
index 086cae0a832..d3f1ae9ceae 100644
--- a/vllm_ascend/sample/sampler.py
+++ b/vllm_ascend/sample/sampler.py
@@ -7,13 +7,15 @@
 
 if not vllm_version_is("0.10.1.1"):
     from vllm.config import LogprobsMode
+    DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
 else:
     LogprobsMode = None
+    DEFAULT_LOGPROBS_MODE = "raw_logprobs"
 
 
 class AscendSampler(Sampler):
 
-    def __init__(self, logprobs_mode="raw_logprobs"):
+    def __init__(self, logprobs_mode=DEFAULT_LOGPROBS_MODE):
         # TODO: support logprobs_mode in vllm-ascend
         super().__init__(logprobs_mode=logprobs_mode)
         self.topk_topp_sampler = AscendTopKTopPSampler()

From 069d9aefa96e813043a27c26164f6a7e841892e2 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 21 Aug 2025 11:00:06 +0000
Subject: [PATCH 8/8] remove dbo

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 .github/workflows/vllm_ascend_test.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index 6e3aff0d154..78cfefae3fa 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -278,7 +278,6 @@ jobs:
           # To avoid oom, we need to run the test in a single process.
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC