From 8823330a6920b0a272ad109494eca89070637f2b Mon Sep 17 00:00:00 2001 From: MengqingCao Date: Thu, 21 Aug 2025 01:49:55 +0000 Subject: [PATCH 1/8] [CI] fix ci Signed-off-by: MengqingCao --- .github/workflows/accuracy_test.yaml | 8 ++++---- .github/workflows/format_pr_body.yaml | 2 +- .github/workflows/image_310p_openeuler.yml | 2 +- .github/workflows/image_310p_ubuntu.yml | 2 +- .github/workflows/image_a3_openeuler.yml | 2 +- .github/workflows/image_a3_ubuntu.yml | 2 +- .github/workflows/image_openeuler.yml | 2 +- .github/workflows/image_ubuntu.yml | 2 +- .github/workflows/nightly_benchmarks.yaml | 4 ++-- .github/workflows/pre-commit.yml | 4 ++-- .github/workflows/vllm_ascend_doctest.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 4 ++-- .github/workflows/vllm_ascend_test_310p.yaml | 4 ++-- .github/workflows/vllm_ascend_test_pd.yaml | 4 ++-- .../test_offline_inference_distributed.py | 20 ------------------- 15 files changed, 22 insertions(+), 42 deletions(-) diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 1f0350dc41b..044c5dcfd00 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -88,7 +88,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Set model name as output id: set_output @@ -109,7 +109,7 @@ jobs: apt-get -y install gcc g++ cmake libnuma-dev - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm ref: v0.10.0 @@ -138,7 +138,7 @@ jobs: echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm-ascend path: ./vllm-ascend @@ -236,7 +236,7 @@ jobs: UPSTREAM_REPO: vllm-project/vllm-ascend steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-ascend-ci/vllm-ascend token: ${{ secrets.PAT_TOKEN }} diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index e50395cf72b..dedf7a4847c 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm path: ./vllm-empty diff --git a/.github/workflows/image_310p_openeuler.yml b/.github/workflows/image_310p_openeuler.yml index 9339c9e4b24..e6062a81233 100644 --- a/.github/workflows/image_310p_openeuler.yml +++ b/.github/workflows/image_310p_openeuler.yml @@ -53,7 +53,7 @@ jobs: 'ubuntu-24.04-arm' }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_310p_ubuntu.yml b/.github/workflows/image_310p_ubuntu.yml index 86ca73f1a5a..0e9444fa50a 100644 --- a/.github/workflows/image_310p_ubuntu.yml +++ b/.github/workflows/image_310p_ubuntu.yml @@ -49,7 +49,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_a3_openeuler.yml b/.github/workflows/image_a3_openeuler.yml index 3eda8dd7ecc..a10ad1c89f9 100644 --- a/.github/workflows/image_a3_openeuler.yml +++ b/.github/workflows/image_a3_openeuler.yml @@ -53,7 +53,7 @@ jobs: 'ubuntu-24.04-arm' }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_a3_ubuntu.yml b/.github/workflows/image_a3_ubuntu.yml index 7a6506cb391..61160150ac6 100644 --- a/.github/workflows/image_a3_ubuntu.yml +++ b/.github/workflows/image_a3_ubuntu.yml @@ -49,7 +49,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml index 22ea1dc4a04..d4e69a55a4d 100644 --- a/.github/workflows/image_openeuler.yml +++ b/.github/workflows/image_openeuler.yml @@ -52,7 +52,7 @@ jobs: 'ubuntu-24.04-arm' }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml index b70875066dd..1c2ddcdffb4 100644 --- a/.github/workflows/image_ubuntu.yml +++ b/.github/workflows/image_ubuntu.yml @@ -49,7 +49,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v4 - name: Print run: | diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 64dadf2c7fe..8a434813776 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -97,12 +97,12 @@ jobs: git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm path: ./vllm-empty diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 1564bd7ab5b..e41dd6e634e 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -11,14 +11,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: "3.11" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" - run: echo "::add-matcher::.github/workflows/matchers/mypy.json" - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm path: ./vllm-empty diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml index ffb552fc786..1b4faeacba8 100644 --- a/.github/workflows/vllm_ascend_doctest.yaml +++ b/.github/workflows/vllm_ascend_doctest.yaml @@ -66,7 +66,7 @@ jobs: git --no-pager log -1 || true - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Run vllm-ascend/tests/e2e/run_doctests.sh run: | diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 7dbcb1863d4..c3159aa511f 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -89,7 +89,7 @@ jobs: apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm ref: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: python3 -m pip uninstall -y triton - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Install vllm-project/vllm-ascend run: | diff --git a/.github/workflows/vllm_ascend_test_310p.yaml b/.github/workflows/vllm_ascend_test_310p.yaml index 52d6ddeffd3..a3d3cae94d2 100644 --- a/.github/workflows/vllm_ascend_test_310p.yaml +++ b/.github/workflows/vllm_ascend_test_310p.yaml @@ -77,7 +77,7 @@ jobs: apt install git -y - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Install system dependencies run: | @@ -85,7 +85,7 @@ jobs: apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm ref: ${{ matrix.vllm_version }} diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index 2f21365829a..a86ba60a65f 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -80,7 +80,7 @@ jobs: git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 - name: Install system dependencies run: | @@ -88,7 +88,7 @@ jobs: apt-get -y install gcc g++ cmake libnuma-dev - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v5 + uses: actions/checkout@v4 with: repository: vllm-project/vllm ref: ${{ matrix.vllm_verison }} diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py index e869c2d5992..f7354abe916 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -78,26 +78,6 @@ def test_models_distributed_DeepSeek_multistream_moe(): vllm_model.generate_greedy(example_prompts, max_tokens) -@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"}) -def test_models_distributed_DeepSeek_dbo(): - example_prompts = ["The president of the United States is"] * 41 - dtype = "half" - sampling_params = SamplingParams(max_tokens=100, temperature=0.0) - with VllmRunner( - "deepseek-ai/DeepSeek-V2-Lite", - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend="mp", - ) as vllm_model: - model_arch = 'DeepseekV2ForCausalLM' - registed_models = ModelRegistry.models - assert registed_models[ - model_arch].module_name == "vllm_ascend.models.deepseek_dbo" - assert registed_models[ - model_arch].class_name == "CustomDeepseekDBOForCausalLM" - vllm_model.generate(example_prompts, sampling_params) - - @pytest.mark.skip( reason= "deepseek dbo dose not consider the support on half precision float, will enable this ut after we actually support it" From b643a2891f26679ebede21018a1b56faf46c11c9 Mon Sep 17 00:00:00 2001 From: MengqingCao Date: Thu, 21 Aug 2025 03:39:42 +0000 Subject: [PATCH 2/8] some fixes - fix kvcache block changes - maintain v0.10.1.1 Signed-off-by: MengqingCao --- .github/workflows/vllm_ascend_test.yaml | 6 +-- .github/workflows/vllm_ascend_test_310p.yaml | 2 +- Dockerfile | 2 +- Dockerfile.310p | 2 +- Dockerfile.310p.openEuler | 2 +- Dockerfile.a3 | 2 +- Dockerfile.a3.openEuler | 2 +- Dockerfile.openEuler | 2 +- vllm_ascend/models/qwen3_moe.py | 3 +- vllm_ascend/worker/model_runner_v1.py | 57 ++++++++++++++------ 10 files changed, 52 insertions(+), 28 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index c3159aa511f..6e3aff0d154 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -81,7 +81,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [main] + vllm_version: [v0.10.1.1, main] steps: - name: Install packages run: | @@ -137,7 +137,7 @@ jobs: max-parallel: 2 matrix: os: [linux-aarch64-a2-1] - vllm_version: [main] + vllm_version: [v0.10.1.1, main] name: singlecard e2e test runs-on: ${{ matrix.os }} container: @@ -219,7 +219,7 @@ jobs: max-parallel: 2 matrix: os: [linux-aarch64-a2-2] - vllm_version: [main] + vllm_version: [v0.10.1.1, main] name: multicard e2e test runs-on: ${{ matrix.os }} container: diff --git a/.github/workflows/vllm_ascend_test_310p.yaml b/.github/workflows/vllm_ascend_test_310p.yaml index a3d3cae94d2..9d4a9709dda 100644 --- a/.github/workflows/vllm_ascend_test_310p.yaml +++ b/.github/workflows/vllm_ascend_test_310p.yaml @@ -53,7 +53,7 @@ jobs: max-parallel: 2 matrix: os: [linux-aarch64-310p-1, linux-aarch64-310p-4] - vllm_version: [main] + vllm_version: [v0.10.1.1, main] name: 310p e2e test runs-on: ${{ matrix.os }} container: diff --git a/Dockerfile b/Dockerfile index a12df1e0b73..29d6445ec04 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index 299624c5417..4eb3c63a81f 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index ff7ec05a430..0e76ba37faf 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/Dockerfile.a3 b/Dockerfile.a3 index da1efcc41b2..8bdfb0e2d9f 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index b03851ca652..afaf11dabe0 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 1146d0a00ab..b744b3325fd 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.10.0 +ARG VLLM_TAG=v0.10.1.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py index 29ab6755250..4ee41eba17b 100644 --- a/vllm_ascend/models/qwen3_moe.py +++ b/vllm_ascend/models/qwen3_moe.py @@ -253,7 +253,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config - self.num_redundant_experts = parallel_config.num_redundant_experts + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.config = config diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index b55cc1395ae..39b29caeff2 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -64,8 +64,8 @@ from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) -from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds, - LogprobsTensors, ModelRunnerOutput) +from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, + ModelRunnerOutput) from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import build_logitsprocs from vllm.v1.sample.metadata import SamplingMetadata @@ -94,11 +94,17 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, ProfileExecuteDuration, is_310p, - maybe_converting_weight_acl_format) + maybe_converting_weight_acl_format, + vllm_version_is) from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch +if not vllm_version_is("0.10.1.1"): + from vllm.v1.outputs import DraftTokenIds +else: + DraftTokenIds = None + if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] from vllm.v1.core.sched.output import SchedulerOutput @@ -513,11 +519,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the block IDs. if not resumed_from_preemption: - # Append the new blocks to the existing block IDs. - for block_ids, new_ids in zip(req_state.block_ids, - new_block_ids): - block_ids.extend(new_ids) + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) else: + assert new_block_ids is not None # The request is resumed from preemption. # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids @@ -533,7 +541,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) - self.input_batch.block_table.append_row(new_block_ids, req_index) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) # For the last rank, we don't need to update the token_ids_cpu # because the sampled tokens are already cached. @@ -1528,6 +1538,7 @@ def _pool( req_ids=self.input_batch.req_ids, req_id_to_index=self.input_batch.req_id_to_index, sampled_token_ids=[], + spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, pooler_output=pooler_output, @@ -1754,15 +1765,27 @@ def execute_model( extra_args = ({"kv_connector_output": kv_connector_output}) - model_runner_output = ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=valid_sampled_token_ids, - logprobs=logprobs_lists, - prompt_logprobs_dict=prompt_logprobs_dict, - pooler_output=[], - **extra_args, - ) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + logprobs=logprobs_lists, + spec_token_ids=self._draft_token_ids, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=[], + **extra_args, + ) + else: + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + logprobs=logprobs_lists, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=[], + **extra_args, + ) durations = ProfileExecuteDuration().pop_captured_sync() if durations: From 4988427ee7f5d890ca96d79631c9d208ddaba40d Mon Sep 17 00:00:00 2001 From: MengqingCao Date: Thu, 21 Aug 2025 05:19:22 +0000 Subject: [PATCH 3/8] fix ut Signed-off-by: MengqingCao --- tests/ut/core/test_scheduler.py | 294 ++++++++++++++++++-------- vllm_ascend/models/qwen3_moe.py | 8 +- vllm_ascend/worker/model_runner_v1.py | 33 ++- 3 files changed, 230 insertions(+), 105 deletions(-) diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py index 78b0c65f0cd..6680a258c19 100644 --- a/tests/ut/core/test_scheduler.py +++ b/tests/ut/core/test_scheduler.py @@ -13,7 +13,7 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec) -from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput +from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager @@ -21,6 +21,11 @@ from vllm_ascend.core.scheduler import AscendScheduler from vllm_ascend.utils import vllm_version_is +if not vllm_version_is("0.10.1.1"): + from vllm.v1.outputs import DraftTokenIds +else: + DraftTokenIds = None + EOS_TOKEN_ID = 50256 MODEL = "Qwen3-0.6B" ENABLE_PREFIX_CACHING = None @@ -66,16 +71,33 @@ def create_requests( def make_output(scheduler): - return ModelRunnerOutput( - req_ids=[req.request_id for req in scheduler.running], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(scheduler.running) - }, - sampled_token_ids=[[1000]] * len(scheduler.running), - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + req_ids = [req.request_id for req in scheduler.running] + req_id_to_index = { + req.request_id: i + for i, req in enumerate(scheduler.running) + } + sampled_token_ids = [[1000]] * len(scheduler.running) + logprobs = None + if vllm_version_is("0.10.1.1"): + modelrunner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + spec_token_ids=None, + logprobs=logprobs, + prompt_logprobs_dict={}, + pooler_output=[], + ) + else: + modelrunner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + logprobs=logprobs, + prompt_logprobs_dict={}, + pooler_output=[], + ) + return modelrunner_output class TestAscendScheduler(TestBase): @@ -271,8 +293,7 @@ def test_stop_via_update_from_output(self): req.num_computed_tokens = req.num_tokens scheduler.requests[req.request_id] = req scheduler.running.append(req) - if not vllm_version_is("0.9.2"): - req.status = RequestStatus.RUNNING + req.status = RequestStatus.RUNNING scheduler_output = SchedulerOutput(scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -291,18 +312,33 @@ def test_stop_via_update_from_output(self): free_encoder_input_ids=[], structured_output_request_ids={}, grammar_bitmask=None) - - model_output = ModelRunnerOutput( - req_ids=[req.request_id for req in requests], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(requests) - }, - sampled_token_ids=[[EOS_TOKEN_ID], [10, 11] - ], # First request hits EOS, second continues - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[EOS_TOKEN_ID], [ + 10, 11 + ]], # First request hits EOS, second continues + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[EOS_TOKEN_ID], [ + 10, 11 + ]], # First request hits EOS, second continues + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -325,8 +361,7 @@ def test_stop_via_update_from_output(self): req.num_computed_tokens = req.num_tokens scheduler.requests[req.request_id] = req scheduler.running.append(req) - if not vllm_version_is("0.9.2"): - req.status = RequestStatus.RUNNING + req.status = RequestStatus.RUNNING scheduler_output = SchedulerOutput(scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -346,18 +381,31 @@ def test_stop_via_update_from_output(self): free_encoder_input_ids=[], structured_output_request_ids={}, grammar_bitmask=None) - - model_output = ModelRunnerOutput( - req_ids=[req.request_id for req in requests], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(requests) - }, - sampled_token_ids=[[10, 42, 12], - [13, 14]], # First request hits stop token - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 42, 12], + [13, 14]], # First request hits stop token + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 42, 12], + [13, 14]], # First request hits stop token + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -379,8 +427,7 @@ def test_stop_via_update_from_output(self): req.num_computed_tokens = req.num_tokens scheduler.requests[req.request_id] = req scheduler.running.append(req) - if not vllm_version_is("0.9.2"): - req.status = RequestStatus.RUNNING + req.status = RequestStatus.RUNNING scheduler_output = SchedulerOutput(scheduled_new_reqs=[], scheduled_cached_reqs=[], @@ -401,18 +448,31 @@ def test_stop_via_update_from_output(self): structured_output_request_ids={}, grammar_bitmask=None) - model_output = ModelRunnerOutput( - req_ids=[req.request_id for req in requests], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(requests) - }, - sampled_token_ids=[[10, 11, 12], - [13]], # First request exceeds max_tokens - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) - + if vllm_version_is("0.10.1.1"): + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 11, 12], + [13]], # First request exceeds max_tokens + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 11, 12], + [13]], # First request exceeds max_tokens + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) # Verify first request stopped due to length @@ -448,13 +508,24 @@ def test_stop_via_update_from_output(self): structured_output_request_ids={}, grammar_bitmask=None) - model_output = ModelRunnerOutput( - req_ids=[requests[0].request_id], - req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + + else: + model_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -505,13 +576,23 @@ def test_schedule_concurrent_batches(self): 512) # Model output of the first request. - model_runner_output = ModelRunnerOutput( - req_ids=[requests[0].request_id], - req_id_to_index={requests[0].request_id: 0}, - sampled_token_ids=[[0]], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[0]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output0, model_runner_output) @@ -521,13 +602,23 @@ def test_schedule_concurrent_batches(self): # request is still running. scheduler.schedule() # Model output of the second request. - model_runner_output = ModelRunnerOutput( - req_ids=[requests[1].request_id], - req_id_to_index={requests[1].request_id: 0}, - sampled_token_ids=[[0]], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=[requests[1].request_id], + req_id_to_index={requests[1].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=[requests[1].request_id], + req_id_to_index={requests[1].request_id: 0}, + sampled_token_ids=[[0]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output1, model_runner_output) @@ -579,19 +670,29 @@ def test_schedule_spec_decoding_stats(self): req_id = requests[i].request_id self.assertEqual(output.num_scheduled_tokens[req_id], 1) self.assertNotIn(req_id, output.scheduled_spec_decode_tokens) - - model_runner_output = ModelRunnerOutput( - req_ids=req_ids, - req_id_to_index=req_to_index, - sampled_token_ids=[[0] for _ in range(len(requests))], - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) - draft_token_ids = DraftTokenIds(req_ids, spec_tokens) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0] for _ in range(len(requests))], + logprobs=None, + prompt_logprobs_dict={}, + spec_token_ids=spec_tokens, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0] for _ in range(len(requests))], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + draft_token_ids = DraftTokenIds(req_ids, spec_tokens) engine_core_outputs = scheduler.update_from_output( output, model_runner_output) - scheduler.update_draft_token_ids(draft_token_ids) + if not vllm_version_is("0.10.1.1"): + scheduler.update_draft_token_ids(draft_token_ids) for i in range(len(requests)): running_req = scheduler.running[i] @@ -627,14 +728,23 @@ def test_schedule_spec_decoding_stats(self): else: self.assertNotIn(req_id, output.scheduled_spec_decode_tokens) - - model_runner_output = ModelRunnerOutput( - req_ids=req_ids, - req_id_to_index=req_to_index, - sampled_token_ids=output_tokens, - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[]) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=output_tokens, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=output_tokens, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) engine_core_outputs = scheduler.update_from_output( output, model_runner_output) diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py index 4ee41eba17b..0df83772b8a 100644 --- a/vllm_ascend/models/qwen3_moe.py +++ b/vllm_ascend/models/qwen3_moe.py @@ -50,6 +50,7 @@ from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.ops.sequence_parallel import (MetadataForPadding, init_metadata_for_sp) +from vllm_ascend.utils import vllm_version_is class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): @@ -253,8 +254,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config - eplb_config = parallel_config.eplb_config - self.num_redundant_experts = eplb_config.num_redundant_experts + if vllm_version_is("0.10.1.1"): + self.num_redundant_experts = parallel_config.num_redundant_experts + else: + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.config = config diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 39b29caeff2..963b54c7a9a 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1533,17 +1533,28 @@ def _pool( else: pooler_output.append(None) extra_args = ({"kv_connector_output": kv_connector_output}) - - return ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=[], - spec_token_ids=None, - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=pooler_output, - **extra_args, - ) + if vllm_version_is("0.10.1.1"): + modelrunner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=[], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + **extra_args, + ) + else: + modelrunner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=[], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + **extra_args, + ) + return modelrunner_output @torch.inference_mode() def execute_model( From a4c0367f5ec57ccf1519dd1274083bf2330ce86d Mon Sep 17 00:00:00 2001 From: MengqingCao Date: Thu, 21 Aug 2025 07:22:33 +0000 Subject: [PATCH 4/8] fix ascendscheduler and sampler Signed-off-by: MengqingCao --- vllm_ascend/core/scheduler.py | 57 ++++++++++++++++++++++++++--------- vllm_ascend/sample/sampler.py | 21 +++++++++++-- 2 files changed, 61 insertions(+), 17 deletions(-) diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index dfdc9aa863c..627d5ea8991 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -31,6 +31,13 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.10.1.1"): + from vllm.v1.core.kv_cache_manager import KVCacheBlocks +else: + KVCacheBlocks = None + class AscendScheduler(Scheduler): """This Scheduler extends vllm's original v1 scheduler @@ -59,7 +66,10 @@ def schedule(self) -> SchedulerOutput: scheduled_running_reqs: list[Request] = [] preempted_reqs: list[Request] = [] - req_to_new_block_ids: dict[str, list[int]] = {} + if vllm_version_is("0.10.1.1"): + req_to_new_block_ids: dict[str, list[int]] = {} + else: + req_to_new_blocks: dict[str, KVCacheBlocks] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Spec decode-related. @@ -217,8 +227,11 @@ def skip_cur_request(): if self.lora_config and request.lora_request: scheduled_loras.add(request.lora_request.lora_int_id) - req_to_new_block_ids[request.request_id] = ( - self.kv_cache_manager.get_block_ids(request.request_id)) + if vllm_version_is("0.10.1.1"): + req_to_new_block_ids[request.request_id] = ( + self.kv_cache_manager.get_block_ids(request.request_id)) + else: + req_to_new_blocks[request.request_id] = new_blocks # Update request info. num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens @@ -307,8 +320,11 @@ def skip_cur_request(): # Schedule the request. scheduled_running_reqs.append(request) self.scheduled_req_ids.add(request.request_id) - req_to_new_block_ids[request.request_id] = ( - new_blocks.get_block_ids()) + if vllm_version_is("0.10.1.1"): + req_to_new_block_ids[request.request_id] = ( + new_blocks.get_block_ids()) + else: + req_to_new_blocks[request.request_id] = new_blocks num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens req_index += 1 @@ -346,16 +362,27 @@ def skip_cur_request(): any_request, len(self.running))) # Construct the scheduler output. - new_reqs_data = [ - NewRequestData.from_request(req, - req_to_new_block_ids[req.request_id]) - for req in scheduled_new_reqs - ] - - cached_reqs_data = self._make_cached_request_data( - scheduled_running_reqs, scheduled_resumed_reqs, - num_scheduled_tokens, scheduled_spec_decode_tokens, - req_to_new_block_ids) + if vllm_version_is("0.10.1.1"): + new_reqs_data = [ + NewRequestData.from_request( + req, req_to_new_block_ids[req.request_id]) + for req in scheduled_new_reqs + ] + cached_reqs_data = self._make_cached_request_data( + scheduled_running_reqs, scheduled_resumed_reqs, + num_scheduled_tokens, scheduled_spec_decode_tokens, + req_to_new_block_ids) + else: + new_reqs_data = [ + NewRequestData.from_request( + req, req_to_new_blocks[req.request_id].get_block_ids()) + for req in scheduled_new_reqs + ] + + cached_reqs_data = self._make_cached_request_data( + scheduled_running_reqs, scheduled_resumed_reqs, + num_scheduled_tokens, scheduled_spec_decode_tokens, + req_to_new_blocks) scheduled_cached_reqs = cached_reqs_data scheduler_output = SchedulerOutput( diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index c082f988adf..d0e015480f0 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -3,7 +3,12 @@ from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample from vllm.v1.sample.sampler import Sampler -from vllm_ascend.utils import is_310p +from vllm_ascend.utils import is_310p, vllm_version_is + +if not vllm_version_is("0.10.1.1"): + from vllm.config import LogprobsMode +else: + LogprobsMode = None class AscendSampler(Sampler): @@ -60,6 +65,18 @@ def _apply_top_k_top_p( def forward_native(self, logits, generators, k, p): """Override pytorch native implementation to torch_npu""" + logits = self.apply_top_k_top_p(logits, k, p) + logits_to_return = None + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + logits_to_return = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) + logits = self._apply_top_k_top_p(logits, k, p) probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + output = None + if vllm_version_is("0.10.1.1"): + output = random_sample(probs, generators) + else: + output = (random_sample(probs, generators), logits_to_return) + return output From ede6c81ee4311971b87b1db1c16e3c2f723818f3 Mon Sep 17 00:00:00 2001 From: MengqingCao Date: Thu, 21 Aug 2025 07:33:56 +0000 Subject: [PATCH 5/8] fix AscendTopKTopPSampler Signed-off-by: MengqingCao --- vllm_ascend/sample/sampler.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index d0e015480f0..086cae0a832 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -65,14 +65,16 @@ def _apply_top_k_top_p( def forward_native(self, logits, generators, k, p): """Override pytorch native implementation to torch_npu""" - logits = self.apply_top_k_top_p(logits, k, p) - logits_to_return = None - if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: - logits_to_return = logits - elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: - logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) - logits = self._apply_top_k_top_p(logits, k, p) + if not vllm_version_is("0.10.1.1"): + + logits_to_return = None + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + logits_to_return = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + logits_to_return = logits.log_softmax(dim=-1, + dtype=torch.float32) + probs = logits.softmax(dim=-1, dtype=torch.float32) output = None if vllm_version_is("0.10.1.1"): From ed14490cf4c489c20c6b75b4c9fd8f3894e592ec Mon Sep 17 00:00:00 2001 From: MengqingCao Date: Thu, 21 Aug 2025 07:43:24 +0000 Subject: [PATCH 6/8] fix modeloutput Signed-off-by: MengqingCao --- tests/ut/kv_connector/utils.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py index 9dc6dfc8309..c2e0a1f955e 100644 --- a/tests/ut/kv_connector/utils.py +++ b/tests/ut/kv_connector/utils.py @@ -200,12 +200,26 @@ def create_model_runner_output( kv_connector_output = KVConnectorOutput(finished_sending=finished_sending, finished_recving=finished_recving) extra_args = {"kv_connector_output": kv_connector_output} - return ModelRunnerOutput( - req_ids=req_ids, - req_id_to_index=req_id_to_index, - sampled_token_ids=sampled_token_ids, - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[], - **extra_args, - ) + if vllm_version_is("0.10.1.1"): + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + **extra_args, + ) + else: + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + **extra_args, + ) + + return model_runner_output From 95b640e2617905fa96339c061a65ee40a842190d Mon Sep 17 00:00:00 2001 From: MengqingCao Date: Thu, 21 Aug 2025 08:51:30 +0000 Subject: [PATCH 7/8] fix ascendsampler Signed-off-by: MengqingCao --- vllm_ascend/sample/sampler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index 086cae0a832..d3f1ae9ceae 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -7,13 +7,15 @@ if not vllm_version_is("0.10.1.1"): from vllm.config import LogprobsMode + DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS else: LogprobsMode = None + DEFAULT_LOGPROBS_MODE = "raw_logprobs" class AscendSampler(Sampler): - def __init__(self, logprobs_mode="raw_logprobs"): + def __init__(self, logprobs_mode=DEFAULT_LOGPROBS_MODE): # TODO: support logprobs_mode in vllm-ascend super().__init__(logprobs_mode=logprobs_mode) self.topk_topp_sampler = AscendTopKTopPSampler() From 069d9aefa96e813043a27c26164f6a7e841892e2 Mon Sep 17 00:00:00 2001 From: MengqingCao Date: Thu, 21 Aug 2025 11:00:06 +0000 Subject: [PATCH 8/8] remove dbo Signed-off-by: MengqingCao --- .github/workflows/vllm_ascend_test.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 6e3aff0d154..78cfefae3fa 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -278,7 +278,6 @@ jobs: # To avoid oom, we need to run the test in a single process. pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC