fix

Potabk · Potabk · commit 3eb5b371eca2 · 2025-11-29T16:10:47.000+08:00
Signed-off-by: wangli &lt;wangli858794774@gmail.com&gt;
diff --git a/.github/workflows/vllm_ascend_test_pr_full.yaml b/.github/workflows/vllm_ascend_test_pr_full.yaml
@@ -69,7 +69,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [888152bf87d62c9f5929d06f386068990b618db7, v0.11.2]
+        vllm_version: [888152bf87d62c9f5929d06f386068990b618db7]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/vllm_ascend_test_pr_light.yaml b/.github/workflows/vllm_ascend_test_pr_light.yaml
@@ -142,7 +142,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [888152bf87d62c9f5929d06f386068990b618db7, v0.11.2]
+        vllm_version: [888152bf87d62c9f5929d06f386068990b618db7]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -136,7 +136,8 @@ def dummy_run(self,
             )
 
     def generate_token_ids(self,
-                           valid_sampled_token_ids: list[np.ndarray],
+                           valid_sampled_token_ids: torch.Tensor
+                           | list[list[int]],
                            sampling_metadata: SamplingMetadata = None,
                            scheduler_output: SchedulerOutput = None,
                            spec_decode_metadata: SpecDecodeMetadata = None,
@@ -149,7 +150,7 @@ def generate_token_ids(self,
         attn_metadata = self._get_eagle_atten_dict(scheduler_output)
         next_token_ids: list[int] = []
         for i, token_ids in enumerate(valid_sampled_token_ids):
-            if token_ids.shape[0] > 0:
+            if token_ids:
                 # Common case.
                 next_token_id = token_ids[-1]
             else:
@@ -161,7 +162,7 @@ def generate_token_ids(self,
                            scheduler_output.num_scheduled_tokens[req_id])
 
                 next_token_id = req_state.get_token_id(seq_len)
-            next_token_ids.append(next_token_id.item())
+            next_token_ids.append(next_token_id)
         next_token_ids = torch.tensor(next_token_ids,
                                       dtype=torch.int32,
                                       device=self.device)
@@ -181,7 +182,7 @@ def generate_token_ids(self,
         else:
             num_draft_tokens = spec_decode_metadata.num_draft_tokens
             num_rejected_tokens = [
-                n + 1 - valid_sampled_token_ids[i].shape[0] if n > 0 else 0
+                n + 1 - len(valid_sampled_token_ids[0]) if n > 0 else 0
                 for i, n in enumerate(num_draft_tokens)
             ]
             num_rejected_tokens = torch.tensor(
diff --git a/vllm_ascend/spec_decode/interface.py b/vllm_ascend/spec_decode/interface.py
@@ -1,7 +1,6 @@
 import enum
 from typing import Optional
 
-import numpy as np
 import torch
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -41,7 +40,7 @@ def dummy_run(self,
         raise NotImplementedError
 
     def generate_token_ids(self,
-                           valid_sampled_token_ids: list[np.ndarray],
+                           valid_sampled_token_ids: list[list[int]],
                            sampling_metadata: SamplingMetadata = None,
                            scheduler_output: SchedulerOutput = None,
                            spec_decode_metadata: SpecDecodeMetadata = None,
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -302,8 +302,7 @@ def dummy_run(self,
                 break
 
     def generate_token_ids(self,
-                           sampled_token_ids: Union[torch.Tensor,
-                                                    list[np.ndarray]],
+                           sampled_token_ids: torch.Tensor | list[list[int]],
                            sampling_metadata: SamplingMetadata = None,
                            scheduler_output: SchedulerOutput = None,
                            spec_decode_metadata: SpecDecodeMetadata = None,
@@ -380,7 +379,6 @@ def generate_token_ids(self,
                 common_attn_metadata.query_start_loc = \
                     query_start_loc_pcp_full[:num_reqs + 1]
             if self.speculative_config.disable_padded_drafter_batch:
-                assert isinstance(sampled_token_ids, list)
                 # NOTE: Currently, MTP-fullgraph is incompatibility with pcp
                 token_indices_to_sample = None
                 common_attn_metadata, token_indices =\
@@ -439,7 +437,7 @@ def _get_attn_metadata(self, attn_metadata):
     def _prepare_inputs(
         self,
         common_attn_metadata: CommonAttentionMetadata,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
         num_draft_tokens: list[int],
     ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
         """
@@ -897,7 +895,7 @@ def _prepare_input_kernel(self, out_ptr: torch.Tensor,
 
     def prepare_next_token_ids_cpu(
         self,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
         num_scheduled_tokens: dict[str, int],
@@ -912,7 +910,7 @@ def prepare_next_token_ids_cpu(
         req_ids = gpu_input_batch.req_ids
         next_token_ids: list[int] = []
         for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids.shape[0] > 0:
+            if token_ids:
                 # Common case.
                 next_token_id = token_ids[-1]
             else:
@@ -923,7 +921,7 @@ def prepare_next_token_ids_cpu(
                 seq_len = req_state.num_computed_tokens + num_scheduled_tokens[
                     req_id]
                 next_token_id = req_state.get_token_id(seq_len)
-            next_token_ids.append(next_token_id.item())
+            next_token_ids.append(next_token_id)
         next_token_ids = torch.tensor(next_token_ids,
                                       dtype=torch.int32,
                                       device=self.input_ids.device)
diff --git a/vllm_ascend/spec_decode/ngram_proposer.py b/vllm_ascend/spec_decode/ngram_proposer.py
@@ -1,4 +1,3 @@
-import numpy as np
 import torch
 from vllm.config import CUDAGraphMode
 from vllm.v1.spec_decode.ngram_proposer import \
@@ -31,7 +30,7 @@ def dummy_run(self,
         pass
 
     def generate_token_ids(self,
-                           valid_sampled_token_ids: list[np.ndarray],
+                           valid_sampled_token_ids,
                            sampling_metadata=None,
                            scheduler_output=None,
                            spec_decode_metadata=None,
@@ -42,7 +41,7 @@ def generate_token_ids(self,
                            aux_hidden_states=None) -> list[list[int]]:
         valid_ngram_requests = []
         for i, sampled_ids in enumerate(valid_sampled_token_ids):
-            num_sampled_ids = sampled_ids.shape[0]
+            num_sampled_ids = len(sampled_ids)
             if not num_sampled_ids:
                 continue
 
diff --git a/vllm_ascend/torchair/models/qwen2.py b/vllm_ascend/torchair/models/qwen2.py
@@ -77,8 +77,6 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         rope_parameters: Optional[dict[str, Any]] = None,
-        rope_theta: float = 10000,
-        rope_scaling: tuple | None = None,
         max_position: int = 4096 * 32,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
@@ -96,10 +94,7 @@ def __init__(
             prefix=prefix,
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
-            # Pass both rope_parameters and rope_theta/rope_scaling for compatibility
-            **(dict(
-                rope_parameters=rope_parameters) if vllm_version_is("0.11.2")
-               else dict(rope_theta=rope_theta, rope_scaling=rope_scaling)))
+            rope_parameters=rope_parameters)
 
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
@@ -154,14 +149,7 @@ def __init__(
         super().__init__()
         self.hidden_size = config.hidden_size
 
-        # NOTE: remove this once we drop vllm v0.11.2
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        rope_parameters = None
-        if not vllm_version_is("0.11.2"):
-            # Requires transformers > 4.32.0
-            set_default_rope_theta(config, default_theta=1000000)
-            rope_parameters = config.rope_parameters
+        set_default_rope_theta(config, default_theta=1000000)
 
         dual_chunk_attention_config = getattr(config,
                                               "dual_chunk_attention_config",
@@ -181,9 +169,7 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_parameters=rope_parameters,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm_ascend/torchair/torchair_mtp_proposer.py b/vllm_ascend/torchair/torchair_mtp_proposer.py
@@ -1,6 +1,5 @@
 import types
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torchair
@@ -147,7 +146,8 @@ def dummy_run(self,
                 break
 
     def generate_token_ids(self,
-                           valid_sampled_token_ids: list[np.ndarray],
+                           valid_sampled_token_ids: torch.Tensor
+                           | list[list[int]],
                            sampling_metadata: SamplingMetadata = None,
                            scheduler_output: SchedulerOutput = None,
                            spec_decode_metadata: SpecDecodeMetadata = None,
@@ -187,7 +187,7 @@ def generate_token_ids(self,
             # TODO(woosuk): Refactor this.
             num_draft_tokens = spec_decode_metadata.num_draft_tokens
             num_rejected_tokens = [
-                n + 1 - valid_sampled_token_ids[i].shape[0] if n > 0 else 0
+                n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
                 for i, n in enumerate(num_draft_tokens)
             ]
             num_rejected_tokens = torch.tensor(
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -243,11 +243,9 @@ def get_output(self) -> ModelRunnerOutput:
         # Release the device tensor once the copy has completed
         del self._sampled_token_ids
 
-        valid_sampled_token_ids: list[np.ndarray] = [
-            row for row in self._sampled_token_ids_cpu.numpy()
-        ]
+        valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
         for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i] = np.array([])
+            valid_sampled_token_ids[i].clear()
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
@@ -2132,7 +2130,7 @@ def apply_grammar_bitmask(
 
     def propose_draft_token_ids(
         self,
-        valid_sampled_token_ids: Union[torch.Tensor, list[np.ndarray]],
+        valid_sampled_token_ids: torch.Tensor | list[list[int]],
         sampling_metadata: SamplingMetadata,
         scheduler_output: "SchedulerOutput",
         spec_decode_metadata: SpecDecodeMetadata,
@@ -4465,18 +4463,3 @@ def _generate_pcp_mtp_input(
             self.input_ids_pcp_full_cpu[:total_num_scheduled_tokens_pcp_full],
             non_blocking=True,
         )
-
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
-        # This is a short term mitigation for issue mentioned in
-        # https://github.com/vllm-project/vllm/issues/22754.
-        # `tolist` would trigger a cuda wise stream sync, which
-        # would block other copy ops from other cuda streams.
-        # A cuda event sync would avoid such a situation. Since
-        # this is in the critical path of every single model
-        # forward loop, this has caused perf issue for a disagg
-        # setup.
-        pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
-        pinned.copy_(sampled_token_ids, non_blocking=True)
-        self.transfer_event.record()
-        self.transfer_event.synchronize()
-        return [row for row in pinned.numpy()]