Skip to content

Commit 3eb5b37

Browse files
committed
fix
Signed-off-by: wangli <[email protected]>
1 parent 12568b1 commit 3eb5b37

File tree

9 files changed

+24
-58
lines changed

9 files changed

+24
-58
lines changed

.github/workflows/vllm_ascend_test_pr_full.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ jobs:
6969
name: e2e-full
7070
strategy:
7171
matrix:
72-
vllm_version: [888152bf87d62c9f5929d06f386068990b618db7, v0.11.2]
72+
vllm_version: [888152bf87d62c9f5929d06f386068990b618db7]
7373
needs: [changes]
7474
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
7575
uses: ./.github/workflows/_e2e_test.yaml

.github/workflows/vllm_ascend_test_pr_light.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ jobs:
142142
name: e2e-light
143143
strategy:
144144
matrix:
145-
vllm_version: [888152bf87d62c9f5929d06f386068990b618db7, v0.11.2]
145+
vllm_version: [888152bf87d62c9f5929d06f386068990b618db7]
146146
# Note (yikun): If CI resource are limited we can split job into two chain jobs
147147
needs: [lint, changes]
148148
# only trigger e2e test after lint passed and the change is e2e related with pull request.

vllm_ascend/spec_decode/eagle_proposer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,8 @@ def dummy_run(self,
136136
)
137137

138138
def generate_token_ids(self,
139-
valid_sampled_token_ids: list[np.ndarray],
139+
valid_sampled_token_ids: torch.Tensor
140+
| list[list[int]],
140141
sampling_metadata: SamplingMetadata = None,
141142
scheduler_output: SchedulerOutput = None,
142143
spec_decode_metadata: SpecDecodeMetadata = None,
@@ -149,7 +150,7 @@ def generate_token_ids(self,
149150
attn_metadata = self._get_eagle_atten_dict(scheduler_output)
150151
next_token_ids: list[int] = []
151152
for i, token_ids in enumerate(valid_sampled_token_ids):
152-
if token_ids.shape[0] > 0:
153+
if token_ids:
153154
# Common case.
154155
next_token_id = token_ids[-1]
155156
else:
@@ -161,7 +162,7 @@ def generate_token_ids(self,
161162
scheduler_output.num_scheduled_tokens[req_id])
162163

163164
next_token_id = req_state.get_token_id(seq_len)
164-
next_token_ids.append(next_token_id.item())
165+
next_token_ids.append(next_token_id)
165166
next_token_ids = torch.tensor(next_token_ids,
166167
dtype=torch.int32,
167168
device=self.device)
@@ -181,7 +182,7 @@ def generate_token_ids(self,
181182
else:
182183
num_draft_tokens = spec_decode_metadata.num_draft_tokens
183184
num_rejected_tokens = [
184-
n + 1 - valid_sampled_token_ids[i].shape[0] if n > 0 else 0
185+
n + 1 - len(valid_sampled_token_ids[0]) if n > 0 else 0
185186
for i, n in enumerate(num_draft_tokens)
186187
]
187188
num_rejected_tokens = torch.tensor(

vllm_ascend/spec_decode/interface.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import enum
22
from typing import Optional
33

4-
import numpy as np
54
import torch
65
from vllm.config import CUDAGraphMode, VllmConfig
76
from vllm.v1.core.sched.output import SchedulerOutput
@@ -41,7 +40,7 @@ def dummy_run(self,
4140
raise NotImplementedError
4241

4342
def generate_token_ids(self,
44-
valid_sampled_token_ids: list[np.ndarray],
43+
valid_sampled_token_ids: list[list[int]],
4544
sampling_metadata: SamplingMetadata = None,
4645
scheduler_output: SchedulerOutput = None,
4746
spec_decode_metadata: SpecDecodeMetadata = None,

vllm_ascend/spec_decode/mtp_proposer.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,7 @@ def dummy_run(self,
302302
break
303303

304304
def generate_token_ids(self,
305-
sampled_token_ids: Union[torch.Tensor,
306-
list[np.ndarray]],
305+
sampled_token_ids: torch.Tensor | list[list[int]],
307306
sampling_metadata: SamplingMetadata = None,
308307
scheduler_output: SchedulerOutput = None,
309308
spec_decode_metadata: SpecDecodeMetadata = None,
@@ -380,7 +379,6 @@ def generate_token_ids(self,
380379
common_attn_metadata.query_start_loc = \
381380
query_start_loc_pcp_full[:num_reqs + 1]
382381
if self.speculative_config.disable_padded_drafter_batch:
383-
assert isinstance(sampled_token_ids, list)
384382
# NOTE: Currently, MTP-fullgraph is incompatibility with pcp
385383
token_indices_to_sample = None
386384
common_attn_metadata, token_indices =\
@@ -439,7 +437,7 @@ def _get_attn_metadata(self, attn_metadata):
439437
def _prepare_inputs(
440438
self,
441439
common_attn_metadata: CommonAttentionMetadata,
442-
sampled_token_ids: list[np.ndarray],
440+
sampled_token_ids: list[list[int]],
443441
num_draft_tokens: list[int],
444442
) -> tuple[CommonAttentionMetadata, torch.Tensor]:
445443
"""
@@ -897,7 +895,7 @@ def _prepare_input_kernel(self, out_ptr: torch.Tensor,
897895

898896
def prepare_next_token_ids_cpu(
899897
self,
900-
sampled_token_ids: list[np.ndarray],
898+
sampled_token_ids: list[list[int]],
901899
requests: dict[str, CachedRequestState],
902900
gpu_input_batch: InputBatch,
903901
num_scheduled_tokens: dict[str, int],
@@ -912,7 +910,7 @@ def prepare_next_token_ids_cpu(
912910
req_ids = gpu_input_batch.req_ids
913911
next_token_ids: list[int] = []
914912
for i, token_ids in enumerate(sampled_token_ids):
915-
if token_ids.shape[0] > 0:
913+
if token_ids:
916914
# Common case.
917915
next_token_id = token_ids[-1]
918916
else:
@@ -923,7 +921,7 @@ def prepare_next_token_ids_cpu(
923921
seq_len = req_state.num_computed_tokens + num_scheduled_tokens[
924922
req_id]
925923
next_token_id = req_state.get_token_id(seq_len)
926-
next_token_ids.append(next_token_id.item())
924+
next_token_ids.append(next_token_id)
927925
next_token_ids = torch.tensor(next_token_ids,
928926
dtype=torch.int32,
929927
device=self.input_ids.device)

vllm_ascend/spec_decode/ngram_proposer.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import numpy as np
21
import torch
32
from vllm.config import CUDAGraphMode
43
from vllm.v1.spec_decode.ngram_proposer import \
@@ -31,7 +30,7 @@ def dummy_run(self,
3130
pass
3231

3332
def generate_token_ids(self,
34-
valid_sampled_token_ids: list[np.ndarray],
33+
valid_sampled_token_ids,
3534
sampling_metadata=None,
3635
scheduler_output=None,
3736
spec_decode_metadata=None,
@@ -42,7 +41,7 @@ def generate_token_ids(self,
4241
aux_hidden_states=None) -> list[list[int]]:
4342
valid_ngram_requests = []
4443
for i, sampled_ids in enumerate(valid_sampled_token_ids):
45-
num_sampled_ids = sampled_ids.shape[0]
44+
num_sampled_ids = len(sampled_ids)
4645
if not num_sampled_ids:
4746
continue
4847

vllm_ascend/torchair/models/qwen2.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ def __init__(
7777
num_heads: int,
7878
num_kv_heads: int,
7979
rope_parameters: Optional[dict[str, Any]] = None,
80-
rope_theta: float = 10000,
81-
rope_scaling: tuple | None = None,
8280
max_position: int = 4096 * 32,
8381
cache_config: Optional[CacheConfig] = None,
8482
quant_config: Optional[QuantizationConfig] = None,
@@ -96,10 +94,7 @@ def __init__(
9694
prefix=prefix,
9795
attn_type=attn_type,
9896
dual_chunk_attention_config=dual_chunk_attention_config,
99-
# Pass both rope_parameters and rope_theta/rope_scaling for compatibility
100-
**(dict(
101-
rope_parameters=rope_parameters) if vllm_version_is("0.11.2")
102-
else dict(rope_theta=rope_theta, rope_scaling=rope_scaling)))
97+
rope_parameters=rope_parameters)
10398

10499
ascend_config = get_ascend_config()
105100
self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
@@ -154,14 +149,7 @@ def __init__(
154149
super().__init__()
155150
self.hidden_size = config.hidden_size
156151

157-
# NOTE: remove this once we drop vllm v0.11.2
158-
rope_theta = getattr(config, "rope_theta", 1000000)
159-
rope_scaling = getattr(config, "rope_scaling", None)
160-
rope_parameters = None
161-
if not vllm_version_is("0.11.2"):
162-
# Requires transformers > 4.32.0
163-
set_default_rope_theta(config, default_theta=1000000)
164-
rope_parameters = config.rope_parameters
152+
set_default_rope_theta(config, default_theta=1000000)
165153

166154
dual_chunk_attention_config = getattr(config,
167155
"dual_chunk_attention_config",
@@ -181,9 +169,7 @@ def __init__(
181169
num_heads=config.num_attention_heads,
182170
max_position=config.max_position_embeddings,
183171
num_kv_heads=config.num_key_value_heads,
184-
rope_parameters=rope_parameters,
185-
rope_theta=rope_theta,
186-
rope_scaling=rope_scaling,
172+
rope_parameters=config.rope_parameters,
187173
cache_config=cache_config,
188174
quant_config=quant_config,
189175
prefix=f"{prefix}.self_attn",

vllm_ascend/torchair/torchair_mtp_proposer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import types
22

3-
import numpy as np
43
import torch
54
import torch.nn as nn
65
import torchair
@@ -147,7 +146,8 @@ def dummy_run(self,
147146
break
148147

149148
def generate_token_ids(self,
150-
valid_sampled_token_ids: list[np.ndarray],
149+
valid_sampled_token_ids: torch.Tensor
150+
| list[list[int]],
151151
sampling_metadata: SamplingMetadata = None,
152152
scheduler_output: SchedulerOutput = None,
153153
spec_decode_metadata: SpecDecodeMetadata = None,
@@ -187,7 +187,7 @@ def generate_token_ids(self,
187187
# TODO(woosuk): Refactor this.
188188
num_draft_tokens = spec_decode_metadata.num_draft_tokens
189189
num_rejected_tokens = [
190-
n + 1 - valid_sampled_token_ids[i].shape[0] if n > 0 else 0
190+
n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
191191
for i, n in enumerate(num_draft_tokens)
192192
]
193193
num_rejected_tokens = torch.tensor(

vllm_ascend/worker/model_runner_v1.py

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -243,11 +243,9 @@ def get_output(self) -> ModelRunnerOutput:
243243
# Release the device tensor once the copy has completed
244244
del self._sampled_token_ids
245245

246-
valid_sampled_token_ids: list[np.ndarray] = [
247-
row for row in self._sampled_token_ids_cpu.numpy()
248-
]
246+
valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
249247
for i in self._invalid_req_indices:
250-
valid_sampled_token_ids[i] = np.array([])
248+
valid_sampled_token_ids[i].clear()
251249

252250
output = self._model_runner_output
253251
output.sampled_token_ids = valid_sampled_token_ids
@@ -2132,7 +2130,7 @@ def apply_grammar_bitmask(
21322130

21332131
def propose_draft_token_ids(
21342132
self,
2135-
valid_sampled_token_ids: Union[torch.Tensor, list[np.ndarray]],
2133+
valid_sampled_token_ids: torch.Tensor | list[list[int]],
21362134
sampling_metadata: SamplingMetadata,
21372135
scheduler_output: "SchedulerOutput",
21382136
spec_decode_metadata: SpecDecodeMetadata,
@@ -4465,18 +4463,3 @@ def _generate_pcp_mtp_input(
44654463
self.input_ids_pcp_full_cpu[:total_num_scheduled_tokens_pcp_full],
44664464
non_blocking=True,
44674465
)
4468-
4469-
def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
4470-
# This is a short term mitigation for issue mentioned in
4471-
# https://github.com/vllm-project/vllm/issues/22754.
4472-
# `tolist` would trigger a cuda wise stream sync, which
4473-
# would block other copy ops from other cuda streams.
4474-
# A cuda event sync would avoid such a situation. Since
4475-
# this is in the critical path of every single model
4476-
# forward loop, this has caused perf issue for a disagg
4477-
# setup.
4478-
pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
4479-
pinned.copy_(sampled_token_ids, non_blocking=True)
4480-
self.transfer_event.record()
4481-
self.transfer_event.synchronize()
4482-
return [row for row in pinned.numpy()]

0 commit comments

Comments
 (0)