From d4c8b879cafbee32869e8c4b99439243f48dde19 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Fri, 21 Nov 2025 10:08:24 +0000 Subject: [PATCH 01/16] patch AscendQwen2_5_VisionAttention Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/models/qwen2_5_vl.py | 234 ++----------------- vllm_ascend/patch/worker/__init__.py | 7 + vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 118 ++++++++++ 3 files changed, 138 insertions(+), 221 deletions(-) create mode 100644 vllm_ascend/patch/worker/patch_qwen2_5_vl.py diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index 0ff31712668..cb12b7e4bc4 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -18,12 +18,11 @@ # limitations under the License. from functools import partial -from typing import Callable, Iterable, Optional, Set, Tuple, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F -import torch_npu from einops import rearrange from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) @@ -35,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2_5_vl import ( - Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, + Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, Qwen2_5_VisionRotaryEmbedding, Qwen2_5_VisionTransformer, Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo) @@ -49,119 +48,19 @@ MAX_PAD_SIZE = 128 # max_size to pad weight -class AscendQwen2_5_VisionAttention(Qwen2_5_VisionAttention): - - def __init__( - self, - embed_dim: int, - num_heads: int, - projection_size: int, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__( - embed_dim, - num_heads, - projection_size, - quant_config, - prefix, - ) - self.embed_dim = embed_dim - self.hidden_size_per_attention_head = dist_utils.divide( - projection_size, num_heads) - self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head - if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE: - self.hidden_size_per_attention_head = MAX_PAD_SIZE - - def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: - # [s, b, 3 * head * head_dim] - seq_len, bs, _ = qkv.shape - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] - q, k, v = qkv.chunk(3, dim=2) - - # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] - new_shape = (seq_len, bs, self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head) - q, k, v = (x.view(*new_shape) for x in (q, k, v)) - return q, k, v +class AscendQwen2_5_VisionBlock(Qwen2_5_VisionBlock): def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, ) -> torch.Tensor: - # [s, b, c] --> [s, b, head * 3 * head_dim] - x, _ = self.qkv(x) - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] - q, k, v = self.split_qkv(x) - batch_size = q.shape[1] - - q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() - for x in (q, k, v)) - q = torch_npu.npu_rotary_mul(q, cos, sin) - k = torch_npu.npu_rotary_mul(k, cos, sin) - - q, k, v = [ - rearrange(x, "b s h d -> (b s) h d").contiguous() - for x in (q, k, v) - ] - - context_layer = torch.empty_like(q) - - # operator requires pta version >= 2.5.1 - torch_npu._npu_flash_attention_unpad( - query=q, - key=k, - value=v, - seq_len=cu_seqlens, - scale_value=self.origin_hidden_size_per_attention_head**-0.5, - num_heads=self.num_attention_heads_per_partition, - num_kv_heads=self.num_attention_heads_per_partition, - out=context_layer) - - context_layer = rearrange(context_layer, - "(b s) h d -> s b (h d)", - b=batch_size).contiguous() - - output, _ = self.proj(context_layer) - return output - - -class AscendQwen2_5_VisionBlock(Qwen2_5_VisionBlock): - - def __init__( - self, - dim: int, - num_heads: int, - mlp_hidden_dim: int, - act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, - norm_layer: Optional[Callable[[int], nn.Module]] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__(dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=prefix) - - self.attn = AscendQwen2_5_VisionAttention(embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn") - - def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, - cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: - x = x + self.attn( - self.norm1(x), cu_seqlens=cu_seqlens, cos=cos, sin=sin) - + x = x + self.attn(self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin) x = x + self.mlp(self.norm2(x)) return x @@ -249,98 +148,6 @@ def cal_cos_sin(self, rotary_pos_emb): self.hidden_size_per_attention_head) return cos_new, sin_new - def pad_qkv_bias(self, bias): - first_half = bias.reshape( - -1, 3, self.origin_hidden_size_per_attention_head - )[:, :, :self.half_origin_hidden_size_per_attention_head] - second_half = bias.reshape( - -1, 3, self.origin_hidden_size_per_attention_head - )[:, :, self.half_origin_hidden_size_per_attention_head:] - first_half_padded = torch.nn.functional.pad( - first_half, (0, self.half_pad_hidden_size_per_attention_head)) - second_half_padded = torch.nn.functional.pad( - second_half, (0, self.half_pad_hidden_size_per_attention_head)) - bias_padded = torch.cat([first_half_padded, second_half_padded], dim=2) - bias_final = bias_padded.reshape(-1) - return bias_final - - def pad_qkv_weight(self, data): - qkv_weight_first_half = data.reshape( - -1, 3, self.origin_hidden_size_per_attention_head, self.hidden_size - )[:, :, :self.half_origin_hidden_size_per_attention_head, :] - qkv_weight_second_half = data.reshape( - -1, 3, self.origin_hidden_size_per_attention_head, self.hidden_size - )[:, :, self.half_origin_hidden_size_per_attention_head:, :] - - qkv_weight_first_half_padded = torch.nn.functional.pad( - qkv_weight_first_half, - (0, 0, 0, self.half_pad_hidden_size_per_attention_head)) - qkv_weight_second_half_padded = torch.nn.functional.pad( - qkv_weight_second_half, - (0, 0, 0, self.half_pad_hidden_size_per_attention_head)) - qkv_weight_padded = torch.cat( - [qkv_weight_first_half_padded, qkv_weight_second_half_padded], - dim=2) - qkv_weight_final = qkv_weight_padded.reshape(-1, self.hidden_size) - - if is_enable_nz(): - qkv_weight_final_copy = torch.empty_like(qkv_weight_final).copy_( - qkv_weight_final) - qkv_weight_final_copy = torch_npu.npu_format_cast( - qkv_weight_final_copy, ACL_FORMAT_FRACTAL_ND) - return qkv_weight_final_copy - - return qkv_weight_final - - def pad_proj_weight(self, data): - out_weight = torch.nn.functional.pad( - data.reshape(self.hidden_size, -1, - self.half_origin_hidden_size_per_attention_head), - (0, self.half_pad_hidden_size_per_attention_head, 0, 0)).reshape( - self.hidden_size, -1) - - if is_enable_nz(): - out_weight_copy = torch.empty_like(out_weight).copy_(out_weight) - out_weight_copy = torch_npu.npu_format_cast( - out_weight_copy, ACL_FORMAT_FRACTAL_ND) - return out_weight_copy - - return out_weight - - def pad_qkv_weight_scale_offset(self, data): - reshaped_data = data.reshape( - -1, 3, self.origin_hidden_size_per_attention_head, 1) - data1 = reshaped_data[:, :, :self. - half_origin_hidden_size_per_attention_head, :] - data2 = reshaped_data[:, :, self. - half_origin_hidden_size_per_attention_head:, :] - data1_paded = torch.nn.functional.pad( - data1, (0, 0, 0, self.half_pad_hidden_size_per_attention_head, 0, - 0, 0, 0)) - data2_paded = torch.nn.functional.pad( - data2, (0, 0, 0, self.half_pad_hidden_size_per_attention_head, 0, - 0, 0, 0)) - res = torch.cat([data1_paded, data2_paded], dim=2) - res = res.reshape(-1, 1) - return res - - def pad_qkv_deq_scale_quant_bias(self, data): - reshaped_data = data.reshape( - -1, 3, self.origin_hidden_size_per_attention_head) - data1 = reshaped_data[:, :, :self. - half_origin_hidden_size_per_attention_head] - data2 = reshaped_data[:, :, - self.half_origin_hidden_size_per_attention_head:] - - data1_paded = torch.nn.functional.pad( - data1, (0, self.half_pad_hidden_size_per_attention_head)) - data2_paded = torch.nn.functional.pad( - data2, (0, self.half_pad_hidden_size_per_attention_head)) - - res = torch.cat([data1_paded, data2_paded], dim=2) - res = res.reshape(-1) - return res - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping: list[tuple[str, str, Union[str, int]]] = [ @@ -368,24 +175,6 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - if ("attn.proj.weight_scale" in name or - "attn.proj.weight_offset" in name) and self.enable_pad: - continue - elif ("attn.proj.deq_scale" in name - or "attn.proj.quant_bias" in name) and self.enable_pad: - continue - elif ("attn.qkv.weight_scale" in name - or "attn.qkv.weight_offset" in name) and self.enable_pad: - param.data = self.pad_qkv_weight_scale_offset(param.data) - elif ("attn.qkv.deq_scale" in name - or "attn.qkv.quant_bias" in name) and self.enable_pad: - param.data = self.pad_qkv_deq_scale_quant_bias(param.data) - elif ("attn.proj.weight" in name) and self.enable_pad: - param.data = self.pad_proj_weight(param.data) - elif ("attn.qkv.weight" in name) and self.enable_pad: - param.data = self.pad_qkv_weight(param.data) - elif ("attn.qkv.bias" in name) and self.enable_pad: - param.data = self.pad_qkv_bias(param.data) loaded_params.add(name) return loaded_params @@ -492,7 +281,10 @@ def forward( cu_seqlens_now = cu_seqlens else: cu_seqlens_now = cu_window_seqlens - x = blk(x, cu_seqlens=cu_seqlens_now, cos=cos, sin=sin) + x = blk(x, + cu_seqlens=cu_seqlens_now, + rotary_pos_emb_cos=cos, + rotary_pos_emb_sin=sin) # adapter x = self.merger(x) diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index a361789f3dd..d9bf4730e51 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -27,3 +27,10 @@ import vllm_ascend.patch.worker.patch_weight_loader # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_minicpm # noqa +import vllm_ascend.patch.worker.patch_qwen2_5_vl # noqa + +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa + import vllm_ascend.patch.worker.patch_deepseek_v3_2 # noqa diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py new file mode 100644 index 00000000000..d9a5a9e878c --- /dev/null +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -0,0 +1,118 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch_npu +from einops import rearrange +from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention + +MIN_PAD_SIZE = 64 # min_size to pad weight +MAX_PAD_SIZE = 128 # max_size to pad weight + + +class AscendQwen2_5_VisionAttention(nn.Module): + + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # [s, b, 3 * head * head_dim] + seq_len, bs, _ = qkv.shape + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] + q, k, v = qkv.chunk(3, dim=2) + + # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] + new_shape = ( + seq_len, + bs, + self.num_attention_heads_per_partition, + self.origin_hidden_size_per_attention_head, + ) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + ) -> torch.Tensor: + self.enable_pad = False + self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head + self.half_origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head // 2 + if self.hidden_size_per_attention_head > MIN_PAD_SIZE \ + and self.hidden_size_per_attention_head < MAX_PAD_SIZE: + self.enable_pad = True + self.half_pad_hidden_size_per_attention_head = ( + MAX_PAD_SIZE - self.hidden_size_per_attention_head) // 2 + self.hidden_size_per_attention_head = MAX_PAD_SIZE + + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) + batch_size = q.shape[1] + + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) + + cos = rotary_pos_emb_cos + sin = rotary_pos_emb_sin + if self.enable_pad: + origin_shape = q.shape[-1] + pad_len = MAX_PAD_SIZE - origin_shape + # [b, s, head, head_dim] + q = F.pad(q, (0, pad_len), mode="constant", value=0) + k = F.pad(k, (0, pad_len), mode="constant", value=0) + v = F.pad(v, (0, pad_len), mode="constant", value=0) + + q = torch_npu.npu_rotary_mul(q, cos, sin) + k = torch_npu.npu_rotary_mul(k, cos, sin) + + q, k, v = [ + rearrange(x, "b s h d -> (b s) h d").contiguous() + for x in (q, k, v) + ] + + context_layer = torch.empty_like(q) + + # operator requires pta version >= 2.5.1 + torch_npu._npu_flash_attention_unpad( + query=q, + key=k, + value=v, + seq_len=cu_seqlens, + scale_value=self.origin_hidden_size_per_attention_head**-0.5, + num_heads=self.num_attention_heads_per_partition, + num_kv_heads=self.num_attention_heads_per_partition, + out=context_layer) + + if self.enable_pad: + context_layer = context_layer[..., :origin_shape] + + context_layer = rearrange(context_layer, + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() + + output, _ = self.proj(context_layer) + return output + + +Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward +Qwen2_5_VisionAttention.split_qkv = AscendQwen2_5_VisionAttention.split_qkv From 29fae873d1f13ae2c5310ea6f3412d51820cb4e0 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Mon, 24 Nov 2025 08:21:36 +0000 Subject: [PATCH 02/16] remove more modeling files Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/models/qwen2_5_vl.py | 273 +------------------ vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 30 +- 2 files changed, 34 insertions(+), 269 deletions(-) diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index cb12b7e4bc4..781626ac3d9 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -17,280 +17,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -from functools import partial -from typing import Iterable, Optional, Set, Tuple, Union - import torch -import torch.nn as nn -import torch.nn.functional as F -from einops import rearrange -from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( - Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import \ + Qwen2_5_VLConfig from vllm.config import VllmConfig -from vllm.distributed import parallel_state -from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.activation import get_act_and_mul_fn -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2_5_vl import ( - Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, - Qwen2_5_VisionRotaryEmbedding, Qwen2_5_VisionTransformer, - Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration, - Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo) + Qwen2_5_VisionTransformer, Qwen2_5_VLDummyInputsBuilder, + Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor, + Qwen2_5_VLProcessingInfo) from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY from vllm_ascend.ascend_forward_context import set_ascend_forward_context -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz - -MIN_PAD_SIZE = 64 # min_size to pad weight -MAX_PAD_SIZE = 128 # max_size to pad weight - - -class AscendQwen2_5_VisionBlock(Qwen2_5_VisionBlock): - - def forward( - self, - x: torch.Tensor, - cu_seqlens: torch.Tensor, - rotary_pos_emb_cos: torch.Tensor, - rotary_pos_emb_sin: torch.Tensor, - ) -> torch.Tensor: - x = x + self.attn(self.norm1(x), - cu_seqlens=cu_seqlens, - rotary_pos_emb_cos=rotary_pos_emb_cos, - rotary_pos_emb_sin=rotary_pos_emb_sin) - x = x + self.mlp(self.norm2(x)) - return x - - -class AscendQwen2_5_VisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding): - - def __init__(self, dim: int, theta: float = 10000.0) -> None: - super().__init__(dim, theta) - inv_freq = 1.0 / (theta - **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) - self.inv_freq = inv_freq - - -class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer): - - def __init__( - self, - vision_config: Qwen2_5_VLVisionConfig, - norm_eps: float = 1e-6, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - interleaved=False, - ) -> None: - super().__init__(vision_config, norm_eps, quant_config, prefix) - norm_layer = partial(RMSNorm, eps=norm_eps) - self.interleaved = interleaved - self.enable_pad = False - head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim // - 2) - self.patch_embed = Qwen2_5_VisionPatchEmbed( - patch_size=vision_config.patch_size, - temporal_patch_size=vision_config.temporal_patch_size, - in_channels=vision_config.in_channels, - hidden_size=self.hidden_size, - ) - - act_fn = get_act_and_mul_fn(vision_config.hidden_act) - self.blocks = nn.ModuleList([ - AscendQwen2_5_VisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") - for layer_idx in range(vision_config.depth) - ]) - self.tp_size = parallel_state.get_tensor_model_parallel_world_size() - self.tp_rank = parallel_state.get_tensor_model_parallel_rank() - self.hidden_size_per_attention_head = dist_utils.divide( - self.hidden_size, self.num_heads) - - if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE: - self.enable_pad = True - self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head - self.half_origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head // 2 - self.half_pad_hidden_size_per_attention_head = ( - MAX_PAD_SIZE - self.hidden_size_per_attention_head) // 2 - self.hidden_size_per_attention_head = MAX_PAD_SIZE - - def cal_cos_sin(self, rotary_pos_emb): - cos = rotary_pos_emb.cos() # [seqlen, rotary_dim / 2] - sin = rotary_pos_emb.sin() - if self.enable_pad: - cos = torch.nn.functional.pad( - cos, (0, self.half_pad_hidden_size_per_attention_head)) - sin = torch.nn.functional.pad( - sin, (0, self.half_pad_hidden_size_per_attention_head)) - - if not self.interleaved: - cos_new = torch.cat((cos, cos), dim=-1) - sin_new = torch.cat((sin, sin), dim=-1) - else: - cos_new = rearrange(torch.stack((cos, cos), dim=-1), - "... d two -> ...(d two)", - two=2) - sin_new = rearrange(torch.stack((sin, sin), dim=-1), - "... d two -> ...(d two)", - two=2) - cos_new = cos_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - sin_new = sin_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - return cos_new, sin_new - - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - stacked_params_mapping: list[tuple[str, str, Union[str, int]]] = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("mlp.gate_up_proj.", "mlp.gate_proj.", 0), - ("mlp.gate_up_proj.", "mlp.up_proj.", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: - pos_ids = [] - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - wpos_ids = wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - pos_ids.append( - torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) - pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb - - def get_window_index(self, grid_thw): - window_index: list = [] - cu_window_seqlens: list = [0] - window_index_id = 0 - vit_merger_window_size = (self.window_size // - self.spatial_merge_size // self.patch_size) - - for grid_t, grid_h, grid_w in grid_thw: - llm_grid_h = grid_h // self.spatial_merge_size - llm_grid_w = grid_w // self.spatial_merge_size - index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( - grid_t, llm_grid_h, llm_grid_w) - pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size - pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size - num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size - num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size - index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) - index_padded = index_padded.reshape(grid_t, num_windows_h, - vit_merger_window_size, - num_windows_w, - vit_merger_window_size) - index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( - grid_t, num_windows_h * num_windows_w, vit_merger_window_size, - vit_merger_window_size) - seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) - index_padded = index_padded.reshape(-1) - index_new = index_padded[index_padded != -100] - window_index.append(index_new + window_index_id) - cu_seqlens_tmp = seqlens.cumsum( - 0) * self.spatial_merge_unit + cu_window_seqlens[-1] - cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) - window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() - window_index = torch.cat(window_index, dim=0) - return window_index, cu_window_seqlens - - def forward( - self, - x: torch.Tensor, - grid_thw: torch.Tensor, - ) -> torch.Tensor: - # compute cu_seqlens - cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, - 0]).cpu().to(torch.int32) - - # patchify - x = self.patch_embed(x) - - # compute position embedding - rotary_pos_emb = self.rot_pos_emb(grid_thw) - - # windows attention - window_index, cu_window_seqlens = self.get_window_index(grid_thw) - cu_window_seqlens = torch.tensor( - cu_window_seqlens, - device=x.device, - dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) - cu_window_seqlens = torch.diff(cu_window_seqlens).cpu().to(torch.int32) - seq_len, _ = x.size() - x = x.reshape(seq_len // self.spatial_merge_unit, - self.spatial_merge_unit, -1) - x = x[window_index, :, :] - x = x.reshape(seq_len, -1) - rotary_pos_emb = rotary_pos_emb.reshape( - seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) - rotary_pos_emb = rotary_pos_emb[window_index, :, :] - rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) - - cos, sin = self.cal_cos_sin(rotary_pos_emb) - - # transformers - x = x.unsqueeze(1) - for layer_num, blk in enumerate(self.blocks): - if layer_num in self.fullatt_block_indexes: - cu_seqlens_now = cu_seqlens - else: - cu_seqlens_now = cu_window_seqlens - x = blk(x, - cu_seqlens=cu_seqlens_now, - rotary_pos_emb_cos=cos, - rotary_pos_emb_sin=sin) - # adapter - x = self.merger(x) - reverse_indices = torch.argsort(window_index) - x = x[reverse_indices, :] - return x @MULTIMODAL_REGISTRY.register_processor( @@ -304,7 +43,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - self.visual = AscendQwen2_5_VisionTransformer( + self.visual = Qwen2_5_VisionTransformer( vision_config=config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config, diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index d9a5a9e878c..1f6478d8aee 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -51,10 +51,13 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb_cos: torch.Tensor, rotary_pos_emb_sin: torch.Tensor, + max_seqlen: torch.Tensor, + seqlens: torch.Tensor, ) -> torch.Tensor: self.enable_pad = False self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head self.half_origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head // 2 + if self.hidden_size_per_attention_head > MIN_PAD_SIZE \ and self.hidden_size_per_attention_head < MAX_PAD_SIZE: self.enable_pad = True @@ -74,13 +77,32 @@ def forward( cos = rotary_pos_emb_cos sin = rotary_pos_emb_sin + if self.enable_pad: origin_shape = q.shape[-1] pad_len = MAX_PAD_SIZE - origin_shape - # [b, s, head, head_dim] + # q/k/v: [b, s, head, head_dim] -> [b, s, head, MAX_PAD_SIZE] q = F.pad(q, (0, pad_len), mode="constant", value=0) k = F.pad(k, (0, pad_len), mode="constant", value=0) v = F.pad(v, (0, pad_len), mode="constant", value=0) + # cos/sin: [seqlen, rotary_dim / 2] -> [b, s, head, MAX_PAD_SIZE / 2] + cos = torch.nn.functional.pad( + cos, (0, self.half_pad_hidden_size_per_attention_head)) + sin = torch.nn.functional.pad( + sin, (0, self.half_pad_hidden_size_per_attention_head)) + + cos = rearrange( + torch.stack((cos, cos), dim=-1), + "... d two -> ...(d two)", + two=2, + ) + sin = rearrange( + torch.stack((sin, sin), dim=-1), + "... d two -> ...(d two)", + two=2, + ) + cos = cos.reshape(1, -1, 1, self.hidden_size_per_attention_head) + sin = sin.reshape(1, -1, 1, self.hidden_size_per_attention_head) q = torch_npu.npu_rotary_mul(q, cos, sin) k = torch_npu.npu_rotary_mul(k, cos, sin) @@ -90,6 +112,9 @@ def forward( for x in (q, k, v) ] + # Convert cumulative tensor to intervals and move it to cpu. + cu_seqlens = torch.diff(cu_seqlens).to("cpu") + context_layer = torch.empty_like(q) # operator requires pta version >= 2.5.1 @@ -101,7 +126,8 @@ def forward( scale_value=self.origin_hidden_size_per_attention_head**-0.5, num_heads=self.num_attention_heads_per_partition, num_kv_heads=self.num_attention_heads_per_partition, - out=context_layer) + out=context_layer, + ) if self.enable_pad: context_layer = context_layer[..., :origin_shape] From a4b291aa90042139d22c1208a85e0effe2f0b143 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Mon, 24 Nov 2025 08:40:36 +0000 Subject: [PATCH 03/16] remove qwen2.5-vl ut Signed-off-by: shen-shanshan <467638484@qq.com> --- tests/ut/models/test_qwen2_5_vl.py | 457 +---------------------------- 1 file changed, 2 insertions(+), 455 deletions(-) diff --git a/tests/ut/models/test_qwen2_5_vl.py b/tests/ut/models/test_qwen2_5_vl.py index 7111aaed6c8..efbb9c57555 100644 --- a/tests/ut/models/test_qwen2_5_vl.py +++ b/tests/ut/models/test_qwen2_5_vl.py @@ -1,461 +1,8 @@ -import pytest -import torch -import torch.nn.functional as F from pytest_mock import MockerFixture from tests.ut.base import PytestBase -from vllm_ascend.models.qwen2_5_vl import ( - AscendQwen2_5_VisionAttention, AscendQwen2_5_VisionBlock, - AscendQwen2_5_VisionPatchEmbed, AscendQwen2_5_VisionRotaryEmbedding, - AscendQwen2_5_VisionTransformer, AscendQwen2_5_VLForConditionalGeneration) - - -class TestAscendQwen2_5_VisionAttention(PytestBase): - - def init_attention( - self, - mocker, - embed_dim=1000, - num_heads=10, - projection_size=100, - quant_config=None, - prefix="", - ): - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionAttention.__init__") - - attention = AscendQwen2_5_VisionAttention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - args, kwargs = mocker_attn.call_args - assert args == (embed_dim, num_heads, projection_size, None, "") - assert not kwargs - attention.num_attention_heads_per_partition = num_heads - return attention - - def test_attn_init_should_normal(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 10 - projection_size = 100 - quant_config = None - prefix = "" - vit = self.init_attention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - mocker=mocker, - ) - assert vit.embed_dim == 1000 - assert vit.hidden_size_per_attention_head == 10 - - def test_attn_init_should_raise_error(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 7 - projection_size = 100 - quant_config = None - prefix = "" - with pytest.raises(AssertionError): - # projection_size should divided by num heads - self.init_attention( - mocker=mocker, - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - - def test_split_qkv(self, mocker: MockerFixture): - attention = self.init_attention(mocker=mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - q, k, v = attention.split_qkv(torch.rand((100, 10, 300))) - assert q.shape == (100, 10, 10, 10) - assert k.shape == (100, 10, 10, 10) - assert v.shape == (100, 10, 10, 10) - - def test_attn_forward(self, mocker: MockerFixture): - attention = self.init_attention(mocker=mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - - qkv = lambda x: (x, 0) # noqa - split_qkv = lambda x: [ #noqa - torch.rand((100, 3, 10, 128)) for i in range(3) - ] # noqa - npu_rotary_mul = lambda q, cos, sin: q # noqa - _npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa - proj = lambda x: (x, 0) # noqa - - mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv) - mocker_split_qkv = mocker.patch.object( - attention, - "split_qkv", - side_effect=split_qkv, - ) - mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul", - side_effect=npu_rotary_mul) - mocker_npu_flash_attention_unpad = mocker.patch( - "torch_npu._npu_flash_attention_unpad", - side_effect=_npu_flash_attention_unpad, - ) - mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj) - attention.__dict__["qkv"] = mocker_qkv - attention.__dict__["split_qkv"] = mocker_split_qkv - attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul - attention.__dict__["_npu_flash_attention_unpad"] = ( - mocker_npu_flash_attention_unpad) - attention.__dict__["proj"] = mocker_proj - - output = attention.forward( - x=x, - cu_seqlens=cu_seqlens, - cos=cos, - sin=sin, - ) - qkv_args, qkv_kwargs = mocker_qkv.call_args - assert qkv_args == (x, ) - assert not qkv_kwargs - - split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args - assert split_qkv_args == (x, ) - assert not split_qkv_kwargs - - npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args - assert npu_rotary_mul_args[1:] == (cos, sin) - assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128]) - assert not npu_rotary_mul_kwargs - - assert output.shape == torch.Size([100, 3, 1280]) - - -class TestAscendQwen2_5_VisionBlock(PytestBase): - - def init_vision_block( - self, - mocker, - dim=100, - num_heads=10, - mlp_hidden_dim=100, - ): - mocker_vit = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__", - return_value=None, - ) - - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionAttention.__init__", - return_value=None, - ) - - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - vision_block = AscendQwen2_5_VisionBlock( - dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - ) - args, kwargs = mocker_vit.call_args - assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "") - assert not kwargs - - args1, kwargs1 = mocker_attn.call_args - assert not args1 - assert kwargs1 == { - "embed_dim": dim, - "num_heads": num_heads, - "projection_size": dim, - "quant_config": None, - "prefix": ".attn", - } - return vision_block - - def test_init_vision_block_should_normal( - self, - mocker: MockerFixture, - ): - vision_block = self.init_vision_block(mocker) - assert isinstance(vision_block, AscendQwen2_5_VisionBlock) - - def test_vision_block_forward(self, mocker: MockerFixture): - x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - vision_block = self.init_vision_block(mocker) - mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x) - mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x) - vision_block.__dict__["attn"] = mocker_attn - vision_block.__dict__["mlp"] = mocker_mlp - - output = vision_block.forward(x.clone(), cu_seqlens, cos, sin) - - _, attn_kwargs = mocker_attn.call_args - assert attn_kwargs == { - "cu_seqlens": cu_seqlens, - "cos": cos, - "sin": sin, - } - - assert torch.all(x * 3 == output) - - -class TestAscendQwen2_5_VisionPatchEmbed(PytestBase): - - def test_forward(self): - patch_embed = AscendQwen2_5_VisionPatchEmbed() - - ret = patch_embed(torch.rand((120, 1176))) - assert ret.shape == (120, 1152) - - -class TestAscendQwen2_5_VisionRotaryEmbedding(PytestBase): - - def init_rotary_embedding( - self, - mocker, - dim=128, - ): - mocker_ebed = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionRotaryEmbedding.__init__", - return_value=None, - ) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - rotary_embedding = AscendQwen2_5_VisionRotaryEmbedding(dim=dim, ) - args, kwargs = mocker_ebed.call_args - assert args == (dim, 10000.0) - assert not kwargs - return rotary_embedding - - def test_init_rotary_embedding_should_normal(self, mocker: MockerFixture): - rotary_embedding = self.init_rotary_embedding(mocker) - assert isinstance(rotary_embedding, - AscendQwen2_5_VisionRotaryEmbedding) - - -class TestAscendQwen2_5_VisionTransformer(PytestBase): - - input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]]) - - def init_vision_transformer( - self, - mocker, - ): - norm_eps = 1e-6 - vision_config = mocker.MagicMock() - vision_config.patch_size = 16 - vision_config.temporal_patch_size = 2 - vision_config.in_channels = 3 - vision_config.hidden_act = "gelu" - vision_config.depth = 0 - vision_config.num_heads = 10 - vision_config.hidden_size = 300 - - mocker.patch( - "vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_rank", - return_value=0, - ) - mocker.patch("vllm.distributed.utils.divide", return_value=100) - mocker.patch( - "vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", - return_value=2, - ) - mocker.patch( - "vllm.model_executor.layers.linear.divide", - return_value=2, - ) - mocker.patch( - "vllm.model_executor.layers.linear.get_tensor_model_parallel_rank", - return_value=0) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_world_size", - return_value=2, - ) - mocker.patch( - "vllm_ascend.ops.linear.divide", - return_value=2, - ) - - mock_group = mocker.MagicMock() - mock_group.rank_in_group = 0 - mock_group.world_size = 2 - mocker.patch( - "vllm_ascend.ops.linear_op.get_tp_group", - return_value=mock_group, - ) - mocker.patch( - "vllm.distributed.parallel_state.get_tp_group", - return_value=mock_group, - ) - - vision_transformer = AscendQwen2_5_VisionTransformer( - vision_config, - norm_eps, - ) - - assert not vision_transformer.interleaved - return vision_transformer - - def test_init_vision_transformer(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - assert isinstance(vision_transformer, AscendQwen2_5_VisionTransformer) - - @pytest.mark.parametrize( - "interleaved, expected", - [ - ( - False, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - ]), - ), - ( - True, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 1].cos(), - ]), - ), - ], - ) - def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - vision_transformer.__dict__["interleaved"] = interleaved - vision_transformer.__dict__["hidden_size_per_attention_head"] = 2 - vision_transformer.hidden_size_per_attention_head = 4 - cos_new, _ = vision_transformer.cal_cos_sin(self.input_data) - assert cos_new.shape == (1, 32, 1, 2) - - def test_pad_qkv_bias(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - res = attention.pad_qkv_bias(torch.rand((300))) - assert res.shape[0] == 384 - - def test_pad_qkv_weight(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker.patch( - "torch_npu.npu_format_cast", - return_value=torch.rand((384, 300)), - ) - res = attention.pad_qkv_weight(torch.rand((300, 300))) - assert res.shape == (384, 300) - - def test_pad_proj_weight(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker.patch( - "torch_npu.npu_format_cast", - return_value=torch.rand((300, 384)), - ) - res = attention.pad_proj_weight(torch.rand((300, 300))) - assert res.shape == (300, 384) - - def test_pad_qkv_weight_scale_offset(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - res = attention.pad_qkv_weight_scale_offset(torch.rand((300, 1))) - assert res.shape == (384, 1) - - def test_pad_qkv_deq_scale_quant_bias(self, mocker: MockerFixture): - attention = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - res = attention.pad_qkv_deq_scale_quant_bias(torch.rand((300))) - assert res.shape[0] == 384 - - def test_forward(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - x = torch.randn(1, 3, 224, 224) - grid_thw = torch.tensor([[1, 4, 4]]) - mocker_patch_embed = mocker.patch.object( - vision_transformer, - "patch_embed", - side_effect=lambda _: torch.randn(16, 512), # noqa - ) - mocker_rot_pos_emb = mocker.patch.object( - vision_transformer, - "rot_pos_emb", - side_effect=lambda _: torch.randn(16, 64), # noqa - ) - mocker_get_window_index = mocker.patch.object( - vision_transformer, - "get_window_index", - side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa - ) - mocker_cal_cos_sin = mocker.patch.object( - vision_transformer, - "cal_cos_sin", - side_effect=lambda _: - (torch.randn(16, 32), torch.randn(16, 32)), # noqa - ) - mocker_merger = mocker.patch.object( - vision_transformer, - "merger", - side_effect=lambda _: torch.randn(16, 256), # noqa - ) - vision_transformer.__dict__["vision_blocks"] = [ - lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa - ] - vision_transformer.__dict__["patch_embed"] = mocker_patch_embed - vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb - vision_transformer.__dict__[ - "get_window_index"] = mocker_get_window_index - vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin - vision_transformer.__dict__["merger"] = mocker_merger - vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2] - vision_transformer.__dict__["spatial_merge_unit"] = 2 - ret = vision_transformer.forward(x, grid_thw) - assert ret.shape == (8, 256) - mocker_patch_embed.assert_called_with(x) - mocker_rot_pos_emb.assert_called_with(grid_thw) - mocker_get_window_index.assert_called_with(grid_thw) - mocker_cal_cos_sin.assert_called_once() - mocker_merger.assert_called_once() +from vllm_ascend.models.qwen2_5_vl import \ + AscendQwen2_5_VLForConditionalGeneration class TestAscendQwen2_5_VLForConditionalGeneration(PytestBase): From c198058c164d9e46134797310e03dbc0794a53c4 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Mon, 24 Nov 2025 12:29:32 +0000 Subject: [PATCH 04/16] add padding manager Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 173 ++++++++++--------- 1 file changed, 88 insertions(+), 85 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 1f6478d8aee..d2115285beb 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -15,6 +15,8 @@ # limitations under the License. # +from contextlib import contextmanager + import torch import torch.nn as nn import torch.nn.functional as F @@ -26,24 +28,55 @@ MAX_PAD_SIZE = 128 # max_size to pad weight -class AscendQwen2_5_VisionAttention(nn.Module): - - def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: - # [s, b, 3 * head * head_dim] - seq_len, bs, _ = qkv.shape +@contextmanager +def _padding_manager( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + origin_shape: int, + hidden_size_per_attention_head: int, +): + enable_pad = (hidden_size_per_attention_head > MIN_PAD_SIZE + and hidden_size_per_attention_head < MAX_PAD_SIZE) + + if enable_pad: + half_pad_hidden_size_per_attention_head = ( + MAX_PAD_SIZE - hidden_size_per_attention_head) // 2 + hidden_size_per_attention_head = MAX_PAD_SIZE + + pad_len = MAX_PAD_SIZE - origin_shape + # q/k/v: [b, s, head, head_dim] -> [b, s, head, MAX_PAD_SIZE] + q = F.pad(q, (0, pad_len), mode="constant", value=0) + k = F.pad(k, (0, pad_len), mode="constant", value=0) + v = F.pad(v, (0, pad_len), mode="constant", value=0) + # cos/sin: [seqlen, rotary_dim / 2] -> [b, s, head, MAX_PAD_SIZE / 2] + cos = torch.nn.functional.pad( + cos, (0, half_pad_hidden_size_per_attention_head)) + sin = torch.nn.functional.pad( + sin, (0, half_pad_hidden_size_per_attention_head)) + + cos = rearrange( + torch.stack((cos, cos), dim=-1), + "... d two -> ...(d two)", + two=2, + ) + sin = rearrange( + torch.stack((sin, sin), dim=-1), + "... d two -> ...(d two)", + two=2, + ) + cos = cos.reshape(1, -1, 1, hidden_size_per_attention_head) + sin = sin.reshape(1, -1, 1, hidden_size_per_attention_head) + + try: + yield (q, k, v, cos, sin, enable_pad) + finally: + pass - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] - q, k, v = qkv.chunk(3, dim=2) - # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] - new_shape = ( - seq_len, - bs, - self.num_attention_heads_per_partition, - self.origin_hidden_size_per_attention_head, - ) - q, k, v = (x.view(*new_shape) for x in (q, k, v)) - return q, k, v +class AscendQwen2_5_VisionAttention(nn.Module): def forward( self, @@ -54,91 +87,61 @@ def forward( max_seqlen: torch.Tensor, seqlens: torch.Tensor, ) -> torch.Tensor: - self.enable_pad = False - self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head - self.half_origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head // 2 - - if self.hidden_size_per_attention_head > MIN_PAD_SIZE \ - and self.hidden_size_per_attention_head < MAX_PAD_SIZE: - self.enable_pad = True - self.half_pad_hidden_size_per_attention_head = ( - MAX_PAD_SIZE - self.hidden_size_per_attention_head) // 2 - self.hidden_size_per_attention_head = MAX_PAD_SIZE - # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] q, k, v = self.split_qkv(x) batch_size = q.shape[1] + origin_shape = q.shape[-1] q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) - cos = rotary_pos_emb_cos - sin = rotary_pos_emb_sin - - if self.enable_pad: - origin_shape = q.shape[-1] - pad_len = MAX_PAD_SIZE - origin_shape - # q/k/v: [b, s, head, head_dim] -> [b, s, head, MAX_PAD_SIZE] - q = F.pad(q, (0, pad_len), mode="constant", value=0) - k = F.pad(k, (0, pad_len), mode="constant", value=0) - v = F.pad(v, (0, pad_len), mode="constant", value=0) - # cos/sin: [seqlen, rotary_dim / 2] -> [b, s, head, MAX_PAD_SIZE / 2] - cos = torch.nn.functional.pad( - cos, (0, self.half_pad_hidden_size_per_attention_head)) - sin = torch.nn.functional.pad( - sin, (0, self.half_pad_hidden_size_per_attention_head)) - - cos = rearrange( - torch.stack((cos, cos), dim=-1), - "... d two -> ...(d two)", - two=2, - ) - sin = rearrange( - torch.stack((sin, sin), dim=-1), - "... d two -> ...(d two)", - two=2, - ) - cos = cos.reshape(1, -1, 1, self.hidden_size_per_attention_head) - sin = sin.reshape(1, -1, 1, self.hidden_size_per_attention_head) - - q = torch_npu.npu_rotary_mul(q, cos, sin) - k = torch_npu.npu_rotary_mul(k, cos, sin) - - q, k, v = [ - rearrange(x, "b s h d -> (b s) h d").contiguous() - for x in (q, k, v) - ] - # Convert cumulative tensor to intervals and move it to cpu. cu_seqlens = torch.diff(cu_seqlens).to("cpu") - context_layer = torch.empty_like(q) - - # operator requires pta version >= 2.5.1 - torch_npu._npu_flash_attention_unpad( - query=q, - key=k, - value=v, - seq_len=cu_seqlens, - scale_value=self.origin_hidden_size_per_attention_head**-0.5, - num_heads=self.num_attention_heads_per_partition, - num_kv_heads=self.num_attention_heads_per_partition, - out=context_layer, - ) - - if self.enable_pad: - context_layer = context_layer[..., :origin_shape] - - context_layer = rearrange(context_layer, - "(b s) h d -> s b (h d)", - b=batch_size).contiguous() + with _padding_manager( + q=q, + k=k, + v=v, + cos=rotary_pos_emb_cos, + sin=rotary_pos_emb_sin, + origin_shape=origin_shape, + hidden_size_per_attention_head=self. + hidden_size_per_attention_head, + ) as (q, k, v, cos, sin, enable_pad): + q = torch_npu.npu_rotary_mul(q, cos, sin) + k = torch_npu.npu_rotary_mul(k, cos, sin) + + q, k, v = [ + rearrange(x, "b s h d -> (b s) h d").contiguous() + for x in (q, k, v) + ] + + context_layer = torch.empty_like(q) + + # operator requires pta version >= 2.5.1 + torch_npu._npu_flash_attention_unpad( + query=q, + key=k, + value=v, + seq_len=cu_seqlens, + scale_value=self.hidden_size_per_attention_head**-0.5, + num_heads=self.num_attention_heads_per_partition, + num_kv_heads=self.num_attention_heads_per_partition, + out=context_layer, + ) + + if enable_pad: + context_layer = context_layer[..., :origin_shape] + + context_layer = rearrange(context_layer, + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() output, _ = self.proj(context_layer) return output Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward -Qwen2_5_VisionAttention.split_qkv = AscendQwen2_5_VisionAttention.split_qkv From a674f808421d611f265954dca30c91644b06c3d2 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Mon, 24 Nov 2025 12:42:37 +0000 Subject: [PATCH 05/16] sync main Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/models/qwen2_5_vl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index 781626ac3d9..1bf7518b2c8 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -31,7 +31,6 @@ from vllm_ascend.ascend_forward_context import set_ascend_forward_context - @MULTIMODAL_REGISTRY.register_processor( Qwen2_5_VLMultiModalProcessor, info=Qwen2_5_VLProcessingInfo, From f391ac13674911ab7f49e8de93f1c751b11608b0 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Tue, 25 Nov 2025 07:50:58 +0000 Subject: [PATCH 06/16] remove qwen2.5-vl without padding Signed-off-by: shen-shanshan <467638484@qq.com> --- tests/ut/models/test_qwen2_5_vl.py | 35 -- .../models/test_qwen2_5_vl_without_padding.py | 422 ------------------ vllm_ascend/models/__init__.py | 13 - vllm_ascend/models/qwen2_5_vl.py | 86 ---- .../models/qwen2_5_vl_without_padding.py | 371 +-------------- vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 50 ++- 6 files changed, 57 insertions(+), 920 deletions(-) delete mode 100644 tests/ut/models/test_qwen2_5_vl.py delete mode 100644 tests/ut/models/test_qwen2_5_vl_without_padding.py delete mode 100644 vllm_ascend/models/qwen2_5_vl.py diff --git a/tests/ut/models/test_qwen2_5_vl.py b/tests/ut/models/test_qwen2_5_vl.py deleted file mode 100644 index efbb9c57555..00000000000 --- a/tests/ut/models/test_qwen2_5_vl.py +++ /dev/null @@ -1,35 +0,0 @@ -from pytest_mock import MockerFixture - -from tests.ut.base import PytestBase -from vllm_ascend.models.qwen2_5_vl import \ - AscendQwen2_5_VLForConditionalGeneration - - -class TestAscendQwen2_5_VLForConditionalGeneration(PytestBase): - - def test_init_vl_for_conditional_generation(self, mocker: MockerFixture): - vllm_config = mocker.MagicMock() - vllm_config.vision_config = "vision_config" - vllm_config.rms_norm_eps = 1e-5 - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker_vl = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__", - return_value=None, - ) - mocker_vit = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionTransformer.__init__", - return_value=None, - ) - - vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration( - vllm_config=vllm_config) - args, kwargs = mocker_vl.call_args - assert not args - assert kwargs == {"vllm_config": vllm_config, "prefix": ""} - mocker_vit.assert_called_once() - assert isinstance( - vl_for_conditional_generation, - AscendQwen2_5_VLForConditionalGeneration, - ) diff --git a/tests/ut/models/test_qwen2_5_vl_without_padding.py b/tests/ut/models/test_qwen2_5_vl_without_padding.py deleted file mode 100644 index 00caf810e61..00000000000 --- a/tests/ut/models/test_qwen2_5_vl_without_padding.py +++ /dev/null @@ -1,422 +0,0 @@ -import pytest -import torch -import torch.nn.functional as F -from pytest_mock import MockerFixture -from vllm.model_executor.models.qwen2_5_vl import \ - Qwen2_5_VLForConditionalGeneration - -from tests.ut.base import PytestBase -from vllm_ascend.models.qwen2_5_vl_without_padding import ( - AscendQwen2_5_VisionAttention_Without_Padding, - AscendQwen2_5_VisionBlock_Without_Padding, - AscendQwen2_5_VisionPatchEmbed_Without_Padding, - AscendQwen2_5_VisionTransformer_Without_Padding, - AscendQwen2_5_VLForConditionalGeneration_Without_Padding) - - -class TestAscendQwen2_5_VisionAttention_Without_Padding(PytestBase): - - def init_attention( - self, - mocker, - embed_dim=1000, - num_heads=10, - projection_size=100, - quant_config=None, - prefix="", - ): - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.Qwen2_5_VisionAttention.__init__" - ) - - attention = AscendQwen2_5_VisionAttention_Without_Padding( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - args, kwargs = mocker_attn.call_args - assert args == (embed_dim, num_heads, projection_size, None, "") - assert not kwargs - attention.num_attention_heads_per_partition = num_heads - return attention - - def test_vit_init_should_normal(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 10 - projection_size = 100 - quant_config = None - prefix = "" - vit = self.init_attention( - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - mocker=mocker, - ) - assert vit.embed_dim == 1000 - assert vit.hidden_size_per_attention_head == 10 - - def test_vit_init_should_raise_error(self, mocker: MockerFixture): - embed_dim = 1000 - num_heads = 7 - projection_size = 100 - quant_config = None - prefix = "" - with pytest.raises(AssertionError): - # projection_size should divided by num heads - self.init_attention( - mocker=mocker, - embed_dim=embed_dim, - num_heads=num_heads, - projection_size=projection_size, - quant_config=quant_config, - prefix=prefix, - ) - - def test_vit_forward(self, mocker: MockerFixture): - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - attention = self.init_attention(mocker=mocker) - x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - - qkv = lambda x: (x, 0) # noqa - split_qkv = lambda x: [ #noqa - torch.rand((100, 3, 10, 128)) for i in range(3) - ] # noqa - npu_rotary_mul = lambda q, cos, sin: q # noqa - _npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa - proj = lambda x: (x, 0) # noqa - - mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv) - mocker_split_qkv = mocker.patch.object( - attention, - "split_qkv", - side_effect=split_qkv, - ) - mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul", - side_effect=npu_rotary_mul) - mocker_npu_flash_attention_unpad = mocker.patch( - "torch_npu._npu_flash_attention_unpad", - side_effect=_npu_flash_attention_unpad, - ) - mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj) - attention.__dict__["qkv"] = mocker_qkv - attention.__dict__["split_qkv"] = mocker_split_qkv - attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul - attention.__dict__["_npu_flash_attention_unpad"] = ( - mocker_npu_flash_attention_unpad) - attention.__dict__["proj"] = mocker_proj - - output = attention.forward( - x=x, - cu_seqlens=cu_seqlens, - cos=cos, - sin=sin, - ) - qkv_args, qkv_kwargs = mocker_qkv.call_args - assert qkv_args == (x, ) - assert not qkv_kwargs - - split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args - assert split_qkv_args == (x, ) - assert not split_qkv_kwargs - - npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args - assert npu_rotary_mul_args[1:] == (cos, sin) - assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128]) - assert not npu_rotary_mul_kwargs - - assert output.shape == torch.Size([100, 3, 1280]) - - -class TestAscendQwen2_5_VisionBlock_Without_Padding(PytestBase): - - def init_vision_block( - self, - mocker, - dim=100, - num_heads=10, - mlp_hidden_dim=100, - ): - mocker_vit = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__", - return_value=None, - ) - - mocker_attn = mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionAttention_Without_Padding.__init__", - return_value=None, - ) - - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - vision_block = AscendQwen2_5_VisionBlock_Without_Padding( - dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - ) - args, kwargs = mocker_vit.call_args - assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "") - assert not kwargs - - args1, kwargs1 = mocker_attn.call_args - assert not args1 - assert kwargs1 == { - "embed_dim": dim, - "num_heads": num_heads, - "projection_size": dim, - "quant_config": None, - "prefix": ".attn", - } - return vision_block - - def test_init_vision_block_should_normal( - self, - mocker: MockerFixture, - ): - vision_block = self.init_vision_block(mocker) - assert isinstance(vision_block, - AscendQwen2_5_VisionBlock_Without_Padding) - - def test_vision_block_forward(self, mocker: MockerFixture): - x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d - cu_seqlens = torch.tensor([10, 50, 100]) - cos = torch.rand((1, 100, 1, 128)) - sin = torch.rand((1, 100, 1, 128)) - vision_block = self.init_vision_block(mocker) - mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x) - mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x) - vision_block.__dict__["attn"] = mocker_attn - vision_block.__dict__["mlp"] = mocker_mlp - - output = vision_block.forward(x.clone(), cu_seqlens, cos, sin) - - _, attn_kwargs = mocker_attn.call_args - assert attn_kwargs == { - "cu_seqlens": cu_seqlens, - "cos": cos, - "sin": sin, - } - - assert torch.all(x * 3 == output) - - -class TestAscendQwen2_5_VisionPatchEmbed_Without_Padding(PytestBase): - - def test_forward(self): - patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding() - - ret = patch_embed(torch.rand((120, 1176))) - assert ret.shape == (120, 1152) - - -class TestAscendQwen2_5_VisionTransformer_Without_Padding(PytestBase): - - input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]]) - - def init_vision_transformer( - self, - mocker, - ): - norm_eps = 1e-6 - vision_config = mocker.MagicMock() - vision_config.patch_size = 16 - vision_config.temporal_patch_size = 2 - vision_config.in_channels = 3 - vision_config.hidden_act = "gelu" - vision_config.depth = 0 - vision_config.hidden_size = 1280 - vision_config.num_heads = 16 - - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker_vit = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionTransformer.__init__", - return_value=None, - ) - mocker_vision_rotary_embedding = mocker.patch( - "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionRotaryEmbedding.__init__", - return_value=None, - ) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionBlock_Without_Padding.__init__", - return_value=None, - ) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionPatchEmbed_Without_Padding.__init__", - return_value=None, - ) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_world_size", - return_value=1, - ) - mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_rank", - return_value=0, - ) - mocker.patch("vllm.distributed.utils.divide", return_value=100) - - vision_transformer = AscendQwen2_5_VisionTransformer_Without_Padding( - vision_config, - norm_eps, - ) - args, kwargs = mocker_vit.call_args - assert args == (vision_config, norm_eps, None, "") - assert not kwargs - mocker_vision_rotary_embedding.assert_called_once() - return vision_transformer - - def test_init_vision_transformer(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - assert isinstance(vision_transformer, - AscendQwen2_5_VisionTransformer_Without_Padding) - - @pytest.mark.parametrize( - "interleaved, expected", - [ - ( - False, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - ]), - ), - ( - True, - torch.tensor([ - input_data[0, 0].cos(), - input_data[0, 0].cos(), - input_data[0, 1].cos(), - input_data[0, 1].cos(), - input_data[1, 0].cos(), - input_data[1, 0].cos(), - input_data[1, 1].cos(), - input_data[1, 1].cos(), - ]), - ), - ], - ) - def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - vision_transformer.__dict__["interleaved"] = interleaved - vision_transformer.__dict__["hidden_size_per_attention_head"] = 2 - vision_transformer.hidden_size_per_attention_head = 4 - cos_new, _ = vision_transformer.cal_cos_sin(self.input_data) - assert cos_new.shape == (1, 4, 1, 2) - assert torch.allclose(cos_new.view(-1), expected) - - def test_forward(self, mocker: MockerFixture): - vision_transformer = self.init_vision_transformer(mocker) - x = torch.randn(1, 3, 224, 224) - grid_thw = torch.tensor([[1, 4, 4]]) - mocker_patch_embed = mocker.patch.object( - vision_transformer, - "patch_embed", - side_effect=lambda _: torch.randn(16, 512), # noqa - ) - mocker_rot_pos_emb = mocker.patch.object( - vision_transformer, - "rot_pos_emb", - side_effect=lambda _: torch.randn(16, 64), # noqa - ) - mocker_get_window_index = mocker.patch.object( - vision_transformer, - "get_window_index", - side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa - ) - mocker_cal_cos_sin = mocker.patch.object( - vision_transformer, - "cal_cos_sin", - side_effect=lambda _: - (torch.randn(16, 32), torch.randn(16, 32)), # noqa - ) - mocker_merger = mocker.patch.object( - vision_transformer, - "merger", - side_effect=lambda _: torch.randn(16, 256), # noqa - ) - vision_transformer.__dict__["vision_blocks"] = [ - lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa - ] - vision_transformer.__dict__["patch_embed"] = mocker_patch_embed - vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb - vision_transformer.__dict__[ - "get_window_index"] = mocker_get_window_index - vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin - vision_transformer.__dict__["merger"] = mocker_merger - vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2] - vision_transformer.__dict__["spatial_merge_unit"] = 2 - ret = vision_transformer.forward(x, grid_thw) - assert ret.shape == (8, 256) - mocker_patch_embed.assert_called_with(x) - mocker_rot_pos_emb.assert_called_with(grid_thw) - mocker_get_window_index.assert_called_with(grid_thw) - mocker_cal_cos_sin.assert_called_once() - mocker_merger.assert_called_once() - - -class TestAscendQwen2_5_VLForConditionalGeneration_Without_Padding(PytestBase): - - def test_init_vl_for_conditional_generation(self, mocker: MockerFixture): - vllm_config = mocker.MagicMock() - vllm_config.vision_config = "vision_config" - vllm_config.rms_norm_eps = 1e-5 - mocker.patch("torch.nn.Module.__setattr__") - mocker.patch("torch.nn.Module.__getattr__") - mocker.patch("torch.nn.Module.__delattr__") - mocker_vl = mocker.patch( - "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__", - return_value=None, - ) - mocker_vit = mocker.patch( - "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionTransformer_Without_Padding.__init__", - return_value=None, - ) - - vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration_Without_Padding( - vllm_config=vllm_config) - args, kwargs = mocker_vl.call_args - assert not args - assert kwargs == {"vllm_config": vllm_config, "prefix": ""} - mocker_vit.assert_called_once() - assert isinstance( - vl_for_conditional_generation, - AscendQwen2_5_VLForConditionalGeneration_Without_Padding, - ) - - def test_overridden_methods(self): - self.assert_method_overridden( - AscendQwen2_5_VLForConditionalGeneration_Without_Padding, - Qwen2_5_VLForConditionalGeneration, - "_process_image_input", - ) - - self.assert_method_overridden( - AscendQwen2_5_VLForConditionalGeneration_Without_Padding, - Qwen2_5_VLForConditionalGeneration, - "_process_video_input", - ) - - @staticmethod - def assert_method_overridden(subclass, parent, method_name: str): - """assert subclass override parent method""" - parent_func = parent.__dict__.get(method_name) - child_func = subclass.__dict__.get(method_name) - - assert child_func is not None, f"{subclass.__name__} should defined {method_name}" - assert child_func is not parent_func, f"{method_name} should override in {subclass.__name__}" diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py index 956df2eb315..8926aa985b9 100644 --- a/vllm_ascend/models/__init__.py +++ b/vllm_ascend/models/__init__.py @@ -1,7 +1,5 @@ from vllm import ModelRegistry -import vllm_ascend.envs as envs_ascend - def register_model(): ModelRegistry.register_model( @@ -18,17 +16,6 @@ def register_model(): "vllm_ascend.models.qwen2_5_vl_without_padding:AscendQwen3VLForConditionalGeneration" ) - if envs_ascend.USE_OPTIMIZED_MODEL: - ModelRegistry.register_model( - "Qwen2_5_VLForConditionalGeneration", - "vllm_ascend.models.qwen2_5_vl:AscendQwen2_5_VLForConditionalGeneration" - ) - else: - ModelRegistry.register_model( - "Qwen2_5_VLForConditionalGeneration", - "vllm_ascend.models.qwen2_5_vl_without_padding:AscendQwen2_5_VLForConditionalGeneration_Without_Padding" - ) - # There is no PanguProMoEForCausalLM in vLLM, so we should register it before vLLM config initialization # to make sure the model can be loaded correctly. This register step can be removed once vLLM support PanguProMoEForCausalLM. ModelRegistry.register_model( diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py deleted file mode 100644 index 1bf7518b2c8..00000000000 --- a/vllm_ascend/models/qwen2_5_vl.py +++ /dev/null @@ -1,86 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Adapted from vllm/model_executor/models/qwen2_5_vl.py -# Copyright 2023 The vLLM team. -# -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import \ - Qwen2_5_VLConfig -from vllm.config import VllmConfig -from vllm.model_executor.models.qwen2_5_vl import ( - Qwen2_5_VisionTransformer, Qwen2_5_VLDummyInputsBuilder, - Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor, - Qwen2_5_VLProcessingInfo) -from vllm.model_executor.models.utils import maybe_prefix -from vllm.multimodal import MULTIMODAL_REGISTRY - -from vllm_ascend.ascend_forward_context import set_ascend_forward_context - - -@MULTIMODAL_REGISTRY.register_processor( - Qwen2_5_VLMultiModalProcessor, - info=Qwen2_5_VLProcessingInfo, - dummy_inputs=Qwen2_5_VLDummyInputsBuilder) -class AscendQwen2_5_VLForConditionalGeneration( - Qwen2_5_VLForConditionalGeneration): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.visual = Qwen2_5_VisionTransformer( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - ) - - def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: - - grid_thw = image_input["image_grid_thw"] - assert grid_thw.ndim == 2 - - if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"].type(self.visual.dtype) - else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - with set_ascend_forward_context(None, self.vllm_config): - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - - # Split concatenated embeddings for each image item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return image_embeds.split(sizes.tolist()) - - def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: - - grid_thw = video_input["video_grid_thw"] - assert grid_thw.ndim == 2 - - if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"].type(self.visual.dtype) - else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - with set_ascend_forward_context(None, self.vllm_config): - video_embeds = self.visual(pixel_values_videos, - grid_thw=grid_thw) - - # Split concatenated embeddings for each video item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return video_embeds.split(sizes.tolist()) diff --git a/vllm_ascend/models/qwen2_5_vl_without_padding.py b/vllm_ascend/models/qwen2_5_vl_without_padding.py index d51a5aca9a6..c79e71e7197 100644 --- a/vllm_ascend/models/qwen2_5_vl_without_padding.py +++ b/vllm_ascend/models/qwen2_5_vl_without_padding.py @@ -22,10 +22,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -import torch_npu -from einops import rearrange -from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( - Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) try: from transformers.models.qwen3_vl.configuration_qwen3_vl import \ @@ -35,17 +31,10 @@ except ImportError: pass from vllm.config import VllmConfig -from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY, - get_act_and_mul_fn) -from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.models.qwen2_5_vl import ( - Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, - Qwen2_5_VisionTransformer, Qwen2_5_VLDummyInputsBuilder, - Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor, - Qwen2_5_VLProcessingInfo) +from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention try: from vllm.model_executor.models.qwen3_vl import ( @@ -67,288 +56,6 @@ from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding - - -class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention): - - def __init__( - self, - embed_dim: int, - num_heads: int, - projection_size: int, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__( - embed_dim, - num_heads, - projection_size, - quant_config, - prefix, - ) - self.embed_dim = embed_dim - self.hidden_size_per_attention_head = dist_utils.divide( - projection_size, num_heads) - - def forward( - self, - x: torch.Tensor, - cu_seqlens: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - ) -> torch.Tensor: - # [s, b, c] --> [s, b, head * 3 * head_dim] - x, _ = self.qkv(x) - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] - q, k, v = self.split_qkv(x) - batch_size = q.shape[1] - - q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() - for x in (q, k, v)) - q = torch_npu.npu_rotary_mul(q, cos, sin) - k = torch_npu.npu_rotary_mul(k, cos, sin) - - q, k, v = [ - rearrange(x, "b s h d -> (b s) h d").contiguous() - for x in (q, k, v) - ] - - context_layer = torch.empty_like(q) - - # operator requires pta version >= 2.5.1.dev20250226 - torch_npu._npu_flash_attention_unpad( - query=q, - key=k, - value=v, - seq_len=cu_seqlens, - scale_value=self.hidden_size_per_attention_head**-0.5, - num_heads=self.num_attention_heads_per_partition, - num_kv_heads=self.num_attention_heads_per_partition, - out=context_layer) - - context_layer = rearrange(context_layer, - "(b s) h d -> s b (h d)", - b=batch_size).contiguous() - - output, _ = self.proj(context_layer) - return output - - -class AscendQwen2_5_VisionBlock_Without_Padding(Qwen2_5_VisionBlock): - - def __init__(self, - dim: int, - num_heads: int, - mlp_hidden_dim: int, - act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, - norm_layer: Optional[Callable[[int], nn.Module]] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "") -> None: - super().__init__(dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=prefix) - self.attn = AscendQwen2_5_VisionAttention_Without_Padding( - embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn") - - def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, - cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: - x = x + self.attn( - self.norm1(x), cu_seqlens=cu_seqlens, cos=cos, sin=sin) - - x = x + self.mlp(self.norm2(x)) - return x - - -class AscendQwen2_5_VisionPatchEmbed_Without_Padding(Qwen2_5_VisionPatchEmbed): - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = x.matmul( - self.proj.weight.data.view(self.hidden_size, -1).transpose(0, 1)) - return x - - -class AscendQwen2_5_VisionTransformer_Without_Padding(Qwen2_5_VisionTransformer - ): - - def __init__( - self, - vision_config: Qwen2_5_VLVisionConfig, - norm_eps: float = 1e-6, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - interleaved=False, - ) -> None: - super().__init__(vision_config, norm_eps, quant_config, prefix) - norm_layer = partial(RMSNorm, eps=norm_eps) - self.interleaved = interleaved - head_dim = self.hidden_size // self.num_heads - self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim // - 2) - self.patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding( - patch_size=vision_config.patch_size, - temporal_patch_size=vision_config.temporal_patch_size, - in_channels=vision_config.in_channels, - hidden_size=self.hidden_size, - ) - - act_fn = get_act_and_mul_fn(vision_config.hidden_act) - self.blocks = nn.ModuleList([ - AscendQwen2_5_VisionBlock_Without_Padding( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") - for layer_idx in range(vision_config.depth) - ]) - self.tp_size = parallel_state.get_tensor_model_parallel_world_size() - self.tp_rank = parallel_state.get_tensor_model_parallel_rank() - self.hidden_size_per_attention_head = dist_utils.divide( - self.hidden_size, self.num_heads) - - def cal_cos_sin(self, rotary_pos_emb): - cos = rotary_pos_emb.cos() # [seqlen, rotary_dim / 2] - sin = rotary_pos_emb.sin() - - if not self.interleaved: - cos_new = torch.cat((cos, cos), dim=-1) - sin_new = torch.cat((sin, sin), dim=-1) - else: - cos_new = rearrange(torch.stack((cos, cos), dim=-1), - "... d two -> ...(d two)", - two=2) - sin_new = rearrange(torch.stack((sin, sin), dim=-1), - "... d two -> ...(d two)", - two=2) - cos_new = cos_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - sin_new = sin_new.reshape(1, -1, 1, - self.hidden_size_per_attention_head) - return cos_new, sin_new - - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: - pos_ids = [] - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - wpos_ids = wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - pos_ids.append( - torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) - pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb - - def get_window_index(self, grid_thw): - window_index: list = [] - cu_window_seqlens: list = [0] - window_index_id = 0 - vit_merger_window_size = (self.window_size // - self.spatial_merge_size // self.patch_size) - - for grid_t, grid_h, grid_w in grid_thw: - llm_grid_h = grid_h // self.spatial_merge_size - llm_grid_w = grid_w // self.spatial_merge_size - index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( - grid_t, llm_grid_h, llm_grid_w) - pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size - pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size - num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size - num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size - index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) - index_padded = index_padded.reshape(grid_t, num_windows_h, - vit_merger_window_size, - num_windows_w, - vit_merger_window_size) - index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( - grid_t, num_windows_h * num_windows_w, vit_merger_window_size, - vit_merger_window_size) - seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) - index_padded = index_padded.reshape(-1) - index_new = index_padded[index_padded != -100] - window_index.append(index_new + window_index_id) - cu_seqlens_tmp = seqlens.cumsum( - 0) * self.spatial_merge_unit + cu_window_seqlens[-1] - cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) - window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() - window_index = torch.cat(window_index, dim=0) - return window_index, cu_window_seqlens - - def forward( - self, - x: torch.Tensor, - grid_thw: torch.Tensor, - ) -> torch.Tensor: - # compute cu_seqlens - cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, - 0]).cpu().to(torch.int32) - - # patchify - x = self.patch_embed(x) - - # compute position embedding - rotary_pos_emb = self.rot_pos_emb(grid_thw) - - # windows attention - window_index, cu_window_seqlens = self.get_window_index(grid_thw) - cu_window_seqlens = torch.tensor( - cu_window_seqlens, - device=x.device, - dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) - cu_window_seqlens = torch.diff(cu_window_seqlens).cpu().to(torch.int32) - seq_len, _ = x.size() - x = x.reshape(seq_len // self.spatial_merge_unit, - self.spatial_merge_unit, -1) - x = x[window_index, :, :] - x = x.reshape(seq_len, -1) - rotary_pos_emb = rotary_pos_emb.reshape( - seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) - rotary_pos_emb = rotary_pos_emb[window_index, :, :] - rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) - - cos, sin = self.cal_cos_sin(rotary_pos_emb) - - # transformers - x = x.unsqueeze(1) - for layer_num, blk in enumerate(self.blocks): - if layer_num in self.fullatt_block_indexes: - cu_seqlens_now = cu_seqlens - else: - cu_seqlens_now = cu_window_seqlens - x = blk(x, cu_seqlens=cu_seqlens_now, cos=cos, sin=sin) - - # adapter - x = self.merger(x) - reverse_indices = torch.argsort(window_index) - x = x[reverse_indices, :] - return x - class AscendQwen3_VisionPatchEmbed(Qwen3_VisionPatchEmbed): @@ -372,21 +79,13 @@ def __init__( prefix: str = "", use_data_parallel: bool = False, ) -> None: - super().__init__(dim=dim, - num_heads=num_heads, - mlp_hidden_dim=mlp_hidden_dim, - act_fn=act_fn, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=prefix, - use_data_parallel=use_data_parallel) - - self.attn = AscendQwen2_5_VisionAttention_Without_Padding( - embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn") + super().__init__(dim, num_heads, mlp_hidden_dim, act_fn, norm_layer, + quant_config, prefix, use_data_parallel) + self.attn = Qwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: @@ -484,58 +183,6 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_processor( - Qwen2_5_VLMultiModalProcessor, - info=Qwen2_5_VLProcessingInfo, - dummy_inputs=Qwen2_5_VLDummyInputsBuilder) -class AscendQwen2_5_VLForConditionalGeneration_Without_Padding( - Qwen2_5_VLForConditionalGeneration): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix=maybe_prefix(prefix, "visual"), - ) - - def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: - - grid_thw = image_input["image_grid_thw"] - assert grid_thw.ndim == 2 - - if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"].type(self.visual.dtype) - else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - - # Split concatenated embeddings for each image item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return image_embeds.split(sizes.tolist()) - - def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: - - grid_thw = video_input["video_grid_thw"] - assert grid_thw.ndim == 2 - - if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"].type(self.visual.dtype) - else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) - - # Split concatenated embeddings for each video item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return video_embeds.split(sizes.tolist()) - - @MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, info=Qwen3VLProcessingInfo, dummy_inputs=Qwen3VLDummyInputsBuilder) diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index d2115285beb..c438445576b 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -22,12 +22,55 @@ import torch.nn.functional as F import torch_npu from einops import rearrange -from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention +from vllm.model_executor.models.qwen2_5_vl import ( + Qwen2_5_VisionAttention, Qwen2_5_VLForConditionalGeneration) + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_forward_context import set_ascend_forward_context MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight +class AscendQwen2_5_VLForConditionalGeneration(nn.Module): + + def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + with set_ascend_forward_context(None, self.vllm_config): + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + return image_embeds.split(sizes.tolist()) + + def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + with set_ascend_forward_context(None, self.vllm_config): + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw) + + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + return video_embeds.split(sizes.tolist()) + + @contextmanager def _padding_manager( q: torch.Tensor, @@ -38,7 +81,8 @@ def _padding_manager( origin_shape: int, hidden_size_per_attention_head: int, ): - enable_pad = (hidden_size_per_attention_head > MIN_PAD_SIZE + enable_pad = (envs_ascend.USE_OPTIMIZED_MODEL + and hidden_size_per_attention_head > MIN_PAD_SIZE and hidden_size_per_attention_head < MAX_PAD_SIZE) if enable_pad: @@ -145,3 +189,5 @@ def forward( Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward +Qwen2_5_VLForConditionalGeneration._process_image_input = AscendQwen2_5_VLForConditionalGeneration._process_image_input +Qwen2_5_VLForConditionalGeneration._process_video_input = AscendQwen2_5_VLForConditionalGeneration._process_video_input From 73ab6c4bc49c41530490196ff5bd32a8f3b1a7cd Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Tue, 25 Nov 2025 07:54:43 +0000 Subject: [PATCH 07/16] update Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/patch/worker/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d9bf4730e51..0fef597aa44 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -28,9 +28,3 @@ import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_minicpm # noqa import vllm_ascend.patch.worker.patch_qwen2_5_vl # noqa - -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.11.0"): - import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa - import vllm_ascend.patch.worker.patch_deepseek_v3_2 # noqa From 26b12d643ffbd3424faa6492a9f536b250d1b9da Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Tue, 25 Nov 2025 09:13:24 +0000 Subject: [PATCH 08/16] add get_forward_context_manager interface Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 45 +------------------- vllm_ascend/platform.py | 6 +++ 2 files changed, 7 insertions(+), 44 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index c438445576b..3bcde0ff244 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -22,55 +22,14 @@ import torch.nn.functional as F import torch_npu from einops import rearrange -from vllm.model_executor.models.qwen2_5_vl import ( - Qwen2_5_VisionAttention, Qwen2_5_VLForConditionalGeneration) +from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention import vllm_ascend.envs as envs_ascend -from vllm_ascend.ascend_forward_context import set_ascend_forward_context MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight -class AscendQwen2_5_VLForConditionalGeneration(nn.Module): - - def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: - - grid_thw = image_input["image_grid_thw"] - assert grid_thw.ndim == 2 - - if image_input["type"] == "image_embeds": - image_embeds = image_input["image_embeds"].type(self.visual.dtype) - else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - with set_ascend_forward_context(None, self.vllm_config): - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - - # Split concatenated embeddings for each image item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return image_embeds.split(sizes.tolist()) - - def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: - - grid_thw = video_input["video_grid_thw"] - assert grid_thw.ndim == 2 - - if video_input["type"] == "video_embeds": - video_embeds = video_input["video_embeds"].type(self.visual.dtype) - else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - with set_ascend_forward_context(None, self.vllm_config): - video_embeds = self.visual(pixel_values_videos, - grid_thw=grid_thw) - - # Split concatenated embeddings for each video item. - merge_size = self.visual.spatial_merge_size - sizes = grid_thw.prod(-1) // merge_size // merge_size - return video_embeds.split(sizes.tolist()) - - @contextmanager def _padding_manager( q: torch.Tensor, @@ -189,5 +148,3 @@ def forward( Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward -Qwen2_5_VLForConditionalGeneration._process_image_input = AscendQwen2_5_VLForConditionalGeneration._process_image_input -Qwen2_5_VLForConditionalGeneration._process_video_input = AscendQwen2_5_VLForConditionalGeneration._process_video_input diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 0797da3270e..3320aab51ec 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -413,3 +413,9 @@ def support_hybrid_kv_cache(cls) -> bool: @classmethod def support_static_graph_mode(cls) -> bool: return True + + @classmethod + def get_forward_context_manager(cls): + from vllm_ascend.ascend_forward_context import \ + set_ascend_forward_context + return set_ascend_forward_context From 0310c88b184d16f23cc3841bb0f73fc937e7bb55 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Wed, 26 Nov 2025 07:48:55 +0000 Subject: [PATCH 09/16] modify pad logic Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 229 +++++++++++-------- 1 file changed, 132 insertions(+), 97 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 3bcde0ff244..47427aa6129 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -15,70 +15,24 @@ # limitations under the License. # -from contextlib import contextmanager - +import einops import torch import torch.nn as nn import torch.nn.functional as F import torch_npu from einops import rearrange -from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention +from vllm.model_executor.models.qwen2_5_vl import ( + Qwen2_5_VisionAttention, Qwen2_5_VLForConditionalGeneration, + Qwen2_5_VLImageInputs, Qwen2_5_VLVideoInputs) +from vllm.model_executor.models.vision import run_dp_sharded_mrope_vision_model import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_forward_context import set_ascend_forward_context MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight -@contextmanager -def _padding_manager( - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - origin_shape: int, - hidden_size_per_attention_head: int, -): - enable_pad = (envs_ascend.USE_OPTIMIZED_MODEL - and hidden_size_per_attention_head > MIN_PAD_SIZE - and hidden_size_per_attention_head < MAX_PAD_SIZE) - - if enable_pad: - half_pad_hidden_size_per_attention_head = ( - MAX_PAD_SIZE - hidden_size_per_attention_head) // 2 - hidden_size_per_attention_head = MAX_PAD_SIZE - - pad_len = MAX_PAD_SIZE - origin_shape - # q/k/v: [b, s, head, head_dim] -> [b, s, head, MAX_PAD_SIZE] - q = F.pad(q, (0, pad_len), mode="constant", value=0) - k = F.pad(k, (0, pad_len), mode="constant", value=0) - v = F.pad(v, (0, pad_len), mode="constant", value=0) - # cos/sin: [seqlen, rotary_dim / 2] -> [b, s, head, MAX_PAD_SIZE / 2] - cos = torch.nn.functional.pad( - cos, (0, half_pad_hidden_size_per_attention_head)) - sin = torch.nn.functional.pad( - sin, (0, half_pad_hidden_size_per_attention_head)) - - cos = rearrange( - torch.stack((cos, cos), dim=-1), - "... d two -> ...(d two)", - two=2, - ) - sin = rearrange( - torch.stack((sin, sin), dim=-1), - "... d two -> ...(d two)", - two=2, - ) - cos = cos.reshape(1, -1, 1, hidden_size_per_attention_head) - sin = sin.reshape(1, -1, 1, hidden_size_per_attention_head) - - try: - yield (q, k, v, cos, sin, enable_pad) - finally: - pass - - class AscendQwen2_5_VisionAttention(nn.Module): def forward( @@ -92,59 +46,140 @@ def forward( ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) - - # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] - q, k, v = self.split_qkv(x) - batch_size = q.shape[1] + seq_len, batch_size, _ = x.shape + + # Split q k v. + qkv = einops.rearrange( + x, + "s b (three head head_dim) -> b s three head head_dim", + three=3, + head=self.num_attention_heads_per_partition, + ) + q, k, v = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2] origin_shape = q.shape[-1] - q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() - for x in (q, k, v)) - # Convert cumulative tensor to intervals and move it to cpu. cu_seqlens = torch.diff(cu_seqlens).to("cpu") - with _padding_manager( - q=q, - k=k, - v=v, - cos=rotary_pos_emb_cos, - sin=rotary_pos_emb_sin, - origin_shape=origin_shape, - hidden_size_per_attention_head=self. - hidden_size_per_attention_head, - ) as (q, k, v, cos, sin, enable_pad): - q = torch_npu.npu_rotary_mul(q, cos, sin) - k = torch_npu.npu_rotary_mul(k, cos, sin) - - q, k, v = [ - rearrange(x, "b s h d -> (b s) h d").contiguous() - for x in (q, k, v) - ] - - context_layer = torch.empty_like(q) - - # operator requires pta version >= 2.5.1 - torch_npu._npu_flash_attention_unpad( - query=q, - key=k, - value=v, - seq_len=cu_seqlens, - scale_value=self.hidden_size_per_attention_head**-0.5, - num_heads=self.num_attention_heads_per_partition, - num_kv_heads=self.num_attention_heads_per_partition, - out=context_layer, - ) - - if enable_pad: - context_layer = context_layer[..., :origin_shape] - - context_layer = rearrange(context_layer, - "(b s) h d -> s b (h d)", - b=batch_size).contiguous() + cos = rotary_pos_emb_cos + sin = rotary_pos_emb_sin + cos = rearrange( + torch.stack((cos, cos), dim=-1), + "... d two -> ...(d two)", + two=2, + ) + sin = rearrange( + torch.stack((sin, sin), dim=-1), + "... d two -> ...(d two)", + two=2, + ) + cos = cos.reshape(1, -1, 1, self.hidden_size_per_attention_head) + sin = sin.reshape(1, -1, 1, self.hidden_size_per_attention_head) + q = torch_npu.npu_rotary_mul(q, cos, sin) + k = torch_npu.npu_rotary_mul(k, cos, sin) + + q, k, v = [ + rearrange(x, "b s h d -> (b s) h d").contiguous() + for x in (q, k, v) + ] + + enable_pad = (envs_ascend.USE_OPTIMIZED_MODEL + and self.hidden_size_per_attention_head > MIN_PAD_SIZE + and self.hidden_size_per_attention_head < MAX_PAD_SIZE) + + if enable_pad: + pad_len = MAX_PAD_SIZE - origin_shape + # q/k/v: [b * s, head, head_dim] -> [b * s, head, MAX_PAD_SIZE] + q = F.pad(q, (0, pad_len), mode="constant", value=0) + k = F.pad(k, (0, pad_len), mode="constant", value=0) + v = F.pad(v, (0, pad_len), mode="constant", value=0) + + context_layer = torch.empty_like(q) + + # operator requires pta version >= 2.5.1 + torch_npu._npu_flash_attention_unpad( + query=q, + key=k, + value=v, + seq_len=cu_seqlens, + scale_value=self.hidden_size_per_attention_head**-0.5, + num_heads=self.num_attention_heads_per_partition, + num_kv_heads=self.num_attention_heads_per_partition, + out=context_layer, + ) + + if enable_pad: + context_layer = context_layer[..., :origin_shape] + + context_layer = rearrange(context_layer, + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() output, _ = self.proj(context_layer) return output +class AscendQwen2_5_VLForConditionalGeneration(nn.Module): + + def _process_image_input( + self, + image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]: + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"] + with set_ascend_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + else: + image_embeds = self.visual(pixel_values, + grid_thw=grid_thw_list) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist() + return image_embeds.split(sizes) + + def _process_video_input( + self, + video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]: + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"] + with set_ascend_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d", + ) + else: + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw_list) + + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist() + return video_embeds.split(sizes) + + +# NOTE: This can be removed after MMEncoderAttention has been extract as a CustomOp in vllm. Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward + +# NOTE: This can be removed after https://github.com/vllm-project/vllm/pull/29388 is merged. +Qwen2_5_VLForConditionalGeneration._process_image_input = AscendQwen2_5_VLForConditionalGeneration._process_image_input +Qwen2_5_VLForConditionalGeneration._process_video_input = AscendQwen2_5_VLForConditionalGeneration._process_video_input From a70e857d810bb045f061b2481ac7671009d465ec Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Thu, 27 Nov 2025 07:39:29 +0000 Subject: [PATCH 10/16] add patch for rope Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/models/__init__.py | 6 +- ...n2_5_vl_without_padding.py => qwen3_vl.py} | 0 vllm_ascend/patch/worker/__init__.py | 1 + vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 334 +++++++++++++++++- vllm_ascend/patch/worker/patch_rope.py | 33 ++ 5 files changed, 358 insertions(+), 16 deletions(-) rename vllm_ascend/models/{qwen2_5_vl_without_padding.py => qwen3_vl.py} (100%) create mode 100644 vllm_ascend/patch/worker/patch_rope.py diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py index 8926aa985b9..31eae8d7cbe 100644 --- a/vllm_ascend/models/__init__.py +++ b/vllm_ascend/models/__init__.py @@ -8,13 +8,11 @@ def register_model(): ModelRegistry.register_model( "Qwen3VLMoeForConditionalGeneration", - "vllm_ascend.models.qwen2_5_vl_without_padding:AscendQwen3VLMoeForConditionalGeneration" - ) + "vllm_ascend.models.qwen3_vl:AscendQwen3VLMoeForConditionalGeneration") ModelRegistry.register_model( "Qwen3VLForConditionalGeneration", - "vllm_ascend.models.qwen2_5_vl_without_padding:AscendQwen3VLForConditionalGeneration" - ) + "vllm_ascend.models.qwen3_vl:AscendQwen3VLForConditionalGeneration") # There is no PanguProMoEForCausalLM in vLLM, so we should register it before vLLM config initialization # to make sure the model can be loaded correctly. This register step can be removed once vLLM support PanguProMoEForCausalLM. diff --git a/vllm_ascend/models/qwen2_5_vl_without_padding.py b/vllm_ascend/models/qwen3_vl.py similarity index 100% rename from vllm_ascend/models/qwen2_5_vl_without_padding.py rename to vllm_ascend/models/qwen3_vl.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 0fef597aa44..faa57b6140f 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -28,3 +28,4 @@ import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_minicpm # noqa import vllm_ascend.patch.worker.patch_qwen2_5_vl # noqa +import vllm_ascend.patch.worker.patch_rope # noqa diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 47427aa6129..98d038dc31f 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -15,16 +15,29 @@ # limitations under the License. # +from functools import lru_cache, partial + import einops import torch import torch.nn as nn import torch.nn.functional as F import torch_npu -from einops import rearrange +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import \ + Qwen2_5_VLVisionConfig +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.layer import maybe_get_vit_flash_attn_backend +from vllm.model_executor.layers.activation import get_act_and_mul_fn +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.models.qwen2_5_vl import ( - Qwen2_5_VisionAttention, Qwen2_5_VLForConditionalGeneration, - Qwen2_5_VLImageInputs, Qwen2_5_VLVideoInputs) -from vllm.model_executor.models.vision import run_dp_sharded_mrope_vision_model + Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, + Qwen2_5_VisionPatchMerger, Qwen2_5_VisionTransformer, + Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLImageInputs, + Qwen2_5_VLVideoInputs) +from vllm.model_executor.models.utils import cast_overflow_tensors +from vllm.model_executor.models.vision import ( + get_vit_attn_backend, run_dp_sharded_mrope_vision_model) import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_forward_context import set_ascend_forward_context @@ -63,12 +76,12 @@ def forward( cos = rotary_pos_emb_cos sin = rotary_pos_emb_sin - cos = rearrange( + cos = einops.rearrange( torch.stack((cos, cos), dim=-1), "... d two -> ...(d two)", two=2, ) - sin = rearrange( + sin = einops.rearrange( torch.stack((sin, sin), dim=-1), "... d two -> ...(d two)", two=2, @@ -79,7 +92,7 @@ def forward( k = torch_npu.npu_rotary_mul(k, cos, sin) q, k, v = [ - rearrange(x, "b s h d -> (b s) h d").contiguous() + einops.rearrange(x, "b s h d -> (b s) h d").contiguous() for x in (q, k, v) ] @@ -111,14 +124,304 @@ def forward( if enable_pad: context_layer = context_layer[..., :origin_shape] - context_layer = rearrange(context_layer, - "(b s) h d -> s b (h d)", - b=batch_size).contiguous() + context_layer = einops.rearrange(context_layer, + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() output, _ = self.proj(context_layer) return output +class AscendQwen2_5_VisionBlock(nn.Module): + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + max_seqlen: torch.Tensor, # Only used for Flash Attention + seqlens: torch.Tensor, # Only used for xFormers + ) -> torch.Tensor: + x_attn = self.attn( + self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + x_fused_norm, residual = self.norm2(x, residual=x_attn) + x = residual + self.mlp(x_fused_norm) + return x + + +class AscendQwen2_5_VisionTransformer(nn.Module): + + def __init__( + self, + vision_config: Qwen2_5_VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_data_parallel: bool = False, + attn_backend_override: AttentionBackendEnum | None = None, + ) -> None: + nn.Module.__init__(self) + + patch_size = vision_config.patch_size + temporal_patch_size = vision_config.temporal_patch_size + in_channels = vision_config.in_channels + depth = vision_config.depth + self.hidden_size = vision_config.hidden_size + self.num_heads = vision_config.num_heads + self.use_data_parallel = use_data_parallel + self.out_hidden_size = vision_config.out_hidden_size + + # args for get_window_index_thw + self.window_size = vision_config.window_size + self.patch_size = vision_config.patch_size + self.spatial_merge_size = vision_config.spatial_merge_size + self.fullatt_block_indexes = vision_config.fullatt_block_indexes + self.spatial_merge_unit = self.spatial_merge_size**2 + # TODO[@lucaskabela]: Investigate fixing this usage + # see https://github.com/vllm-project/vllm/issues/27044 + # DO NOT MOVE THIS IMPORT + from vllm.compilation.backends import set_model_tag + + with set_model_tag("Qwen2_5_VisionPatchEmbed"): + self.patch_embed = Qwen2_5_VisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=in_channels, + hidden_size=self.hidden_size, + ) + + norm_layer = partial(RMSNorm, eps=norm_eps) + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = get_rope( + head_size=head_dim, + rotary_dim=head_dim // 2, + max_position=8192, + base=10000.0, + is_neox_style=True, + ) + + use_upstream_fa = False + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, + dtype=torch.get_default_dtype(), + attn_backend_override=attn_backend_override, + ) + + self.attn_backend, self.flash_attn_varlen_func = ( + maybe_get_vit_flash_attn_backend( + self.attn_backend, + use_upstream_fa, + attn_backend_override=attn_backend_override, + )) + + with set_model_tag("Qwen2_5_VisionBlock"): + self.blocks = nn.ModuleList([ + Qwen2_5_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=get_act_and_mul_fn(vision_config.hidden_act), + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}", + use_data_parallel=use_data_parallel, + attn_backend=self.attn_backend, + use_upstream_fa=use_upstream_fa, + attn_backend_override=attn_backend_override, + ) for layer_idx in range(depth) + ]) + + with set_model_tag("Qwen2_5_VisionPatchMerger"): + self.merger = Qwen2_5_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + prefix=f"{prefix}.merger", + use_data_parallel=use_data_parallel, + ) + + def rotary_pos_emb_thw(self, t, h, w): + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = (hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten()) + wpos_ids = (wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten()) + pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1) + max_size = max(h, w) + + # Use pre-computed cos_sin_cache from RotaryEmbedding + cos, sin = self.rotary_pos_emb.get_cos_sin(max_size) + + cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) + cos_w = cos[pos_ids[:, 1]] + sin_h = sin[pos_ids[:, 0]] + sin_w = sin[pos_ids[:, 1]] + + cos_combined = torch.cat([cos_h, cos_w], dim=-1) + sin_combined = torch.cat([sin_h, sin_w], dim=-1) + + cos_combined = cos_combined.reshape( + cos_combined.shape[0] // self.spatial_merge_unit, + self.spatial_merge_unit, + -1, + ) + sin_combined = sin_combined.reshape( + sin_combined.shape[0] // self.spatial_merge_unit, + self.spatial_merge_unit, + -1, + ) + + return cos_combined, sin_combined + + @lru_cache(maxsize=1024) # noqa: B019 + def get_rope_by_thw(self, t, h, w): + window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw( + t, h, w) + cos_thw, sin_thw = self.rotary_pos_emb_thw(t, h, w) + + cos_thw = cos_thw[window_index_thw, :, :] + cos_thw = cos_thw.flatten(start_dim=0, end_dim=1) + sin_thw = sin_thw[window_index_thw, :, :] + sin_thw = sin_thw.flatten(start_dim=0, end_dim=1) + + cu_seqlens_thw = torch.repeat_interleave( + torch.tensor([h * w], dtype=torch.int32), t) + return ( + cos_thw, + sin_thw, + window_index_thw, + cu_seqlens_window_thw, + cu_seqlens_thw, + ) + + def forward( + self, + x: torch.Tensor, + grid_thw: list[list[int]], + ) -> torch.Tensor: + # patchify + seq_len, _ = x.size() + rotary_pos_emb_cos = [] + rotary_pos_emb_sin = [] + window_index: list = [] + cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] + cu_seqlens: list = [] + + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) + + window_index_id = 0 + cu_window_seqlens_last = 0 + for t, h, w in grid_thw: + t, h, w = int(t), int(h), int(w) + llm_h = h // self.spatial_merge_size + llm_w = w // self.spatial_merge_size + + ( + cos_thw, + sin_thw, + window_index_thw, + cu_seqlens_window_thw, + cu_seqlens_thw, + ) = self.get_rope_by_thw(t, h, w) + + window_index.append(window_index_thw + window_index_id) + window_index_id += t * llm_h * llm_w + + cu_seqlens_window_thw = cu_seqlens_window_thw + cu_window_seqlens_last + cu_window_seqlens_last = cu_seqlens_window_thw[-1] + cu_window_seqlens.append(cu_seqlens_window_thw) + + rotary_pos_emb_cos.append(cos_thw) + rotary_pos_emb_sin.append(sin_thw) + + cu_seqlens.append(cu_seqlens_thw) + + rotary_pos_emb_cos = torch.cat(rotary_pos_emb_cos) + rotary_pos_emb_sin = torch.cat(rotary_pos_emb_sin) + window_index = torch.cat(window_index) + # compute reverse indices + reverse_indices = self.invert_permutation(window_index) + cu_window_seqlens = torch.cat(cu_window_seqlens) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + cu_seqlens = torch.cat(cu_seqlens) + cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32) + cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + + # transformers + # pre-compute seqlens for window/full attn to reduce cuMemcpy operations + max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen( + cu_seqlens) + max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen( + cu_window_seqlens) + + cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True) + cu_window_seqlens = cu_window_seqlens.to(device=self.device, + non_blocking=True) + rotary_pos_emb_cos = rotary_pos_emb_cos.to(device=self.device, + non_blocking=True) + rotary_pos_emb_sin = rotary_pos_emb_sin.to(device=self.device, + non_blocking=True) + window_index = window_index.to(device=hidden_states.device, + non_blocking=True) + reverse_indices = reverse_indices.to(device=hidden_states.device, + non_blocking=True) + + hidden_states = hidden_states.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape(seq_len, -1) + + hidden_states = hidden_states.unsqueeze(1) + + for layer_num, blk in enumerate(self.blocks): + if layer_num in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + max_seqlen_now = max_seqlen_full + seqlens_now = seqlens_full + else: + cu_seqlens_now = cu_window_seqlens + max_seqlen_now = max_seqlen_window + seqlens_now = seqlens_window + + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens_now, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + max_seqlen=max_seqlen_now, + seqlens=seqlens_now, + ) + + # For Qwen2.5-VL-3B, float16 will overflow at last block + # for long visual tokens sequences. + if hidden_states.dtype == torch.float16: + hidden_states = cast_overflow_tensors(hidden_states) + + # adapter + hidden_states = self.merger(hidden_states) + hidden_states = hidden_states[reverse_indices, :] + return hidden_states + + class AscendQwen2_5_VLForConditionalGeneration(nn.Module): def _process_image_input( @@ -177,9 +480,16 @@ def _process_video_input( return video_embeds.split(sizes) -# NOTE: This can be removed after MMEncoderAttention has been extract as a CustomOp in vllm. +# NOTE: This will be removed after MMEncoderAttention has been extract as a CustomOp in vllm. Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward -# NOTE: This can be removed after https://github.com/vllm-project/vllm/pull/29388 is merged. +# NOTE: These will be removed after https://github.com/vllm-project/vllm/pull/29388 is merged. Qwen2_5_VLForConditionalGeneration._process_image_input = AscendQwen2_5_VLForConditionalGeneration._process_image_input Qwen2_5_VLForConditionalGeneration._process_video_input = AscendQwen2_5_VLForConditionalGeneration._process_video_input + +# NOTE: These will be removed after vllm-ascend is aligned with vllm latest main. +Qwen2_5_VisionBlock.forward = AscendQwen2_5_VisionBlock.forward +Qwen2_5_VisionTransformer.__init__ = AscendQwen2_5_VisionTransformer.__init__ +Qwen2_5_VisionTransformer.rotary_pos_emb_thw = AscendQwen2_5_VisionTransformer.rotary_pos_emb_thw +Qwen2_5_VisionTransformer.get_rope_by_thw = AscendQwen2_5_VisionTransformer.get_rope_by_thw +Qwen2_5_VisionTransformer.forward = AscendQwen2_5_VisionTransformer.forward diff --git a/vllm_ascend/patch/worker/patch_rope.py b/vllm_ascend/patch/worker/patch_rope.py new file mode 100644 index 00000000000..cb40af86728 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_rope.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import torch.nn as nn +from vllm.model_executor.layers.rotary_embedding.base import \ + RotaryEmbeddingBase + + +class AscendRotaryEmbeddingBase(nn.Module): + + def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]: + cos_sin = self.cos_sin_cache[:seqlen] + cos, sin = cos_sin.chunk(2, dim=-1) + return cos, sin + + +# NOTE: These will be removed after vllm-ascend is aligned with vllm latest main. +RotaryEmbeddingBase.get_cos_sin = AscendRotaryEmbeddingBase.get_cos_sin From f0af5cc649b22a5f030b496d40de5af41d9f7d6d Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Thu, 27 Nov 2025 07:54:21 +0000 Subject: [PATCH 11/16] fix lint Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 24 ++++++++++++-------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 98d038dc31f..8fb79b53cf8 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -373,15 +373,21 @@ def forward( max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen( cu_window_seqlens) - cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True) - cu_window_seqlens = cu_window_seqlens.to(device=self.device, - non_blocking=True) - rotary_pos_emb_cos = rotary_pos_emb_cos.to(device=self.device, - non_blocking=True) - rotary_pos_emb_sin = rotary_pos_emb_sin.to(device=self.device, - non_blocking=True) - window_index = window_index.to(device=hidden_states.device, - non_blocking=True) + cu_seqlens = cu_seqlens.to( + device=self.device, + non_blocking=True) # type: ignore[attr-defined] + cu_window_seqlens = cu_window_seqlens.to( + device=self.device, + non_blocking=True) # type: ignore[attr-defined] + rotary_pos_emb_cos = rotary_pos_emb_cos.to( + device=self.device, + non_blocking=True) # type: ignore[attr-defined] + rotary_pos_emb_sin = rotary_pos_emb_sin.to( + device=self.device, + non_blocking=True) # type: ignore[attr-defined] + window_index = window_index.to( + device=hidden_states.device, + non_blocking=True) # type: ignore[attr-defined] reverse_indices = reverse_indices.to(device=hidden_states.device, non_blocking=True) From f3a6ad84d9437f54e576e52e944326d457f6391d Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Thu, 27 Nov 2025 08:03:19 +0000 Subject: [PATCH 12/16] fix lint Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 8fb79b53cf8..40bff0bc18e 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -375,19 +375,19 @@ def forward( cu_seqlens = cu_seqlens.to( device=self.device, - non_blocking=True) # type: ignore[attr-defined] + non_blocking=True) # type: ignore[attr-defined] # noqa: F821 cu_window_seqlens = cu_window_seqlens.to( device=self.device, - non_blocking=True) # type: ignore[attr-defined] + non_blocking=True) # type: ignore[attr-defined] # noqa: F821 rotary_pos_emb_cos = rotary_pos_emb_cos.to( device=self.device, - non_blocking=True) # type: ignore[attr-defined] + non_blocking=True) # type: ignore[attr-defined] # noqa: F821 rotary_pos_emb_sin = rotary_pos_emb_sin.to( device=self.device, - non_blocking=True) # type: ignore[attr-defined] + non_blocking=True) # type: ignore[attr-defined] # noqa: F821 window_index = window_index.to( device=hidden_states.device, - non_blocking=True) # type: ignore[attr-defined] + non_blocking=True) # type: ignore[attr-defined] # noqa: F821 reverse_indices = reverse_indices.to(device=hidden_states.device, non_blocking=True) From e02e99c5f0b02a322913053671023ffbf584ba9b Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Thu, 27 Nov 2025 08:16:06 +0000 Subject: [PATCH 13/16] fix lint Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 25 ++++++++------------ 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 40bff0bc18e..67b7d1f59bd 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -373,21 +373,16 @@ def forward( max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen( cu_window_seqlens) - cu_seqlens = cu_seqlens.to( - device=self.device, - non_blocking=True) # type: ignore[attr-defined] # noqa: F821 - cu_window_seqlens = cu_window_seqlens.to( - device=self.device, - non_blocking=True) # type: ignore[attr-defined] # noqa: F821 - rotary_pos_emb_cos = rotary_pos_emb_cos.to( - device=self.device, - non_blocking=True) # type: ignore[attr-defined] # noqa: F821 - rotary_pos_emb_sin = rotary_pos_emb_sin.to( - device=self.device, - non_blocking=True) # type: ignore[attr-defined] # noqa: F821 - window_index = window_index.to( - device=hidden_states.device, - non_blocking=True) # type: ignore[attr-defined] # noqa: F821 + cu_seqlens: torch.Tensor = cu_seqlens.to(device=self.device, + non_blocking=True) + cu_window_seqlens: torch.Tensor = cu_window_seqlens.to( + device=self.device, non_blocking=True) + rotary_pos_emb_cos: torch.Tensor = rotary_pos_emb_cos.to( + device=self.device, non_blocking=True) + rotary_pos_emb_sin: torch.Tensor = rotary_pos_emb_sin.to( + device=self.device, non_blocking=True) + window_index: torch.Tensor = window_index.to( + device=hidden_states.device, non_blocking=True) reverse_indices = reverse_indices.to(device=hidden_states.device, non_blocking=True) From 523ab296a234d1799124e04b7c266812acbe3573 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Thu, 27 Nov 2025 08:31:14 +0000 Subject: [PATCH 14/16] fix lint Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 29 ++++++++++---------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 67b7d1f59bd..35db1ea3214 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -319,11 +319,13 @@ def forward( ) -> torch.Tensor: # patchify seq_len, _ = x.size() - rotary_pos_emb_cos = [] - rotary_pos_emb_sin = [] - window_index: list = [] - cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] - cu_seqlens: list = [] + rotary_pos_emb_cos: list[torch.Tensor] = [] + rotary_pos_emb_sin: list[torch.Tensor] = [] + window_index: list[torch.Tensor] = [] + cu_window_seqlens: list[torch.Tensor] = [ + torch.tensor([0], dtype=torch.int32) + ] + cu_seqlens: list[torch.Tensor] = [] hidden_states = x.to(device=self.device, dtype=self.dtype) hidden_states = self.patch_embed(hidden_states) @@ -373,16 +375,15 @@ def forward( max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen( cu_window_seqlens) - cu_seqlens: torch.Tensor = cu_seqlens.to(device=self.device, + cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True) + cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True) - cu_window_seqlens: torch.Tensor = cu_window_seqlens.to( - device=self.device, non_blocking=True) - rotary_pos_emb_cos: torch.Tensor = rotary_pos_emb_cos.to( - device=self.device, non_blocking=True) - rotary_pos_emb_sin: torch.Tensor = rotary_pos_emb_sin.to( - device=self.device, non_blocking=True) - window_index: torch.Tensor = window_index.to( - device=hidden_states.device, non_blocking=True) + rotary_pos_emb_cos = rotary_pos_emb_cos.to(device=self.device, + non_blocking=True) + rotary_pos_emb_sin = rotary_pos_emb_sin.to(device=self.device, + non_blocking=True) + window_index = window_index.to(device=hidden_states.device, + non_blocking=True) reverse_indices = reverse_indices.to(device=hidden_states.device, non_blocking=True) From f27ac29f184ac10bc7e953e78f234ffa1406c2da Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Thu, 27 Nov 2025 08:47:26 +0000 Subject: [PATCH 15/16] fix lint Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/patch/worker/patch_qwen2_5_vl.py | 36 +++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py index 35db1ea3214..27f08751bff 100644 --- a/vllm_ascend/patch/worker/patch_qwen2_5_vl.py +++ b/vllm_ascend/patch/worker/patch_qwen2_5_vl.py @@ -319,13 +319,11 @@ def forward( ) -> torch.Tensor: # patchify seq_len, _ = x.size() - rotary_pos_emb_cos: list[torch.Tensor] = [] - rotary_pos_emb_sin: list[torch.Tensor] = [] - window_index: list[torch.Tensor] = [] - cu_window_seqlens: list[torch.Tensor] = [ - torch.tensor([0], dtype=torch.int32) - ] - cu_seqlens: list[torch.Tensor] = [] + rotary_pos_emb_cos: list = [] + rotary_pos_emb_sin: list = [] + window_index: list = [] + cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] + cu_seqlens: list = [] hidden_states = x.to(device=self.device, dtype=self.dtype) hidden_states = self.patch_embed(hidden_states) @@ -375,15 +373,21 @@ def forward( max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen( cu_window_seqlens) - cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True) - cu_window_seqlens = cu_window_seqlens.to(device=self.device, - non_blocking=True) - rotary_pos_emb_cos = rotary_pos_emb_cos.to(device=self.device, - non_blocking=True) - rotary_pos_emb_sin = rotary_pos_emb_sin.to(device=self.device, - non_blocking=True) - window_index = window_index.to(device=hidden_states.device, - non_blocking=True) + cu_seqlens = cu_seqlens.to( # type: ignore[attr-defined] + device=self.device, + non_blocking=True) + cu_window_seqlens = cu_window_seqlens.to( # type: ignore[attr-defined] + device=self.device, + non_blocking=True) + rotary_pos_emb_cos = rotary_pos_emb_cos.to( # type: ignore[attr-defined] + device=self.device, + non_blocking=True) + rotary_pos_emb_sin = rotary_pos_emb_sin.to( # type: ignore[attr-defined] + device=self.device, + non_blocking=True) + window_index = window_index.to( # type: ignore[attr-defined] + device=hidden_states.device, + non_blocking=True) reverse_indices = reverse_indices.to(device=hidden_states.device, non_blocking=True) From 9c62d278aced4cd32a9f305457822a7b59f5c83b Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Fri, 28 Nov 2025 06:14:49 +0000 Subject: [PATCH 16/16] fix platform Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/platform.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 3320aab51ec..0797da3270e 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -413,9 +413,3 @@ def support_hybrid_kv_cache(cls) -> bool: @classmethod def support_static_graph_mode(cls) -> bool: return True - - @classmethod - def get_forward_context_manager(cls): - from vllm_ascend.ascend_forward_context import \ - set_ascend_forward_context - return set_ascend_forward_context