|
22 | 22 | import torch.nn.functional as F |
23 | 23 | import torch_npu |
24 | 24 | from einops import rearrange |
25 | | -from vllm.model_executor.models.qwen2_5_vl import ( |
26 | | - Qwen2_5_VisionAttention, Qwen2_5_VLForConditionalGeneration) |
| 25 | +from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention |
27 | 26 |
|
28 | 27 | import vllm_ascend.envs as envs_ascend |
29 | | -from vllm_ascend.ascend_forward_context import set_ascend_forward_context |
30 | 28 |
|
31 | 29 | MIN_PAD_SIZE = 64 # min_size to pad weight |
32 | 30 | MAX_PAD_SIZE = 128 # max_size to pad weight |
33 | 31 |
|
34 | 32 |
|
35 | | -class AscendQwen2_5_VLForConditionalGeneration(nn.Module): |
36 | | - |
37 | | - def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: |
38 | | - |
39 | | - grid_thw = image_input["image_grid_thw"] |
40 | | - assert grid_thw.ndim == 2 |
41 | | - |
42 | | - if image_input["type"] == "image_embeds": |
43 | | - image_embeds = image_input["image_embeds"].type(self.visual.dtype) |
44 | | - else: |
45 | | - pixel_values = image_input["pixel_values"].type(self.visual.dtype) |
46 | | - with set_ascend_forward_context(None, self.vllm_config): |
47 | | - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) |
48 | | - |
49 | | - # Split concatenated embeddings for each image item. |
50 | | - merge_size = self.visual.spatial_merge_size |
51 | | - sizes = grid_thw.prod(-1) // merge_size // merge_size |
52 | | - return image_embeds.split(sizes.tolist()) |
53 | | - |
54 | | - def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: |
55 | | - |
56 | | - grid_thw = video_input["video_grid_thw"] |
57 | | - assert grid_thw.ndim == 2 |
58 | | - |
59 | | - if video_input["type"] == "video_embeds": |
60 | | - video_embeds = video_input["video_embeds"].type(self.visual.dtype) |
61 | | - else: |
62 | | - pixel_values_videos = video_input["pixel_values_videos"].type( |
63 | | - self.visual.dtype) |
64 | | - with set_ascend_forward_context(None, self.vllm_config): |
65 | | - video_embeds = self.visual(pixel_values_videos, |
66 | | - grid_thw=grid_thw) |
67 | | - |
68 | | - # Split concatenated embeddings for each video item. |
69 | | - merge_size = self.visual.spatial_merge_size |
70 | | - sizes = grid_thw.prod(-1) // merge_size // merge_size |
71 | | - return video_embeds.split(sizes.tolist()) |
72 | | - |
73 | | - |
74 | 33 | @contextmanager |
75 | 34 | def _padding_manager( |
76 | 35 | q: torch.Tensor, |
@@ -189,5 +148,3 @@ def forward( |
189 | 148 |
|
190 | 149 |
|
191 | 150 | Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward |
192 | | -Qwen2_5_VLForConditionalGeneration._process_image_input = AscendQwen2_5_VLForConditionalGeneration._process_image_input |
193 | | -Qwen2_5_VLForConditionalGeneration._process_video_input = AscendQwen2_5_VLForConditionalGeneration._process_video_input |
0 commit comments