2727"""Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
2828from collections .abc import Iterable , Mapping
2929from functools import lru_cache , partial
30- from typing import Callable , Literal , Optional , TypedDict , Union
30+ from typing import Annotated , Callable , Literal , Optional , Union
3131
3232import torch
3333import torch .nn as nn
6464from vllm .platforms import _Backend
6565from vllm .sequence import IntermediateTensors
6666from vllm .transformers_utils .config import uses_mrope
67+ from vllm .utils .tensor_schema import TensorSchema , TensorShape
6768
6869from .interfaces import (MultiModalEmbeddings , SupportsLoRA ,
6970 SupportsMultiModal , SupportsPP , SupportsQuant )
8081# === Vision Inputs === #
8182
8283
83- class Qwen2_5_VLImagePixelInputs (TypedDict ):
84- type : Literal ["pixel_values" ]
85- pixel_values : torch .Tensor
86- """Shape:
87- `(num_patches, num_channels * patch_size * patch_size)`
84+ class Qwen2_5_VLImagePixelInputs (TensorSchema ):
8885 """
89-
90- image_grid_thw : torch .Tensor
91- """Shape: `(num_images, 3)`
92- This should be in `(grid_t, grid_h, grid_w)` format.
86+ Dimensions:
87+ - np: Number of patches
88+ - ni: Number of images
89+ - cps: Number of channels * patch_size * patch_size
90+
91+ Historical context:
92+ - pixel_values shape: (num_patches, num_channels * patch_size *
93+ patch_size)
94+ - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
95+ formatnum_channels * patch_size * patch_size
9396 """
97+ type : Literal ["pixel_values" ]
98+
99+ pixel_values : Annotated [
100+ torch .Tensor ,
101+ TensorShape ("np" , "cps" ),
102+ ]
94103
104+ image_grid_thw : Annotated [
105+ torch .Tensor ,
106+ TensorShape ("ni" , 3 ),
107+ ]
95108
96- class Qwen2_5_VLImageEmbeddingInputs (TypedDict ):
97- type : Literal ["image_embeds" ]
98- image_embeds : torch .Tensor
99- """Supported types:
100- - list[`torch.Tensor`]: A list of tensors holding all images' features.
101- Each tensor holds an image's features.
102- - `torch.Tensor`: A tensor holding all images' features
103- (concatenation of all images' feature tensors).
104-
105- Tensor shape: `(num_image_features, hidden_size)`
106- - `num_image_features` varies based on
107- the number and resolution of the images.
108- - `hidden_size` must match the hidden size of language model backbone.
109- """
110109
111- image_grid_thw : torch .Tensor
112- """Shape: `(num_images, 3)`
113- This should be in `(grid_t, grid_h, grid_w)` format.
110+ class Qwen2_5_VLImageEmbeddingInputs (TensorSchema ):
114111 """
112+ Dimensions:
113+ - nf: Number of image features
114+ - hs: Hidden size
115+ - ni: Number of images
116+
117+ Historical context:
118+ - image_embeds shape: (num_image_features, hidden_size)
119+ - num_image_features varies based on the number and resolution of the
120+ images.
121+ - hidden_size must match the hidden size of language model backbone.
122+ - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
123+ format
124+ """
125+ type : Literal ["image_embeds" ]
126+
127+ image_embeds : Annotated [
128+ torch .Tensor ,
129+ TensorShape ("nf" , "hs" ),
130+ ]
131+
132+ image_grid_thw : Annotated [
133+ torch .Tensor ,
134+ TensorShape ("ni" , 3 ),
135+ ]
115136
116137
117138Qwen2_5_VLImageInputs = Union [Qwen2_5_VLImagePixelInputs ,
118139 Qwen2_5_VLImageEmbeddingInputs ]
119140
120141
121- class Qwen2_5_VLVideoPixelInputs (TypedDict ):
122- type : Literal ["pixel_values_videos" ]
123- pixel_values_videos : torch .Tensor
124- """Shape:
125- `(num_patches,
126- num_channels * temporal_patch_size * patch_size * patch_size)`
142+ class Qwen2_5_VLVideoPixelInputs (TensorSchema ):
143+ """
144+ Dimensions:
145+ - np: Number of patches
146+ - nv: Number of videos
147+ - ctps: Number of channels * temporal_patch_size * patch_size *
148+ patch_size
149+
150+ Historical context:
151+ - pixel_values_videos shape: (num_patches, num_channels *
152+ temporal_patch_size * patch_size * patch_size)
153+ - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
154+ format
155+ - second_per_grid_ts: The video time interval (in seconds) for each
156+ grid along the temporal dimension in the 3D position IDs. Returned
157+ when `videos` is not `None`.
127158 """
159+ type : Literal ["pixel_values_videos" ]
128160
129- video_grid_thw : torch .Tensor
130- """Shape: `(num_videos, 3)`
161+ pixel_values_videos : Annotated [
162+ torch .Tensor ,
163+ TensorShape ("np" , "ctps" ),
164+ ]
131165
132- This should be in `(grid_t, grid_h, grid_w)` format.
133- """
166+ video_grid_thw : Annotated [
167+ torch .Tensor ,
168+ TensorShape ("nv" , 3 ),
169+ ]
134170
135- second_per_grid_ts : torch .Tensor
136- """
137- The video time interval (in seconds) for each grid along the temporal
138- dimension in the 3D position IDs. Returned when `videos` is not `None`.
139- """
171+ second_per_grid_ts : Annotated [
172+ Optional [torch .Tensor ],
173+ TensorShape ("nv" ),
174+ ]
140175
141176
142- class Qwen2_5_VLVideoEmbeddingInputs (TypedDict ):
143- type : Literal ["video_embeds" ]
144- video_embeds : torch .Tensor
145- """Supported types:
146- - list[`torch.Tensor`]: A list of tensors holding all videos' features.
147- Each tensor holds an video's features.
148- - `torch.Tensor`: A tensor holding all videos' features
149- (concatenation of all videos' feature tensors).
150-
151- Tensor shape: `(num_image_features, hidden_size)`
152- - `num_image_features` varies based on
153- the number and resolution of the videos.
154- - `hidden_size` must match the hidden size of language model backbone.
177+ class Qwen2_5_VLVideoEmbeddingInputs (TensorSchema ):
155178 """
156-
157- video_grid_thw : torch .Tensor
158- """Shape: `(num_videos, 3)`
159- This should be in `(grid_t, grid_h, grid_w)` format.
179+ Dimensions:
180+ - nf: Number of video features
181+ - hs: Hidden size
182+ - nv: Number of videos
183+
184+ Historical context:
185+ - video_embeds shape: (num_video_features, hidden_size)
186+ - num_video_features varies based on the number and resolution of the
187+ videos.
188+ - hidden_size must match the hidden size of language model backbone.
189+ - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
190+ format
160191 """
192+ type : Literal ["video_embeds" ]
193+
194+ video_embeds : Annotated [
195+ torch .Tensor ,
196+ TensorShape ("nf" , "hs" ),
197+ ]
198+
199+ video_grid_thw : Annotated [
200+ torch .Tensor ,
201+ TensorShape ("nv" , 3 ),
202+ ]
161203
162204
163205Qwen2_5_VLVideoInputs = Union [Qwen2_5_VLVideoPixelInputs ,
@@ -936,10 +978,6 @@ def _parse_and_validate_image_input(
936978 image_grid_thw = self ._validate_and_reshape_mm_tensor (
937979 image_grid_thw , "image grid_thw" )
938980
939- if not isinstance (pixel_values , (torch .Tensor , list )):
940- raise ValueError ("Incorrect type of image pixel values. "
941- f"Got type: { type (pixel_values )} " )
942-
943981 return Qwen2_5_VLImagePixelInputs (type = "pixel_values" ,
944982 pixel_values = pixel_values ,
945983 image_grid_thw = image_grid_thw )
@@ -950,9 +988,6 @@ def _parse_and_validate_image_input(
950988 image_grid_thw = self ._validate_and_reshape_mm_tensor (
951989 image_grid_thw , "image grid_thw" )
952990
953- if not isinstance (image_embeds , torch .Tensor ):
954- raise ValueError ("Incorrect type of image embeddings. "
955- f"Got type: { type (image_embeds )} " )
956991 return Qwen2_5_VLImageEmbeddingInputs (
957992 type = "image_embeds" ,
958993 image_embeds = image_embeds ,
@@ -973,7 +1008,8 @@ def _parse_and_validate_video_input(
9731008 pixel_values_videos , "video pixel values" )
9741009 video_grid_thw = self ._validate_and_reshape_mm_tensor (
9751010 video_grid_thw , "video grid_thw" )
976-
1011+ if second_per_grid_ts is not None and second_per_grid_ts .ndim == 2 :
1012+ second_per_grid_ts = second_per_grid_ts .squeeze (- 1 )
9771013 return Qwen2_5_VLVideoPixelInputs (
9781014 type = "pixel_values_videos" ,
9791015 pixel_values_videos = pixel_values_videos ,
@@ -987,9 +1023,6 @@ def _parse_and_validate_video_input(
9871023 video_grid_thw = self ._validate_and_reshape_mm_tensor (
9881024 video_grid_thw , "video grid_thw" )
9891025
990- if not isinstance (video_embeds , torch .Tensor ):
991- raise ValueError ("Incorrect type of video embeddings. "
992- f"Got type: { type (video_embeds )} " )
9931026 return Qwen2_5_VLVideoEmbeddingInputs (
9941027 type = "video_embeds" ,
9951028 video_embeds = video_embeds ,
0 commit comments