3434from vllm .multimodal .inputs import (MultiModalDataDict , MultiModalFieldConfig ,
3535 MultiModalInputsV2 , MultiModalKwargs ,
3636 NestedTensors , PlaceholderRange )
37- from vllm .multimodal .parse import ImageProcessorItems
37+ from vllm .multimodal .parse import ImageProcessorItems , MultiModalDataParser
3838from vllm .multimodal .processing import (BaseMultiModalProcessor ,
3939 MultiModalDataItems , ProcessorInputs ,
4040 PromptReplacement )
5454
5555class FuyuImagePatchInputs (TypedDict ):
5656 type : Literal ["image_patches" ]
57- data : torch .Tensor
57+ flat_data : torch .Tensor
5858 """
5959 Shape:
6060 `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
@@ -63,7 +63,7 @@ class FuyuImagePatchInputs(TypedDict):
6363 patches_per_image : List [int ]
6464 """
6565 List of number of total patches for each image in the batch.
66- This is used to restore the first two dimensions of `data `.
66+ This is used to restore the first two dimensions of `flat_data `.
6767 """
6868
6969
@@ -102,6 +102,9 @@ def get_max_fuyu_image_tokens(ctx: InputContext):
102102
103103class FuyuMultiModalProcessor (BaseMultiModalProcessor ):
104104
105+ def _get_data_parser (self ) -> MultiModalDataParser :
106+ return MultiModalDataParser (max_mm_counts = {"image" : 1 })
107+
105108 def _get_hf_processor (self ) -> FuyuProcessor :
106109 return self .ctx .get_hf_processor (FuyuProcessor )
107110
@@ -304,7 +307,7 @@ def _parse_and_validate_image_input(
304307
305308 return FuyuImagePatchInputs (
306309 type = "image_patches" ,
307- data = self ._validate_pixel_values (
310+ flat_data = self ._validate_pixel_values (
308311 flatten_bn (image_patches_flat , concat = True )),
309312 patches_per_image = [x .size (0 ) for x in image_patches_flat ],
310313 )
@@ -313,12 +316,13 @@ def _parse_and_validate_image_input(
313316
314317 def _process_image_input (
315318 self , image_input : FuyuImagePatchInputs ) -> NestedTensors :
316- image_patches = image_input ["data " ]
319+ image_patches_flat = image_input ["flat_data " ]
317320 patches_per_image = image_input ["patches_per_image" ]
318321
319322 assert self .vision_embed_tokens is not None
320- vision_embeddings , _ = self .vision_embed_tokens (image_patches )
321- return vision_embeddings .split (patches_per_image , dim = 0 )
323+ vision_embeddings_flat , _ = self .vision_embed_tokens (
324+ image_patches_flat )
325+ return vision_embeddings_flat .split (patches_per_image , dim = 0 )
322326
323327 def get_multimodal_embeddings (self , ** kwargs ) -> Optional [NestedTensors ]:
324328 image_input = self ._parse_and_validate_image_input (** kwargs )
0 commit comments