Skip to content

Commit 91455c1

Browse files
authored
Fix processor kwargs qwen2 vl (#36890)
* Fix qwen2_vl and qwen2_5_vl processors cutom images kwargs * change version warning
1 parent 48385aa commit 91455c1

File tree

9 files changed

+66
-11
lines changed

9 files changed

+66
-11
lines changed

src/transformers/models/auto/image_processing_auto.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -493,15 +493,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
493493
image_processor_auto_map = config.auto_map["AutoImageProcessor"]
494494

495495
image_processor_class = None
496-
# TODO: @yoni, change logic in v4.50 (when use_fast set to True by default)
496+
# TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
497497
if image_processor_type is not None:
498498
# if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
499499
if use_fast is None:
500500
use_fast = image_processor_type.endswith("Fast")
501501
if not use_fast:
502502
logger.warning_once(
503503
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
504-
"`use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. "
504+
"`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
505505
"This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
506506
)
507507
# Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version.

src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
VisionRotaryEmbedding,
4242
VisionSdpaAttention,
4343
)
44-
from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
44+
from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLImagesKwargs, Qwen2VLProcessor
4545

4646
from ...activations import ACT2FN
4747
from ...configuration_utils import PretrainedConfig
@@ -816,7 +816,12 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
816816
fps: Union[List[float], float]
817817

818818

819+
class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs):
820+
pass
821+
822+
819823
class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
824+
images_kwargs: Qwen2_5_VLImagesKwargs
820825
videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
821826
_defaults = {
822827
"text_kwargs": {

src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,28 @@
2323
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2424
# See the License for the specific language governing permissions and
2525
# limitations under the License.
26-
from typing import List, Union
26+
from typing import List, Optional, Union
2727

2828
from ...feature_extraction_utils import BatchFeature
2929
from ...image_utils import ImageInput, VideoInput
30-
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
30+
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
3131
from ...tokenization_utils_base import PreTokenizedInput, TextInput
3232

3333

3434
class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
3535
fps: Union[List[float], float]
3636

3737

38+
class Qwen2_5_VLImagesKwargs(ImagesKwargs):
39+
min_pixels: Optional[int]
40+
max_pixels: Optional[int]
41+
patch_size: Optional[int]
42+
temporal_patch_size: Optional[int]
43+
merge_size: Optional[int]
44+
45+
3846
class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
47+
images_kwargs: Qwen2_5_VLImagesKwargs
3948
videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
4049
_defaults = {
4150
"text_kwargs": {

src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,7 @@ def preprocess(
384384
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
385385
min_pixels = size["shortest_edge"]
386386
else:
387-
size = self.size
387+
size = {**self.size}
388388
# backward compatibility: override size with min_pixels and max_pixels if they are provided
389389
if min_pixels is not None:
390390
size["shortest_edge"] = min_pixels

src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ def preprocess(
339339
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
340340
min_pixels = size["shortest_edge"]
341341
else:
342-
size = self.size
342+
size = {**self.size}
343343
# backward compatibility: override size with min_pixels and max_pixels if they are provided
344344
if min_pixels is not None:
345345
size["shortest_edge"] = min_pixels

src/transformers/models/qwen2_vl/processing_qwen2_vl.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,28 @@
2121
Processor class for Qwen2-VL.
2222
"""
2323

24-
from typing import List, Union
24+
from typing import List, Optional, Union
2525

2626
from ...feature_extraction_utils import BatchFeature
2727
from ...image_utils import ImageInput, VideoInput
28-
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
28+
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
2929
from ...tokenization_utils_base import PreTokenizedInput, TextInput
3030
from ...utils import logging
3131

3232

3333
logger = logging.get_logger(__name__)
3434

3535

36+
class Qwen2VLImagesKwargs(ImagesKwargs):
37+
min_pixels: Optional[int]
38+
max_pixels: Optional[int]
39+
patch_size: Optional[int]
40+
temporal_patch_size: Optional[int]
41+
merge_size: Optional[int]
42+
43+
3644
class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False):
45+
images_kwargs: Qwen2VLImagesKwargs
3746
_defaults = {
3847
"text_kwargs": {
3948
"padding": False,

src/transformers/processing_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1111,12 +1111,12 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
11111111
if isinstance(class_name, tuple):
11121112
classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name)
11131113
if attribute_name == "image_processor":
1114-
# TODO: @yoni, change logic in v4.50 (when use_fast set to True by default)
1114+
# TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
11151115
use_fast = kwargs.get("use_fast", None)
11161116
if use_fast is None:
11171117
logger.warning_once(
11181118
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
1119-
"`use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. "
1119+
"`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
11201120
"This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
11211121
)
11221122
else:

tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,3 +310,19 @@ def test_chat_template_video(self):
310310
)
311311
self.assertTrue(self.videos_input_name in out_dict_with_video)
312312
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
313+
314+
def test_kwargs_overrides_custom_image_processor_kwargs(self):
315+
processor_components = self.prepare_components()
316+
processor_components["image_processor"] = self.get_component("image_processor")
317+
processor_components["tokenizer"] = self.get_component("tokenizer")
318+
processor_kwargs = self.prepare_processor_dict()
319+
320+
processor = self.processor_class(**processor_components, **processor_kwargs, use_fast=True)
321+
self.skip_processor_without_typed_kwargs(processor)
322+
323+
input_str = self.prepare_text_inputs()
324+
image_input = self.prepare_image_inputs()
325+
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
326+
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
327+
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
328+
self.assertEqual(inputs[self.images_input_name].shape[0], 800)

tests/models/qwen2_vl/test_processor_qwen2_vl.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,3 +307,19 @@ def test_chat_template_video(self):
307307
)
308308
self.assertTrue(self.videos_input_name in out_dict_with_video)
309309
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
310+
311+
def test_kwargs_overrides_custom_image_processor_kwargs(self):
312+
processor_components = self.prepare_components()
313+
processor_components["image_processor"] = self.get_component("image_processor")
314+
processor_components["tokenizer"] = self.get_component("tokenizer")
315+
processor_kwargs = self.prepare_processor_dict()
316+
317+
processor = self.processor_class(**processor_components, **processor_kwargs, use_fast=True)
318+
self.skip_processor_without_typed_kwargs(processor)
319+
320+
input_str = self.prepare_text_inputs()
321+
image_input = self.prepare_image_inputs()
322+
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
323+
self.assertEqual(inputs[self.images_input_name].shape[0], 800)
324+
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
325+
self.assertEqual(inputs[self.images_input_name].shape[0], 612)

0 commit comments

Comments
 (0)