Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/models/glm4v/image_processing_glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
"""
patch_size = images_kwargs.get("patch_size", self.patch_size)
merge_size = images_kwargs.get("merge_size", self.merge_size)
size = images_kwargs.get("size", self.size)
size = images_kwargs.get("size", {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we align self.size instead?

Copy link
Member Author

@zucchini-nlp zucchini-nlp Sep 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the defaults are already using these values. It is the model checkpoint that has another set of values saved, and for some reason we never use it. Didn't want to break BC but using config values is actually correct

Copy link
Contributor

@qubvel qubvel Sep 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm, can we update config values instead? or Hub's PR will not be merged?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can try, lemme ask one of the authors who is on slack

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

asked the authors and the config values are the correct ones. So will update processor in the next PR to use config defaults and mark it as slightly breaking


factor = patch_size * merge_size
resized_height, resized_width = smart_resize(
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/video_processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,10 +305,14 @@ def _decode_and_sample_videos(
# Only sample frames if an array video is passed, otherwise first decode -> then sample
if is_valid_video(videos[0]) and do_sample_frames:
sampled_videos = []
sampled_metadata = []
for video, metadata in zip(videos, video_metadata):
indices = sample_indices_fn(metadata=metadata)
metadata.frames_indices = indices
sampled_videos.append(video[indices])
sampled_metadata.append(metadata)
videos = sampled_videos
video_metadata = sampled_metadata
elif not is_valid_video(videos[0]):
if isinstance(videos[0], list):
# Videos sometimes are passed as a list of image URLs, especially through templates
Expand Down
16 changes: 11 additions & 5 deletions src/transformers/video_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@

import os
import warnings
from collections.abc import Iterable
from collections.abc import Iterable, Mapping
from contextlib import redirect_stdout
from dataclasses import dataclass
from dataclasses import dataclass, fields
from io import BytesIO
from typing import Callable, NewType, Optional, Union
from urllib.parse import urlparse
Expand Down Expand Up @@ -78,7 +78,7 @@


@dataclass
class VideoMetadata:
class VideoMetadata(Mapping):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the motivation for making it iterable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so that we can use VideoMetadata in same way as dict. We allow users to pass both formats currently

total_num_frames: int
fps: float = None
width: int = None
Expand All @@ -87,6 +87,12 @@ class VideoMetadata:
video_backend: str = None
frames_indices: list[int] = None

def __iter__(self):
return (f.name for f in fields(self))

def __len__(self):
return len(fields(self))

def __getitem__(self, item):
return getattr(self, item)

Expand All @@ -96,8 +102,8 @@ def __setitem__(self, key, value):
@property
def timestamps(self) -> float:
"Timestamps of the sampled frames in seconds."
if self.fps is None:
raise ValueError("Cannot infer video `timestamps` when `fps` is None.")
if self.fps is None or self.frames_indices is None:
raise ValueError("Cannot infer video `timestamps` when `fps` or `frames_indices` is None.")
return [frame_idx / self.fps for frame_idx in self.frames_indices]

def update(self, dictionary):
Expand Down
7 changes: 7 additions & 0 deletions tests/test_video_processing_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,13 @@ def test_call_sample_frames(self):
self.assertEqual(encoded_videos.shape[1], 6)
self.assertEqual(encoded_videos_batched.shape[1], 6)

# The same as above but uses a `VideoMetadata` object in the input
metadata = [[VideoMetadata(duration=2.0, total_num_frames=8, fps=4)]]
batched_metadata = metadata * len(video_inputs)
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
self.input_name
]

# We should raise error when asked to sample more frames than there are in input video
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
Expand Down