diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py index 920b1cf44daf..2489c9368c16 100644 --- a/src/transformers/debug_utils.py +++ b/src/transformers/debug_utils.py @@ -153,7 +153,7 @@ def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_afte self.batch_number = 0 self.total_calls = 0 self.detected_overflow = False - self.prefix = " " + self.prefix = " " * 17 self.analyse_model() diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 52b798c09f84..ed4f2e6bec26 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -304,6 +304,10 @@ def get_patch_output_size(image, target_resolution, input_data_format): original_height, original_width = get_image_size(image, channel_dim=input_data_format) target_height, target_width = target_resolution + if original_width == 0: + raise ValueError("original_width can not be 0") + if original_height == 0: + raise ValueError("original_height can not be 0") scale_w = target_width / original_width scale_h = target_height / original_height diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py index 931c58870d62..f780df7e9403 100644 --- a/src/transformers/masking_utils.py +++ b/src/transformers/masking_utils.py @@ -201,18 +201,19 @@ def prepare_padding_mask( From the 2D attention mask, prepare the correct padding mask to use by potentially padding it, and slicing according to the `kv_offset` if `_slice` is `True`. """ + if attention_mask is None: + return None local_padding_mask = attention_mask - if attention_mask is not None: - # Pad it if necessary - if (padding_length := kv_length + kv_offset - attention_mask.shape[-1]) > 0: - local_padding_mask = torch.nn.functional.pad(attention_mask, (0, padding_length)) - # For flex, we should not slice them, only use an offset - if _slice: - # Equivalent to: `local_padding_mask = attention_mask[:, kv_offset : kv_offset + kv_length]`, - # but without data-dependent slicing (i.e. torch.compile friendly) - mask_indices = torch.arange(kv_length, device=local_padding_mask.device) - mask_indices += kv_offset - local_padding_mask = local_padding_mask[:, mask_indices] + # Pad it if necessary + if (padding_length := kv_length + kv_offset - attention_mask.shape[-1]) > 0: + local_padding_mask = torch.nn.functional.pad(attention_mask, (0, padding_length)) + # For flex, we should not slice them, only use an offset + if _slice: + # Equivalent to: `local_padding_mask = attention_mask[:, kv_offset : kv_offset + kv_length]`, + # but without data-dependent slicing (i.e. torch.compile friendly) + mask_indices = torch.arange(kv_length, device=local_padding_mask.device) + mask_indices += kv_offset + local_padding_mask = local_padding_mask[:, mask_indices] return local_padding_mask