huggingface · zhanluxianshen · Aug 27, 2025
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
@@ -153,7 +153,7 @@ def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_afte
         self.batch_number = 0
         self.total_calls = 0
         self.detected_overflow = False
-        self.prefix = "                 "
+        self.prefix = " " * 17
 
         self.analyse_model()
 

diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
@@ -304,6 +304,10 @@ def get_patch_output_size(image, target_resolution, input_data_format):
     original_height, original_width = get_image_size(image, channel_dim=input_data_format)
     target_height, target_width = target_resolution
 
+    if original_width == 0:
+        raise ValueError("original_width can not be 0")
+    if original_height == 0:
+        raise ValueError("original_height can not be 0")
     scale_w = target_width / original_width
     scale_h = target_height / original_height
 

diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
@@ -201,18 +201,19 @@ def prepare_padding_mask(
     From the 2D attention mask, prepare the correct padding mask to use by potentially padding it, and slicing
     according to the `kv_offset` if `_slice` is `True`.
     """
+    if attention_mask is None:
+        return None
     local_padding_mask = attention_mask
-    if attention_mask is not None:
-        # Pad it if necessary
-        if (padding_length := kv_length + kv_offset - attention_mask.shape[-1]) > 0:
-            local_padding_mask = torch.nn.functional.pad(attention_mask, (0, padding_length))
-        # For flex, we should not slice them, only use an offset
-        if _slice:
-            # Equivalent to: `local_padding_mask = attention_mask[:, kv_offset : kv_offset + kv_length]`,
-            # but without data-dependent slicing (i.e. torch.compile friendly)
-            mask_indices = torch.arange(kv_length, device=local_padding_mask.device)
-            mask_indices += kv_offset
-            local_padding_mask = local_padding_mask[:, mask_indices]
+    # Pad it if necessary
+    if (padding_length := kv_length + kv_offset - attention_mask.shape[-1]) > 0:
+        local_padding_mask = torch.nn.functional.pad(attention_mask, (0, padding_length))
+    # For flex, we should not slice them, only use an offset
+    if _slice:
+        # Equivalent to: `local_padding_mask = attention_mask[:, kv_offset : kv_offset + kv_length]`,
+        # but without data-dependent slicing (i.e. torch.compile friendly)
+        mask_indices = torch.arange(kv_length, device=local_padding_mask.device)
+        mask_indices += kv_offset
+        local_padding_mask = local_padding_mask[:, mask_indices]
     return local_padding_mask