vijayabhaskar-ev
diff --git a/‎src/transformers/generation/continuous_batching/cache.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/generation/continuous_batching/cache.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/big_bird/modeling_big_bird.py‎
Lines changed: 4 additions & 4 deletions b/‎src/transformers/models/big_bird/modeling_big_bird.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py‎
Lines changed: 4 additions & 4 deletions b/‎src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/transformers/models/cpmant/modeling_cpmant.py‎
Lines changed: 6 additions & 6 deletions b/‎src/transformers/models/cpmant/modeling_cpmant.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/transformers/models/gemma3/convert_gemma3_weights.py‎
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/gemma3/convert_gemma3_weights.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/git/modeling_git.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/git/modeling_git.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/groupvit/modeling_groupvit.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/groupvit/modeling_groupvit.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/imagegpt/image_processing_imagegpt.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/imagegpt/image_processing_imagegpt.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py‎
Lines changed: 1 addition & 1 deletion
@@ -79,7 +79,7 @@ class PagedAttentionCache:
         layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`.
 
     Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the
-        same for all layers in group N, equivalently it is allocated accross all cache tensors. This allows us to
+        same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to
         efficiently allocate and free blocks, and to efficiently read and write key and value states.
 
     For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3
@@ -349,7 +349,7 @@ class PagedAttentionMemoryHandler:
     The memory footprint consists of three main components:
     - Cache memory: the space needed to store the cache tensors:
         2 * layer_group_size * [num_pages, page_size] * cache_dtype
-    - Activation memory: the space temporarly taken by the largest activation during the model forward pass:
+    - Activation memory: the space temporarily taken by the largest activation during the model forward pass:
         peak_activation_per_token * max_tokens_per_batch * activation_dtype_size
     - Static tensors: the space taken by the input/output buffers and metadata tensors for batch processing, sum of:
         - inputs_ids + outputs_ids + position_ids + logits_indices: 4 * max_tokens_per_batch * int32_size
 
@@ -1108,14 +1108,14 @@ def _get_single_block_row_attention(
         if block_id == to_end_block_id - 2:
             illegal_blocks.append(1)
 
-        selected_random_blokcs = []
+        selected_random_blocks = []
 
         for i in range(to_end_block_id - to_start_block_id):
             if perm_block[i] not in illegal_blocks:
-                selected_random_blokcs.append(perm_block[i])
-            if len(selected_random_blokcs) == num_rand_blocks:
+                selected_random_blocks.append(perm_block[i])
+            if len(selected_random_blocks) == num_rand_blocks:
                 break
-        return np.array(selected_random_blokcs, dtype=np.int32)
+        return np.array(selected_random_blocks, dtype=np.int32)
 
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BigBird
 
@@ -1086,14 +1086,14 @@ def _get_single_block_row_attention(
         if block_id == to_end_block_id - 2:
             illegal_blocks.append(1)
 
-        selected_random_blokcs = []
+        selected_random_blocks = []
 
         for i in range(to_end_block_id - to_start_block_id):
             if perm_block[i] not in illegal_blocks:
-                selected_random_blokcs.append(perm_block[i])
-            if len(selected_random_blokcs) == num_rand_blocks:
+                selected_random_blocks.append(perm_block[i])
+            if len(selected_random_blocks) == num_rand_blocks:
                 break
-        return np.array(selected_random_blokcs, dtype=np.int32)
+        return np.array(selected_random_blocks, dtype=np.int32)
 
 
 class BigBirdPegasusEncoderAttention(nn.Module):
 
@@ -351,7 +351,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         past_key_values: Optional[Cache] = None,
         use_cache: Optional[bool] = None,
-        cache_postion: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
     ):
         """
         Args:
@@ -492,16 +492,16 @@ def _position_bucket(self, relative_position, num_buckets=32, max_distance=128):
         relative_position = torch.abs(relative_position)
         max_exact = num_buckets // 2
         is_small = relative_position < max_exact
-        relative_postion_if_large = max_exact + (
+        relative_position_if_large = max_exact + (
             torch.log(relative_position.float() / max_exact)
             / math.log(max_distance / max_exact)
             * (num_buckets - max_exact)
         ).to(torch.int32)
-        relative_postion_if_large = torch.min(
-            relative_postion_if_large,
-            torch.full_like(relative_postion_if_large, num_buckets - 1),
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, num_buckets - 1),
         )
-        relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large)
+        relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_position_if_large)
         return relative_buckets
 
 
 
@@ -439,9 +439,9 @@ def convert_transformer_weights(
         decoder_block_start = path.find(_TRANSFORMER_DECODER_BLOCK)
         decoder_block_offset = decoder_block_start + _TRANSFORMER_DECODER_BLOCK_LEN
         decoder_block_path = path[decoder_block_offset:]
-        next_path_seperator_idx = decoder_block_path.find("/")
-        layer_idx = decoder_block_path[:next_path_seperator_idx]
-        decoder_block_path = decoder_block_path[next_path_seperator_idx:]
+        next_path_separator_idx = decoder_block_path.find("/")
+        layer_idx = decoder_block_path[:next_path_separator_idx]
+        decoder_block_path = decoder_block_path[next_path_separator_idx:]
 
         base_path = f"language_model.model.layers.{layer_idx}"
 
 
@@ -950,7 +950,7 @@ def __init__(self, config):
         self.visual_projection = GitProjection(config)
 
         if config.num_image_with_embedding is not None:
-            self.img_temperal_embedding = nn.ParameterList(
+            self.img_temporal_embedding = nn.ParameterList(
                 nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size))
                 for _ in range(config.num_image_with_embedding)
             )
@@ -1115,7 +1115,7 @@ def forward(
                     visual_features_frame = self.image_encoder(
                         pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding
                     ).last_hidden_state
-                    visual_features_frame += self.img_temperal_embedding[frame_idx]
+                    visual_features_frame += self.img_temporal_embedding[frame_idx]
                     visual_features.append(visual_features_frame)
 
                 # finally, concatenate all features along sequence dimension
 
@@ -74,7 +74,7 @@ def gumbel_softmax(logits: torch.Tensor, tau: float = 1, hard: bool = False, dim
         y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
         ret = y_hard - y_soft.detach() + y_soft
     else:
-        # Reparametrization trick.
+        # Reparameterization trick.
         ret = y_soft
     return ret
 
@@ -662,7 +662,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
 
@@ -242,7 +242,7 @@ def preprocess(
             raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
 
         # Here, normalize() is using a constant factor to divide pixel values.
-        # hence, the method does not need iamge_mean and image_std.
+        # hence, the method does not need image_mean and image_std.
         validate_preprocess_arguments(
             do_resize=do_resize,
             size=size,
 
@@ -34,7 +34,7 @@
 # Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly.
 def torch_extract_patches(image_tensor, patch_height, patch_width):
     """
-    Utiliy function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape
+    Utility function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape
     (batch_size, `rows`, `columns`, `num_channels` x `patch_height` x `patch_width`).
 
     Args:
 
@@ -203,7 +203,7 @@ def __call__(
             if padding:
                 padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
 
-        # now let's padd left and right
+        # now let's pad left and right
         pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
         pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
         padded_inputs["input_values"] = np.pad(