huggingface · Rocketknight1 · Sep 19, 2025 · Sep 10, 2025
diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py
@@ -79,7 +79,7 @@ class PagedAttentionCache:
         layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`.
 
     Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the
-        same for all layers in group N, equivalently it is allocated accross all cache tensors. This allows us to
+        same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to
         efficiently allocate and free blocks, and to efficiently read and write key and value states.
 
     For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3
@@ -349,7 +349,7 @@ class PagedAttentionMemoryHandler:
     The memory footprint consists of three main components:
     - Cache memory: the space needed to store the cache tensors:
         2 * layer_group_size * [num_pages, page_size] * cache_dtype
-    - Activation memory: the space temporarly taken by the largest activation during the model forward pass:
+    - Activation memory: the space temporarily taken by the largest activation during the model forward pass:
         peak_activation_per_token * max_tokens_per_batch * activation_dtype_size
     - Static tensors: the space taken by the input/output buffers and metadata tensors for batch processing, sum of:
         - inputs_ids + outputs_ids + position_ids + logits_indices: 4 * max_tokens_per_batch * int32_size

diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -1108,14 +1108,14 @@ def _get_single_block_row_attention(
         if block_id == to_end_block_id - 2:
             illegal_blocks.append(1)
 
-        selected_random_blokcs = []
+        selected_random_blocks = []
 
         for i in range(to_end_block_id - to_start_block_id):
             if perm_block[i] not in illegal_blocks:
-                selected_random_blokcs.append(perm_block[i])
-            if len(selected_random_blokcs) == num_rand_blocks:
+                selected_random_blocks.append(perm_block[i])
+            if len(selected_random_blocks) == num_rand_blocks:
                 break
-        return np.array(selected_random_blokcs, dtype=np.int32)
+        return np.array(selected_random_blocks, dtype=np.int32)
 
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BigBird

diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1086,14 +1086,14 @@ def _get_single_block_row_attention(
         if block_id == to_end_block_id - 2:
             illegal_blocks.append(1)
 
-        selected_random_blokcs = []
+        selected_random_blocks = []
 
         for i in range(to_end_block_id - to_start_block_id):
             if perm_block[i] not in illegal_blocks:
-                selected_random_blokcs.append(perm_block[i])
-            if len(selected_random_blokcs) == num_rand_blocks:
+                selected_random_blocks.append(perm_block[i])
+            if len(selected_random_blocks) == num_rand_blocks:
                 break
-        return np.array(selected_random_blokcs, dtype=np.int32)
+        return np.array(selected_random_blocks, dtype=np.int32)
 
 
 class BigBirdPegasusEncoderAttention(nn.Module):

diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py
@@ -351,7 +351,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         past_key_values: Optional[Cache] = None,
         use_cache: Optional[bool] = None,
-        cache_postion: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
     ):
         """
         Args:
@@ -492,16 +492,16 @@ def _position_bucket(self, relative_position, num_buckets=32, max_distance=128):
         relative_position = torch.abs(relative_position)
         max_exact = num_buckets // 2
         is_small = relative_position < max_exact
-        relative_postion_if_large = max_exact + (
+        relative_position_if_large = max_exact + (
             torch.log(relative_position.float() / max_exact)
             / math.log(max_distance / max_exact)
             * (num_buckets - max_exact)
         ).to(torch.int32)
-        relative_postion_if_large = torch.min(
-            relative_postion_if_large,
-            torch.full_like(relative_postion_if_large, num_buckets - 1),
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, num_buckets - 1),
         )
-        relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large)
+        relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_position_if_large)
         return relative_buckets
 
 

diff --git a/src/transformers/models/gemma3/convert_gemma3_weights.py b/src/transformers/models/gemma3/convert_gemma3_weights.py
@@ -439,9 +439,9 @@ def convert_transformer_weights(
         decoder_block_start = path.find(_TRANSFORMER_DECODER_BLOCK)
         decoder_block_offset = decoder_block_start + _TRANSFORMER_DECODER_BLOCK_LEN
         decoder_block_path = path[decoder_block_offset:]
-        next_path_seperator_idx = decoder_block_path.find("/")
-        layer_idx = decoder_block_path[:next_path_seperator_idx]
-        decoder_block_path = decoder_block_path[next_path_seperator_idx:]
+        next_path_separator_idx = decoder_block_path.find("/")
+        layer_idx = decoder_block_path[:next_path_separator_idx]
+        decoder_block_path = decoder_block_path[next_path_separator_idx:]
 
         base_path = f"language_model.model.layers.{layer_idx}"
 

diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
@@ -950,7 +950,7 @@ def __init__(self, config):
         self.visual_projection = GitProjection(config)
 
         if config.num_image_with_embedding is not None:
-            self.img_temperal_embedding = nn.ParameterList(
+            self.img_temporal_embedding = nn.ParameterList(
                 nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size))
                 for _ in range(config.num_image_with_embedding)
             )
@@ -1115,7 +1115,7 @@ def forward(
                     visual_features_frame = self.image_encoder(
                         pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding
                     ).last_hidden_state
-                    visual_features_frame += self.img_temperal_embedding[frame_idx]
+                    visual_features_frame += self.img_temporal_embedding[frame_idx]
                     visual_features.append(visual_features_frame)
 
                 # finally, concatenate all features along sequence dimension

diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -74,7 +74,7 @@ def gumbel_softmax(logits: torch.Tensor, tau: float = 1, hard: bool = False, dim
         y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
         ret = y_hard - y_soft.detach() + y_soft
     else:
-        # Reparametrization trick.
+        # Reparameterization trick.
         ret = y_soft
     return ret
 
@@ -662,7 +662,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -242,7 +242,7 @@ def preprocess(
             raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
 
         # Here, normalize() is using a constant factor to divide pixel values.
-        # hence, the method does not need iamge_mean and image_std.
+        # hence, the method does not need image_mean and image_std.
         validate_preprocess_arguments(
             do_resize=do_resize,
             size=size,

diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
@@ -34,7 +34,7 @@
 # Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly.
 def torch_extract_patches(image_tensor, patch_height, patch_width):
     """
-    Utiliy function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape
+    Utility function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape
     (batch_size, `rows`, `columns`, `num_channels` x `patch_height` x `patch_width`).
 
     Args:

diff --git a/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
@@ -203,7 +203,7 @@ def __call__(
             if padding:
                 padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
 
-        # now let's padd left and right
+        # now let's pad left and right
         pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
         pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
         padded_inputs["input_values"] = np.pad(

diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
@@ -1078,7 +1078,7 @@ def __init__(self, config):
         self.codec_model = AutoModel.from_config(config.codec_config)
 
         # we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None
-        # yet the codec_model needs a generation config to initalize it's cache for streaming inference
+        # yet the codec_model needs a generation config to initialize it's cache for streaming inference
         # we therefore initialize a generation config for the codec model
         self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config)
 

diff --git a/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py
@@ -182,7 +182,7 @@ def __call__(
             if padding:
                 padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
 
-        # now let's padd left and right
+        # now let's pad left and right
         pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
         pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
         padded_inputs["input_values"] = np.pad(
@@ -258,7 +258,7 @@ def __init__(self, config):
         self.codec_model = AutoModel.from_config(config.codec_config)
 
         # we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None
-        # yet the codec_model needs a generation config to initalize it's cache for streaming inference
+        # yet the codec_model needs a generation config to initialize it's cache for streaming inference
         # we therefore initialize a generation config for the codec model
         self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config)
 

diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -2882,7 +2882,7 @@ def forward(
             Task inputs. Task inputs can be obtained using [`AutoImageProcessor`]. See [`OneFormerProcessor.__call__`]
             for details.
         text_inputs (`list[torch.Tensor]`, *optional*):
-            Tensor fof shape `(num_queries, sequence_length)` to be fed to a model
+            Tensor of shape `(num_queries, sequence_length)` to be fed to a model
 
         Example:
 
@@ -3068,7 +3068,7 @@ def forward(
             Task inputs. Task inputs can be obtained using [`AutoImageProcessor`]. See [`OneFormerProcessor.__call__`]
             for details.
         text_inputs (`list[torch.Tensor]`, *optional*):
-            Tensor fof shape `(num_queries, sequence_length)` to be fed to a model
+            Tensor of shape `(num_queries, sequence_length)` to be fed to a model
         mask_labels (`list[torch.Tensor]`, *optional*):
             List of mask labels of shape `(num_labels, height, width)` to be fed to a model
         class_labels (`list[torch.LongTensor]`, *optional*):

diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -190,7 +190,7 @@ def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
                         target_width=n_w * tile_size,
                         target_height=n_h * tile_size,
                     )
-                    # Llama3V dynamic tiling. Priortize biggest canvas.
+                    # Llama3V dynamic tiling. Prioritize biggest canvas.
                     if (scale < 1.0 and (image_width_height[0] >= optimal_image_width_height[0])) or (
                         scale >= 1.0 and (image_width_height[1] >= optimal_image_width_height[1])
                     ):

diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py
@@ -137,7 +137,7 @@ class Phi4MultimodalAudioConfig(PretrainedConfig):
             The dropout ratio.
         ext_pw_out_channel (`int`, *optional*, defaults to 1024):
             Number of out channels in the point-wise conv modules.
-        depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
+        depthwise_separable_out_channel (`int`, *optional*, defaults to 1024):
             Number of out channels in the depth-wise separable conv modules.
         depthwise_multiplier (`int`, *optional*, defaults to 1):
             Input size multiplier for the depth-wise separable conv modules.
@@ -190,7 +190,7 @@ def __init__(
         left_chunk: int = 18,
         dropout_rate: float = 0.0,
         ext_pw_out_channel: int = 1024,
-        depthwise_seperable_out_channel: int = 1024,
+        depthwise_separable_out_channel: int = 1024,
         depthwise_multiplier: int = 1,
         kernel_size: int = 3,
         conv_activation: str = "swish",
@@ -217,7 +217,7 @@ def __init__(
         self.num_blocks = num_blocks
         self.dropout_rate = dropout_rate
         self.ext_pw_out_channel = ext_pw_out_channel
-        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_separable_out_channel = depthwise_separable_out_channel
         self.depthwise_multiplier = depthwise_multiplier
         self.kernel_size = kernel_size
         self.conv_activation = conv_activation

diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
@@ -746,7 +746,7 @@ def forward(
         return attn_output
 
 
-class Phi4MultimodalAudioDepthWiseSeperableConv1d(nn.Module):
+class Phi4MultimodalAudioDepthWiseSeparableConv1d(nn.Module):
     def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
         super().__init__()
         self.dw_conv = nn.Conv1d(
@@ -758,7 +758,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
             groups=config.hidden_size,
         )
         self.pw_conv = nn.Conv1d(
-            config.hidden_size * config.depthwise_multiplier, config.depthwise_seperable_out_channel, 1, 1, 0
+            config.hidden_size * config.depthwise_multiplier, config.depthwise_separable_out_channel, 1, 1, 0
         )
 
     def forward(self, hidden_states):
@@ -794,7 +794,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig):
 
         self.layer_norm = nn.LayerNorm(config.hidden_size)
         self.glu = Phi4MultimodalAudioGluPointWiseConv(config)
-        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeperableConv1d(config, padding=config.kernel_size - 1)
+        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeparableConv1d(config, padding=config.kernel_size - 1)
         self.act = ACT2FN[config.conv_activation]
         self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel, kernel_size=1, stride=1)
         self.dropout = nn.Dropout(config.dropout_rate)

diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
@@ -174,7 +174,7 @@ class Phi4MultimodalAudioConfig(PretrainedConfig):
             The dropout ratio.
         ext_pw_out_channel (`int`, *optional*, defaults to 1024):
             Number of out channels in the point-wise conv modules.
-        depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
+        depthwise_separable_out_channel (`int`, *optional*, defaults to 1024):
             Number of out channels in the depth-wise separable conv modules.
         depthwise_multiplier (`int`, *optional*, defaults to 1):
             Input size multiplier for the depth-wise separable conv modules.
@@ -227,7 +227,7 @@ def __init__(
         left_chunk: int = 18,
         dropout_rate: float = 0.0,
         ext_pw_out_channel: int = 1024,
-        depthwise_seperable_out_channel: int = 1024,
+        depthwise_separable_out_channel: int = 1024,
         depthwise_multiplier: int = 1,
         kernel_size: int = 3,
         conv_activation: str = "swish",
@@ -254,7 +254,7 @@ def __init__(
         self.num_blocks = num_blocks
         self.dropout_rate = dropout_rate
         self.ext_pw_out_channel = ext_pw_out_channel
-        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_separable_out_channel = depthwise_separable_out_channel
         self.depthwise_multiplier = depthwise_multiplier
         self.kernel_size = kernel_size
         self.conv_activation = conv_activation
@@ -930,7 +930,7 @@ def forward(
         return attn_output
 
 
-class Phi4MultimodalAudioDepthWiseSeperableConv1d(nn.Module):
+class Phi4MultimodalAudioDepthWiseSeparableConv1d(nn.Module):
     def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
         super().__init__()
         self.dw_conv = nn.Conv1d(
@@ -942,7 +942,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
             groups=config.hidden_size,
         )
         self.pw_conv = nn.Conv1d(
-            config.hidden_size * config.depthwise_multiplier, config.depthwise_seperable_out_channel, 1, 1, 0
+            config.hidden_size * config.depthwise_multiplier, config.depthwise_separable_out_channel, 1, 1, 0
         )
 
     def forward(self, hidden_states):
@@ -978,7 +978,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig):
 
         self.layer_norm = nn.LayerNorm(config.hidden_size)
         self.glu = Phi4MultimodalAudioGluPointWiseConv(config)
-        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeperableConv1d(config, padding=config.kernel_size - 1)
+        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeparableConv1d(config, padding=config.kernel_size - 1)
         self.act = ACT2FN[config.conv_activation]
         self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel, kernel_size=1, stride=1)
         self.dropout = nn.Dropout(config.dropout_rate)

diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
@@ -36,7 +36,7 @@
     "14B": 40,
 }
 
-HIDEN_SIZE_MAPPING = {
+HIDDEN_SIZE_MAPPING = {
     "169M": 768,
     "430M": 1024,
     "1B5": 2048,
@@ -106,7 +106,7 @@ def convert_rmkv_checkpoint_to_hf_format(
     config = RwkvConfig(
         vocab_size=vocab_size,
         num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
-        hidden_size=HIDEN_SIZE_MAPPING[size],
+        hidden_size=HIDDEN_SIZE_MAPPING[size],
     )
     config.save_pretrained(output_dir)
 

diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -2187,7 +2187,7 @@ def __init__(self, config):
             kernel_size=kernel_size,
             padding=(kernel_size - 1) // 2,
         )
-        self.activation_fuction = nn.ReLU()
+        self.activation_function = nn.ReLU()
         self.ln1 = nn.LayerNorm(embed_dim)
         self.dropout_module = nn.Dropout(p=var_pred_dropout)
         self.conv2 = nn.Conv1d(
@@ -2202,10 +2202,10 @@ def __init__(self, config):
     def forward(self, hidden_states: Tensor) -> Tensor:
         # Input: B x T x C; Output: B x T
         hidden_states = self.conv1(hidden_states.transpose(1, 2))
-        hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
         hidden_states = self.dropout_module(self.ln1(hidden_states))
         hidden_states = self.conv2(hidden_states.transpose(1, 2))
-        hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
         hidden_states = self.dropout_module(self.ln2(hidden_states))
         return self.proj(hidden_states).squeeze(dim=2)