Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/transformers/generation/continuous_batching/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class PagedAttentionCache:
layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`.

Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the
same for all layers in group N, equivalently it is allocated accross all cache tensors. This allows us to
same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to
efficiently allocate and free blocks, and to efficiently read and write key and value states.

For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3
Expand Down Expand Up @@ -349,7 +349,7 @@ class PagedAttentionMemoryHandler:
The memory footprint consists of three main components:
- Cache memory: the space needed to store the cache tensors:
2 * layer_group_size * [num_pages, page_size] * cache_dtype
- Activation memory: the space temporarly taken by the largest activation during the model forward pass:
- Activation memory: the space temporarily taken by the largest activation during the model forward pass:
peak_activation_per_token * max_tokens_per_batch * activation_dtype_size
- Static tensors: the space taken by the input/output buffers and metadata tensors for batch processing, sum of:
- inputs_ids + outputs_ids + position_ids + logits_indices: 4 * max_tokens_per_batch * int32_size
Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/big_bird/modeling_big_bird.py
Original file line number Diff line number Diff line change
Expand Up @@ -1108,14 +1108,14 @@ def _get_single_block_row_attention(
if block_id == to_end_block_id - 2:
illegal_blocks.append(1)

selected_random_blokcs = []
selected_random_blocks = []

for i in range(to_end_block_id - to_start_block_id):
if perm_block[i] not in illegal_blocks:
selected_random_blokcs.append(perm_block[i])
if len(selected_random_blokcs) == num_rand_blocks:
selected_random_blocks.append(perm_block[i])
if len(selected_random_blocks) == num_rand_blocks:
break
return np.array(selected_random_blokcs, dtype=np.int32)
return np.array(selected_random_blocks, dtype=np.int32)


# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BigBird
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1086,14 +1086,14 @@ def _get_single_block_row_attention(
if block_id == to_end_block_id - 2:
illegal_blocks.append(1)

selected_random_blokcs = []
selected_random_blocks = []

for i in range(to_end_block_id - to_start_block_id):
if perm_block[i] not in illegal_blocks:
selected_random_blokcs.append(perm_block[i])
if len(selected_random_blokcs) == num_rand_blocks:
selected_random_blocks.append(perm_block[i])
if len(selected_random_blocks) == num_rand_blocks:
break
return np.array(selected_random_blokcs, dtype=np.int32)
return np.array(selected_random_blocks, dtype=np.int32)


class BigBirdPegasusEncoderAttention(nn.Module):
Expand Down
12 changes: 6 additions & 6 deletions src/transformers/models/cpmant/modeling_cpmant.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def forward(
output_hidden_states: Optional[bool] = None,
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None,
cache_postion: Optional[torch.Tensor] = None,
cache_position: Optional[torch.Tensor] = None,
):
"""
Args:
Expand Down Expand Up @@ -492,16 +492,16 @@ def _position_bucket(self, relative_position, num_buckets=32, max_distance=128):
relative_position = torch.abs(relative_position)
max_exact = num_buckets // 2
is_small = relative_position < max_exact
relative_postion_if_large = max_exact + (
relative_position_if_large = max_exact + (
torch.log(relative_position.float() / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(torch.int32)
relative_postion_if_large = torch.min(
relative_postion_if_large,
torch.full_like(relative_postion_if_large, num_buckets - 1),
relative_position_if_large = torch.min(
relative_position_if_large,
torch.full_like(relative_position_if_large, num_buckets - 1),
)
relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large)
relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_position_if_large)
return relative_buckets


Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/gemma3/convert_gemma3_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,9 +439,9 @@ def convert_transformer_weights(
decoder_block_start = path.find(_TRANSFORMER_DECODER_BLOCK)
decoder_block_offset = decoder_block_start + _TRANSFORMER_DECODER_BLOCK_LEN
decoder_block_path = path[decoder_block_offset:]
next_path_seperator_idx = decoder_block_path.find("/")
layer_idx = decoder_block_path[:next_path_seperator_idx]
decoder_block_path = decoder_block_path[next_path_seperator_idx:]
next_path_separator_idx = decoder_block_path.find("/")
layer_idx = decoder_block_path[:next_path_separator_idx]
decoder_block_path = decoder_block_path[next_path_separator_idx:]

base_path = f"language_model.model.layers.{layer_idx}"

Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/git/modeling_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ def __init__(self, config):
self.visual_projection = GitProjection(config)

if config.num_image_with_embedding is not None:
self.img_temperal_embedding = nn.ParameterList(
self.img_temporal_embedding = nn.ParameterList(
nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size))
for _ in range(config.num_image_with_embedding)
)
Expand Down Expand Up @@ -1115,7 +1115,7 @@ def forward(
visual_features_frame = self.image_encoder(
pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding
).last_hidden_state
visual_features_frame += self.img_temperal_embedding[frame_idx]
visual_features_frame += self.img_temporal_embedding[frame_idx]
visual_features.append(visual_features_frame)

# finally, concatenate all features along sequence dimension
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/groupvit/modeling_groupvit.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def gumbel_softmax(logits: torch.Tensor, tau: float = 1, hard: bool = False, dim
y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
ret = y_hard - y_soft.detach() + y_soft
else:
# Reparametrization trick.
# Reparameterization trick.
ret = y_soft
return ret

Expand Down Expand Up @@ -662,7 +662,7 @@ def forward(
attn_weights = nn.functional.softmax(attn_weights, dim=-1)

if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# twice and have to be reused in the following
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def preprocess(
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")

# Here, normalize() is using a constant factor to divide pixel values.
# hence, the method does not need iamge_mean and image_std.
# hence, the method does not need image_mean and image_std.
validate_preprocess_arguments(
do_resize=do_resize,
size=size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly.
def torch_extract_patches(image_tensor, patch_height, patch_width):
"""
Utiliy function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape
Utility function to extract patches from a given tensor representing a batch of images. Returns a tensor of shape
(batch_size, `rows`, `columns`, `num_channels` x `patch_height` x `patch_width`).

Args:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def __call__(
if padding:
padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")

# now let's padd left and right
# now let's pad left and right
pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
padded_inputs["input_values"] = np.pad(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1078,7 +1078,7 @@ def __init__(self, config):
self.codec_model = AutoModel.from_config(config.codec_config)

# we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None
# yet the codec_model needs a generation config to initalize it's cache for streaming inference
# yet the codec_model needs a generation config to initialize it's cache for streaming inference
# we therefore initialize a generation config for the codec model
self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def __call__(
if padding:
padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")

# now let's padd left and right
# now let's pad left and right
pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
padded_inputs["input_values"] = np.pad(
Expand Down Expand Up @@ -258,7 +258,7 @@ def __init__(self, config):
self.codec_model = AutoModel.from_config(config.codec_config)

# we are in an edge case where for the codec_model self.can_generate is False, setting self.codec_model.generation_config to None
# yet the codec_model needs a generation config to initalize it's cache for streaming inference
# yet the codec_model needs a generation config to initialize it's cache for streaming inference
# we therefore initialize a generation config for the codec model
self.codec_model.generation_config = GenerationConfig.from_model_config(config.codec_config)

Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/oneformer/modeling_oneformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2882,7 +2882,7 @@ def forward(
Task inputs. Task inputs can be obtained using [`AutoImageProcessor`]. See [`OneFormerProcessor.__call__`]
for details.
text_inputs (`list[torch.Tensor]`, *optional*):
Tensor fof shape `(num_queries, sequence_length)` to be fed to a model
Tensor of shape `(num_queries, sequence_length)` to be fed to a model

Example:

Expand Down Expand Up @@ -3068,7 +3068,7 @@ def forward(
Task inputs. Task inputs can be obtained using [`AutoImageProcessor`]. See [`OneFormerProcessor.__call__`]
for details.
text_inputs (`list[torch.Tensor]`, *optional*):
Tensor fof shape `(num_queries, sequence_length)` to be fed to a model
Tensor of shape `(num_queries, sequence_length)` to be fed to a model
mask_labels (`list[torch.Tensor]`, *optional*):
List of mask labels of shape `(num_labels, height, width)` to be fed to a model
class_labels (`list[torch.LongTensor]`, *optional*):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
target_width=n_w * tile_size,
target_height=n_h * tile_size,
)
# Llama3V dynamic tiling. Priortize biggest canvas.
# Llama3V dynamic tiling. Prioritize biggest canvas.
if (scale < 1.0 and (image_width_height[0] >= optimal_image_width_height[0])) or (
scale >= 1.0 and (image_width_height[1] >= optimal_image_width_height[1])
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class Phi4MultimodalAudioConfig(PretrainedConfig):
The dropout ratio.
ext_pw_out_channel (`int`, *optional*, defaults to 1024):
Number of out channels in the point-wise conv modules.
depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
depthwise_separable_out_channel (`int`, *optional*, defaults to 1024):
Number of out channels in the depth-wise separable conv modules.
depthwise_multiplier (`int`, *optional*, defaults to 1):
Input size multiplier for the depth-wise separable conv modules.
Expand Down Expand Up @@ -190,7 +190,7 @@ def __init__(
left_chunk: int = 18,
dropout_rate: float = 0.0,
ext_pw_out_channel: int = 1024,
depthwise_seperable_out_channel: int = 1024,
depthwise_separable_out_channel: int = 1024,
depthwise_multiplier: int = 1,
kernel_size: int = 3,
conv_activation: str = "swish",
Expand All @@ -217,7 +217,7 @@ def __init__(
self.num_blocks = num_blocks
self.dropout_rate = dropout_rate
self.ext_pw_out_channel = ext_pw_out_channel
self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
self.depthwise_separable_out_channel = depthwise_separable_out_channel
self.depthwise_multiplier = depthwise_multiplier
self.kernel_size = kernel_size
self.conv_activation = conv_activation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ def forward(
return attn_output


class Phi4MultimodalAudioDepthWiseSeperableConv1d(nn.Module):
class Phi4MultimodalAudioDepthWiseSeparableConv1d(nn.Module):
def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
super().__init__()
self.dw_conv = nn.Conv1d(
Expand All @@ -758,7 +758,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
groups=config.hidden_size,
)
self.pw_conv = nn.Conv1d(
config.hidden_size * config.depthwise_multiplier, config.depthwise_seperable_out_channel, 1, 1, 0
config.hidden_size * config.depthwise_multiplier, config.depthwise_separable_out_channel, 1, 1, 0
)

def forward(self, hidden_states):
Expand Down Expand Up @@ -794,7 +794,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig):

self.layer_norm = nn.LayerNorm(config.hidden_size)
self.glu = Phi4MultimodalAudioGluPointWiseConv(config)
self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeperableConv1d(config, padding=config.kernel_size - 1)
self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeparableConv1d(config, padding=config.kernel_size - 1)
self.act = ACT2FN[config.conv_activation]
self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel, kernel_size=1, stride=1)
self.dropout = nn.Dropout(config.dropout_rate)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ class Phi4MultimodalAudioConfig(PretrainedConfig):
The dropout ratio.
ext_pw_out_channel (`int`, *optional*, defaults to 1024):
Number of out channels in the point-wise conv modules.
depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
depthwise_separable_out_channel (`int`, *optional*, defaults to 1024):
Number of out channels in the depth-wise separable conv modules.
depthwise_multiplier (`int`, *optional*, defaults to 1):
Input size multiplier for the depth-wise separable conv modules.
Expand Down Expand Up @@ -227,7 +227,7 @@ def __init__(
left_chunk: int = 18,
dropout_rate: float = 0.0,
ext_pw_out_channel: int = 1024,
depthwise_seperable_out_channel: int = 1024,
depthwise_separable_out_channel: int = 1024,
depthwise_multiplier: int = 1,
kernel_size: int = 3,
conv_activation: str = "swish",
Expand All @@ -254,7 +254,7 @@ def __init__(
self.num_blocks = num_blocks
self.dropout_rate = dropout_rate
self.ext_pw_out_channel = ext_pw_out_channel
self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
self.depthwise_separable_out_channel = depthwise_separable_out_channel
self.depthwise_multiplier = depthwise_multiplier
self.kernel_size = kernel_size
self.conv_activation = conv_activation
Expand Down Expand Up @@ -930,7 +930,7 @@ def forward(
return attn_output


class Phi4MultimodalAudioDepthWiseSeperableConv1d(nn.Module):
class Phi4MultimodalAudioDepthWiseSeparableConv1d(nn.Module):
def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
super().__init__()
self.dw_conv = nn.Conv1d(
Expand All @@ -942,7 +942,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
groups=config.hidden_size,
)
self.pw_conv = nn.Conv1d(
config.hidden_size * config.depthwise_multiplier, config.depthwise_seperable_out_channel, 1, 1, 0
config.hidden_size * config.depthwise_multiplier, config.depthwise_separable_out_channel, 1, 1, 0
)

def forward(self, hidden_states):
Expand Down Expand Up @@ -978,7 +978,7 @@ def __init__(self, config: Phi4MultimodalAudioConfig):

self.layer_norm = nn.LayerNorm(config.hidden_size)
self.glu = Phi4MultimodalAudioGluPointWiseConv(config)
self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeperableConv1d(config, padding=config.kernel_size - 1)
self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeparableConv1d(config, padding=config.kernel_size - 1)
self.act = ACT2FN[config.conv_activation]
self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel, kernel_size=1, stride=1)
self.dropout = nn.Dropout(config.dropout_rate)
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"14B": 40,
}

HIDEN_SIZE_MAPPING = {
HIDDEN_SIZE_MAPPING = {
"169M": 768,
"430M": 1024,
"1B5": 2048,
Expand Down Expand Up @@ -106,7 +106,7 @@ def convert_rmkv_checkpoint_to_hf_format(
config = RwkvConfig(
vocab_size=vocab_size,
num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
hidden_size=HIDEN_SIZE_MAPPING[size],
hidden_size=HIDDEN_SIZE_MAPPING[size],
)
config.save_pretrained(output_dir)

Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
Original file line number Diff line number Diff line change
Expand Up @@ -2187,7 +2187,7 @@ def __init__(self, config):
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
)
self.activation_fuction = nn.ReLU()
self.activation_function = nn.ReLU()
self.ln1 = nn.LayerNorm(embed_dim)
self.dropout_module = nn.Dropout(p=var_pred_dropout)
self.conv2 = nn.Conv1d(
Expand All @@ -2202,10 +2202,10 @@ def __init__(self, config):
def forward(self, hidden_states: Tensor) -> Tensor:
# Input: B x T x C; Output: B x T
hidden_states = self.conv1(hidden_states.transpose(1, 2))
hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
hidden_states = self.activation_function(hidden_states).transpose(1, 2)
hidden_states = self.dropout_module(self.ln1(hidden_states))
hidden_states = self.conv2(hidden_states.transpose(1, 2))
hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
hidden_states = self.activation_function(hidden_states).transpose(1, 2)
hidden_states = self.dropout_module(self.ln2(hidden_states))
return self.proj(hidden_states).squeeze(dim=2)

Expand Down
Loading