fix configs

Cyrilvallez · Cyrilvallez · commit 475c567e2996 · 2025-09-10T15:17:53.000+02:00
diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
@@ -54,9 +54,6 @@ class GemmaConfig(PretrainedConfig):
             The attention head dimension.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
             The legacy activation function. It is overwritten by the `hidden_activation`.
-        hidden_activation (`str` or `function`, *optional*):
-            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
-            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
         max_position_embeddings (`int`, *optional*, defaults to 8192):
             The maximum sequence length that this model might ever be used with.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -117,7 +114,6 @@ def __init__(
         num_key_value_heads=16,
         head_dim=256,
         hidden_act="gelu_pytorch_tanh",
-        hidden_activation=None,
         max_position_embeddings=8192,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
@@ -140,7 +136,6 @@ def __init__(
         self.head_dim = head_dim
         self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
-        self.hidden_activation = hidden_activation
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py
@@ -79,9 +79,6 @@ class GemmaConfig(PretrainedConfig):
             The attention head dimension.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
             The legacy activation function. It is overwritten by the `hidden_activation`.
-        hidden_activation (`str` or `function`, *optional*):
-            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
-            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
         max_position_embeddings (`int`, *optional*, defaults to 8192):
             The maximum sequence length that this model might ever be used with.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -142,7 +139,6 @@ def __init__(
         num_key_value_heads=16,
         head_dim=256,
         hidden_act="gelu_pytorch_tanh",
-        hidden_activation=None,
         max_position_embeddings=8192,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
@@ -165,7 +161,6 @@ def __init__(
         self.head_dim = head_dim
         self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
-        self.hidden_activation = hidden_activation
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
diff --git a/src/transformers/models/lxmert/configuration_lxmert.py b/src/transformers/models/lxmert/configuration_lxmert.py
@@ -66,8 +66,6 @@ class LxmertConfig(PretrainedConfig):
             The vocabulary size of the *token_type_ids* passed into [`BertModel`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
         l_layers (`int`, *optional*, defaults to 9):
             Number of hidden layers in the Transformer language encoder.
         x_layers (`int`, *optional*, defaults to 5):
@@ -119,7 +117,6 @@ def __init__(
         max_position_embeddings=512,
         type_vocab_size=2,
         initializer_range=0.02,
-        layer_norm_eps=1e-12,
         l_layers=9,
         x_layers=5,
         r_layers=5,
@@ -145,7 +142,6 @@ def __init__(
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
         self.num_qa_labels = num_qa_labels
         self.num_object_labels = num_object_labels
         self.num_attr_labels = num_attr_labels
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -187,8 +187,7 @@ def __init__(self, config):
         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, padding_idx=0)
         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size, padding_idx=0)
 
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
+        # self.LayerNorm is not snake-cased due to old tensorflow checkpoint name matching
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py
@@ -49,8 +49,6 @@ class XLNetConfig(PretrainedConfig):
         ff_activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the If string, `"gelu"`, `"relu"`, `"silu"` and
             `"gelu_new"` are supported.
-        untie_r (`bool`, *optional*, defaults to `True`):
-            Whether or not to untie relative position biases
         attn_type (`str`, *optional*, defaults to `"bi"`):
             The attention type used by the model. Set `"bi"` for XLNet, `"uni"` for Transformer-XL.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -150,7 +148,6 @@ def __init__(
         n_head=16,
         d_inner=4096,
         ff_activation="gelu",
-        untie_r=True,
         attn_type="bi",
         initializer_range=0.02,
         layer_norm_eps=1e-12,
@@ -188,7 +185,6 @@ def __init__(
         self.d_head = d_model // n_head
         self.ff_activation = ff_activation
         self.d_inner = d_inner
-        self.untie_r = untie_r
         self.attn_type = attn_type
 
         self.initializer_range = initializer_range
diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py
@@ -54,7 +54,6 @@ def __init__(
         max_position_embeddings=512,
         type_vocab_size=2,
         initializer_range=0.02,
-        layer_norm_eps=1e-12,
         pad_token_id=0,
         num_qa_labels=30,
         num_object_labels=16,
@@ -94,7 +93,6 @@ def __init__(
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
         self.pad_token_id = pad_token_id
         self.num_qa_labels = num_qa_labels
         self.num_object_labels = num_object_labels
@@ -194,7 +192,6 @@ def get_config(self):
             max_position_embeddings=self.max_position_embeddings,
             type_vocab_size=self.type_vocab_size,
             initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
             pad_token_id=self.pad_token_id,
             num_qa_labels=self.num_qa_labels,
             num_object_labels=self.num_object_labels,
diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py
@@ -56,7 +56,6 @@ def __init__(
         d_inner=128,
         num_hidden_layers=2,
         type_sequence_label_size=2,
-        untie_r=True,
         bi_data=False,
         same_length=False,
         initializer_range=0.05,
@@ -83,7 +82,6 @@ def __init__(
         self.d_inner = 128
         self.num_hidden_layers = 5
         self.type_sequence_label_size = 2
-        self.untie_r = True
         self.bi_data = False
         self.same_length = False
         self.initializer_range = 0.05
@@ -152,7 +150,6 @@ def get_config(self):
             n_head=self.num_attention_heads,
             d_inner=self.d_inner,
             n_layer=self.num_hidden_layers,
-            untie_r=self.untie_r,
             mem_len=self.mem_len,
             clamp_len=self.clamp_len,
             same_length=self.same_length,
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
@@ -308,6 +308,7 @@
     "MarkupLMConfig": ["position_embedding_type"],
     "SmolLM3Config": ["no_rope_layer_interval"],
     "Gemma3nVisionConfig": ["architecture", "do_pooling", "model_args"],  # this is for use in `timm`
+    "GemmaConfig": ["tie_word_embeddings"],
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -308,6 +308,7 @@`
`308`	`308`	`"MarkupLMConfig": ["position_embedding_type"],`
`309`	`309`	`"SmolLM3Config": ["no_rope_layer_interval"],`
`310`	`310`	"Gemma3nVisionConfig": ["architecture", "do_pooling", "model_args"], # this is for use in `timm`
	`311`	`+ "GemmaConfig": ["tie_word_embeddings"],`
`311`	`312`	`}`
`312`	`313`
`313`	`314`