@@ -54,9 +54,6 @@ class GemmaConfig(PretrainedConfig):
5454 The attention head dimension.
5555 hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
5656 The legacy activation function. It is overwritten by the `hidden_activation`.
57- hidden_activation (`str` or `function`, *optional*):
58- The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
59- if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
6057 max_position_embeddings (`int`, *optional*, defaults to 8192):
6158 The maximum sequence length that this model might ever be used with.
6259 initializer_range (`float`, *optional*, defaults to 0.02):
@@ -117,7 +114,6 @@ def __init__(
117114 num_key_value_heads = 16 ,
118115 head_dim = 256 ,
119116 hidden_act = "gelu_pytorch_tanh" ,
120- hidden_activation = None ,
121117 max_position_embeddings = 8192 ,
122118 initializer_range = 0.02 ,
123119 rms_norm_eps = 1e-6 ,
@@ -140,7 +136,6 @@ def __init__(
140136 self .head_dim = head_dim
141137 self .num_key_value_heads = num_key_value_heads
142138 self .hidden_act = hidden_act
143- self .hidden_activation = hidden_activation
144139 self .initializer_range = initializer_range
145140 self .rms_norm_eps = rms_norm_eps
146141 self .use_cache = use_cache
0 commit comments