diff --git a/src/transformers/activations.py b/src/transformers/activations.py index e1f238ab4fb0..12f8408d1146 100644 --- a/src/transformers/activations.py +++ b/src/transformers/activations.py @@ -2,6 +2,7 @@ import torch import torch.nn.functional as F +from packaging import version from .utils import logging @@ -9,29 +10,25 @@ logger = logging.get_logger(__name__) -def swish(x): - return x * torch.sigmoid(x) - - def _gelu_python(x): """ - Original Implementation of the gelu activation function in Google Bert repo when initially created. For - information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + + Original Implementation of the GELU activation function in Google BERT repo when initially created. For + information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in - torch.nn.functional Also see https://arxiv.org/abs/1606.08415 + torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) def gelu_new(x): """ - Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). Also see - https://arxiv.org/abs/1606.08415 + Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see + the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) -if torch.__version__ < "1.4.0": +if version.parse(torch.__version__) < version.parse("1.4"): gelu = _gelu_python else: gelu = F.gelu @@ -41,6 +38,23 @@ def gelu_fast(x): return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) +def _silu_python(x): + """ + See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear + Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function + Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated + Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with + later. + """ + return x * torch.sigmoid(x) + + +if version.parse(torch.__version__) < version.parse("1.7"): + silu = _silu_python +else: + silu = F.silu + + def mish(x): return x * torch.tanh(torch.nn.functional.softplus(x)) @@ -51,7 +65,8 @@ def linear_act(x): ACT2FN = { "relu": F.relu, - "swish": swish, + "silu": silu, + "swish": silu, "gelu": gelu, "tanh": torch.tanh, "gelu_new": gelu_new, diff --git a/src/transformers/activations_tf.py b/src/transformers/activations_tf.py index c6e71b9d4d82..1e330f4ccb5e 100644 --- a/src/transformers/activations_tf.py +++ b/src/transformers/activations_tf.py @@ -52,6 +52,7 @@ def gelu_fast(x): "gelu": tf.keras.layers.Activation(gelu), "relu": tf.keras.activations.relu, "swish": tf.keras.activations.swish, + "silu": tf.keras.activations.swish, "gelu_new": tf.keras.layers.Activation(gelu_new), "mish": tf.keras.layers.Activation(mish), "tanh": tf.keras.activations.tanh, diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py index 958876558b84..78bde7157039 100644 --- a/src/transformers/configuration_albert.py +++ b/src/transformers/configuration_albert.py @@ -61,7 +61,7 @@ class AlbertConfig(PretrainedConfig): The number of inner repetition of attention and ffn. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu_new"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0): diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py index a5f79f33d151..1bc06624a07f 100644 --- a/src/transformers/configuration_bart.py +++ b/src/transformers/configuration_bart.py @@ -59,7 +59,7 @@ class BartConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py index 8c9ec766d1e4..5cb86168d04d 100644 --- a/src/transformers/configuration_bert.py +++ b/src/transformers/configuration_bert.py @@ -74,7 +74,7 @@ class BertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_bert_generation.py b/src/transformers/configuration_bert_generation.py index 0342d4909cfa..3b9dc4873f40 100644 --- a/src/transformers/configuration_bert_generation.py +++ b/src/transformers/configuration_bert_generation.py @@ -40,7 +40,7 @@ class BertGenerationConfig(PretrainedConfig): Dimensionality of the "intermediate" (often called feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_blenderbot.py b/src/transformers/configuration_blenderbot.py index ef9b97db5a28..449089a862d2 100644 --- a/src/transformers/configuration_blenderbot.py +++ b/src/transformers/configuration_blenderbot.py @@ -56,7 +56,7 @@ class BlenderbotConfig(BartConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_deberta.py b/src/transformers/configuration_deberta.py index e305784e84ed..ffc236df41fd 100644 --- a/src/transformers/configuration_deberta.py +++ b/src/transformers/configuration_deberta.py @@ -52,7 +52,7 @@ class DebertaConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`, :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py index 42a6eae22e84..fce9563af215 100644 --- a/src/transformers/configuration_distilbert.py +++ b/src/transformers/configuration_distilbert.py @@ -66,7 +66,7 @@ class DistilBertConfig(PretrainedConfig): The dropout ratio for the attention probabilities. activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. initializer_range (:obj:`float`, `optional`, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. qa_dropout (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_dpr.py b/src/transformers/configuration_dpr.py index b079e8a7d6e3..506a2c4f5b87 100644 --- a/src/transformers/configuration_dpr.py +++ b/src/transformers/configuration_dpr.py @@ -55,7 +55,7 @@ class DPRConfig(PretrainedConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_electra.py b/src/transformers/configuration_electra.py index 91253f0aefce..066c1d501e3b 100644 --- a/src/transformers/configuration_electra.py +++ b/src/transformers/configuration_electra.py @@ -60,7 +60,7 @@ class ElectraConfig(PretrainedConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_fsmt.py b/src/transformers/configuration_fsmt.py index 4008d93fb1fa..16a68b514d53 100644 --- a/src/transformers/configuration_fsmt.py +++ b/src/transformers/configuration_fsmt.py @@ -71,7 +71,7 @@ class FSMTConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_funnel.py b/src/transformers/configuration_funnel.py index c1b6a284afd8..7883aec9234f 100644 --- a/src/transformers/configuration_funnel.py +++ b/src/transformers/configuration_funnel.py @@ -66,7 +66,7 @@ class FunnelConfig(PretrainedConfig): Inner dimension in the feed-forward blocks. hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py index af8fc331a6a5..7a054e4bbe2f 100644 --- a/src/transformers/configuration_gpt2.py +++ b/src/transformers/configuration_gpt2.py @@ -60,7 +60,7 @@ class GPT2Config(PretrainedConfig): n_inner (:obj:`int`, `optional`, defaults to None): Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`): - Activation function, to be selected in the list :obj:`["relu", "swish", "gelu", "tanh", "gelu_new"]`. + Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`. resid_pdrop (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_layoutlm.py b/src/transformers/configuration_layoutlm.py index 75e5fe717c09..f16e17e4ea25 100644 --- a/src/transformers/configuration_layoutlm.py +++ b/src/transformers/configuration_layoutlm.py @@ -52,7 +52,7 @@ class LayoutLMConfig(BertConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_lxmert.py b/src/transformers/configuration_lxmert.py index 0c06d14ebd64..e18d4ed03147 100644 --- a/src/transformers/configuration_lxmert.py +++ b/src/transformers/configuration_lxmert.py @@ -55,7 +55,7 @@ class LxmertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_marian.py b/src/transformers/configuration_marian.py index 8e4e257ce98e..042062a3146d 100644 --- a/src/transformers/configuration_marian.py +++ b/src/transformers/configuration_marian.py @@ -50,7 +50,7 @@ class MarianConfig(BartConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_mbart.py b/src/transformers/configuration_mbart.py index 8406236889fd..b03b6f977755 100644 --- a/src/transformers/configuration_mbart.py +++ b/src/transformers/configuration_mbart.py @@ -55,7 +55,7 @@ class MBartConfig(BartConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_mobilebert.py b/src/transformers/configuration_mobilebert.py index a67ff5a79d29..37493d1fb69b 100644 --- a/src/transformers/configuration_mobilebert.py +++ b/src/transformers/configuration_mobilebert.py @@ -48,7 +48,7 @@ class MobileBertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py index fb8f68411a49..2301c36922d7 100644 --- a/src/transformers/configuration_openai.py +++ b/src/transformers/configuration_openai.py @@ -54,7 +54,7 @@ class OpenAIGPTConfig(PretrainedConfig): Number of attention heads for each attention layer in the Transformer encoder. afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. resid_pdrop (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_pegasus.py b/src/transformers/configuration_pegasus.py index 8bda4dc114d7..ed56f0b22c26 100644 --- a/src/transformers/configuration_pegasus.py +++ b/src/transformers/configuration_pegasus.py @@ -94,7 +94,7 @@ class PegasusConfig(BartConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_prophetnet.py b/src/transformers/configuration_prophetnet.py index 3dc2b011ea10..451c93954c29 100644 --- a/src/transformers/configuration_prophetnet.py +++ b/src/transformers/configuration_prophetnet.py @@ -39,7 +39,7 @@ class ProphetNetConfig(PretrainedConfig): The dropout ratio for activations inside the fully connected layer. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. vocab_size (:obj:`int`, `optional`, defaults to 30522): Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`. diff --git a/src/transformers/configuration_reformer.py b/src/transformers/configuration_reformer.py index 0ef4b598b71c..55367d118880 100755 --- a/src/transformers/configuration_reformer.py +++ b/src/transformers/configuration_reformer.py @@ -80,7 +80,7 @@ class ReformerConfig(PretrainedConfig): :obj:`None` to ensure fully random rotations in local sensitive hashing scheme. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`): The non-linear activation function (function or string) in the feed forward layer in the residual attention - block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. hidden_size (:obj:`int`, `optional`, defaults to 256): diff --git a/src/transformers/configuration_retribert.py b/src/transformers/configuration_retribert.py index 36e04faa7137..a68801cbcc48 100644 --- a/src/transformers/configuration_retribert.py +++ b/src/transformers/configuration_retribert.py @@ -49,7 +49,7 @@ class RetriBertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_squeezebert.py b/src/transformers/configuration_squeezebert.py index 666c79ab2ff3..80f456cb5b2d 100644 --- a/src/transformers/configuration_squeezebert.py +++ b/src/transformers/configuration_squeezebert.py @@ -50,7 +50,7 @@ class SqueezeBertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py index 365162de7103..05eda6010cc3 100644 --- a/src/transformers/configuration_xlnet.py +++ b/src/transformers/configuration_xlnet.py @@ -54,7 +54,7 @@ class XLNetConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. ff_activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the If string, :obj:`"gelu"`, :obj:`"relu"`, - :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"silu"` and :obj:`"gelu_new"` are supported. untie_r (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to untie relative position biases attn_type (:obj:`str`, `optional`, defaults to :obj:`"bi"`): diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py index 32cb9f450ec5..6c9eb14ce40e 100644 --- a/src/transformers/modeling_flax_utils.py +++ b/src/transformers/modeling_flax_utils.py @@ -53,6 +53,7 @@ def gelu(x): ACT2FN = { "gelu": nn.gelu, "relu": nn.relu, + "silu": nn.swish, "swish": nn.swish, "gelu_new": gelu, } diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index b69ef557b9e5..5bfb2b682ad2 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -27,7 +27,7 @@ import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss -from .activations import gelu_new, swish +from .activations import gelu_new, silu from .configuration_openai import OpenAIGPTConfig from .file_utils import ( ModelOutput, @@ -139,7 +139,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): return model -ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new} +ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu} class Attention(nn.Module): diff --git a/tests/test_activations.py b/tests/test_activations.py index a5a9a2347755..cc92ea3cda89 100644 --- a/tests/test_activations.py +++ b/tests/test_activations.py @@ -20,6 +20,7 @@ def test_gelu_versions(self): def test_get_activation(self): get_activation("swish") + get_activation("silu") get_activation("relu") get_activation("tanh") get_activation("gelu_new") diff --git a/tests/test_activations_tf.py b/tests/test_activations_tf.py index bdaecff40794..406105c09b05 100644 --- a/tests/test_activations_tf.py +++ b/tests/test_activations_tf.py @@ -12,6 +12,7 @@ class TestTFActivations(unittest.TestCase): def test_get_activation(self): get_tf_activation("swish") + get_tf_activation("silu") get_tf_activation("gelu") get_tf_activation("relu") get_tf_activation("tanh")