Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 26 additions & 11 deletions src/transformers/activations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,33 @@

import torch
import torch.nn.functional as F
from packaging import version

from .utils import logging


logger = logging.get_logger(__name__)


def swish(x):
return x * torch.sigmoid(x)


def _gelu_python(x):
"""
Original Implementation of the gelu activation function in Google Bert repo when initially created. For
information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
torch.nn.functional Also see https://arxiv.org/abs/1606.08415
torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def gelu_new(x):
"""
Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). Also see
https://arxiv.org/abs/1606.08415
Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


if torch.__version__ < "1.4.0":
if version.parse(torch.__version__) < version.parse("1.4"):
gelu = _gelu_python
else:
gelu = F.gelu
Expand All @@ -41,6 +38,23 @@ def gelu_fast(x):
return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))


def _silu_python(x):
"""
See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
later.
"""
return x * torch.sigmoid(x)


if version.parse(torch.__version__) < version.parse("1.7"):
silu = _silu_python
else:
silu = F.silu


def mish(x):
return x * torch.tanh(torch.nn.functional.softplus(x))

Expand All @@ -51,7 +65,8 @@ def linear_act(x):

ACT2FN = {
"relu": F.relu,
"swish": swish,
"silu": silu,
"swish": silu,
"gelu": gelu,
"tanh": torch.tanh,
"gelu_new": gelu_new,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/activations_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def gelu_fast(x):
"gelu": tf.keras.layers.Activation(gelu),
"relu": tf.keras.activations.relu,
"swish": tf.keras.activations.swish,
"silu": tf.keras.activations.swish,
"gelu_new": tf.keras.layers.Activation(gelu_new),
"mish": tf.keras.layers.Activation(mish),
"tanh": tf.keras.activations.tanh,
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class AlbertConfig(PretrainedConfig):
The number of inner repetition of attention and ffn.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class BartConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class BertConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_bert_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class BertGenerationConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often called feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_blenderbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class BlenderbotConfig(BartConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_deberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class DebertaConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
:obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_distilbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class DistilBertConfig(PretrainedConfig):
The dropout ratio for the attention probabilities.
activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
qa_dropout (:obj:`float`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_dpr.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class DPRConfig(PretrainedConfig):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class ElectraConfig(PretrainedConfig):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_fsmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class FSMTConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_funnel.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class FunnelConfig(PretrainedConfig):
Inner dimension in the feed-forward blocks.
hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class GPT2Config(PretrainedConfig):
n_inner (:obj:`int`, `optional`, defaults to None):
Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`):
Activation function, to be selected in the list :obj:`["relu", "swish", "gelu", "tanh", "gelu_new"]`.
Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_layoutlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class LayoutLMConfig(BertConfig):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_lxmert.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class LxmertConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_marian.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class MarianConfig(BartConfig):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_mbart.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class MBartConfig(BartConfig):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_mobilebert.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class MobileBertConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class OpenAIGPTConfig(PretrainedConfig):
Number of attention heads for each attention layer in the Transformer encoder.
afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_pegasus.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class PegasusConfig(BartConfig):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_prophetnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class ProphetNetConfig(PretrainedConfig):
The dropout ratio for activations inside the fully connected layer.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/configuration_reformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class ReformerConfig(PretrainedConfig):
:obj:`None` to ensure fully random rotations in local sensitive hashing scheme.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
The non-linear activation function (function or string) in the feed forward layer in the residual attention
block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
hidden_size (:obj:`int`, `optional`, defaults to 256):
Expand Down
Loading