@@ -56,7 +56,7 @@ def img_conv_group(input,
5656 conv_act = None ,
5757 param_attr = None ,
5858 conv_with_batchnorm = False ,
59- conv_batchnorm_drop_rate = None ,
59+ conv_batchnorm_drop_rate = 0.0 ,
6060 pool_stride = 1 ,
6161 pool_type = None ,
6262 use_cudnn = True ):
@@ -127,21 +127,21 @@ def sequence_conv_pool(input,
127127
128128def glu (input , dim = - 1 ):
129129 """
130- The gated linear unit composed by split, sigmoid activation and elementwise
131- multiplication. Specifically, Split the input into two equal sized parts
132- :math:`a` and :math:`b` along the given dimension and then compute as
130+ The gated linear unit composed by split, sigmoid activation and elementwise
131+ multiplication. Specifically, Split the input into two equal sized parts
132+ :math:`a` and :math:`b` along the given dimension and then compute as
133133 following:
134134
135135 .. math::
136136
137137 {GLU}(a, b)= a \otimes \sigma(b)
138138
139- Refer to `Language Modeling with Gated Convolutional Networks
139+ Refer to `Language Modeling with Gated Convolutional Networks
140140 <https://arxiv.org/pdf/1612.08083.pdf>`_.
141-
141+
142142 Args:
143143 input (Variable): The input variable which is a Tensor or LoDTensor.
144- dim (int): The dimension along which to split. If :math:`dim < 0`, the
144+ dim (int): The dimension along which to split. If :math:`dim < 0`, the
145145 dimension to split along is :math:`rank(input) + dim`.
146146
147147 Returns:
@@ -164,24 +164,24 @@ def dot_product_attention(querys, keys, values):
164164 """
165165 The dot-product attention.
166166
167- Attention mechanism can be seen as mapping a query and a set of key-value
168- pairs to an output. The output is computed as a weighted sum of the values,
169- where the weight assigned to each value is computed by a compatibility
167+ Attention mechanism can be seen as mapping a query and a set of key-value
168+ pairs to an output. The output is computed as a weighted sum of the values,
169+ where the weight assigned to each value is computed by a compatibility
170170 function (dot-product here) of the query with the corresponding key.
171-
172- The dot-product attention can be implemented through (batch) matrix
171+
172+ The dot-product attention can be implemented through (batch) matrix
173173 multipication as follows:
174174
175175 .. math::
176176
177177 Attention(Q, K, V)= softmax(QK^\mathrm{T})V
178178
179- Refer to `Attention Is All You Need
179+ Refer to `Attention Is All You Need
180180 <https://arxiv.org/pdf/1706.03762.pdf>`_.
181181
182- Note that batch data containing sequences with different lengths is not
182+ Note that batch data containing sequences with different lengths is not
183183 supported by this because of the (batch) matrix multipication.
184-
184+
185185 Args:
186186 query (Variable): The input variable which is a Tensor or LoDTensor.
187187 key (Variable): The input variable which is a Tensor or LoDTensor.
0 commit comments