Polish english doc.

limin2021 · limin2021 · commit eb730ab76b5e · 2021-11-15T02:57:27.000Z
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -76,8 +76,18 @@ def fused_feedforward(x,
         ln1_epsilon (float, optional): Small float of first layer_norm added to denominator to avoid dividing by zero. Default is 1e-5.
         ln2_epsilon (float, optional): Small float of second layer_norm added to denominator to avoid dividing by zero. Default is 1e-5.
         pre_layer_norm (bool, optional): add layer_norm in the pre-processing stage or post-processing state.
-        training (bool): A flag indicating whether it is in train phrase or not. Default True.
-        mode(str): ['upscale_in_train'(default) | 'downscale_in_infer'].
+        training (bool, optional): A flag indicating whether it is in train phrase or not. Default True.
+        mode (str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                               1. upscale_in_train(default), upscale the output at training time
+
+                                  - train: out = input * mask / ( 1.0 - p )
+                                  - inference: out = input
+
+                               2. downscale_in_infer, downscale the output at inference
+
+                                  - train: out = input * mask
+                                  - inference: out = input * (1.0 - p)
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -245,7 +255,10 @@ def fused_multi_head_attention(x,
     	out = out * v
     	out = transpose(out, perm=[0, 2, 1, 3])
     	out = out_linear(out)
-    	out = layer_norm(x + dropout(linear_bias + out))
+    	if pre_layer_norm:
+    	    out = x + dropout(linear_bias + out)
+	else:
+    	    out = layer_norm(x + dropout(linear_bias + out))
 
     Parameters:
         x (Tensor): The input tensor of fused_multi_head_attention. The shape is
@@ -279,7 +292,7 @@ def fused_multi_head_attention(x,
         ln_epsilon (float, optional): Small float value added to denominator of layer_norm
             to avoid dividing by zero. Default is 1e-5.
         training (bool, optional): A flag indicating whether it is in train phrase or not. Default True.
-        mode(str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
+        mode (str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
 
                                1. upscale_in_train(default), upscale the output at training time
 
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -39,8 +39,6 @@ class FusedMultiHeadAttention(Layer):
         attn_dropout_rate (float, optional): The dropout probability used on attention
             weights to drop some attention targets for the dropout in attention.
             0 for no dropout. Default 0.5.
-        epsilon (float, optional): The small value added to the variance to prevent
-            division by zero. Default: 1e-05.
         kdim (int, optional): The feature size in key. If None, assumed equal to
             `embed_dim`. Default None.
         vdim (int, optional): The feature size in value. If None, assumed equal to
@@ -56,6 +54,8 @@ class FusedMultiHeadAttention(Layer):
             Default: None, which means the default bias parameter property is used.
             If it is set to False, this layer will not have trainable bias parameter.
             See usage for details in :code:`ParamAttr`.
+        epsilon (float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
 
     Examples: