PaddlePaddle
diff --git a/‎paddle/fluid/operators/fused/fused_dropout_act_bias.h‎
Lines changed: 38 additions & 77 deletions b/‎paddle/fluid/operators/fused/fused_dropout_act_bias.h‎
Lines changed: 38 additions & 77 deletions
diff --git a/‎paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu‎
Lines changed: 34 additions & 43 deletions b/‎paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu‎
Lines changed: 34 additions & 43 deletions
@@ -13,43 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
 
 #include "paddle/fluid/operators/fused/fused_dropout_common.h"
-#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/operators/math/functors.h"
 
 namespace paddle {
 namespace operators {
 
-typedef platform::float16 fp16;
-
-/**
- *@brief the relu functor
- */
-template <typename T>
-struct ReluFunctor {
-  __host__ __device__ T operator()(const T *args) const {
-    math::ReluFunctor<T> relu;
-    return relu(args[0]);
-  }
-};
-
-template <typename T>
-struct ReluGradFunctor {
-  __host__ __device__ __forceinline__ T operator()(const T *args) const {
-    math::ReluGradFunctor<T> relu_grad;
-    return args[0] * relu_grad.UseOut(args[1]);
-  }
-};
-
 /**
  *@brief the gelu functor
  */
 template <typename T>
 struct GeluFunctor {
-  __host__ __device__ T operator()(const T *args) const {
-    math::GeluFunctor<T> gelu;
-    return gelu(args[0]);
+  inline __host__ __device__ T operator()(const T x) const {
+    using U = LayerNormParamType<T>;
+    const U casted_x = static_cast<U>(x);
+    const U temp = erf(casted_x * static_cast<U>(M_SQRT1_2));
+    const U out = (casted_x * static_cast<U>(0.5) * (static_cast<U>(1) + temp));
+    return static_cast<T>(out);
   }
 };
 
@@ -58,11 +42,17 @@ struct GeluFunctor {
  */
 template <typename T>
 struct GeluGradFunctor {
-  __host__ __device__ T operator()(const T *args) const {
-    const T grad = args[0];
-    const T x = args[1];
-    math::GeluGradFunctor<T> gelu_grad;
-    return grad * gelu_grad.UseOut(x);
+  inline __host__ __device__ T UseOut(const T x) const {
+    using U = LayerNormParamType<T>;
+    auto casted_x = static_cast<U>(x);
+
+    auto first =
+        static_cast<U>(0.5) *
+        (static_cast<U>(1) + erf(casted_x * static_cast<U>(M_SQRT1_2)));
+
+    auto second = static_cast<U>(0.5 * M_2_SQRTPI * M_SQRT1_2) * casted_x *
+                  exp(-static_cast<U>(0.5) * casted_x * casted_x);
+    return static_cast<T>((first + second));
   }
 };
 
@@ -72,13 +62,12 @@ struct GeluGradFunctor {
  * the bias shape is (1, cols)
  */
 template <typename T, typename MaskType, int VecSize, typename Functor>
-__global__ void FusedDropoutActBias(Functor act, const uint64_t seed,
-                                    const uint64_t rows, const uint64_t cols,
-                                    const int increment,
-                                    const float dropout_prob,
-                                    const bool is_upscale_in_train,
-                                    const bool is_test, const T *src,
-                                    const T *bias, T *dst, MaskType *mask) {
+__global__ void FusedDropoutActBias(
+    Functor act, const uint64_t seed, const uint64_t rows, const uint64_t cols,
+    const int increment, const float dropout_prob,
+    const bool is_upscale_in_train, const bool is_test,
+    const T *__restrict__ src, const T *__restrict__ bias, T *dst,
+    MaskType *mask) {
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
@@ -102,9 +91,8 @@ __global__ void FusedDropoutActBias(Functor act, const uint64_t seed,
   using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
   using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
 
-  const int tmp_cols = cols / VecSize * VecSize;
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
-    for (int i = col_id * VecSize; i < tmp_cols;
+    for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
       LoadT src_vec;
       LoadT bias_vec;
@@ -139,11 +127,14 @@ __global__ void FusedDropoutActBias(Functor act, const uint64_t seed,
 #pragma unroll
       for (int ii = 0; ii < VecSize; ii++) {
         const T tmp = src_vec[ii] + bias_vec[ii];
-        dest_vec[ii] = act(&tmp) * static_cast<T>(mask_vec[ii]) * factor;
+        const T act_out = act(tmp);
+        dest_vec[ii] = act_out * static_cast<T>(mask_vec[ii]) * factor;
       }
       // store result to global
       platform::Store<T, VecSize>(dest_vec, &dst[r * cols + i]);
-      platform::Store<MaskType, VecSize>(mask_vec, &mask[r * cols + i]);
+      if (!is_test) {
+        platform::Store<MaskType, VecSize>(mask_vec, &mask[r * cols + i]);
+      }
     }
   }
 }
@@ -161,10 +152,8 @@ void LaunchDropoutActBias(Functor act_functor, const uint64_t seed,
                           const platform::CUDADeviceContext &ctx) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemsetAsync(dst, 0, rows * cols * sizeof(T), ctx.stream()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
-        mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
+    SetZero<T>(ctx, dst, rows * cols);
+    SetZero<MaskType>(ctx, mask_data, rows * cols);
     return;
   }
 
@@ -211,7 +200,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
       T args[2];
       args[0] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
       args[1] = src_vec[ii];
-      dx_vec[ii] = act_grad(args);
+      dx_vec[ii] = args[0] * act_grad.UseOut(args[1]);
     }
     platform::Store<T, VecSize>(dx_vec, &dx[i]);
   }
@@ -221,7 +210,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
  * blocks(128 * 8)
  * 1. calculate the dx and reduce total rows to 128 rows
  * 2. save 128*8 temporary sum in 8*128 shared memory
- * 3. reduce the sum of 128 rows data by 8*VecSize warps
+ * 3. reduce the sum of 128 cols data by 8*VecSize warps
  */
 template <typename T, typename MaskType, int BlockSizeX, int BlockSizeY,
           int VecSize, typename Functor>
@@ -257,43 +246,15 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
         T args[2];
         args[0] = dout_vec[i] * static_cast<T>(mask_vec[i]) * factor;
         args[1] = src_vec[i] + bias_vec[i];
-        val = act_grad(args);
+        val = args[0] * act_grad.UseOut(args[1]);
         dx_vec[i] = val;
         tmp_sum[i] += val;
       }
       platform::Store<T, VecSize>(dx_vec, &dx[index]);
     }
   }
 
-  __shared__ T cache[BlockSizeX * VecSize][BlockSizeY];
-  for (int i = 0; i < VecSize; i++) {
-    cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i];
-  }
-  __syncthreads();
-
-  // reduce sum
-  T sum = static_cast<T>(0);
-  int tid = threadIdx.y * blockDim.x + threadIdx.x;
-  int x = tid >> 5;  // warp id
-  int y = tid & 31;  // thread id on warp 0~31
-
-  // need BlockSizeX * VecSize warps
-  if (x < BlockSizeX * VecSize) {
-// reduce 128 to 32
-#pragma unroll
-    for (int i = 0; i < (BlockSizeY >> 5); i++) {
-      sum += cache[x][y + i * 32];
-    }
-  }
-
-  // reduce 32 to 1
-  sum = WarpReduceSum<T>(sum);
-
-  // save sum to dbias
-  int bias_id = blockIdx.x * blockDim.x * VecSize + x;
-  if (y == 0 && x < VecSize * BlockSizeX && bias_id < cols) {
-    dbias[bias_id] = sum;
-  }
+  CalculateDBias<T, VecSize, BlockSizeX, BlockSizeY>(tmp_sum, dbias, cols);
 }
 
 /**
 
@@ -20,11 +20,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
 #include "paddle/fluid/operators/fused/fused_dropout_test.h"
+#include "paddle/fluid/operators/math/functors.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace details = paddle::operators::details;
-namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
 
 /**
  * @brief the unittest of fused_dropout_act_bias
@@ -111,16 +112,12 @@ struct TestFusedDropoutActBias {
     }
 
     {
-      out.Resize({rows, cols});
-      out.mutable_data<T>(place);
-      mask.Resize({rows, cols});
-      mask.mutable_data<uint8_t>(place);
-      dsrc.Resize({rows, cols});
-      dsrc.mutable_data<T>(place);
+      out.mutable_data<T>({rows, cols}, place);
+      mask.mutable_data<uint8_t>({rows, cols}, place);
+      dsrc.mutable_data<T>({rows, cols}, place);
 
       if (has_bias) {
-        dbias.Resize({cols});
-        dbias.mutable_data<T>(place);
+        dbias.mutable_data<T>({cols}, place);
       }
     }
   }
@@ -133,7 +130,7 @@ struct TestFusedDropoutActBias {
       for (int i = 0; i < rows; i++) {
         for (int j = 0; j < cols; j++) {
           const T tmp = src_vec[i * cols + j] + bias_vec[j];
-          out1[i * cols + j] = act(&tmp);
+          out1[i * cols + j] = act(tmp);
         }
       }
       // call dropout
@@ -143,7 +140,7 @@ struct TestFusedDropoutActBias {
       for (int i = 0; i < rows; i++) {
         for (int j = 0; j < cols; j++) {
           const T tmp = src_vec[i * cols + j];
-          out1[i * cols + j] = act(&tmp);
+          out1[i * cols + j] = act(tmp);
         }
       }
 
@@ -164,22 +161,22 @@ struct TestFusedDropoutActBias {
     GradFunctor act_grad;
     for (int i = 0; i < rows; i++) {
       for (int j = 0; j < cols; j++) {
+        T args[2];
+        args[0] = _out[i * cols + j];
         if (has_bias) {
-          T args[2];
-          args[0] = _out[i * cols + j];
           args[1] = src_vec[i * cols + j] + bias_vec[j];
-          T val = act_grad(args);
-          correct_dbias[j] += val;
-          correct_dsrc[i * cols + j] = val;
         } else {
-          T args[2];
-          args[0] = _out[i * cols + j];
           args[1] = src_vec[i * cols + j];
-          T val = act_grad(args);
-          correct_dsrc[i * cols + j] = val;
         }
+        T val = args[0] * act_grad.UseOut(args[1]);
+        correct_dsrc[i * cols + j] = val;
       }
     }
+
+    if (has_bias) {
+      // reduce_sum: keep the same calculate order as the GPU
+      ReduceSum<T>(correct_dsrc, &correct_dbias, rows, cols);
+    }
   }
 
   void FusedForward() {
@@ -273,47 +270,41 @@ static void BaseTest(const bool is_fp16 = false) {
   const int rows = 16;
   std::vector<int> cols_list = {16, 17};
   bool has_bias[2] = {true, false};
-  T default_diff = !is_fp16 ? static_cast<T>(1e-3) : default_diff =
-                                                         static_cast<T>(1e-2);
+  T default_diff = !is_fp16 ? static_cast<T>(1e-5) : static_cast<T>(1e-1);
   for (auto cols : {16, 17}) {
     for (auto has_bias : {true, false}) {
       TestFusedDropoutActBias<T, Functor, GradFunctor> test(rows, cols);
       test.has_bias = has_bias;
       test.Run();
       test.CheckOut(default_diff);
-      if (!is_fp16) {
-        test.CheckGrad(default_diff);
-      }
+      test.CheckGrad(default_diff);
     }
   }
 }
 
 TEST(FusedDropout, GPUFusedDorpoutActBias) {
-  BaseTest<float, paddle::operators::ReluFunctor<float>,
-           paddle::operators::ReluGradFunctor<float>>();
-  BaseTest<float, operators::GeluFunctor<float>,
-           operators::GeluGradFunctor<float>>();
+  BaseTest<float, math::ReluFunctor<float>, math::ReluGradFunctor<float>>();
+  BaseTest<float, paddle::operators::GeluFunctor<float>,
+           paddle::operators::GeluGradFunctor<float>>();
 }
 TEST(FusedDropout, GPUFusedDropoutActBiasDouble) {
-  BaseTest<double, operators::ReluFunctor<double>,
-           operators::ReluGradFunctor<double>>();
-  BaseTest<double, operators::GeluFunctor<double>,
-           operators::GeluGradFunctor<double>>();
+  BaseTest<double, math::ReluFunctor<double>, math::ReluGradFunctor<double>>();
+  BaseTest<double, paddle::operators::GeluFunctor<double>,
+           paddle::operators::GeluGradFunctor<double>>();
 }
 
 // test fp16, For inference, check_grad is not required. ref: test_dropout_op.py
 TEST(FusedDropout, GPUFusedDropoutActBiasFp16) {
   using fp16 = platform::float16;
-  BaseTest<fp16, operators::ReluFunctor<fp16>,
-           operators::ReluGradFunctor<fp16>>(true);
+  BaseTest<fp16, math::ReluFunctor<fp16>, math::ReluGradFunctor<fp16>>(true);
 }
 
 TEST(FusedDropout, GPUFusedDropoutActBiasIsUpscaleInTrain) {
   const int rows = 16;
   const int cols = 16;
   for (auto is_upscale_in_train : {true, false}) {
-    TestFusedDropoutActBias<float, operators::ReluFunctor<float>,
-                            operators::ReluGradFunctor<float>>
+    TestFusedDropoutActBias<float, math::ReluFunctor<float>,
+                            math::ReluGradFunctor<float>>
         test(rows, cols, 0, 1.0, is_upscale_in_train, false);
     test.Run();
     test.CheckOut(static_cast<float>(1e-5));
@@ -324,8 +315,8 @@ TEST(FusedDropout, GPUFusedDropoutActBiasIsUpscaleInTrain) {
 TEST(FusedDropout, GPUFusedDropoutActBiasIsTest) {
   const int rows = 16;
   const int cols = 16;
-  TestFusedDropoutActBias<float, operators::ReluFunctor<float>,
-                          operators::ReluGradFunctor<float>>
+  TestFusedDropoutActBias<float, math::ReluFunctor<float>,
+                          math::ReluGradFunctor<float>>
       test(rows, cols, 0, 0.35, true, true);
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
@@ -335,8 +326,8 @@ TEST(FusedDropout, GPUFusedDropoutActBiasIsTest) {
 TEST(FusedDropout, GPUFusedDropoutActBiasSeed) {
   const int rows = 16;
   const int cols = 16;
-  TestFusedDropoutActBias<float, operators::ReluFunctor<float>,
-                          operators::ReluGradFunctor<float>>
+  TestFusedDropoutActBias<float, math::ReluFunctor<float>,
+                          math::ReluGradFunctor<float>>
       test(rows, cols, 125, 0.0, false, false);
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
@@ -346,8 +337,8 @@ TEST(FusedDropout, GPUFusedDropoutActBiasSeed) {
 TEST(FusedDropout, GPUFusedDropoutActBiasLargeShape) {
   const int rows = 256;
   const int cols = 4096;
-  TestFusedDropoutActBias<float, operators::ReluFunctor<float>,
-                          operators::ReluGradFunctor<float>>
+  TestFusedDropoutActBias<float, math::ReluFunctor<float>,
+                          math::ReluGradFunctor<float>>
       test(rows, cols);
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));