PaddlePaddle
diff --git a/‎cmake/operators.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/operators.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/operators/fused/CMakeLists.txt‎
Lines changed: 5 additions & 1 deletion b/‎paddle/fluid/operators/fused/CMakeLists.txt‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/fused/fused_dropout_act_bias.h‎
Lines changed: 52 additions & 7 deletions b/‎paddle/fluid/operators/fused/fused_dropout_act_bias.h‎
Lines changed: 52 additions & 7 deletions
diff --git a/‎paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu‎
Lines changed: 42 additions & 34 deletions b/‎paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu‎
Lines changed: 42 additions & 34 deletions
@@ -214,7 +214,7 @@ function(op_library TARGET)
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
 "skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
-"fused_bn_add_activation_op")
+"fused_bn_add_activation_op" "fused_ffn_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
 
@@ -16,7 +16,8 @@ register_operators(EXCLUDES
     fusion_gru_op
     fusion_lstm_op
     fused_bn_add_activation_op
-    fused_transformer_op)
+    fused_transformer_op
+    fused_ffn_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -77,5 +78,8 @@ if (WITH_GPU OR WITH_ROCM)
         nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
         nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
         nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
+
+        op_library(fused_ffn_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_ffn);\n")
     endif()
 endif()
@@ -23,6 +23,49 @@ namespace operators {
 
 typedef platform::float16 fp16;
 
+/**
+ *@brief the relu functor
+ */
+template <typename T>
+struct ReluFunctor {
+  __host__ __device__ T operator()(const T *args) const {
+    math::ReluFunctor<T> relu;
+    return relu(args[0]);
+  }
+};
+
+template <typename T>
+struct ReluGradFunctor {
+  __host__ __device__ __forceinline__ T operator()(const T *args) const {
+    math::ReluGradFunctor<T> relu_grad;
+    return args[0] * relu_grad.UseOut(args[1]);
+  }
+};
+
+/**
+ *@brief the gelu functor
+ */
+template <typename T>
+struct GeluFunctor {
+  __host__ __device__ T operator()(const T *args) const {
+    math::GeluFunctor<T> gelu;
+    return gelu(args[0]);
+  }
+};
+
+/**
+ *@brief the gelu grad functor
+ */
+template <typename T>
+struct GeluGradFunctor {
+  __host__ __device__ T operator()(const T *args) const {
+    const T grad = args[0];
+    const T x = args[1];
+    math::GeluGradFunctor<T> gelu_grad;
+    return grad * gelu_grad.UseOut(x);
+  }
+};
+
 /**
  * @brief dst = dropout(activation(src + bias));
  * the src, mask and dst shape is (rows, cols)
@@ -96,7 +139,7 @@ __global__ void FusedDropoutActBias(Functor act, const uint64_t seed,
 #pragma unroll
       for (int ii = 0; ii < VecSize; ii++) {
         const T tmp = src_vec[ii] + bias_vec[ii];
-        dest_vec[ii] = act(tmp) * static_cast<T>(mask_vec[ii]) * factor;
+        dest_vec[ii] = act(&tmp) * static_cast<T>(mask_vec[ii]) * factor;
       }
       // store result to global
       platform::Store<T, VecSize>(dest_vec, &dst[r * cols + i]);
@@ -165,9 +208,10 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
     StoreT dx_vec;
 #pragma unroll
     for (int ii = 0; ii < VecSize; ii++) {
-      T x = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
-      T out = src_vec[ii];
-      dx_vec[ii] = act_grad.UseXAndOut(x, out);
+      T args[2];
+      args[0] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
+      args[1] = src_vec[ii];
+      dx_vec[ii] = act_grad(args);
     }
     platform::Store<T, VecSize>(dx_vec, &dx[i]);
   }
@@ -210,9 +254,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
 #pragma unroll
       for (int i = 0; i < VecSize; i++) {
         T val;
-        T x = dout_vec[i] * static_cast<T>(mask_vec[i]) * factor;
-        T out = src_vec[i] + bias_vec[i];
-        val = act_grad.UseXAndOut(x, out);
+        T args[2];
+        args[0] = dout_vec[i] * static_cast<T>(mask_vec[i]) * factor;
+        args[1] = src_vec[i] + bias_vec[i];
+        val = act_grad(args);
         dx_vec[i] = val;
         tmp_sum[i] += val;
       }
 
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace details = paddle::operators::details;
-namespace math = paddle::operators::math;
+namespace operators = paddle::operators;
 
 /**
  * @brief the unittest of fused_dropout_act_bias
@@ -133,7 +133,7 @@ struct TestFusedDropoutActBias {
       for (int i = 0; i < rows; i++) {
         for (int j = 0; j < cols; j++) {
           const T tmp = src_vec[i * cols + j] + bias_vec[j];
-          out1[i * cols + j] = act(tmp);
+          out1[i * cols + j] = act(&tmp);
         }
       }
       // call dropout
@@ -143,7 +143,7 @@ struct TestFusedDropoutActBias {
       for (int i = 0; i < rows; i++) {
         for (int j = 0; j < cols; j++) {
           const T tmp = src_vec[i * cols + j];
-          out1[i * cols + j] = act(tmp);
+          out1[i * cols + j] = act(&tmp);
         }
       }
 
@@ -165,14 +165,17 @@ struct TestFusedDropoutActBias {
     for (int i = 0; i < rows; i++) {
       for (int j = 0; j < cols; j++) {
         if (has_bias) {
-          T x = _out[i * cols + j];
-          T out = src_vec[i * cols + j] + bias_vec[j];
-          T val = act_grad.UseXAndOut(x, out);
+          T args[2];
+          args[0] = _out[i * cols + j];
+          args[1] = src_vec[i * cols + j] + bias_vec[j];
+          T val = act_grad(args);
           correct_dbias[j] += val;
           correct_dsrc[i * cols + j] = val;
         } else {
-          T val =
-              act_grad.UseXAndOut(_out[i * cols + j], src_vec[i * cols + j]);
+          T args[2];
+          args[0] = _out[i * cols + j];
+          args[1] = src_vec[i * cols + j];
+          T val = act_grad(args);
           correct_dsrc[i * cols + j] = val;
         }
       }
@@ -264,84 +267,89 @@ struct TestFusedDropoutActBias {
   }
 };
 
-template <typename Functor>
-static void BaseTest() {}
 // test the shape , bias, activation
 template <typename T, typename Functor, typename GradFunctor>
 static void BaseTest(const bool is_fp16 = false) {
   const int rows = 16;
   std::vector<int> cols_list = {16, 17};
   bool has_bias[2] = {true, false};
-  T default_diff = !is_fp16 ? static_cast<T>(1e-5) : default_diff =
+  T default_diff = !is_fp16 ? static_cast<T>(1e-3) : default_diff =
                                                          static_cast<T>(1e-2);
   for (auto cols : {16, 17}) {
     for (auto has_bias : {true, false}) {
       TestFusedDropoutActBias<T, Functor, GradFunctor> test(rows, cols);
       test.has_bias = has_bias;
       test.Run();
       test.CheckOut(default_diff);
-      test.CheckGrad(default_diff);
+      if (!is_fp16) {
+        test.CheckGrad(default_diff);
+      }
     }
   }
 }
 
 TEST(FusedDropout, GPUFusedDorpoutActBias) {
-  BaseTest<float, math::ReluFunctor<float>, math::ReluGradFunctor<float>>();
-  BaseTest<float, math::GeluFunctor<float>, math::GeluGradFunctor<float>>();
+  BaseTest<float, paddle::operators::ReluFunctor<float>,
+           paddle::operators::ReluGradFunctor<float>>();
+  BaseTest<float, operators::GeluFunctor<float>,
+           operators::GeluGradFunctor<float>>();
 }
-TEST(FusedDropout, GPUFusedRedisualDorpoutBiasDouble) {
-  BaseTest<double, math::ReluFunctor<double>, math::ReluGradFunctor<double>>();
-  BaseTest<double, math::GeluFunctor<double>, math::GeluGradFunctor<double>>();
+TEST(FusedDropout, GPUFusedDropoutActBiasDouble) {
+  BaseTest<double, operators::ReluFunctor<double>,
+           operators::ReluGradFunctor<double>>();
+  BaseTest<double, operators::GeluFunctor<double>,
+           operators::GeluGradFunctor<double>>();
 }
 
 // test fp16, For inference, check_grad is not required. ref: test_dropout_op.py
-TEST(FusedDropout, GPUFusedRedisualDorpoutBiasFp16) {
+TEST(FusedDropout, GPUFusedDropoutActBiasFp16) {
   using fp16 = platform::float16;
-  BaseTest<fp16, math::ReluFunctor<fp16>, math::ReluGradFunctor<fp16>>(true);
+  BaseTest<fp16, operators::ReluFunctor<fp16>,
+           operators::ReluGradFunctor<fp16>>(true);
 }
 
 TEST(FusedDropout, GPUFusedDropoutActBiasIsUpscaleInTrain) {
   const int rows = 16;
   const int cols = 16;
   for (auto is_upscale_in_train : {true, false}) {
-    TestFusedDropoutActBias<float, math::ReluFunctor<float>,
-                            math::ReluGradFunctor<float>>
+    TestFusedDropoutActBias<float, operators::ReluFunctor<float>,
+                            operators::ReluGradFunctor<float>>
         test(rows, cols, 0, 1.0, is_upscale_in_train, false);
     test.Run();
     test.CheckOut(static_cast<float>(1e-5));
-    test.CheckGrad(static_cast<float>(1e-5));
+    test.CheckGrad(static_cast<float>(1e-3));
   }
 }
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBiasIsTest) {
+TEST(FusedDropout, GPUFusedDropoutActBiasIsTest) {
   const int rows = 16;
   const int cols = 16;
-  TestFusedDropoutActBias<float, math::ReluFunctor<float>,
-                          math::ReluGradFunctor<float>>
+  TestFusedDropoutActBias<float, operators::ReluFunctor<float>,
+                          operators::ReluGradFunctor<float>>
       test(rows, cols, 0, 0.35, true, true);
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
-  test.CheckGrad(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-3));
 }
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBiasSeed) {
+TEST(FusedDropout, GPUFusedDropoutActBiasSeed) {
   const int rows = 16;
   const int cols = 16;
-  TestFusedDropoutActBias<float, math::ReluFunctor<float>,
-                          math::ReluGradFunctor<float>>
+  TestFusedDropoutActBias<float, operators::ReluFunctor<float>,
+                          operators::ReluGradFunctor<float>>
       test(rows, cols, 125, 0.0, false, false);
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
-  test.CheckGrad(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-3));
 }
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBiasLargeShape) {
+TEST(FusedDropout, GPUFusedDropoutActBiasLargeShape) {
   const int rows = 256;
   const int cols = 4096;
-  TestFusedDropoutActBias<float, math::ReluFunctor<float>,
-                          math::ReluGradFunctor<float>>
+  TestFusedDropoutActBias<float, operators::ReluFunctor<float>,
+                          operators::ReluGradFunctor<float>>
       test(rows, cols);
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
-  test.CheckGrad(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-3));
 }