From 823b0e9553b96748c1196ef5f8c0e987d9e97f11 Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Mon, 19 Apr 2021 07:22:32 +0000
Subject: [PATCH 01/11] rebase

---
 paddle/fluid/operators/activation_op.cu | 762 ++++++++++++++----------
 1 file changed, 462 insertions(+), 300 deletions(-)
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 781a97c1ffcc17..eede79ec924858 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -10,337 +10,435 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using float16 = paddle::platform::float16;
+template <typename T>
+struct BaseCudaActiveFunctor {
+  using ELEMENT_TYPE = T;
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+  AttrPair GetAttrs() { return AttrPair(); }
+};
 
+// For forward, args[0] means the input x;
+// For backward, args[0] means the input dout, args[1] means the input x or out,
+// which depends on the FwdDeps;
+/********************Relu Begin********************/
 template <typename T>
-struct CudaVecType {
-  using type = T;
-  static constexpr int vecsize = 1;
+struct CudaReluFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] > zero ? args[0] : zero;
+  }
 };
 
-template <>
-struct CudaVecType<platform::float16> {
-  using type = __half2;
-  static constexpr int vecsize = 2;
+template <typename T>
+struct CudaReluGradFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[1] > zero ? args[0] : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
+/********************Relu End********************/
+
+/********************LeakyRelu Begin********************/
+template <typename T>
+struct CudaLeakyReluFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-template <>
-struct CudaVecType<float> {
-  using type = float4;
-  static constexpr int vecsize = 4;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] > zero ? args[0] : static_cast<T>(alpha) * args[0];
+  }
 };
 
 template <typename T>
-class BaseGPUFunctor {
- public:
-  using ELEMENT_TYPE = T;
+struct CudaLeakyReluGradFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
 
-  using AttrPair = std::vector<std::pair<const char*, float*>>;
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-  AttrPair GetAttrs() { return AttrPair(); }
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[1] > zero ? args[0] : static_cast<T>(alpha) * args[0];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
+/********************LeakyRelu End********************/
 
-/* ========================================================================== */
+/********************Sigmoid Begin********************/
+template <typename T>
+struct CudaSigmoidFunctor : public BaseCudaActiveFunctor<T> {
+  // CT means Compute Type
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(one / (one + exp(-x)));
+  }
+};
 
-/* ===========================    relu forward   ============================ */
 template <typename T>
-class ReluGPUFunctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
+struct CudaSigmoidGradFunctor : public BaseCudaActiveFunctor<T> {
+  T one = static_cast<T>(1.0f);
 
- public:
-  ReluGPUFunctor() { zero_ = static_cast<T>(0.0f); }
-
-  // for relu forward when T is double
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type in) {
-    // relu forward : out = max(x, 0)
-    return in > zero_ ? in : zero_;
-  }
-
-  // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T in) {
-    // relu forward : out = max(x, 0)
-    return in > zero_ ? in : zero_;
-  }
-};
-
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-ReluGPUFunctor<float>::Compute(const CudaVecType<float>::type in) {
-  // relu forward : out = max(in, 0)
-  return make_float4((in.x > zero_) * (in.x), (in.y > zero_) * (in.y),
-                     (in.z > zero_) * (in.z), (in.w > zero_) * (in.w));
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type
-ReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type in) {
-// relu forward : out = max(in, 0)
-#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
-  const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(in, kzero), in);
-#else
-  const float2 xx = __half22float2(in);
-  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(xx.x),
-                           (xx.y > 0.0f) * static_cast<float>(xx.y));
-#endif
-}
-/* ========================================================================== */
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * args[1] * (one - args[1]);
+  }
 
-/* ===========================    relu backward   ============================
- */
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+/********************Sigmoid End********************/
 
+/********************LogSigmoid Begin********************/
 template <typename T>
-class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
+struct CudaLogSigmoidFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    CT temp = x > zero ? zero : -x;
+    return T(-temp - log(exp(-temp) + exp(-x - temp)));
+  }
+};
 
- public:
-  ReluGradGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+template <typename T>
+struct CudaLogSigmoidGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    CT temp = x > zero ? zero : -x;
+    return T(dout * (exp(-x - temp) / (exp(-temp) + exp(-x - temp))));
+  }
 
-  // for relu backward when T is double
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type out,
-      const typename CudaVecType<T>::type dout) {
-    return out > zero_ ? dout : zero_;
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************LogSigmoid End********************/
+
+/********************Atan Begin********************/
+template <typename T>
+struct CudaAtanFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(atan(x));
   }
+};
 
-  // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) {
-    // relu backward : dx = out > 0 ? dout : 0
-    return out > zero_ ? dout : zero_;
+template <typename T>
+struct CudaAtanGradFunctor : public BaseCudaActiveFunctor<T> {
+  T one = static_cast<T>(1.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * one / (one + args[1] * args[1]);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
+/********************Atan End********************/
 
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-ReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type out,
-                                   const CudaVecType<float>::type dout) {
-  // relu backward : dx = out > 0 ? dout : 0;
-  return make_float4((out.x > zero_) * (dout.x), (out.y > zero_) * (dout.y),
-                     (out.z > zero_) * (dout.z), (out.w > zero_) * (dout.w));
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type
-ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type out,
-                                     const CudaVecType<float16>::type dout) {
-// relu backward : dx = out > 0 ? dout : 0;
-#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
-  const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(out, kzero), dout);
-#else
-  const float2 xx = __half22float2(out);
-  const float2 yy = __half22float2(dout);
-  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(yy.x),
-                           (xx.y > 0.0f) * static_cast<float>(yy.y));
-#endif
-}
-
-/* ========================================================================== */
-/* ========================    leaky relu forward    ========================
- */
+/********************SoftShrink Begin********************/
 template <typename T>
-class LeakyReluGPUFunctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
-  float alpha_;
+struct CudaSoftShrinkFunctor : public BaseCudaActiveFunctor<T> {
+  float lambda;
 
- public:
-  LeakyReluGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T lambdaT = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(args[0] > lambdaT);
+    T temp2 = static_cast<T>(args[0] < -lambdaT);
+    return temp1 * (args[0] - lambdaT) + temp2 * (args[0] + lambdaT);
+  }
+};
+
+template <typename T>
+struct CudaSoftShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
+  float lambda;
 
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha_}};
-  }
-  // leakyrelu forward : out = x > 0 ? x : x * alpha
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type in) {
-    return in > zero_ ? in : static_cast<T>(alpha_) * in;
-  }
-
-  __device__ __forceinline__ T ComputeRemainder(const T in) {
-    // leakyrelu forward : out = x > 0 ? x : x * alpha
-    return in > zero_ ? in : static_cast<T>(alpha_) * in;
-  }
-};
-
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-LeakyReluGPUFunctor<float>::Compute(const CudaVecType<float>::type in) {
-  // leakyrelu forward : out = x > 0 ? x : x * alpha
-  return make_float4((in.x > zero_) ? (in.x) : (in.x) * alpha_,
-                     (in.y > zero_) ? (in.y) : (in.y) * alpha_,
-                     (in.z > zero_) ? (in.z) : (in.z) * alpha_,
-                     (in.w > zero_) ? (in.w) : (in.w) * alpha_);
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type
-LeakyReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type in) {
-  // leakyrelu forward : out = x > 0 ? x : x * alpha
-  const float2 xx = __half22float2(in);
-  return __floats2half2_rn((xx.x > 0.0f) ? xx.x : xx.x * alpha_,
-                           (xx.y > 0.0f) ? xx.y : xx.y * alpha_);
-}
-/* ========================================================================== */
+    return {{"lambda", &lambda}};
+  }
 
-/* ===========================  leaky relu backward   =======================
- */
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T lambdaT = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(args[1] > lambdaT);
+    T temp2 = static_cast<T>(args[1] < -lambdaT);
+    return args[0] * static_cast<T>(temp1 + temp2);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************SoftShrink End********************/
+
+/********************Ceil Begin********************/
 template <typename T>
-class LeakyReluGradGPUFunctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
-  float alpha_;
+struct CudaCeilFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(ceil(x));
+  }
+};
+/********************Ceil End********************/
 
- public:
-  LeakyReluGradGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+/********************Floor Begin********************/
+template <typename T>
+struct CudaFloorFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(floor(x));
+  }
+};
+/********************Floor End********************/
 
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha_}};
+/********************Round Begin********************/
+template <typename T>
+struct CudaRoundFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(round(x));
   }
+};
+/********************Floor End********************/
 
-  // for leaky relu backward when T is double
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type in,
-      const typename CudaVecType<T>::type dout) {
-    // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
-    return in > zero_ ? dout : static_cast<T>(alpha_) * dout;
+/********************Zero Begin********************/
+template <typename T>
+struct CudaZeroGradFunctor : public BaseCudaActiveFunctor<T> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return static_cast<T>(0.0f);
   }
 
-  // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T in, const T dout) {
-    // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
-    return in > zero_ ? dout : static_cast<T>(alpha_) * dout;
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
+};
+/********************Zero End********************/
+
+/********************Cos Begin********************/
+template <typename T>
+struct CudaCosFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return T(-dout * sin(x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
+/********************Cos End********************/
+
+/********************Sin Begin********************/
+template <typename T>
+struct CudaSinFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(sin(x));
+  }
+};
 
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-LeakyReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type in,
-                                        const CudaVecType<float>::type dout) {
-  // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
-  return make_float4((in.x > zero_) ? (dout.x) : alpha_ * (dout.x),
-                     (in.y > zero_) ? (dout.y) : alpha_ * (dout.y),
-                     (in.z > zero_) ? (dout.z) : alpha_ * (dout.z),
-                     (in.w > zero_) ? (dout.w) : alpha_ * (dout.w));
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type LeakyReluGradGPUFunctor<
-    float16>::Compute(const CudaVecType<float16>::type in,
-                      const CudaVecType<float16>::type dout) {
-  // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
-  const float2 xx = __half22float2(in);
-  const float2 yy = __half22float2(dout);
-  return __floats2half2_rn((xx.x > 0.0f) ? yy.x : alpha_ * yy.x,
-                           (xx.y > 0.0f) ? yy.y : alpha_ * yy.y);
-}
+template <typename T>
+struct CudaSinGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return T(dout * cos(x));
+  }
 
-/* ========================================================================== */
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Sin End********************/
+
+/********************Tan Begin********************/
+template <typename T>
+struct CudaTanFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return T(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Tan End********************/
+
+/********************Asin Begin********************/
+template <typename T>
+struct CudaAsinFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return T(dout * one / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Asin End********************/
+
+/********************Acos Begin********************/
+template <typename T>
+struct CudaAcosFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return T(-dout * one / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Acos End********************/
+
+/********************Cosh Begin********************/
+template <typename T>
+struct CudaCoshFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return T(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Cosh End********************/
+
+/********************Sinh Begin********************/
+template <typename T>
+struct CudaSinhFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return T(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Sinh End********************/
+
+/********************Reciprocal Begin********************/
+template <typename T>
+struct CudaReciprocalFunctor : public BaseCudaActiveFunctor<T> {
+  T one = static_cast<T>(1.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return one / args[0];
+  }
+};
 
-template <typename T, typename Functor>
-__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout,
-                                        T* dx, int num, Functor functor) {
-  using VecType = typename CudaVecType<T>::type;
-  constexpr int vecsize = CudaVecType<T>::vecsize;
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  int loop = num / vecsize;
-  int tail = num % vecsize;
-  const VecType* in_forward = reinterpret_cast<const VecType*>(forward_data);
-  const VecType* in_dout = reinterpret_cast<const VecType*>(dout);
-  VecType* out = reinterpret_cast<VecType*>(dx);
-  VecType forward_vec, dout_vec;
-  T in_data, dout_data;
-  for (int i = idx; i < loop; i += stride) {
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
-    forward_vec = __ldg(in_forward + i);
-    dout_vec = __ldg(in_dout + i);
-#else
-    forward_vec = in_forward[i];
-    dout_vec = in_dout[i];
-#endif
-    out[i] = functor.Compute(forward_vec, dout_vec);
-  }
-
-  while (idx == loop && tail) {
-    in_data = forward_data[num - tail];
-    dout_data = dout[num - tail];
-    dx[num - tail] = functor.ComputeRemainder(in_data, dout_data);
-    --tail;
-  }
-}
-
-template <typename T, typename Functor>
-__global__ void ActivationkernelVec(const T* src, T* dst, int num,
-                                    Functor functor) {
-  constexpr int vecsize = CudaVecType<T>::vecsize;
-  using VecType = typename CudaVecType<T>::type;
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  int loop = num / vecsize;
-  int tail = num % vecsize;
-  const VecType* in = reinterpret_cast<const VecType*>(src);
-  VecType* out = reinterpret_cast<VecType*>(dst);
-  VecType x_vec;
-  for (int i = idx; i < loop; i += stride) {
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
-    x_vec = __ldg(in + i);
-#else
-    x_vec = in[i];
-#endif
-    out[i] = functor.Compute(x_vec);
-  }
-
-  while (idx == loop && tail) {
-    dst[num - tail] = functor.ComputeRemainder(src[num - tail]);
-    --tail;
-  }
-}
+template <typename T>
+struct CudaReciprocalGradFunctor : public BaseCudaActiveFunctor<T> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return -args[0] * args[1] * args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+/********************Reciprocal End********************/
 
 template <typename DeviceContext, typename Functor>
 class ActivationGPUKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = nullptr;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor* x = nullptr;
     framework::Tensor* out = nullptr;
-    ExtractActivationTensor(context, &in_x, &out);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    int num = in_x->numel();
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->mutable_data<T>(dev_ctx.GetPlace(),
-                                          static_cast<size_t>(num * sizeof(T)));
-
-    int block = 512;
-#ifdef __HIPCC__
-    block = 256;
-#endif
-    Functor functor;
+    ExtractActivationTensor(ctx, &x, &out);
+    out->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    std::vector<const framework::Tensor*> ins = {x};
+    std::vector<framework::Tensor*> outs = {out};
+    auto functor = Functor();
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
+      *attr.second = ctx.Attr<float>(attr.first);
     }
-    constexpr int vecsize = CudaVecType<T>::vecsize;
-    int grid = max((num / vecsize + block - 1) / block, 1);
-    auto stream = context.cuda_device_context().stream();
-    ActivationkernelVec<T, Functor><<<grid, block, 0, stream>>>(
-        input_data, output_data, num, functor);
+    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins, &outs,
+                                                            functor);
   }
 };
 
@@ -349,43 +447,38 @@ class ActivationGradGPUKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     const framework::Tensor *x, *out, *d_out;
     framework::Tensor* d_x = nullptr;
     x = out = d_out = nullptr;
-    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &x, &out, &d_out,
+    ExtractActivationGradTensor<Functor::FwdDeps()>(ctx, &x, &out, &d_out,
                                                     &d_x);
-    int numel = d_out->numel();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto* dx_data = d_x->mutable_data<T>(
-        dev_ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-    auto* dout_data = d_out->data<T>();
+    d_x->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto functor = Functor();
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+
+    std::vector<const framework::Tensor*> ins = {d_out};
+    std::vector<framework::Tensor*> outs = {d_x};
 
-    auto* forward_data = dout_data;
     if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
       // Only need forward output Out
-      forward_data = out->data<T>();
+      ins.push_back(out);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
+                                                               &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
                static_cast<int>(kDepX)) {
       // Only need forward input X
-      forward_data = x->data<T>();
-    }
-
-    int block = 512;
-#ifdef __HIPCC__
-    block = 256;
-#endif
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
+      ins.push_back(x);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
+                                                               &outs, functor);
+    } else {
+      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins,
+                                                              &outs, functor);
     }
-    constexpr int vecsize = CudaVecType<T>::vecsize;
-    int grid = max((numel / vecsize + block - 1) / block, 1);
-    auto stream = context.cuda_device_context().stream();
-    ActivationGradKernelVec<T, Functor><<<grid, block, 0, stream>>>(
-        forward_data, dout_data, dx_data, numel, functor);
   }
 };
 
@@ -410,7 +503,6 @@ namespace plat = paddle::platform;
                                 ops::grad_functor<double>>,                 \
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                 ops::grad_functor<plat::float16>>);
-FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
 
 #define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor,             \
                                        grad_functor)                           \
@@ -430,8 +522,8 @@ FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
                                    ops::grad_functor<plat::float16>>);
 
 /* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluGPUFunctor,
-                               LeakyReluGradGPUFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
+                               CudaLeakyReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     leaky_relu_grad_grad,
@@ -456,7 +548,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
-REGISTER_ACTIVATION_GPU_KERNEL(relu, Relu, ReluGPUFunctor, ReluGradGPUFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(relu, Relu, CudaReluFunctor,
+                               CudaReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,
@@ -594,3 +687,72 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LogDoubleGradKernel<plat::CUDADeviceContext,
                              ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
+REGISTER_ACTIVATION_GPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
+                               CudaSigmoidGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
+                               CudaLogSigmoidGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(atan, Atan, CudaAtanFunctor,
+                               CudaAtanGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
+                               CudaSoftShrinkGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
+                               CudaZeroGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(floor, Floor, CudaFloorFunctor,
+                               CudaZeroGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(acos, Acos, CudaAcosFunctor,
+                               CudaAcosGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(asin, Asin, CudaAsinFunctor,
+                               CudaAsinGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(sinh, Sinh, CudaSinhFunctor,
+                               CudaSinhGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(cosh, Cosh, CudaCoshFunctor,
+                               CudaCoshGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(round, Round, CudaRoundFunctor,
+                               CudaZeroGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
+// SigmoidGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(logsigmoid, LogSigmoid, LogSigmoidFunctor,
+// LogSigmoidGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(atan, Atan, AtanFunctor, AtanGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(softshrink, SoftShrink, SoftShrinkFunctor,
+// SoftShrinkGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(ceil, Ceil, CeilFunctor, ZeroGradFunctor);
+//  REGISTER_ACTIVATION_CUDA_KERNEL(floor, Floor, FloorFunctor,
+//  ZeroGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(cos, Cos, CosFunctor, CosGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(tan, Tan, TanFunctor, TanGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(acos, Acos, AcosFunctor, AcosGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(sin, Sin, SinFunctor, SinGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(asin, Asin, AsinFunctor, AsinGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(sinh, Sinh, SinhFunctor, SinhGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(cosh, Cosh, CoshFunctor, CoshGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(round, Round, RoundFunctor, ZeroGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(reciprocal, Reciprocal, ReciprocalFunctor,
+                                ReciprocalGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(log2, Log2, Log2Functor, Log2GradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(log10, Log10, Log10Functor, Log10GradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(brelu, BRelu, BReluFunctor, BReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(soft_relu, SoftRelu, SoftReluFunctor,
+                                SoftReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(stanh, STanh, STanhFunctor, STanhGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(softplus, Softplus, SoftplusFunctor,
+                                SoftplusGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(softsign, Softsign, SoftsignFunctor,
+                                SoftsignGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(tanh_shrink, TanhShrink, TanhShrinkFunctor,
+                                TanhShrinkGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(hard_shrink, HardShrink, HardShrinkFunctor,
+                                HardShrinkGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
+                                HardSigmoidGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(swish, Swish, SwishFunctor, SwishGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(thresholded_relu, ThresholdedRelu,
+                                ThresholdedReluFunctor,
+                                ThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(hard_swish, HardSwish, HardSwishFunctor,
+                                HardSwishGradFunctor);

From ec22be6e74c659e4ec7cf857813343534a90cd66 Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Thu, 22 Apr 2021 13:15:42 +0000
Subject: [PATCH 02/11] add 12 op

---
 paddle/fluid/operators/activation_op.cu | 444 ++++++++++++++++++++++--
 1 file changed, 415 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index eede79ec924858..e62aebce7cbf6c 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -150,7 +150,7 @@ template <typename T>
 struct CudaAtanGradFunctor : public BaseCudaActiveFunctor<T> {
   T one = static_cast<T>(1.0f);
   __device__ __forceinline__ T operator()(const T* args) const {
-    return args[0] * one / (one + args[1] * args[1]);
+    return args[0] / (one + args[1] * args[1]);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -167,10 +167,11 @@ struct CudaSoftShrinkFunctor : public BaseCudaActiveFunctor<T> {
   }
 
   __device__ __forceinline__ T operator()(const T* args) const {
-    T lambdaT = static_cast<T>(lambda);
-    T temp1 = static_cast<T>(args[0] > lambdaT);
-    T temp2 = static_cast<T>(args[0] < -lambdaT);
-    return temp1 * (args[0] - lambdaT) + temp2 * (args[0] + lambdaT);
+    T x = args[0];
+    T l = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(x > l);
+    T temp2 = static_cast<T>(x < -l);
+    return temp1 * (x - l) + temp2 * (x + l);
   }
 };
 
@@ -183,10 +184,11 @@ struct CudaSoftShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
   }
 
   __device__ __forceinline__ T operator()(const T* args) const {
-    T lambdaT = static_cast<T>(lambda);
-    T temp1 = static_cast<T>(args[1] > lambdaT);
-    T temp2 = static_cast<T>(args[1] < -lambdaT);
-    return args[0] * static_cast<T>(temp1 + temp2);
+    T x = args[1];
+    T l = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(x > l);
+    T temp2 = static_cast<T>(x < -l);
+    return args[0] * (temp1 + temp2);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -323,7 +325,7 @@ struct CudaAsinGradFunctor : public BaseCudaActiveFunctor<T> {
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
-    return T(dout * one / sqrt(one - x * x));
+    return T(dout / sqrt(one - x * x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -347,7 +349,7 @@ struct CudaAcosGradFunctor : public BaseCudaActiveFunctor<T> {
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
-    return T(-dout * one / sqrt(one - x * x));
+    return T(-dout / sqrt(one - x * x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -419,6 +421,361 @@ struct CudaReciprocalGradFunctor : public BaseCudaActiveFunctor<T> {
 };
 /********************Reciprocal End********************/
 
+/********************Log1p Begin********************/
+template <typename T>
+struct CudaLog1pFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(log(one + x));
+  }
+};
+
+template <typename T>
+struct CudaLog1pGradFunctor : public BaseCudaActiveFunctor<T> {
+  T one = static_cast<T>(1.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (one + args[1]);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Log1p End********************/
+
+/********************Log2 Begin********************/
+template <typename T>
+struct CudaLog2Functor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(log2(x));
+  }
+};
+
+template <typename T>
+struct CudaLog2GradFunctor : public BaseCudaActiveFunctor<T> {
+  T log_two = static_cast<T>(log(2));
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (args[1] * log_two);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Log2 End********************/
+
+/********************Log10 Begin********************/
+template <typename T>
+struct CudaLog10Functor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(log10(x));
+  }
+};
+
+template <typename T>
+struct CudaLog10GradFunctor : public BaseCudaActiveFunctor<T> {
+  T log_ten = static_cast<T>(log(10));
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (args[1] * log_ten);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Log10 End********************/
+
+/********************BRelu Begin********************/
+template <typename T>
+struct CudaBReluFunctor : public BaseCudaActiveFunctor<T> {
+  float t_min;
+  float t_max;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    return (x > t_min_cast && x < t_max_cast)
+               ? x
+               : (x <= t_min_cast ? t_min_cast : t_max_cast);
+  }
+};
+
+template <typename T>
+struct CudaBReluGradFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float t_min;
+  float t_max;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T dout = args[0];
+    T x = args[1];
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    return (x <= t_min_cast || x >= t_max_cast) ? zero : dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************BRelu End********************/
+
+/********************SoftRelu Begin********************/
+template <typename T>
+struct CudaSoftReluFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  float threshold;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    CT t = static_cast<CT>(threshold);
+    CT temp = (x > -t && x < t) ? x : (x <= -t ? -t : t);
+    return T(log(one + exp(temp)));
+  }
+};
+
+template <typename T>
+struct CudaSoftReluGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  float threshold;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT out = static_cast<CT>(args[1]);
+    CT t = static_cast<CT>(threshold);
+    return (out <= -t || out >= t) ? static_cast<T>(0.0f)
+                                   : T(dout * (one - exp(-out)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+/********************SoftRelu End********************/
+
+/********************STanh Begin********************/
+template <typename T>
+struct CudaSTanhFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  float scale_a;
+  float scale_b;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    CT a = static_cast<CT>(scale_a);
+    CT b = static_cast<CT>(scale_b);
+    return T(b * tanh(a * x));
+  }
+};
+
+template <typename T>
+struct CudaSTanhGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  float scale_a;
+  float scale_b;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    CT a = static_cast<CT>(scale_a);
+    CT b = static_cast<CT>(scale_b);
+    CT temp = tanh(a * x) * tanh(a * x);
+    return T(dout * a * b * (one - temp));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************STanh End********************/
+
+/********************Softplus Begin********************/
+template <typename T>
+struct CudaSoftplusFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    CT b = static_cast<CT>(beta);
+    CT t = static_cast<CT>(threshold);
+    CT x_beta = x * beta;
+    return T(x_beta > t ? x : log(one + exp(x_beta)) / b);
+  }
+};
+
+template <typename T>
+struct CudaSoftplusGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    CT b = static_cast<CT>(beta);
+    CT t = static_cast<CT>(threshold);
+    CT x_beta = x * beta;
+    return x_beta > t ? args[0] : T(dout / (one + exp(-x_beta)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Softplus End********************/
+
+/********************Softsign Begin********************/
+template <typename T>
+struct CudaSoftsignFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(x / (one + abs(x)));
+  }
+};
+
+template <typename T>
+struct CudaSoftsignGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return T(dout / ((one + abs(x)) * (one + abs(x))));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Softsign End********************/
+
+/********************Relu6 Begin********************/
+template <typename T>
+struct CudaRelu6Functor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T t = static_cast<T>(threshold);
+    return args[0] <= zero ? zero : (args[0] < t ? args[0] : t);
+  }
+};
+
+template <typename T>
+struct CudaRelu6GradFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T t = static_cast<T>(threshold);
+    return (args[1] > zero && args[1] < t) ? args[0] : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+/********************Relu6 End********************/
+
+/********************TanhShrink Begin********************/
+template <typename T>
+struct CudaTanhShrinkFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(x - tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return T(dout * tanh(x) * tanh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************TanhShrink End********************/
+
+/********************HardShrink Begin********************/
+template <typename T>
+struct CudaHardShrinkFunctor : public BaseCudaActiveFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T t = static_cast<T>(threshold);
+    T temp1 = static_cast<T>(x > t);
+    T temp2 = static_cast<T>(x < -t);
+    return x * (temp1 + temp2);
+  }
+};
+
+template <typename T>
+struct CudaHardShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[1];
+    T t = static_cast<T>(threshold);
+    T temp1 = static_cast<T>(x > t);
+    T temp2 = static_cast<T>(x < -t);
+    return args[0] * (temp1 + temp2);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************HardShrink End********************/
+
 template <typename DeviceContext, typename Functor>
 class ActivationGPUKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -712,6 +1069,30 @@ REGISTER_ACTIVATION_GPU_KERNEL(cosh, Cosh, CudaCoshFunctor,
                                CudaCoshGradFunctor);
 REGISTER_ACTIVATION_GPU_KERNEL(round, Round, CudaRoundFunctor,
                                CudaZeroGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
+                               CudaReciprocalGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
+                               CudaLog1pGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(log2, Log2, CudaLog2Functor,
+                               CudaLog2GradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(log10, Log10, CudaLog10Functor,
+                               CudaLog10GradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
+                               CudaBReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
+                               CudaSoftReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(stanh, STanh, CudaSTanhFunctor,
+                               CudaSTanhGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
+                               CudaSoftplusGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
+                               CudaSoftsignGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
+                               CudaRelu6GradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor,
+                               CudaTanhShrinkGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
+                               CudaHardShrinkGradFunctor);
 // REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
 // SigmoidGradFunctor);
 // REGISTER_ACTIVATION_CUDA_KERNEL(logsigmoid, LogSigmoid, LogSigmoidFunctor,
@@ -730,24 +1111,29 @@ REGISTER_ACTIVATION_GPU_KERNEL(round, Round, CudaRoundFunctor,
 // REGISTER_ACTIVATION_CUDA_KERNEL(sinh, Sinh, SinhFunctor, SinhGradFunctor);
 // REGISTER_ACTIVATION_CUDA_KERNEL(cosh, Cosh, CoshFunctor, CoshGradFunctor);
 // REGISTER_ACTIVATION_CUDA_KERNEL(round, Round, RoundFunctor, ZeroGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(reciprocal, Reciprocal, ReciprocalFunctor,
-                                ReciprocalGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(log2, Log2, Log2Functor, Log2GradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(log10, Log10, Log10Functor, Log10GradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(brelu, BRelu, BReluFunctor, BReluGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(soft_relu, SoftRelu, SoftReluFunctor,
-                                SoftReluGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(stanh, STanh, STanhFunctor, STanhGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(softplus, Softplus, SoftplusFunctor,
-                                SoftplusGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(softsign, Softsign, SoftsignFunctor,
-                                SoftsignGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(tanh_shrink, TanhShrink, TanhShrinkFunctor,
-                                TanhShrinkGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(hard_shrink, HardShrink, HardShrinkFunctor,
-                                HardShrinkGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(reciprocal, Reciprocal, ReciprocalFunctor,
+//                                ReciprocalGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(log1p, Log1p, Log1pFunctor,
+// Log1pGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(log2, Log2, Log2Functor, Log2GradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(log10, Log10, Log10Functor,
+// Log10GradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(brelu, BRelu, BReluFunctor,
+// BReluGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(soft_relu, SoftRelu, SoftReluFunctor,
+//                                SoftReluGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(stanh, STanh, STanhFunctor,
+// STanhGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(softplus, Softplus, SoftplusFunctor,
+//                                SoftplusGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(softsign, Softsign, SoftsignFunctor,
+//                                SoftsignGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(relu6, Relu6, Relu6Functor,
+// Relu6GradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(tanh_shrink, TanhShrink, TanhShrinkFunctor,
+//                                TanhShrinkGradFunctor);
+// REGISTER_ACTIVATION_CUDA_KERNEL(hard_shrink, HardShrink, HardShrinkFunctor,
+//                                HardShrinkGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
                                 HardSigmoidGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(swish, Swish, SwishFunctor, SwishGradFunctor);

From 59b16b9dcc107548e439834f1d90dcff0561309e Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Fri, 23 Apr 2021 08:07:06 +0000
Subject: [PATCH 03/11] add all activation op

---
 paddle/fluid/operators/activation_op.cu | 622 +++++++++++++++++-------
 1 file changed, 451 insertions(+), 171 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index e62aebce7cbf6c..84454f574c9c30 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -402,6 +402,29 @@ struct CudaSinhGradFunctor : public BaseCudaActiveFunctor<T> {
 };
 /********************Sinh End********************/
 
+/********************Tanh Begin********************/
+template <typename T>
+struct CudaTanhFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhGradFunctor : public BaseCudaActiveFunctor<T> {
+  T one = static_cast<T>(1.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T dout = static_cast<T>(args[0]);
+    T out = static_cast<T>(args[1]);
+    return dout * (one - out * out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+/********************Tanh End********************/
+
 /********************Reciprocal Begin********************/
 template <typename T>
 struct CudaReciprocalFunctor : public BaseCudaActiveFunctor<T> {
@@ -421,6 +444,26 @@ struct CudaReciprocalGradFunctor : public BaseCudaActiveFunctor<T> {
 };
 /********************Reciprocal End********************/
 
+/********************Exp Begin********************/
+template <typename T>
+struct CudaExpFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(exp(x));
+  }
+};
+
+template <typename T>
+struct CudaExpGradFunctor : public BaseCudaActiveFunctor<T> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+/********************Exp End********************/
+
 /********************Log1p Begin********************/
 template <typename T>
 struct CudaLog1pFunctor : public BaseCudaActiveFunctor<T> {
@@ -443,6 +486,26 @@ struct CudaLog1pGradFunctor : public BaseCudaActiveFunctor<T> {
 };
 /********************Log1p End********************/
 
+/********************Log Begin********************/
+template <typename T>
+struct CudaLogFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(log(x));
+  }
+};
+
+template <typename T>
+struct CudaLogGradFunctor : public BaseCudaActiveFunctor<T> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Log End********************/
+
 /********************Log2 Begin********************/
 template <typename T>
 struct CudaLog2Functor : public BaseCudaActiveFunctor<T> {
@@ -776,8 +839,268 @@ struct CudaHardShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
 };
 /********************HardShrink End********************/
 
+/********************HardSigmoid Begin********************/
+template <typename T>
+struct CudaHardSigmoidFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T temp = args[0] * static_cast<T>(slope) + static_cast<T>(offset);
+    return (temp > zero && temp < one) ? temp : (temp <= zero ? zero : one);
+  }
+};
+
+template <typename T>
+struct CudaHardSigmoidGradFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T out = args[1];
+    return (out > zero && out < one) ? args[0] * static_cast<T>(slope) : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+/********************HardSigmoid End********************/
+
+/********************Swish Begin********************/
+template <typename T>
+struct CudaSwishFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  float beta;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    CT b = static_cast<CT>(beta);
+    return T(x / (one + exp(-b * x)));
+  }
+};
+
+template <typename T>
+struct CudaSwishGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+  float beta;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    CT b = static_cast<CT>(beta);
+    CT temp1 = one / (one + exp(-b * x));
+    CT out = x * temp1;
+    CT temp2 = temp1 * (one - b * x);
+    return T(dout * (b * out + temp2));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Swish End********************/
+
+/********************ThresholdedRelu Begin********************/
+template <typename T>
+struct CudaThresholdedReluFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] > static_cast<T>(threshold) ? args[0] : zero;
+  }
+};
+
+template <typename T>
+struct CudaThresholdedReluGradFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[1] > static_cast<T>(threshold) ? args[0] : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************ThresholdedRelu End********************/
+
+/********************HardSwish Begin********************/
+template <typename T>
+struct CudaHardSwishFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+  float scale;
+  float offset;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T t = static_cast<T>(threshold);
+    T temp1 = x + static_cast<T>(offset);
+    T temp2 = (temp1 > zero && temp1 < t) ? temp1 : (temp1 <= zero ? zero : t);
+    return temp2 * x / static_cast<T>(scale);
+  }
+};
+
+template <typename T>
+struct CudaHardSwishGradFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  T two = static_cast<T>(2.0f);
+  float threshold;
+  float scale;
+  float offset;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[1];
+    T o = static_cast<T>(offset);
+    T s = static_cast<T>(scale);
+    T temp1 = static_cast<T>(x + o > zero);
+    T temp2 = static_cast<T>(x + o < static_cast<T>(threshold));
+    return args[0] * (temp1 * temp2 * (two * x + o) / s + one - temp2);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************HardSwish End********************/
+
+/********************ELU Begin********************/
+template <typename T>
+struct CudaELUFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+  CT one = static_cast<CT>(1.0f);
+  float alpha;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return x >= zero ? args[0] : T(static_cast<CT>(alpha) * (exp(x) - one));
+  }
+};
+
+template <typename T>
+struct CudaELUGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+  CT one = static_cast<CT>(1.0f);
+  float alpha;
+
+  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    return x >= zero ? args[0] : T(dout * static_cast<CT>(alpha) * exp(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************ELU End********************/
+
+/********************Square Begin********************/
+template <typename T>
+struct CudaSquareFunctor : public BaseCudaActiveFunctor<T> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * args[0];
+  }
+};
+
+template <typename T>
+struct CudaSquareGradFunctor : public BaseCudaActiveFunctor<T> {
+  T two = static_cast<T>(2.0f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * two * args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Square End********************/
+
+/********************Sqrt Begin********************/
+template <typename T>
+struct CudaSqrtFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(sqrt(x));
+  }
+};
+
+template <typename T>
+struct CudaSqrtGradFunctor : public BaseCudaActiveFunctor<T> {
+  T one_half = static_cast<T>(0.5f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return one_half * args[0] / args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+/********************Sqrt End********************/
+
+/********************Rsqrt Begin********************/
+template <typename T>
+struct CudaRsqrtFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(rsqrt(x));
+  }
+};
+
+template <typename T>
+struct CudaRsqrtGradFunctor : public BaseCudaActiveFunctor<T> {
+  T minus_one_half = static_cast<T>(-0.5f);
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T out = args[1];
+    return minus_one_half * args[0] * out * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+/********************Rsqrt End********************/
+
 template <typename DeviceContext, typename Functor>
-class ActivationGPUKernel
+class ActivationCudaKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   using T = typename Functor::ELEMENT_TYPE;
@@ -800,7 +1123,7 @@ class ActivationGPUKernel
 };
 
 template <typename DeviceContext, typename Functor>
-class ActivationGradGPUKernel
+class ActivationGradCudaKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   using T = typename Functor::ELEMENT_TYPE;
@@ -845,42 +1168,27 @@ class ActivationGradGPUKernel
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,         \
-                                        grad_functor)                       \
-  REGISTER_OP_CUDA_KERNEL(                                                  \
-      act_type,                                                             \
-      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>,  \
-      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<double>>, \
-      ops::ActivationKernel<plat::CUDADeviceContext,                        \
-                            ops::functor<plat::float16>>);                  \
-  REGISTER_OP_CUDA_KERNEL(                                                  \
-      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
-                                                 ops::grad_functor<float>>, \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>,                 \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<plat::float16>>);
-
-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor,             \
-                                       grad_functor)                           \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,            \
+                                        grad_functor)                          \
   REGISTER_OP_CUDA_KERNEL(                                                     \
-      act_type, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,  \
-                                         ops::functor<float>>,                 \
-      ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,            \
-                               ops::functor<double>>,                          \
-      ops::ActivationGPUKernel<plat::CUDADeviceContext,                        \
-                               ops::functor<plat::float16>>);                  \
+      act_type, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
+                                          ops::functor<float>>,                \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
+                                ops::functor<double>>,                         \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::float16>>);                 \
   REGISTER_OP_CUDA_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradGPUKernel<plat::CUDADeviceContext,   \
-                                                    ops::grad_functor<float>>, \
-      ops::ActivationGradGPUKernel<plat::CUDADeviceContext,                    \
-                                   ops::grad_functor<double>>,                 \
-      ops::ActivationGradGPUKernel<plat::CUDADeviceContext,                    \
-                                   ops::grad_functor<plat::float16>>);
+      act_type##_grad,                                                         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<float>>,                 \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<double>>,                \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::float16>>);
 
 /* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
-                               CudaLeakyReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
+                                CudaLeakyReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     leaky_relu_grad_grad,
@@ -893,7 +1201,7 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ======================== elu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, CudaELUFunctor, CudaELUGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
@@ -905,8 +1213,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
-REGISTER_ACTIVATION_GPU_KERNEL(relu, Relu, CudaReluFunctor,
-                               CudaReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
+                                CudaReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,
@@ -919,7 +1227,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    tanh register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
+                                CudaTanhGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     tanh_grad_grad,
@@ -932,7 +1241,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================   sqrt register  ============================= */
-REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
+                                CudaSqrtGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     sqrt_grad_grad,
@@ -946,7 +1256,8 @@ REGISTER_OP_CUDA_KERNEL(
 
 /* ===========================   rsqrt register  =============================
  */
-REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, CudaRsqrtFunctor,
+                                CudaRsqrtGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     rsqrt_grad_grad,
@@ -960,24 +1271,28 @@ REGISTER_OP_CUDA_KERNEL(
 
 /* ===========================  square register  ============================ */
 REGISTER_OP_CUDA_KERNEL(
-    square,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<float>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<double>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<int>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<int64_t>>,
-    ops::ActivationKernel<plat::CUDADeviceContext,
-                          ops::SquareFunctor<plat::float16>>);
+    square, ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                                      ops::CudaSquareFunctor<float>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaSquareFunctor<double>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaSquareFunctor<int>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaSquareFunctor<int64_t>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaSquareFunctor<plat::float16>>);
 REGISTER_OP_CUDA_KERNEL(
-    square_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
-                                           ops::SquareGradFunctor<float>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<double>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<int>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<int64_t>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<plat::float16>>);
+    square_grad,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<int>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<int64_t>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<plat::float16>>);
 
 REGISTER_OP_CUDA_KERNEL(
     square_grad_grad,
@@ -1014,27 +1329,31 @@ REGISTER_OP_CUDA_KERNEL(
 /* ==========================   exp register  ============================ */
 
 REGISTER_OP_CUDA_KERNEL(
-    exp, ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<float>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<double>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int64_t>>,
-    ops::ActivationKernel<plat::CUDADeviceContext,
-                          ops::ExpFunctor<plat::float16>>);
+    exp, ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                                   ops::CudaExpFunctor<float>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<double>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<int>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<int64_t>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<plat::float16>>);
 REGISTER_OP_CUDA_KERNEL(
-    exp_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
-                                        ops::ExpGradFunctor<float>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<double>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<int>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<int64_t>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<plat::float16>>);
+    exp_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                            ops::CudaExpGradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int64_t>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================  Log register ==================================*/
-REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
@@ -1044,101 +1363,62 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LogDoubleGradKernel<plat::CUDADeviceContext,
                              ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
-REGISTER_ACTIVATION_GPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                               CudaSigmoidGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
-                               CudaLogSigmoidGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(atan, Atan, CudaAtanFunctor,
-                               CudaAtanGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
-                               CudaSoftShrinkGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(floor, Floor, CudaFloorFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(acos, Acos, CudaAcosFunctor,
-                               CudaAcosGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(asin, Asin, CudaAsinFunctor,
-                               CudaAsinGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(sinh, Sinh, CudaSinhFunctor,
-                               CudaSinhGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(cosh, Cosh, CudaCoshFunctor,
-                               CudaCoshGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(round, Round, CudaRoundFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
-                               CudaReciprocalGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
-                               CudaLog1pGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(log2, Log2, CudaLog2Functor,
-                               CudaLog2GradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(log10, Log10, CudaLog10Functor,
-                               CudaLog10GradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
-                               CudaBReluGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
-                               CudaSoftReluGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(stanh, STanh, CudaSTanhFunctor,
-                               CudaSTanhGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
-                               CudaSoftplusGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
-                               CudaSoftsignGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
-                               CudaRelu6GradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor,
-                               CudaTanhShrinkGradFunctor);
-REGISTER_ACTIVATION_GPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
-                               CudaHardShrinkGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
-// SigmoidGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(logsigmoid, LogSigmoid, LogSigmoidFunctor,
-// LogSigmoidGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(atan, Atan, AtanFunctor, AtanGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(softshrink, SoftShrink, SoftShrinkFunctor,
-// SoftShrinkGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(ceil, Ceil, CeilFunctor, ZeroGradFunctor);
-//  REGISTER_ACTIVATION_CUDA_KERNEL(floor, Floor, FloorFunctor,
-//  ZeroGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(cos, Cos, CosFunctor, CosGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(tan, Tan, TanFunctor, TanGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(acos, Acos, AcosFunctor, AcosGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(sin, Sin, SinFunctor, SinGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(asin, Asin, AsinFunctor, AsinGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(sinh, Sinh, SinhFunctor, SinhGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(cosh, Cosh, CoshFunctor, CoshGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(round, Round, RoundFunctor, ZeroGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(reciprocal, Reciprocal, ReciprocalFunctor,
-//                                ReciprocalGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(log1p, Log1p, Log1pFunctor,
-// Log1pGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(log2, Log2, Log2Functor, Log2GradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(log10, Log10, Log10Functor,
-// Log10GradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(brelu, BRelu, BReluFunctor,
-// BReluGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(soft_relu, SoftRelu, SoftReluFunctor,
-//                                SoftReluGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(stanh, STanh, STanhFunctor,
-// STanhGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(softplus, Softplus, SoftplusFunctor,
-//                                SoftplusGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(softsign, Softsign, SoftsignFunctor,
-//                                SoftsignGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(relu6, Relu6, Relu6Functor,
-// Relu6GradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(tanh_shrink, TanhShrink, TanhShrinkFunctor,
-//                                TanhShrinkGradFunctor);
-// REGISTER_ACTIVATION_CUDA_KERNEL(hard_shrink, HardShrink, HardShrinkFunctor,
-//                                HardShrinkGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
-                                HardSigmoidGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(swish, Swish, SwishFunctor, SwishGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
+                                CudaSigmoidGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
+                                CudaLogSigmoidGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(atan, Atan, CudaAtanFunctor,
+                                CudaAtanGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
+                                CudaSoftShrinkGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(ceil, Ceil, CudaCeilFunctor,
+                                CudaZeroGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(floor, Floor, CudaFloorFunctor,
+                                CudaZeroGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(acos, Acos, CudaAcosFunctor,
+                                CudaAcosGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(asin, Asin, CudaAsinFunctor,
+                                CudaAsinGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(sinh, Sinh, CudaSinhFunctor,
+                                CudaSinhGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(cosh, Cosh, CudaCoshFunctor,
+                                CudaCoshGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(round, Round, CudaRoundFunctor,
+                                CudaZeroGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
+                                CudaReciprocalGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(log1p, Log1p, CudaLog1pFunctor,
+                                CudaLog1pGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(log2, Log2, CudaLog2Functor,
+                                CudaLog2GradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(log10, Log10, CudaLog10Functor,
+                                CudaLog10GradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(brelu, BRelu, CudaBReluFunctor,
+                                CudaBReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
+                                CudaSoftReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(stanh, STanh, CudaSTanhFunctor,
+                                CudaSTanhGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
+                                CudaSoftplusGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
+                                CudaSoftsignGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(relu6, Relu6, CudaRelu6Functor,
+                                CudaRelu6GradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor,
+                                CudaTanhShrinkGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
+                                CudaHardShrinkGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(hard_sigmoid, HardSigmoid,
+                                CudaHardSigmoidFunctor,
+                                CudaHardSigmoidGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(swish, Swish, CudaSwishFunctor,
+                                CudaSwishGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(thresholded_relu, ThresholdedRelu,
-                                ThresholdedReluFunctor,
-                                ThresholdedReluGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(hard_swish, HardSwish, HardSwishFunctor,
-                                HardSwishGradFunctor);
+                                CudaThresholdedReluFunctor,
+                                CudaThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
+                                CudaHardSwishGradFunctor);

From a51d16f8176565284ba325d6c364db6f386b9822 Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Fri, 23 Apr 2021 10:18:36 +0000
Subject: [PATCH 04/11] fix

---
 paddle/fluid/operators/activation_op.cu | 16 +++++++---------
 paddle/fluid/operators/activation_op.h  |  4 ++--
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 84454f574c9c30..a412235aadf64c 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -623,8 +623,8 @@ struct CudaSoftReluGradFunctor : public BaseCudaActiveFunctor<T> {
     CT dout = static_cast<CT>(args[0]);
     CT out = static_cast<CT>(args[1]);
     CT t = static_cast<CT>(threshold);
-    return (out <= -t || out >= t) ? static_cast<T>(0.0f)
-                                   : T(dout * (one - exp(-out)));
+    return (out > -t && out < t) ? T(dout * (one - exp(-out)))
+                                 : static_cast<T>(0.0f);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
@@ -804,6 +804,7 @@ struct CudaTanhShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
 /********************HardShrink Begin********************/
 template <typename T>
 struct CudaHardShrinkFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
   float threshold;
 
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
@@ -813,14 +814,13 @@ struct CudaHardShrinkFunctor : public BaseCudaActiveFunctor<T> {
   __device__ __forceinline__ T operator()(const T* args) const {
     T x = args[0];
     T t = static_cast<T>(threshold);
-    T temp1 = static_cast<T>(x > t);
-    T temp2 = static_cast<T>(x < -t);
-    return x * (temp1 + temp2);
+    return (x > -t && x < t) ? zero : x;
   }
 };
 
 template <typename T>
 struct CudaHardShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
+  T zero = static_cast<T>(0.0f);
   float threshold;
 
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
@@ -830,9 +830,7 @@ struct CudaHardShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
   __device__ __forceinline__ T operator()(const T* args) const {
     T x = args[1];
     T t = static_cast<T>(threshold);
-    T temp1 = static_cast<T>(x > t);
-    T temp2 = static_cast<T>(x < -t);
-    return args[0] * (temp1 + temp2);
+    return (x > -t && x < t) ? zero : args[0];
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -911,7 +909,7 @@ struct CudaSwishGradFunctor : public BaseCudaActiveFunctor<T> {
     CT b = static_cast<CT>(beta);
     CT temp1 = one / (one + exp(-b * x));
     CT out = x * temp1;
-    CT temp2 = temp1 * (one - b * x);
+    CT temp2 = temp1 * (one - b * out);
     return T(dout * (b * out + temp2));
   }
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 7245dea9cf9499..ccd5bf528ba58c 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -455,7 +455,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 + temp2).template cast<T>();
+    out.device(d) = x * (temp1 || temp2).template cast<T>();
   }
 };
 
@@ -472,7 +472,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }

From c17f3aa317aba5aed677301e7aacd0b9bd9be074 Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Sun, 25 Apr 2021 06:50:37 +0000
Subject: [PATCH 05/11] fix

---
 paddle/fluid/operators/activation_op.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index a412235aadf64c..3aaa14fd3fcbe9 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1331,10 +1331,8 @@ REGISTER_OP_CUDA_KERNEL(
                                    ops::CudaExpFunctor<float>>,
     ops::ActivationCudaKernel<plat::CUDADeviceContext,
                               ops::CudaExpFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaExpFunctor<int>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaExpFunctor<int64_t>>,
+    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int>>,
+    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int64_t>>,
     ops::ActivationCudaKernel<plat::CUDADeviceContext,
                               ops::CudaExpFunctor<plat::float16>>);
 REGISTER_OP_CUDA_KERNEL(
@@ -1361,6 +1359,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LogDoubleGradKernel<plat::CUDADeviceContext,
                              ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
+
 REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
                                 CudaSigmoidGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,

From 88d29139a77fedb606a7cb608a712858069afe79 Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Sun, 25 Apr 2021 07:31:24 +0000
Subject: [PATCH 06/11] add silu

---
 paddle/fluid/operators/activation_op.cu | 32 +++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 3aaa14fd3fcbe9..ab0ec790ccc88e 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -107,6 +107,36 @@ struct CudaSigmoidGradFunctor : public BaseCudaActiveFunctor<T> {
 };
 /********************Sigmoid End********************/
 
+/********************Silu Begin********************/
+template <typename T>
+struct CudaSiluFunctor : public BaseCudaActiveFunctor<T> {
+  // CT means Compute Type
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    return T(x / (one + exp(-x)));
+  }
+};
+
+template <typename T>
+struct CudaSiluGradFunctor : public BaseCudaActiveFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT one = static_cast<CT>(1.0f);
+
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT dout = static_cast<CT>(args[0]);
+    CT x = static_cast<CT>(args[1]);
+    CT temp1 = one + exp(-x);
+    CT temp2 = x * exp(-x);
+    return T(dout * ((one / temp1) * (one + temp2 / temp1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+/********************Silu End********************/
+
 /********************LogSigmoid Begin********************/
 template <typename T>
 struct CudaLogSigmoidFunctor : public BaseCudaActiveFunctor<T> {
@@ -1362,6 +1392,8 @@ REGISTER_OP_CUDA_KERNEL(
 
 REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
                                 CudaSigmoidGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(silu, Silu, CudaSiluFunctor,
+                                CudaSiluGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
                                 CudaLogSigmoidGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(atan, Atan, CudaAtanFunctor,

From 95aad4b857e187059bad526eadf6ed5a8b237c2b Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Sun, 25 Apr 2021 08:01:29 +0000
Subject: [PATCH 07/11] revert swish and softrelu

---
 paddle/fluid/operators/activation_op.cu | 38 ++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index ab0ec790ccc88e..4ddb777cdddeb3 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1390,6 +1390,40 @@ REGISTER_OP_CUDA_KERNEL(
                              ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
+/* ==========================   softrelu register  ============================
+ */
+REGISTER_OP_CUDA_KERNEL(
+    soft_relu,
+    ops::ActivationKernel<plat::CUDADeviceContext, ops::SoftReluFunctor<float>>,
+    ops::ActivationKernel<plat::CUDADeviceContext,
+                          ops::SoftReluFunctor<double>>,
+    ops::ActivationKernel<plat::CUDADeviceContext,
+                          ops::SoftReluFunctor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    soft_relu_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
+                                              ops::SoftReluGradFunctor<float>>,
+    ops::ActivationGradKernel<plat::CUDADeviceContext,
+                              ops::SoftReluGradFunctor<double>>,
+    ops::ActivationGradKernel<plat::CUDADeviceContext,
+                              ops::SoftReluGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
+/* ==========================   swish register  ============================ */
+REGISTER_OP_CUDA_KERNEL(
+    swish,
+    ops::ActivationKernel<plat::CUDADeviceContext, ops::SwishFunctor<float>>,
+    ops::ActivationKernel<plat::CUDADeviceContext, ops::SwishFunctor<double>>,
+    ops::ActivationKernel<plat::CUDADeviceContext,
+                          ops::SwishFunctor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    swish_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
+                                          ops::SwishGradFunctor<float>>,
+    ops::ActivationGradKernel<plat::CUDADeviceContext,
+                              ops::SwishGradFunctor<double>>,
+    ops::ActivationGradKernel<plat::CUDADeviceContext,
+                              ops::SwishGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
                                 CudaSigmoidGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(silu, Silu, CudaSiluFunctor,
@@ -1427,8 +1461,6 @@ REGISTER_ACTIVATION_CUDA_KERNEL(log10, Log10, CudaLog10Functor,
                                 CudaLog10GradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(brelu, BRelu, CudaBReluFunctor,
                                 CudaBReluGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
-                                CudaSoftReluGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(stanh, STanh, CudaSTanhFunctor,
                                 CudaSTanhGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
@@ -1444,8 +1476,6 @@ REGISTER_ACTIVATION_CUDA_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
 REGISTER_ACTIVATION_CUDA_KERNEL(hard_sigmoid, HardSigmoid,
                                 CudaHardSigmoidFunctor,
                                 CudaHardSigmoidGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(swish, Swish, CudaSwishFunctor,
-                                CudaSwishGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(thresholded_relu, ThresholdedRelu,
                                 CudaThresholdedReluFunctor,
                                 CudaThresholdedReluGradFunctor);

From 0d09b3efa0171cde81a2b9879c7c82b545ba0df0 Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Sun, 25 Apr 2021 08:12:42 +0000
Subject: [PATCH 08/11] fix

---
 paddle/fluid/operators/activation_op.cu | 615 ++----------------------
 1 file changed, 41 insertions(+), 574 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 4ddb777cdddeb3..81ddfe71ddd380 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -494,28 +494,6 @@ struct CudaExpGradFunctor : public BaseCudaActiveFunctor<T> {
 };
 /********************Exp End********************/
 
-/********************Log1p Begin********************/
-template <typename T>
-struct CudaLog1pFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(log(one + x));
-  }
-};
-
-template <typename T>
-struct CudaLog1pGradFunctor : public BaseCudaActiveFunctor<T> {
-  T one = static_cast<T>(1.0f);
-  __device__ __forceinline__ T operator()(const T* args) const {
-    return args[0] / (one + args[1]);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************Log1p End********************/
-
 /********************Log Begin********************/
 template <typename T>
 struct CudaLogFunctor : public BaseCudaActiveFunctor<T> {
@@ -536,496 +514,6 @@ struct CudaLogGradFunctor : public BaseCudaActiveFunctor<T> {
 };
 /********************Log End********************/
 
-/********************Log2 Begin********************/
-template <typename T>
-struct CudaLog2Functor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(log2(x));
-  }
-};
-
-template <typename T>
-struct CudaLog2GradFunctor : public BaseCudaActiveFunctor<T> {
-  T log_two = static_cast<T>(log(2));
-  __device__ __forceinline__ T operator()(const T* args) const {
-    return args[0] / (args[1] * log_two);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************Log2 End********************/
-
-/********************Log10 Begin********************/
-template <typename T>
-struct CudaLog10Functor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(log10(x));
-  }
-};
-
-template <typename T>
-struct CudaLog10GradFunctor : public BaseCudaActiveFunctor<T> {
-  T log_ten = static_cast<T>(log(10));
-  __device__ __forceinline__ T operator()(const T* args) const {
-    return args[0] / (args[1] * log_ten);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************Log10 End********************/
-
-/********************BRelu Begin********************/
-template <typename T>
-struct CudaBReluFunctor : public BaseCudaActiveFunctor<T> {
-  float t_min;
-  float t_max;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T x = args[0];
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    return (x > t_min_cast && x < t_max_cast)
-               ? x
-               : (x <= t_min_cast ? t_min_cast : t_max_cast);
-  }
-};
-
-template <typename T>
-struct CudaBReluGradFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float t_min;
-  float t_max;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T dout = args[0];
-    T x = args[1];
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    return (x <= t_min_cast || x >= t_max_cast) ? zero : dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************BRelu End********************/
-
-/********************SoftRelu Begin********************/
-template <typename T>
-struct CudaSoftReluFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  float threshold;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    CT t = static_cast<CT>(threshold);
-    CT temp = (x > -t && x < t) ? x : (x <= -t ? -t : t);
-    return T(log(one + exp(temp)));
-  }
-};
-
-template <typename T>
-struct CudaSoftReluGradFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  float threshold;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT out = static_cast<CT>(args[1]);
-    CT t = static_cast<CT>(threshold);
-    return (out > -t && out < t) ? T(dout * (one - exp(-out)))
-                                 : static_cast<T>(0.0f);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-/********************SoftRelu End********************/
-
-/********************STanh Begin********************/
-template <typename T>
-struct CudaSTanhFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  float scale_a;
-  float scale_b;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    CT a = static_cast<CT>(scale_a);
-    CT b = static_cast<CT>(scale_b);
-    return T(b * tanh(a * x));
-  }
-};
-
-template <typename T>
-struct CudaSTanhGradFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  float scale_a;
-  float scale_b;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    CT a = static_cast<CT>(scale_a);
-    CT b = static_cast<CT>(scale_b);
-    CT temp = tanh(a * x) * tanh(a * x);
-    return T(dout * a * b * (one - temp));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************STanh End********************/
-
-/********************Softplus Begin********************/
-template <typename T>
-struct CudaSoftplusFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  float beta;
-  float threshold;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}, {"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    CT b = static_cast<CT>(beta);
-    CT t = static_cast<CT>(threshold);
-    CT x_beta = x * beta;
-    return T(x_beta > t ? x : log(one + exp(x_beta)) / b);
-  }
-};
-
-template <typename T>
-struct CudaSoftplusGradFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  float beta;
-  float threshold;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}, {"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    CT b = static_cast<CT>(beta);
-    CT t = static_cast<CT>(threshold);
-    CT x_beta = x * beta;
-    return x_beta > t ? args[0] : T(dout / (one + exp(-x_beta)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************Softplus End********************/
-
-/********************Softsign Begin********************/
-template <typename T>
-struct CudaSoftsignFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(x / (one + abs(x)));
-  }
-};
-
-template <typename T>
-struct CudaSoftsignGradFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return T(dout / ((one + abs(x)) * (one + abs(x))));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************Softsign End********************/
-
-/********************Relu6 Begin********************/
-template <typename T>
-struct CudaRelu6Functor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T t = static_cast<T>(threshold);
-    return args[0] <= zero ? zero : (args[0] < t ? args[0] : t);
-  }
-};
-
-template <typename T>
-struct CudaRelu6GradFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T t = static_cast<T>(threshold);
-    return (args[1] > zero && args[1] < t) ? args[0] : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-/********************Relu6 End********************/
-
-/********************TanhShrink Begin********************/
-template <typename T>
-struct CudaTanhShrinkFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(x - tanh(x));
-  }
-};
-
-template <typename T>
-struct CudaTanhShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return T(dout * tanh(x) * tanh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************TanhShrink End********************/
-
-/********************HardShrink Begin********************/
-template <typename T>
-struct CudaHardShrinkFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T x = args[0];
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : x;
-  }
-};
-
-template <typename T>
-struct CudaHardShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T x = args[1];
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : args[0];
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************HardShrink End********************/
-
-/********************HardSigmoid Begin********************/
-template <typename T>
-struct CudaHardSigmoidFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T temp = args[0] * static_cast<T>(slope) + static_cast<T>(offset);
-    return (temp > zero && temp < one) ? temp : (temp <= zero ? zero : one);
-  }
-};
-
-template <typename T>
-struct CudaHardSigmoidGradFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T out = args[1];
-    return (out > zero && out < one) ? args[0] * static_cast<T>(slope) : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-/********************HardSigmoid End********************/
-
-/********************Swish Begin********************/
-template <typename T>
-struct CudaSwishFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  float beta;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    CT b = static_cast<CT>(beta);
-    return T(x / (one + exp(-b * x)));
-  }
-};
-
-template <typename T>
-struct CudaSwishGradFunctor : public BaseCudaActiveFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
-  float beta;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    CT b = static_cast<CT>(beta);
-    CT temp1 = one / (one + exp(-b * x));
-    CT out = x * temp1;
-    CT temp2 = temp1 * (one - b * out);
-    return T(dout * (b * out + temp2));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************Swish End********************/
-
-/********************ThresholdedRelu Begin********************/
-template <typename T>
-struct CudaThresholdedReluFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    return args[0] > static_cast<T>(threshold) ? args[0] : zero;
-  }
-};
-
-template <typename T>
-struct CudaThresholdedReluGradFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    return args[1] > static_cast<T>(threshold) ? args[0] : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************ThresholdedRelu End********************/
-
-/********************HardSwish Begin********************/
-template <typename T>
-struct CudaHardSwishFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-  float scale;
-  float offset;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T x = args[0];
-    T t = static_cast<T>(threshold);
-    T temp1 = x + static_cast<T>(offset);
-    T temp2 = (temp1 > zero && temp1 < t) ? temp1 : (temp1 <= zero ? zero : t);
-    return temp2 * x / static_cast<T>(scale);
-  }
-};
-
-template <typename T>
-struct CudaHardSwishGradFunctor : public BaseCudaActiveFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  T two = static_cast<T>(2.0f);
-  float threshold;
-  float scale;
-  float offset;
-
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
-  }
-
-  __device__ __forceinline__ T operator()(const T* args) const {
-    T x = args[1];
-    T o = static_cast<T>(offset);
-    T s = static_cast<T>(scale);
-    T temp1 = static_cast<T>(x + o > zero);
-    T temp2 = static_cast<T>(x + o < static_cast<T>(threshold));
-    return args[0] * (temp1 * temp2 * (two * x + o) / s + one - temp2);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-/********************HardSwish End********************/
-
 /********************ELU Begin********************/
 template <typename T>
 struct CudaELUFunctor : public BaseCudaActiveFunctor<T> {
@@ -1196,6 +684,23 @@ class ActivationGradCudaKernel
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor,          \
+                                       grad_functor)                        \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext,  \
+                                      ops::functor<float>>,                 \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,            \
+                            ops::functor<double>>,                          \
+      ops::ActivationKernel<plat::CUDADeviceContext,                        \
+                            ops::functor<plat::float16>>);                  \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
+      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
+                                                 ops::grad_functor<float>>, \
+      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
+                                ops::grad_functor<double>>,                 \
+      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
+                                ops::grad_functor<plat::float16>>);
+
 #define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,            \
                                         grad_functor)                          \
   REGISTER_OP_CUDA_KERNEL(                                                     \
@@ -1390,40 +895,6 @@ REGISTER_OP_CUDA_KERNEL(
                              ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ==========================   softrelu register  ============================
- */
-REGISTER_OP_CUDA_KERNEL(
-    soft_relu,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SoftReluFunctor<float>>,
-    ops::ActivationKernel<plat::CUDADeviceContext,
-                          ops::SoftReluFunctor<double>>,
-    ops::ActivationKernel<plat::CUDADeviceContext,
-                          ops::SoftReluFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    soft_relu_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
-                                              ops::SoftReluGradFunctor<float>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SoftReluGradFunctor<double>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SoftReluGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ==========================   swish register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    swish,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SwishFunctor<float>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SwishFunctor<double>>,
-    ops::ActivationKernel<plat::CUDADeviceContext,
-                          ops::SwishFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    swish_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
-                                          ops::SwishGradFunctor<float>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SwishGradFunctor<double>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SwishGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
 REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
                                 CudaSigmoidGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(silu, Silu, CudaSiluFunctor,
@@ -1453,31 +924,27 @@ REGISTER_ACTIVATION_CUDA_KERNEL(round, Round, CudaRoundFunctor,
                                 CudaZeroGradFunctor);
 REGISTER_ACTIVATION_CUDA_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
                                 CudaReciprocalGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(log1p, Log1p, CudaLog1pFunctor,
-                                CudaLog1pGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(log2, Log2, CudaLog2Functor,
-                                CudaLog2GradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(log10, Log10, CudaLog10Functor,
-                                CudaLog10GradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(brelu, BRelu, CudaBReluFunctor,
-                                CudaBReluGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(stanh, STanh, CudaSTanhFunctor,
-                                CudaSTanhGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
-                                CudaSoftplusGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
-                                CudaSoftsignGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(relu6, Relu6, CudaRelu6Functor,
-                                CudaRelu6GradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor,
-                                CudaTanhShrinkGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
-                                CudaHardShrinkGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(hard_sigmoid, HardSigmoid,
-                                CudaHardSigmoidFunctor,
-                                CudaHardSigmoidGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(thresholded_relu, ThresholdedRelu,
-                                CudaThresholdedReluFunctor,
-                                CudaThresholdedReluGradFunctor);
-REGISTER_ACTIVATION_CUDA_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
-                                CudaHardSwishGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(log2, Log2, Log2Functor, Log2GradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(log10, Log10, Log10Functor, Log10GradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(brelu, BRelu, BReluFunctor, BReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(soft_relu, SoftRelu, SoftReluFunctor,
+                               SoftReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(stanh, STanh, STanhFunctor, STanhGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(softplus, Softplus, SoftplusFunctor,
+                               SoftplusGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(softsign, Softsign, SoftsignFunctor,
+                               SoftsignGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(tanh_shrink, TanhShrink, TanhShrinkFunctor,
+                               TanhShrinkGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(hard_shrink, HardShrink, HardShrinkFunctor,
+                               HardShrinkGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
+                               HardSigmoidGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(swish, Swish, SwishFunctor, SwishGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(thresholded_relu, ThresholdedRelu,
+                               ThresholdedReluFunctor,
+                               ThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(hard_swish, HardSwish, HardSwishFunctor,
+                               HardSwishGradFunctor);

From f67e8a4421b02015e8ebc59e3fcadf7f54981c12 Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Mon, 26 Apr 2021 11:12:11 +0000
Subject: [PATCH 09/11] add notes

---
 paddle/fluid/operators/activation_op.cu | 287 ++++++++++++++----------
 1 file changed, 172 insertions(+), 115 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 81ddfe71ddd380..3db64136425dcf 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -19,76 +19,73 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct BaseCudaActiveFunctor {
-  using ELEMENT_TYPE = T;
-  using AttrPair = std::vector<std::pair<const char*, float*>>;
-  AttrPair GetAttrs() { return AttrPair(); }
-};
-
-// For forward, args[0] means the input x;
-// For backward, args[0] means the input dout, args[1] means the input x or out,
-// which depends on the FwdDeps;
-/********************Relu Begin********************/
-template <typename T>
-struct CudaReluFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
 
+  // relu(x) = max(x, 0)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[0] > zero ? args[0] : zero;
   }
 };
 
 template <typename T>
-struct CudaReluGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
 
+  // dx = dout * (out > 0)
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[1] > zero ? args[0] : zero;
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
-/********************Relu End********************/
 
-/********************LeakyRelu Begin********************/
 template <typename T>
-struct CudaLeakyReluFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
   float alpha;
 
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"alpha", &alpha}};
   }
 
+  // leakyrelu(x) = x > 0 ? x : alpha * x
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[0] > zero ? args[0] : static_cast<T>(alpha) * args[0];
   }
 };
 
 template <typename T>
-struct CudaLeakyReluGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
   float alpha;
 
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"alpha", &alpha}};
   }
 
+  // dx = dout * (x > 0 ? 1 : alpha)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[1] > zero ? args[0] : static_cast<T>(alpha) * args[0];
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************LeakyRelu End********************/
 
-/********************Sigmoid Begin********************/
 template <typename T>
-struct CudaSigmoidFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
   // CT means Compute Type
   using CT = typename details::MPTypeTrait<T>::Type;
   CT one = static_cast<CT>(1.0f);
 
+  // sigmoid(x) = 1 / (1 + exp(-x))
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(one / (one + exp(-x)));
@@ -96,24 +93,27 @@ struct CudaSigmoidFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaSigmoidGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
+  // dx = dout * out * (1 - out)
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[0] * args[1] * (one - args[1]);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
-/********************Sigmoid End********************/
 
-/********************Silu Begin********************/
 template <typename T>
-struct CudaSiluFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSiluFunctor : public BaseActivationFunctor<T> {
   // CT means Compute Type
   using CT = typename details::MPTypeTrait<T>::Type;
   CT one = static_cast<CT>(1.0f);
 
+  // silu(x) = x / (1 + exp(-x))
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(x / (one + exp(-x)));
@@ -121,10 +121,13 @@ struct CudaSiluFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaSiluGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
   CT one = static_cast<CT>(1.0f);
 
+  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -135,14 +138,14 @@ struct CudaSiluGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Silu End********************/
 
-/********************LogSigmoid Begin********************/
 template <typename T>
-struct CudaLogSigmoidFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
   CT zero = static_cast<CT>(0.0f);
 
+  // logsigmoid(x) = log(1 / (1 + exp(-x)))
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     CT temp = x > zero ? zero : -x;
@@ -151,10 +154,13 @@ struct CudaLogSigmoidFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaLogSigmoidGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
   CT zero = static_cast<CT>(0.0f);
 
+  // dx = dout * exp(-x) / (1 + exp(-x))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -164,12 +170,12 @@ struct CudaLogSigmoidGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************LogSigmoid End********************/
 
-/********************Atan Begin********************/
 template <typename T>
-struct CudaAtanFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // atan(x) = atan(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(atan(x));
@@ -177,25 +183,30 @@ struct CudaAtanFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaAtanGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
+  // dx = dout / (1 + x^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[0] / (one + args[1] * args[1]);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Atan End********************/
 
-/********************SoftShrink Begin********************/
 template <typename T>
-struct CudaSoftShrinkFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
   float lambda;
 
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"lambda", &lambda}};
   }
 
+  // softshrink(x) = x - lambda, if x > lambda;
+  //                 x + lambda, if x < -lambda;
+  //                 0, otherwise.
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     T x = args[0];
     T l = static_cast<T>(lambda);
@@ -206,13 +217,16 @@ struct CudaSoftShrinkFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaSoftShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
   float lambda;
 
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"lambda", &lambda}};
   }
 
+  // dx = dout, if x > lambda or x < -lambda else 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     T x = args[1];
     T l = static_cast<T>(lambda);
@@ -223,56 +237,55 @@ struct CudaSoftShrinkGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************SoftShrink End********************/
 
-/********************Ceil Begin********************/
 template <typename T>
-struct CudaCeilFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaCeilFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // ceil(x) = ceil(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(ceil(x));
   }
 };
-/********************Ceil End********************/
 
-/********************Floor Begin********************/
 template <typename T>
-struct CudaFloorFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaFloorFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // floor(x) = floor(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(floor(x));
   }
 };
-/********************Floor End********************/
 
-/********************Round Begin********************/
 template <typename T>
-struct CudaRoundFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaRoundFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // round(x) = round(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(round(x));
   }
 };
-/********************Floor End********************/
 
-/********************Zero Begin********************/
+// grad functor for ceil. floor and round
 template <typename T>
-struct CudaZeroGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
   __device__ __forceinline__ T operator()(const T* args) const {
     return static_cast<T>(0.0f);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
 };
-/********************Zero End********************/
 
-/********************Cos Begin********************/
 template <typename T>
-struct CudaCosFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // cos(x) = cos(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(cos(x));
@@ -280,8 +293,11 @@ struct CudaCosFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaCosGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // dx = dout * (-sin(x))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -290,12 +306,12 @@ struct CudaCosGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Cos End********************/
 
-/********************Sin Begin********************/
 template <typename T>
-struct CudaSinFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // sin(x) = sin(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(sin(x));
@@ -303,8 +319,11 @@ struct CudaSinFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaSinGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // dx = dout * cos(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -313,12 +332,12 @@ struct CudaSinGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Sin End********************/
 
-/********************Tan Begin********************/
 template <typename T>
-struct CudaTanFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // tan(x) = tan(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(tan(x));
@@ -326,8 +345,11 @@ struct CudaTanFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaTanGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // dx = dout / cos(x)^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -336,12 +358,12 @@ struct CudaTanGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Tan End********************/
 
-/********************Asin Begin********************/
 template <typename T>
-struct CudaAsinFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // asin(x) = asin(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(asin(x));
@@ -349,9 +371,12 @@ struct CudaAsinFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaAsinGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
   CT one = static_cast<CT>(1.0f);
+  // dx = dout / sqrt(1 - x^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -360,12 +385,12 @@ struct CudaAsinGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Asin End********************/
 
-/********************Acos Begin********************/
 template <typename T>
-struct CudaAcosFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // acos(x) = acos(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(acos(x));
@@ -373,9 +398,12 @@ struct CudaAcosFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaAcosGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
   CT one = static_cast<CT>(1.0f);
+  // dx = -dout / sqrt(1 - x^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -384,12 +412,12 @@ struct CudaAcosGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Acos End********************/
 
-/********************Cosh Begin********************/
 template <typename T>
-struct CudaCoshFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // cosh(x) = cosh(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(cosh(x));
@@ -397,8 +425,11 @@ struct CudaCoshFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaCoshGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // dx = dout * sinh(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -407,12 +438,12 @@ struct CudaCoshGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Cosh End********************/
 
-/********************Sinh Begin********************/
 template <typename T>
-struct CudaSinhFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // sinh(x) = sinh(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(sinh(x));
@@ -420,8 +451,11 @@ struct CudaSinhFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaSinhGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // dx = dout * cosh(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -430,12 +464,12 @@ struct CudaSinhGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Sinh End********************/
 
-/********************Tanh Begin********************/
 template <typename T>
-struct CudaTanhFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaTanhFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // tanh(x) = tanh(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(tanh(x));
@@ -443,8 +477,11 @@ struct CudaTanhFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaTanhGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
+  // dx = dout * (1 - out^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
   __device__ __forceinline__ T operator()(const T* args) const {
     T dout = static_cast<T>(args[0]);
     T out = static_cast<T>(args[1]);
@@ -453,31 +490,34 @@ struct CudaTanhGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
-/********************Tanh End********************/
 
-/********************Reciprocal Begin********************/
 template <typename T>
-struct CudaReciprocalFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
+  // reciprocal(x) = 1 / x
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     return one / args[0];
   }
 };
 
 template <typename T>
-struct CudaReciprocalGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  // dx = -dout * out^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
   __device__ __forceinline__ T operator()(const T* args) const {
     return -args[0] * args[1] * args[1];
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
-/********************Reciprocal End********************/
 
-/********************Exp Begin********************/
 template <typename T>
-struct CudaExpFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaExpFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // exp(x) = exp(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(exp(x));
@@ -485,19 +525,22 @@ struct CudaExpFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaExpGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout * out
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[0] * args[1];
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
-/********************Exp End********************/
 
-/********************Log Begin********************/
 template <typename T>
-struct CudaLogFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaLogFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // log(x) = log(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(log(x));
@@ -505,27 +548,30 @@ struct CudaLogFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaLogGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout / x
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[0] / args[1];
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Log End********************/
 
-/********************ELU Begin********************/
 template <typename T>
-struct CudaELUFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaELUFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
   CT zero = static_cast<CT>(0.0f);
   CT one = static_cast<CT>(1.0f);
   float alpha;
 
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"alpha", &alpha}};
   }
 
+  // elu(x) = x >= 0 ? x : alpha * (exp(x) - 1)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return x >= zero ? args[0] : T(static_cast<CT>(alpha) * (exp(x) - one));
@@ -533,16 +579,19 @@ struct CudaELUFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaELUGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
   CT zero = static_cast<CT>(0.0f);
   CT one = static_cast<CT>(1.0f);
   float alpha;
 
-  typename BaseCudaActiveFunctor<T>::AttrPair GetAttrs() {
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"alpha", &alpha}};
   }
 
+  // dx = x >= 0 ? dout : dout * alpha * exp(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT dout = static_cast<CT>(args[0]);
     CT x = static_cast<CT>(args[1]);
@@ -551,31 +600,34 @@ struct CudaELUGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************ELU End********************/
 
-/********************Square Begin********************/
 template <typename T>
-struct CudaSquareFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSquareFunctor : public BaseActivationFunctor<T> {
+  // square(x) = x * x
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[0] * args[0];
   }
 };
 
 template <typename T>
-struct CudaSquareGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
   T two = static_cast<T>(2.0f);
+  // dx = dout * 2 * x
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     return args[0] * two * args[1];
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
-/********************Square End********************/
 
-/********************Sqrt Begin********************/
 template <typename T>
-struct CudaSqrtFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // sqrt(x) = sqrt(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(sqrt(x));
@@ -583,20 +635,23 @@ struct CudaSqrtFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaSqrtGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
   T one_half = static_cast<T>(0.5f);
+  // dx = dout * 0.5 / out
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
   __device__ __forceinline__ T operator()(const T* args) const {
     return one_half * args[0] / args[1];
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
-/********************Sqrt End********************/
 
-/********************Rsqrt Begin********************/
 template <typename T>
-struct CudaRsqrtFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
+  // rsqrt(x) = rsqrt(x)
+  // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
     CT x = static_cast<CT>(args[0]);
     return T(rsqrt(x));
@@ -604,8 +659,11 @@ struct CudaRsqrtFunctor : public BaseCudaActiveFunctor<T> {
 };
 
 template <typename T>
-struct CudaRsqrtGradFunctor : public BaseCudaActiveFunctor<T> {
+struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
   T minus_one_half = static_cast<T>(-0.5f);
+  // dx = dout * -0.5 / out^3
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
   __device__ __forceinline__ T operator()(const T* args) const {
     T out = args[1];
     return minus_one_half * args[0] * out * out * out;
@@ -613,7 +671,6 @@ struct CudaRsqrtGradFunctor : public BaseCudaActiveFunctor<T> {
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
-/********************Rsqrt End********************/
 
 template <typename DeviceContext, typename Functor>
 class ActivationCudaKernel

From 63e938dd701e795a07957474716aa0aae6a21b61 Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Tue, 27 Apr 2021 03:30:42 +0000
Subject: [PATCH 10/11] fix

---
 paddle/fluid/operators/activation_op.cu | 297 ++++++++++++------------
 1 file changed, 144 insertions(+), 153 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 3db64136425dcf..23d309357d5705 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -80,15 +80,14 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
-  // CT means Compute Type
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
 
   // sigmoid(x) = 1 / (1 + exp(-x))
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(one / (one + exp(-x)));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(one / (one + exp(-x)));
   }
 };
 
@@ -108,32 +107,31 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaSiluFunctor : public BaseActivationFunctor<T> {
-  // CT means Compute Type
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
+  // MPType means Compute Type
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
 
   // silu(x) = x / (1 + exp(-x))
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(x / (one + exp(-x)));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(x / (one + exp(-x)));
   }
 };
 
 template <typename T>
 struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
 
   // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
   // Inputs: args[0], the input dout
   //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    CT temp1 = one + exp(-x);
-    CT temp2 = x * exp(-x);
-    return T(dout * ((one / temp1) * (one + temp2 / temp1)));
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType temp = one / (one + exp(-x));
+    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -141,31 +139,38 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
 
   // logsigmoid(x) = log(1 / (1 + exp(-x)))
+  // For numerical stability,
+  // logsigmoid(x) =
+  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    CT temp = x > zero ? zero : -x;
-    return T(-temp - log(exp(-temp) + exp(-x - temp)));
+    MPType x = static_cast<MPType>(args[0]);
+    MPType temp = x > zero ? zero : -x;
+    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
   }
 };
 
 template <typename T>
 struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
 
   // dx = dout * exp(-x) / (1 + exp(-x))
+  // For numerical stability:
+  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
+  // 0)))
   // Inputs: args[0], the input dout
   //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    CT temp = x > zero ? zero : -x;
-    return T(dout * (exp(-x - temp) / (exp(-temp) + exp(-x - temp))));
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType temp1 = x > zero ? zero : -x;
+    MPType temp2 = exp(-x - temp);
+    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -173,18 +178,20 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaAtanFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // atan(x) = atan(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(atan(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(atan(x));
   }
 };
 
 template <typename T>
 struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
+
   // dx = dout / (1 + x^2)
   // Inputs: args[0], the input dout
   //         args[1], the input x
@@ -218,6 +225,7 @@ struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
   float lambda;
 
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
@@ -230,9 +238,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
   __device__ __forceinline__ T operator()(const T* args) const {
     T x = args[1];
     T l = static_cast<T>(lambda);
-    T temp1 = static_cast<T>(x > l);
-    T temp2 = static_cast<T>(x < -l);
-    return args[0] * (temp1 + temp2);
+    return (x >= -l && x <= l) ? zero : args[0];
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -240,38 +246,41 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaCeilFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // ceil(x) = ceil(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(ceil(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(ceil(x));
   }
 };
 
 template <typename T>
 struct CudaFloorFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // floor(x) = floor(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(floor(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(floor(x));
   }
 };
 
 template <typename T>
 struct CudaRoundFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // round(x) = round(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(round(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(round(x));
   }
 };
 
-// grad functor for ceil. floor and round
+// grad functor for ceil, floor and round
 template <typename T>
 struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
   __device__ __forceinline__ T operator()(const T* args) const {
@@ -283,25 +292,27 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaCosFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // cos(x) = cos(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(cos(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(cos(x));
   }
 };
 
 template <typename T>
 struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // dx = dout * (-sin(x))
   // Inputs: args[0], the input dout
   //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return T(-dout * sin(x));
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(-dout * sin(x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -309,25 +320,27 @@ struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaSinFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // sin(x) = sin(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(sin(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(sin(x));
   }
 };
 
 template <typename T>
 struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // dx = dout * cos(x)
   // Inputs: args[0], the input dout
   //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return T(dout * cos(x));
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * cos(x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -335,25 +348,27 @@ struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaTanFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // tan(x) = tan(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(tan(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(tan(x));
   }
 };
 
 template <typename T>
 struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // dx = dout / cos(x)^2
   // Inputs: args[0], the input dout
   //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return T(dout / (cos(x) * cos(x)));
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -361,26 +376,28 @@ struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaAsinFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // asin(x) = asin(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(asin(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(asin(x));
   }
 };
 
 template <typename T>
 struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
   // dx = dout / sqrt(1 - x^2)
   // Inputs: args[0], the input dout
   //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return T(dout / sqrt(one - x * x));
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout / sqrt(one - x * x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -388,26 +405,28 @@ struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaAcosFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // acos(x) = acos(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(acos(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(acos(x));
   }
 };
 
 template <typename T>
 struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT one = static_cast<CT>(1.0f);
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
   // dx = -dout / sqrt(1 - x^2)
   // Inputs: args[0], the input dout
   //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return T(-dout / sqrt(one - x * x));
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(-dout / sqrt(one - x * x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -415,25 +434,27 @@ struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaCoshFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // cosh(x) = cosh(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(cosh(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(cosh(x));
   }
 };
 
 template <typename T>
 struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // dx = dout * sinh(x)
   // Inputs: args[0], the input dout
   //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return T(dout * sinh(x));
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * sinh(x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -441,25 +462,27 @@ struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaSinhFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // sinh(x) = sinh(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(sinh(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(sinh(x));
   }
 };
 
 template <typename T>
 struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // dx = dout * cosh(x)
   // Inputs: args[0], the input dout
   //         args[1], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return T(dout * cosh(x));
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * cosh(x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -467,18 +490,20 @@ struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaTanhFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // tanh(x) = tanh(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(tanh(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(tanh(x));
   }
 };
 
 template <typename T>
 struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
+
   // dx = dout * (1 - out^2)
   // Inputs: args[0], the input dout
   //         args[1], the input out
@@ -494,6 +519,7 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
+
   // reciprocal(x) = 1 / x
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
@@ -515,12 +541,13 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaExpFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // exp(x) = exp(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(exp(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(exp(x));
   }
 };
 
@@ -538,12 +565,13 @@ struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaLogFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // log(x) = log(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(log(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(log(x));
   }
 };
 
@@ -559,48 +587,6 @@ struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
-template <typename T>
-struct CudaELUFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
-  CT one = static_cast<CT>(1.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // elu(x) = x >= 0 ? x : alpha * (exp(x) - 1)
-  // Inputs: args[0], the input x
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return x >= zero ? args[0] : T(static_cast<CT>(alpha) * (exp(x) - one));
-  }
-};
-
-template <typename T>
-struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
-  CT one = static_cast<CT>(1.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // dx = x >= 0 ? dout : dout * alpha * exp(x)
-  // Inputs: args[0], the input dout
-  //         args[1], the input x
-  __device__ __forceinline__ T operator()(const T* args) const {
-    CT dout = static_cast<CT>(args[0]);
-    CT x = static_cast<CT>(args[1]);
-    return x >= zero ? args[0] : T(dout * static_cast<CT>(alpha) * exp(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 template <typename T>
 struct CudaSquareFunctor : public BaseActivationFunctor<T> {
   // square(x) = x * x
@@ -613,6 +599,7 @@ struct CudaSquareFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
   T two = static_cast<T>(2.0f);
+
   // dx = dout * 2 * x
   // Inputs: args[0], the input dout
   //         args[1], the input x
@@ -625,18 +612,20 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // sqrt(x) = sqrt(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(sqrt(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(sqrt(x));
   }
 };
 
 template <typename T>
 struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
   T one_half = static_cast<T>(0.5f);
+
   // dx = dout * 0.5 / out
   // Inputs: args[0], the input dout
   //         args[1], the input out
@@ -649,18 +638,20 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
   // rsqrt(x) = rsqrt(x)
   // Inputs: args[0], the input x
   __device__ __forceinline__ T operator()(const T* args) const {
-    CT x = static_cast<CT>(args[0]);
-    return T(rsqrt(x));
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(rsqrt(x));
   }
 };
 
 template <typename T>
 struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
   T minus_one_half = static_cast<T>(-0.5f);
+
   // dx = dout * -0.5 / out^3
   // Inputs: args[0], the input dout
   //         args[1], the input out
@@ -791,7 +782,7 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ======================== elu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, CudaELUFunctor, CudaELUGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,

From 31665c6731842a92bf65f23b9e98de8723144971 Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Tue, 27 Apr 2021 03:32:25 +0000
Subject: [PATCH 11/11] fix

---
 paddle/fluid/operators/activation_op.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 23d309357d5705..836c5fa06f6dfe 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -169,7 +169,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
     MPType dout = static_cast<MPType>(args[0]);
     MPType x = static_cast<MPType>(args[1]);
     MPType temp1 = x > zero ? zero : -x;
-    MPType temp2 = exp(-x - temp);
+    MPType temp2 = exp(-x - temp1);
     return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
   }