[ARM] Add silu op (#9280)

wz1qqx · web-flow · commit 663ed1641198 · 2022-11-17T12:41:29.000+08:00
* add silu op and python unitest
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
@@ -52,6 +52,7 @@ const std::string& ActivationTypeToStr(ActivationType act) {
                                            "PRelu",
                                            "LeakyRelu",
                                            "Sigmoid",
+                                           "Silu",
                                            "Tanh",
                                            "Swish",
                                            "Exp",
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
@@ -143,7 +143,8 @@ enum class ActivationType : int {
   kSign = 20,
   kSoftPlus = 21,
   kMish = 22,
-  NUM = 23,
+  kSilu = 23,
+  NUM = 24,
 };
 
 static size_t PrecisionTypeLength(PrecisionType type) {
diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc
@@ -1118,6 +1118,50 @@ void mish(const float* din, float* dout, int size, float threshold) {
     dout[i] = x * std::tanh(sp);
   }
 }
+
+template <>
+void act_silu<float>(const float* din, float* dout, int size, int threads) {
+  int nums_per_thread = size / threads;
+  int remain = size - threads * nums_per_thread;
+  int neon_loop_cnt_dim4 = nums_per_thread >> 2;
+  int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2);
+
+  // float32x4_t vzero = vdupq_n_f32(0.f);
+  LITE_PARALLEL_BEGIN(i, tid, threads) {
+    float32x4_t x_vec = vdupq_n_f32(0.0f);
+    float32x4_t exp_vec = vdupq_n_f32(0.0f);
+    float32x4_t recip = vdupq_n_f32(0.0f);
+    const float* ptr_in_thread = din + i * nums_per_thread;
+    float* ptr_out_thread = dout + i * nums_per_thread;
+    for (int k = 0; k < neon_loop_cnt_dim4; ++k) {
+      x_vec = vld1q_f32(ptr_in_thread);
+      exp_vec = exp_ps(vnegq_f32(x_vec));
+      exp_vec = vaddq_f32(exp_vec, vdupq_n_f32(1.0f));
+      recip = vrecpeq_f32(exp_vec);
+      // Using Newton-Raphson step for finding the reciprocal
+      recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip);
+      recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip);
+      recip = vmulq_f32(x_vec, recip);
+      vst1q_f32(ptr_out_thread, recip);
+      ptr_out_thread += 4;
+      ptr_in_thread += 4;
+    }
+    for (int j = 0; j < neon_loop_remain_dim4; ++j) {
+      ptr_out_thread[0] = ptr_in_thread[0] / (1 + expf(-ptr_in_thread[0]));
+      ptr_in_thread++;
+      ptr_out_thread++;
+    }
+  }
+  LITE_PARALLEL_END();
+  float* ptr_out = dout + threads * nums_per_thread;
+  const float* ptr_in = din + threads * nums_per_thread;
+  for (int j = 0; j < remain; ++j) {
+    ptr_out[0] = ptr_in[0] / (1 + expf(-ptr_in[0]));
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h
@@ -111,6 +111,9 @@ void softplus(const T* din, T* dout, int size, float beta, int threads);
 template <typename T>
 void mish(const T* din, T* dout, int size, float threshold);
 
+template <typename T>
+void act_silu(const T* din, T* dout, int size, int threads);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc
@@ -169,6 +169,16 @@ void EluCompute::Run() {
       x_data, output_data, x_dims.production(), alpha, ctx.threads());
 }
 
+void SiluCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_silu<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -276,3 +286,8 @@ REGISTER_LITE_KERNEL(
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+REGISTER_LITE_KERNEL(
+    silu, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SiluCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h
@@ -98,6 +98,15 @@ class EluCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   virtual ~EluCompute() = default;
 };
 
+class SiluCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~SiluCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/host/activation_compute.cc b/lite/kernels/host/activation_compute.cc
@@ -289,6 +289,17 @@ void SoftplusCompute::Run() {
   }
 }
 
+void SiluCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(param.X);
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  for (int i = 0; i < x_dims.production(); i++) {
+    output_data[i] = x_data[i] / (1 + std::exp(-x_data[i]));
+  }
+}
+
 }  // namespace host
 }  // namespace kernels
 }  // namespace lite
@@ -435,3 +446,8 @@ REGISTER_LITE_KERNEL(softplus,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
+REGISTER_LITE_KERNEL(
+    silu, kHost, kFloat, kNCHW, paddle::lite::kernels::host::SiluCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/host/activation_compute.h b/lite/kernels/host/activation_compute.h
@@ -203,6 +203,15 @@ class SoftplusCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
   virtual ~SoftplusCompute() = default;
 };
 
+class SiluCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~SiluCompute() = default;
+};
+
 }  // namespace host
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
@@ -59,6 +59,8 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
     param_.hard_sigmoid_offset = opdesc.GetAttr<float>("offset");
   } else if (opdesc.Type() == "sigmoid") {
     param_.active_type = lite_api::ActivationType::kSigmoid;
+  } else if (opdesc.Type() == "silu") {
+    param_.active_type = lite_api::ActivationType::kSilu;
   } else if (opdesc.Type() == "tanh") {
     param_.active_type = lite_api::ActivationType::kTanh;
   } else if (opdesc.Type() == "exp") {
@@ -140,3 +142,4 @@ REGISTER_LITE_OP(thresholded_relu, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(elu, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(erf, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(softplus, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(silu, paddle::lite::operators::ActivationOp);
diff --git a/lite/operators/activation_ops.h b/lite/operators/activation_ops.h
@@ -109,6 +109,9 @@ class ActivationOp : public OpLite {
       case lite_api::ActivationType::kSoftPlus:
         ch->macs = param_.X->numel();
         break;
+      case lite_api::ActivationType::kSilu:
+        ch->macs = param_.X->numel();
+        break;
       default:
         LOG(FATAL) << "This Type of Activation:"
                    << static_cast<int>(param_.active_type)
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
@@ -29,6 +29,7 @@ enum activation_type_test {
   RELU_CLIPPED,
   PRELU,
   SIGMOID,
+  SILU,
   TANH,
   SWISH,
   RELU6,
@@ -165,6 +166,12 @@ class ActivationComputeTester : public arena::TestCase {
         }
         break;
       }
+      case SILU: {
+        for (int i = 0; i < dims_.production(); i++) {
+          output_data[i] = x_data[i] / (1.f + std::exp(-x_data[i]));
+        }
+        break;
+      }
       case TANH: {
         for (int i = 0; i < dims_.production(); i++) {
           output_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) /
@@ -674,6 +681,32 @@ TEST(Activation_sigmoid, precision) {
   }
 }
 
+TEST(Activation_silu, precision) {
+  Place place;
+  float abs_error = 2e-5;
+  std::vector<std::vector<int64_t>> test_dims{
+      {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}};
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  for (auto dims : test_dims) {
+    TestAct(place,
+            "def",
+            0.01,
+            6.,
+            "all",
+            0.,
+            1.0,
+            DDim(dims),
+            "silu",
+            SILU,
+            abs_error);
+  }
+}
+
 TEST(Activation_tanh, precision) {
   Place place;
   float abs_error = 2e-5;
diff --git a/lite/tests/unittest_py/op/test_silu_op.py b/lite/tests/unittest_py/op/test_silu_op.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('../')
+
+from auto_scan_test import AutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig, CxxConfig, TargetType, PrecisionType, DataLayoutType, Place
+import unittest
+from functools import partial
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestSiluOp(AutoScanTest):
+    def __init__(self, *args, **kwargs):
+        AutoScanTest.__init__(self, *args, **kwargs)
+        self.enable_testing_on_place(
+            TargetType.ARM, [PrecisionType.FP32],
+            DataLayoutType.NCHW,
+            thread=[1, 4])
+
+    def is_program_valid(self,
+                         program_config: ProgramConfig,
+                         predictor_config: CxxConfig) -> bool:
+        return True
+
+    def sample_program_configs(self, draw):
+        in_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=64), min_size=1, max_size=4))
+
+        def generate_input(*args, **kwargs):
+            return np.random.normal(-1, 1.0, in_shape).astype(np.float32)
+
+        ops_config = OpConfig(
+            type="silu",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["output_data"]},
+            attrs={})
+
+        program_config = ProgramConfig(
+            ops=[ops_config],
+            weights={},
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["output_data"])
+
+        return program_config
+
+    def sample_predictor_configs(self):
+        atol, rtol = 1e-5, 1e-5
+        target_str = self.get_target()
+        if target_str == "Metal":
+            atol, rtol = 5e-4, 5e-4
+        return self.get_predictor_configs(), ["silu"], (atol, rtol)
+
+    def add_ignore_pass_case(self):
+        def teller1(program_config, predictor_config):
+            x_shape = list(program_config.inputs["input_data"].shape)
+            if predictor_config.target() == TargetType.Metal:
+                if len(x_shape) != 4:
+                    return True
+
+        self.add_ignore_check_case(
+            teller1, IgnoreReasons.PADDLELITE_NOT_SUPPORT,
+            "Lite does not support this op in a specific case on metal. We need to fix it as soon as possible."
+        )
+
+        def teller2(program_config, predictor_config):
+            if "nvidia_tensorrt" in self.get_nnadapter_device_name():
+                in_shape = program_config.inputs["input_data"].shape
+                if len(in_shape) == 1:
+                    return True
+
+        self.add_ignore_check_case(
+            teller2, IgnoreReasons.PADDLELITE_NOT_SUPPORT,
+            "Lite does not support 'in_shape_size == 1' on nvidia_tensorrt.")
+
+    def test(self, *args, **kwargs):
+        target_str = self.get_target()
+        self.run_and_statis(quant=False, max_examples=25)
+
+
+if __name__ == "__main__":
+    unittest.main(argv=[''])