PaddlePaddle · hong19860320 · Jun 3, 2019 · May 31, 2019 · Jun 3, 2019 · Jun 3, 2019
diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -6,4 +6,4 @@ if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
   return()
 endif()
 
-cc_library(math_arm SRCS funcs.cc packed_sgemm.cc softmax.cc DEPS ${lite_kernel_deps} eigen3)
+cc_library(math_arm SRCS funcs.cc packed_sgemm.cc softmax.cc scale.cc DEPS ${lite_kernel_deps} eigen3)
diff --git a/paddle/fluid/lite/arm/math/funcs.h b/paddle/fluid/lite/arm/math/funcs.h
@@ -18,6 +18,7 @@
 #include <cmath>
 
 #include "paddle/fluid/lite/arm/math/packed_sgemm.h"
+#include "paddle/fluid/lite/arm/math/scale.h"
 #include "paddle/fluid/lite/arm/math/softmax.h"
 
 namespace paddle {

diff --git a/paddle/fluid/lite/arm/math/scale.cc b/paddle/fluid/lite/arm/math/scale.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/arm/math/scale.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void scale<float>(const float* din, float* dout, int num, float scale,
+                  float bias) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vscale = vdupq_n_f32(scale);
+  float32x4_t vbias = vdupq_n_f32(bias);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const float* din_ptr = din + (i << 4);
+    float* dout_ptr = dout + (i << 4);
+
+    float32x4_t din0 = vld1q_f32(din_ptr);
+    float32x4_t din1 = vld1q_f32(din_ptr + 4);
+    float32x4_t din2 = vld1q_f32(din_ptr + 8);
+    float32x4_t din3 = vld1q_f32(din_ptr + 12);
+
+    float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
+    float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
+    float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
+    float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
+
+    vst1q_f32(dout_ptr, vsum1);
+    vst1q_f32(dout_ptr + 4, vsum2);
+    vst1q_f32(dout_ptr + 8, vsum3);
+    vst1q_f32(dout_ptr + 12, vsum4);
+  }
+  if (remain > 0) {
+    const float* din_ptr = din + (cnt << 4);
+    float* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *din_ptr * scale + bias;
+      dout_ptr++;
+      din_ptr++;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/scale.h b/paddle/fluid/lite/arm/math/scale.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void scale(const T* din, T* dout, int num, float scale, float bias);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/hvy_tensor.h b/paddle/fluid/lite/core/hvy_tensor.h
@@ -102,7 +102,8 @@ class TensorHvy : public TensorBase<TensorHvy> {
     data_.ShareDataWith(other.data_);
   }
   void CopyDataFrom(const TensorHvy& other) {
-    data_.ShareDataWith(other.data_);
+    data_.mutable_data(other.data_.place(), other.data_.type());
+    TensorCopySync(other.data_, data_.place(), &data_);
   }
 
   DDimT dims() const { return DDimT(framework::vectorize(data_.dims())); }

diff --git a/paddle/fluid/lite/core/lite_tensor.h b/paddle/fluid/lite/core/lite_tensor.h
@@ -37,7 +37,7 @@ class DDimLite : public DDimBase<DDimLite> {
   void ConstructFrom(const std::vector<value_type> &x) { data_ = x; }
 
   value_type operator[](int offset) const { return data_[offset]; }
-  std::vector<int64_t> Vectorize() { return data_; }
+  std::vector<int64_t> Vectorize() const { return data_; }
 
   size_t size() const { return data_.size(); }
   bool empty() const { return data_.empty(); }

diff --git a/paddle/fluid/lite/core/tensor.h b/paddle/fluid/lite/core/tensor.h
@@ -48,7 +48,7 @@ class DDimBase {
 
   explicit DDimBase(const std::vector<int64_t> &x) { self()->ConstructFrom(x); }
   value_type operator[](int offset) const { return (*self())[offset]; }
-  std::vector<int64_t> Vectorize() { return self()->Vectorize(); }
+  std::vector<int64_t> Vectorize() const { return self()->Vectorize(); }
   size_t size() const { return const_self()->size(); }
   bool empty() const { return const_self()->empty(); }
 

diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -7,10 +7,11 @@ message(STATUS "compile with lite ARM kernels")
 cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
 cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
-cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} eigen3)
+cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
+lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
 
 set(arm_kernels

diff --git a/paddle/fluid/lite/kernels/arm/scale_compute.cc b/paddle/fluid/lite/kernels/arm/scale_compute.cc
@@ -12,39 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/lite/kernels/arm/scale_compute.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {
 
-template <typename T>
-void scale_compute(const T* x, T* out, int size, float scale, float bias,
-                   bool bias_before) {
-  if (bias_before) bias *= scale;
-  for (int i = 0; i < size; i++) {
-    out[i] = x[i] * scale + bias;
+void ScaleCompute::Run() {
+  auto& param = Param<operators::ScaleParam>();
+  const float* x_data = param.x->data<float>();
+  float* output_data = param.output->mutable_data<float>();
+  DDim x_dims = param.x->dims();
+  bool bias_after_scale = param.bias_after_scale;
+  float scale = param.scale;
+  float bias = param.bias;
+  if (!bias_after_scale) {
+    bias *= scale;
   }
+  lite::arm::math::scale(x_data, output_data, x_dims.production(), scale, bias);
 }
 
-class ScaleCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
-
-  void Run() override {
-    auto& param = Param<operators::ScaleParam>();
-    scale_compute(param.x->data<float>(), param.output->mutable_data<float>(),
-                  param.x->dims().production(), param.scale, param.bias,
-                  param.bias_after_scale);
-  }
-
-  virtual ~ScaleCompute() = default;
-};
-
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

diff --git a/paddle/fluid/lite/kernels/arm/scale_compute.h b/paddle/fluid/lite/kernels/arm/scale_compute.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ScaleCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ScaleCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/scale_compute.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename dtype>
+void scale_compute_ref(const operators::ScaleParam& param) {
+  const dtype* x_data = param.x->mutable_data<const dtype>();
+  dtype* output_data = param.output->mutable_data<dtype>();
+  DDim x_dims = param.x->dims();
+  DDim output_dims = param.output->dims();
+  ASSERT_EQ(x_dims.data(), output_dims.data());
+  bool bias_after_scale = param.bias_after_scale;
+  float scale = param.scale;
+  float bias = param.bias;
+  if (!bias_after_scale) {
+    bias *= scale;
+  }
+  for (int i = 0; i < output_dims.production(); i++) {
+    output_data[i] = x_data[i] * scale + bias;
+  }
+}
+
+TEST(scale_arm, init) {
+  ScaleCompute scale;
+  ASSERT_EQ(scale.precision(), PRECISION(kFloat));
+  ASSERT_EQ(scale.target(), TARGET(kARM));
+}
+
+TEST(scale_arm, compute) {
+  ScaleCompute scale;
+  operators::ScaleParam param;
+
+  lite::Tensor x;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+
+  for (auto n : {1, 3, 4, 11}) {
+    for (auto c : {1, 3, 11, 4}) {
+      for (auto h : {3, 1, 11, 4}) {
+        for (auto w : {1, 3, 4, 12}) {
+          for (auto bias_after_scale : {true, false}) {
+            for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) {
+              for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) {
+                x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+                output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+                output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+                auto* x_data = x.mutable_data<float>();
+                auto* output_data = output.mutable_data<float>();
+                auto* output_ref_data = output_ref.mutable_data<float>();
+                for (int i = 0; i < x.dims().production(); i++) {
+                  x_data[i] = i;
+                }
+                param.x = &x;
+                param.output = &output;
+                param.bias_after_scale = bias_after_scale;
+                param.scale = s;
+                param.bias = b;
+                scale.SetParam(param);
+                scale.Run();
+                param.output = &output_ref;
+                scale_compute_ref<float>(param);
+                for (int i = 0; i < output.dims().production(); i++) {
+                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(scale, retrive_op) {
+  auto scale =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("scale");
+  ASSERT_FALSE(scale.empty());
+  ASSERT_TRUE(scale.front());
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/softmax_compute.cc b/paddle/fluid/lite/kernels/arm/softmax_compute.cc
@@ -24,15 +24,15 @@ void SoftmaxCompute::Run() {
   auto& param = Param<operators::SoftmaxParam>();
   const float* din = param.x->data<float>();
   float* dout = param.output->mutable_data<float>();
-  auto dim_x = param.x->dims();
-  auto rank_x = dim_x.size();
+  auto x_dims = param.x->dims();
+  auto x_rank = x_dims.size();
   int axis = param.axis;
   if (axis < 0) {
-    axis += rank_x;
+    axis += x_rank;
   }
-  int outer_num = dim_x.Slice(0, axis).production();
-  int inner_num = dim_x.Slice(axis + 1, rank_x).production();
-  int axis_size = dim_x[axis];
+  int outer_num = x_dims.Slice(0, axis).production();
+  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
+  int axis_size = x_dims[axis];
   if (inner_num == 1) {
     if (axis_size >= 4) {
       lite::arm::math::softmax_inner1_large_axis(din, dout, outer_num,
@@ -64,10 +64,6 @@ void SoftmaxCompute::Run() {
   }
 }
 
-TargetType SoftmaxCompute::target() const { return TARGET(kARM); }
-
-PrecisionType SoftmaxCompute::precision() const { return PRECISION(kFloat); }
-
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

diff --git a/paddle/fluid/lite/kernels/arm/softmax_compute.h b/paddle/fluid/lite/kernels/arm/softmax_compute.h
@@ -26,9 +26,6 @@ class SoftmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
   void Run() override;
 
-  TargetType target() const override;
-  PrecisionType precision() const override;
-
   virtual ~SoftmaxCompute() = default;
 };