follow comments

tensor-tang · tensor-tang · commit 86d6b148b58a · 2019-05-28T07:11:34.000Z
diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt
@@ -118,6 +118,7 @@ endfunction()
 
 add_subdirectory(core)
 add_subdirectory(x86)
+add_subdirectory(arm)
 add_subdirectory(host)
 add_subdirectory(cuda)
 add_subdirectory(operators)
diff --git a/paddle/fluid/lite/arm/CMakeLists.txt b/paddle/fluid/lite/arm/CMakeLists.txt
@@ -0,0 +1,2 @@
+
+add_subdirectory(math)
diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -1,2 +1,2 @@
 
-cc_library(math_arm SRCS funcs.cc packed_sgemm.cc DEPS ${lite_kernel_deps})
+cc_library(math_arm SRCS funcs.cc packed_sgemm.cc DEPS ${lite_kernel_deps} eigen3)
diff --git a/paddle/fluid/lite/arm/math/funcs.cc b/paddle/fluid/lite/arm/math/funcs.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/kernels/arm/math/funcs.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
 #include <arm_neon.h>
 
 namespace paddle {
 namespace lite {
-namespace kernels {
 namespace arm {
 namespace math {
 
@@ -153,6 +152,5 @@ void fill_bias_fc<int>(int *tensor, const int *bias, const int num,
 
 }  // namespace math
 }  // namespace arm
-}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/funcs.h b/paddle/fluid/lite/arm/math/funcs.h
@@ -14,20 +14,40 @@
 
 #pragma once
 
+#include <Eigen/Core>
 #include <cmath>
-#include "paddle/fluid/lite/kernels/arm/math/packed_sgemm.h"
+
+#include "paddle/fluid/lite/arm/math/packed_sgemm.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
 namespace arm {
 namespace math {
 
 template <typename T>
 void fill_bias_fc(T* tensor, const T* bias, const int num, const int channel);
 
+template <typename T>
+void fc_compute_eigen(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  using matrix_t =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+  Eigen::Map<const matrix_t> X(x, x_h, x_w);
+  Eigen::Map<const matrix_t> W(w, w_h, w_w);
+  Eigen::Map<matrix_t> Out(out, x_h, w_w);
+
+  Out = X * W;
+
+  if (b) {
+    Eigen::Map<const Eigen::Matrix<T, 1, Eigen::Dynamic>> B(b, w_w);
+    Out = Out.array().rowwise() + B.array();
+  }
+}
+
 }  // namespace math
 }  // namespace arm
-}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/packed_sgemm.cc b/paddle/fluid/lite/arm/math/packed_sgemm.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/kernels/arm/math/packed_sgemm.h"
+#include "paddle/fluid/lite/arm/math/packed_sgemm.h"
 #include <arm_neon.h>
 
 namespace paddle {
 namespace lite {
-namespace kernels {
 namespace arm {
 namespace math {
 
@@ -68,7 +67,7 @@ void prepackA(float *out, const float *in, const int ldin, const int m0,
     prepackA_8x12(out, in, ldin, m0, mmax, k0, kmax);
   }
 #else
-  if (ctx->get_arch() == kA73) {
+  if (ctx->arch() == kA73) {
     if (is_trans) {
       prepackA_trans_4x8(out, in, ldin, m0, mmax, k0, kmax);
     } else {
@@ -86,7 +85,7 @@ void prepackA(float *out, const float *in, const int ldin, const int m0,
 
 void prepackA(TensorLite *tout, const TensorLite &tin, int m, int k, int group,
               bool is_trans, ARMContext *ctx) {
-  int hblock = get_hblock(ctx->get_arch());
+  int hblock = get_hblock(ctx->arch());
   int m_roundup = hblock * ((m + hblock - 1) / hblock);
   int group_size_round_up = ((m_roundup * k + 15) / 16) * 16;
   if (tout->numel() < group_size_round_up * group) {
@@ -112,7 +111,7 @@ void sgemm_prepack(const float *A_packed, const float *B, const float *bias,
   sgemm_conv_8x12(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB,
                   ctx);
 #else   // armv7
-  if (ctx->get_arch() == kA73) {
+  if (ctx->arch() == kA73) {
     sgemm_conv_4x8(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB,
                    ctx);
   } else {
@@ -1521,8 +1520,8 @@ void sgemm_conv_8x12(const float *A_packed, const float *B, const float *bias,
                      bool transB, ARMContext *ctx) {
   size_t l2_cache =
       ctx->l2_cache_size() > 0 ? ctx->l2_cache_size() : 512 * 1024;
-  float *workspace = ctx->get_workspace_data<float>();
-  int threads = ctx->get_threads();
+  float *workspace = ctx->workspace_data<float>();
+  int threads = ctx->threads();
   //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
   int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK));
   x_block /= NBLOCK;
@@ -2359,8 +2358,8 @@ void sgemm_conv_6x8(const float* A_packed, const float* B, const float* bias,
                     bool transB, ARMContext* ctx) {
   size_t l2_cache =
       ctx->l2_cache_size() > 0 ? ctx->l2_cache_size() : 512 * 1024;
-  auto* workspace = ctx->get_workspace_data<float>();
-  int threads = ctx->get_threads();
+  auto* workspace = ctx->workspace_data<float>();
+  int threads = ctx->threads();
   //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
   int x_block =
       (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH));
@@ -2753,7 +2752,7 @@ void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias,
   size_t l2_cache =
       ctx->l2_cache_size() > 0 ? ctx->l2_cache_size() : 512 * 1024;
   void* workspace = ctx->get_work_space();
-  int threads = ctx->get_threads();
+  int threads = ctx->threads();
   //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
   int x_block =
       (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73));
@@ -3046,6 +3045,5 @@ void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias,
 
 }  // namespace math
 }  // namespace arm
-}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/packed_sgemm.h b/paddle/fluid/lite/arm/math/packed_sgemm.h
@@ -21,7 +21,6 @@
 
 namespace paddle {
 namespace lite {
-namespace kernels {
 namespace arm {
 namespace math {
 
@@ -57,6 +56,5 @@ void sgemm_prepack(const float* A_packed, const float* B, const float* bias,
 
 }  // namespace math
 }  // namespace arm
-}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/core/context.cc b/paddle/fluid/lite/core/context.cc
@@ -33,7 +33,7 @@ namespace lite {
 
 #ifdef LITE_WITH_ARM
 
-void ARMContext::set_cache(int l1size, int l2size, int l3size) {
+void ARMContext::SetCache(int l1size, int l2size, int l3size) {
   DeviceInfo& dev = DeviceInfo::Global();
   int cpu_count = arm_get_cpucount();
   dev.L1_cache_.resize(cpu_count);
@@ -62,9 +62,9 @@ ARMContext::ARMContext() {
 #endif
 }
 
-PowerMode ARMContext::get_mode() const { return mode_; }
+PowerMode ARMContext::mode() const { return mode_; }
 
-int ARMContext::get_threads() const { return active_ids_.size(); }
+int ARMContext::threads() const { return active_ids_.size(); }
 
 ARMContext::ARMContext(const ARMContext& ctx) {
   mode_ = ctx.mode_;
@@ -83,7 +83,7 @@ ARMContext& ARMContext::operator=(const ARMContext& ctx) {
   return *this;
 }
 
-void ARMContext::bind_dev() {
+void ARMContext::BindDev() {
 #ifdef USE_OPENMP
   int num_threads = active_ids_.size();
   omp_set_num_threads(num_threads);
@@ -116,7 +116,7 @@ void ARMContext::bind_dev() {
 #endif  // USE_OPENMP
 }
 
-void ARMContext::set_run_mode(PowerMode mode, int threads) {
+void ARMContext::SetRunMode(PowerMode mode, int threads) {
   DeviceInfo& dev = DeviceInfo::Global();
   int big_core_size = dev.big_core_ids_.size();
   int small_core_size = dev.little_core_ids_.size();
@@ -271,7 +271,7 @@ void ARMContext::set_run_mode(PowerMode mode, int threads) {
     omp_set_num_threads(threads);
   } else {
     if (check_online(active_ids_)) {
-      bind_dev();
+      BindDev();
     } else {
       LOG(ERROR) << "core id " << active_ids_[0]
                  << " is offline, switch to NO BIND MODE";
@@ -293,9 +293,9 @@ void ARMContext::set_run_mode(PowerMode mode, int threads) {
   arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
 }
 
-ARMArch ARMContext::get_arch() const { return arch_; }
+ARMArch ARMContext::arch() const { return arch_; }
 
-void ARMContext::set_arch(ARMArch arch) { arch_ = arch; }
+void ARMContext::SetArch(ARMArch arch) { arch_ = arch; }
 
 int ARMContext::l1_cache_size() const {
   DeviceInfo& dev = DeviceInfo::Global();
@@ -312,7 +312,7 @@ int ARMContext::l3_cache_size() const {
   return dev.L3_cache_[active_ids_[0]];
 }
 
-bool ARMContext::workspace_extend(DDimLite dims) {
+bool ARMContext::ExtendWorkspace(DDimLite dims) {
   auto count = dims.product();
   auto old = workspace_.dims();
   if (count == old.product()) {
diff --git a/paddle/fluid/lite/core/context.h b/paddle/fluid/lite/core/context.h
@@ -45,26 +45,29 @@ struct ARMContext {
 
   ARMContext& operator=(const ARMContext& ctx);
 
-  void set_run_mode(PowerMode mode, int threads);
-  void bind_dev();
-  PowerMode get_mode() const;
-  int get_threads() const;
-  void set_cache(int l1size, int l2size, int l3size);
+  void SetRunMode(PowerMode mode, int threads);
+  void SetCache(int l1size, int l2size, int l3size);
+  void SetArch(ARMArch arch);
+  void BindDev();
+
+  PowerMode mode() const;
+  int threads() const;
+  ARMArch arch() const;
+
   template <typename T>
-  T* get_workspace_data() {
+  T* workspace_data() {
     return workspace_.mutable_data<T>();
   }
-  ARMArch get_arch() const;
-  void set_arch(ARMArch arch);
+
   int l1_cache_size() const;
   int l2_cache_size() const;
   int l3_cache_size() const;
-  bool workspace_extend(DDimLite dims);
+  bool ExtendWorkspace(DDimLite dims);
 
  private:
-  //! LITE_POWER_HIGH stands for using big cores,
-  //! LITE_POWER_LOW stands for using small core,
-  //! LITE_POWER_FULL stands for using all cores
+  // LITE_POWER_HIGH stands for using big cores,
+  // LITE_POWER_LOW stands for using small core,
+  // LITE_POWER_FULL stands for using all cores
   ARMArch arch_;
   PowerMode mode_;
   std::vector<int> active_ids_;
diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -4,8 +4,6 @@ endif()
 
 message(STATUS "compile with lite ARM kernels")
 
-add_subdirectory(math)
-
 cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
 cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.cc b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/kernels/arm/fc_compute.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/core/type_system.h"
-#include "paddle/fluid/lite/kernels/arm/math/funcs.h"
 
 namespace paddle {
 namespace lite {
@@ -42,15 +42,16 @@ void FcCompute::Run() {
   CHECK_EQ(x_w, static_cast<int>(w_dims[0]));
   auto& ctx = this->ctx_->template As<ARMContext>();
   if (x_h > 1) {
-    float* packed_in = static_cast<float*>(ctx.get_workspace_data<float>()) +
+    float* packed_in = static_cast<float*>(ctx.workspace_data<float>()) +
                        ctx.l2_cache_size() / sizeof(float);
-    math::prepackA(packed_in, i_data, x_w, 0, x_h, 0, x_w, false, &ctx);
-    math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n, x_w, false,
-                        false, false, &ctx);
+    lite::arm::math::prepackA(packed_in, i_data, x_w, 0, x_h, 0, x_w, false,
+                              &ctx);
+    lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n,
+                                   x_w, false, false, false, &ctx);
 
     if (param.bias) {
       CHECK_EQ(param.bias->numel(), n);
-      math::fill_bias_fc(o_data, b_data, x_h, n);
+      lite::arm::math::fill_bias_fc(o_data, b_data, x_h, n);
     }
   } else {
     // use sgemmv
diff --git a/paddle/fluid/lite/kernels/arm/fc_compute_test.cc b/paddle/fluid/lite/kernels/arm/fc_compute_test.cc
@@ -13,37 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/kernels/arm/fc_compute.h"
-#include <Eigen/Core>  // move to math
-
 #include <gtest/gtest.h>
 #include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {
 
-template <typename T>
-void fc_compute_eigen(const T* x, int x_h, int x_w,  //
-                      const T* w, int w_h, int w_w,  //
-                      const T* b,                    //
-                      T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> W(w, w_h, w_w);
-  Eigen::Map<matrix_t> Out(out, x_h, w_w);
-
-  Out = X * W;
-
-  if (b) {
-    Eigen::Map<const Eigen::Matrix<T, 1, Eigen::Dynamic>> B(b, w_w);
-    Out = Out.array().rowwise() + B.array();
-  }
-}
-
 TEST(fc_arm, retrive_op) {
   auto fc =
       KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("fc");
@@ -84,9 +63,9 @@ TEST(fc_arm, compare_test) {
 
   // TODO(TJ): enable bias soon
   b_data = nullptr;
-  fc_compute_eigen(x_data, batch_size, 3,  //
-                   w_data, 3, 4,           //
-                   b_data, ref_data);
+  lite::arm::math::fc_compute_eigen(x_data, batch_size, 3,  //
+                                    w_data, 3, 4,           //
+                                    b_data, ref_data);
 
   // fc compute kernel
   FcCompute fc;

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`
`2`		`-cc_library(math_arm SRCS funcs.cc packed_sgemm.cc DEPS ${lite_kernel_deps})`
	`2`	`+cc_library(math_arm SRCS funcs.cc packed_sgemm.cc DEPS ${lite_kernel_deps} eigen3)`