PaddlePaddle · tensor-tang · May 28, 2019 · May 26, 2019 · May 26, 2019 · May 27, 2019
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
@@ -16,6 +16,8 @@ if(NOT ANDROID)
     return()
 endif()
 
+add_definitions(-DLITE_WITH_ANDROID)
+
 if(NOT DEFINED ANDROID_NDK)
     set(ANDROID_NDK $ENV{NDK_ROOT})
     if(NOT ANDROID_NDK)

diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt
@@ -118,6 +118,7 @@ endfunction()
 
 add_subdirectory(core)
 add_subdirectory(x86)
+add_subdirectory(arm)
 add_subdirectory(host)
 add_subdirectory(cuda)
 add_subdirectory(operators)

diff --git a/paddle/fluid/lite/arm/CMakeLists.txt b/paddle/fluid/lite/arm/CMakeLists.txt
@@ -0,0 +1,2 @@
+
+add_subdirectory(math)
diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -0,0 +1,2 @@
+
+cc_library(math_arm SRCS funcs.cc packed_sgemm.cc DEPS ${lite_kernel_deps} eigen3)
diff --git a/paddle/fluid/lite/arm/math/funcs.cc b/paddle/fluid/lite/arm/math/funcs.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include <arm_neon.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void fill_bias_fc<float>(float *tensor, const float *bias, const int num,
+                         const int channel) {
+  int cnt = channel >> 4;
+  int remain = channel & 15;
+
+  for (int j = 0; j < num; ++j) {
+    const float *ptr_bias = bias;
+    float *ptr_out = tensor + j * channel;
+
+    float32x4_t vout1;
+    float32x4_t vout2;
+    float32x4_t vout3;
+    float32x4_t vout4;
+
+    for (int i = 0; i < cnt; ++i) {
+      float32x4_t vin1 = vld1q_f32(ptr_out);
+      float32x4_t vb1 = vld1q_f32(ptr_bias);
+
+      float32x4_t vin2 = vld1q_f32(ptr_out + 4);
+      float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
+
+      float32x4_t vin3 = vld1q_f32(ptr_out + 8);
+      float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
+
+      float32x4_t vin4 = vld1q_f32(ptr_out + 12);
+      float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
+
+      vout1 = vaddq_f32(vin1, vb1);
+      vout2 = vaddq_f32(vin2, vb2);
+      vout3 = vaddq_f32(vin3, vb3);
+      vout4 = vaddq_f32(vin4, vb4);
+
+      vst1q_f32(ptr_out, vout1);
+      vst1q_f32(ptr_out + 4, vout2);
+      vst1q_f32(ptr_out + 8, vout3);
+      vst1q_f32(ptr_out + 12, vout4);
+
+      ptr_out += 16;
+      ptr_bias += 16;
+    }
+
+#if 0
+        if (cnt > 0) {
+            asm(
+            "1: \n"
+            "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
+            "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
+            "vadd.f32 q2, q0, q1              @ add bias\n"
+            "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
+            "subs   %[cnt], #1                @ loop count -1\n"
+            "bne    1b                        @ jump to main loop\n"
+            :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
+                    [cnt] "+r"(cnt)
+            :
+            :"q0", "q1", "q2"
+            );
+        }
+#endif
+    for (; remain > 0; remain--) {
+      *(ptr_out++) += *(ptr_bias++);
+    }
+  }
+}
+
+template <>
+void fill_bias_fc<int>(int *tensor, const int *bias, const int num,
+                       const int channel) {
+  int cnt = channel >> 4;
+  int remain = channel & 15;
+
+  for (int j = 0; j < num; ++j) {
+    const int *ptr_bias = bias;
+    int *ptr_out = tensor + j * channel;
+
+    int32x4_t vout1;
+    int32x4_t vout2;
+    int32x4_t vout3;
+    int32x4_t vout4;
+
+    for (int i = 0; i < cnt; ++i) {
+      int32x4_t vin1 = vld1q_s32(ptr_out);
+      int32x4_t vb1 = vld1q_s32(ptr_bias);
+
+      int32x4_t vin2 = vld1q_s32(ptr_out + 4);
+      int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
+
+      int32x4_t vin3 = vld1q_s32(ptr_out + 8);
+      int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
+
+      int32x4_t vin4 = vld1q_s32(ptr_out + 12);
+      int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
+
+      vout1 = vaddq_s32(vin1, vb1);
+      vout2 = vaddq_s32(vin2, vb2);
+      vout3 = vaddq_s32(vin3, vb3);
+      vout4 = vaddq_s32(vin4, vb4);
+
+      vst1q_s32(ptr_out, vout1);
+      vst1q_s32(ptr_out + 4, vout2);
+      vst1q_s32(ptr_out + 8, vout3);
+      vst1q_s32(ptr_out + 12, vout4);
+
+      ptr_out += 16;
+      ptr_bias += 16;
+    }
+
+#if 0
+        if (cnt > 0) {
+        asm(
+        "1: \n"
+        "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
+        "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
+        "vadd.s32 q2, q0, q1              @ add bias\n"
+        "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
+        "subs   %[cnt], #1                @ loop count -1\n"
+        "bne    1b                        @ jump to main loop\n"
+        :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
+                [cnt] "+r"(cnt)
+        :
+        :"q0", "q1", "q2"
+        );
+    }
+#endif
+    for (; remain > 0; remain--) {
+      *(ptr_out++) += *(ptr_bias++);
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/funcs.h b/paddle/fluid/lite/arm/math/funcs.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Eigen/Core>
+#include <cmath>
+
+#include "paddle/fluid/lite/arm/math/packed_sgemm.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void fill_bias_fc(T* tensor, const T* bias, const int num, const int channel);
+
+template <typename T>
+void fc_compute_eigen(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  using matrix_t =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+  Eigen::Map<const matrix_t> X(x, x_h, x_w);
+  Eigen::Map<const matrix_t> W(w, w_h, w_w);
+  Eigen::Map<matrix_t> Out(out, x_h, w_w);
+
+  Out = X * W;
+
+  if (b) {
+    Eigen::Map<const Eigen::Matrix<T, 1, Eigen::Dynamic>> B(b, w_w);
+    Out = Out.array().rowwise() + B.array();
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		cc_library(math_arm SRCS funcs.cc packed_sgemm.cc DEPS ${lite_kernel_deps} eigen3)