PaddlePaddle · zhaoyang-star · Jul 1, 2021 · Jun 28, 2021 · Jun 28, 2021 · Jun 28, 2021
@@ -53,6 +53,7 @@ limitations under the License. */
 #define GET_VEC_TYPE(type__, size__) type__##size__
 #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
 #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
+#define CL_DTYPE16 VECTORIZED_TYPE(CL_DTYPE, 16)
 #define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4)
 
 /////////////////////////////////

@@ -1,8 +1,8 @@
 #include <cl_common.h>
 
-inline elt_fuse_func_wrapper(__read_only image2d_t second_input_image,
-                             const int2 pos,
-                             CL_DTYPE4 *value_p) {
+inline void elt_fuse_func_wrapper(__read_only image2d_t second_input_image,
+                                  const int2 pos,
+                                  CL_DTYPE4 *value_p) {
   CL_DTYPE4 second_val =
       READ_IMG_TYPE(CL_DTYPE_CHAR, second_input_image, SAMPLER, pos);
   *value_p += second_val;
@@ -2413,3 +2413,75 @@ __kernel void conv2d_1x1_mali_h2w2c2(
     }
   }
 }
+
+__kernel void conv2d_1x1_fc(__read_only image2d_t input,
+                            __write_only image2d_t output,
+                            __global CL_DTYPE16 *weights,
+#ifdef BIASE_CH
+                            __read_only image2d_t biases,
+#endif  // BIASE_CH
+#ifdef PRELU
+                            __read_only image2d_t prelu_alpha,
+#endif  // PRELU
+#ifdef ELT_FUSE
+                            __read_only image2d_t second_input_image,
+#endif  // ELT_FUSE
+                            int in_c_blks,
+                            int out_c_blks) {
+  int out_c = get_global_id(0);
+  int2 tid = (int2)(get_local_id(0), get_local_id(1));
+  CL_DTYPE4 s = (CL_DTYPE4)(0.0f);
+
+  if (out_c < out_c_blks) {
+    for (int c = tid.y; c < in_c_blks; c += 4) {
+      CL_DTYPE4 v = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(c, 0));
+      CL_DTYPE16 w = weights[c * out_c_blks + out_c];
+      CL_DTYPE4 partial = v.x * w.s0123;
+      partial += v.y * w.s4567;
+      partial += v.z * w.s89ab;
+      partial += v.w * w.scdef;
+      s += partial;
+    }
+  }
+  __local CL_DTYPE4 temp[32][4];
+  temp[tid.x][tid.y] = s;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  if (out_c >= out_c_blks) {
+    return;
+  }
+  if (tid.y == 0) {
+    s += temp[tid.x][1];
+    s += temp[tid.x][2];
+    s += temp[tid.x][3];
+    int2 output_pos0 = (int2)(out_c, 0);
+
+#ifdef BIASE_CH
+    CL_DTYPE4 output0 =
+        s + READ_IMG_TYPE(CL_DTYPE_CHAR, biases, SAMPLER, output_pos0);
+#else
+    CL_DTYPE4 output0 = s;
+#endif
+
+    CL_DTYPE4 alpha0;
+#ifdef PRELU_CH
+    alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
+#elif defined(PRELU_ELE)
+    alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
+#elif defined(PRELU_ALL)
+    alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
+    alpha0.y = alpha0.x;
+    alpha0.z = alpha0.x;
+    alpha0.w = alpha0.x;
+#endif
+    output0 = activation_type4(output0, alpha0);
+#ifdef SCALE_ACTIVATION
+    output0 = fuse_scale(output0, 1.f, 0.f, 0.f);
+#endif
+
+#ifdef ELT_FUSE
+    elt_fuse_func_wrapper(second_input_image, output_pos0, &output0);
+#endif
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos0, output0);
+  }
+}
@@ -150,7 +150,35 @@ void ConvImageCompute::PrepareForRun() {
   //                          filter_gpu_buffer_.get());
 
   //   impl_ = &ConvImageCompute::Conv2d1x1Mali;
-  if (is_mali_ && filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
+
+  if (UseFcReplaceConv()) {
+    kernel_func_names_.push_back("conv2d_1x1_fc");
+    kernel_func_paths_.push_back("image/conv2d_1x1_opt_kernel.cl");
+
+    filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
+    auto tensor_hold_filter_buffer = std::unique_ptr<Tensor>(new Tensor);
+    tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
+    auto filter_ext_dims = filter_dims;
+    filter_ext_dims[0] = ROUND_UP(filter_dims[0], 4);
+    filter_ext_dims[1] = ROUND_UP(filter_dims[1], 4);
+    tensor_hold_filter_buffer->Resize(filter_ext_dims);
+    auto* filter_buffer_data =
+        MUTABLE_DATA_CPU(tensor_hold_filter_buffer.get());
+    size_t buf_size = tensor_hold_filter_buffer->memory_size();
+
+    std::memset(filter_buffer_data, 0, buf_size);  // can be remove later
+    OI2IOO4I4(filter_cpu, filter_buffer_data, filter_dims[0], filter_dims[1]);
+
+    filter_gpu_buffer_ = std::unique_ptr<Tensor>(new Tensor);
+    auto* filter_gpu_data = filter_gpu_buffer_->mutable_data(
+        TARGET(kOpenCL), tensor_hold_filter_buffer->memory_size());
+    TargetWrapperCL::MemcpySync(filter_gpu_data,
+                                tensor_hold_filter_buffer->raw_data(),
+                                tensor_hold_filter_buffer->memory_size(),
+                                IoDirection::HtoD);
+    filter_buffer_p_ = GET_BUFFER_GPU(filter_gpu_buffer_);
+    impl_ = &ConvImageCompute::Conv2d1x1FC;
+  } else if (is_mali_ && filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
     filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
     tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
     tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
@@ -765,7 +793,14 @@ void ConvImageCompute::PrepareForRun() {
 #define SHOW_EACH_LWS_TIME
 #undef SHOW_EACH_LWS_TIME
 void ConvImageCompute::SetLocalWorkSize(size_t repeats /*=4*/) {
-  if (kernel_func_names_[0] == "conv2d_1x1_h1w4c1") {
+  if (kernel_func_names_[0] == "conv2d_1x1_fc") {
+    auto& context = ctx_->As<OpenCLContext>();
+    std::stringstream kernel_key;
+    kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+
+    local_work_size_ = cl::NDRange(32, 4, 1);
+  } else if (kernel_func_names_[0] == "conv2d_1x1_h1w4c1") {
     auto tuned_map_key = GenerateTunedKey();
     cl::NDRange lws_in_map = cl::NullRange;
     // if (CLRuntime::Global()->HasTunedLocalWorkSizeMap(tuned_map_key,
@@ -1161,7 +1196,12 @@ void ConvImageCompute::SetGlobalWorkSize() {
                                   static_cast<size_t>(w_blk_),
                                   static_cast<size_t>(nh_blk_)};
 
-  if (kernel_func_names_[0] == "conv2d_1x1_mali") {
+  if (kernel_func_names_[0] == "conv2d_1x1_fc") {
+    c_blk_ = ROUND_UP(global_work_size_[0], 32);
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    4 * static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+  } else if (kernel_func_names_[0] == "conv2d_1x1_mali") {
     global_work_size_ =
         cl::NDRange{static_cast<size_t>(c_blk_ * UP_DIV(w_blk_, 4)),
                     static_cast<size_t>(nh_blk_)};
@@ -1304,12 +1344,38 @@ void ConvImageCompute::SetGlobalWorkSize() {
                                     static_cast<size_t>(nh_blk_)};
     input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
   }
-  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
-          << global_work_size_[1] << "," << global_work_size_[2] << "}";
-  VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
-          << local_work_size_[1] << "," << local_work_size_[2] << "}";
-  for (auto i = 0; i < global_work_size_.dimensions(); i++) {
-    VLOG(4) << "global_work_size[" << i << "]: " << global_work_size_[i];
+}
+
+void ConvImageCompute::OI2IOO4I4(void* src, void* dst, size_t O, size_t I) {
+  bool fp16_support =
+      CLRuntime::Global()->get_precision() == lite_api::CL_PRECISION_FP16;
+  size_t padded_I = ROUND_UP(I, 4);
+  size_t padded_O = ROUND_UP(O, 4);
+
+  float* dst_fp32 = static_cast<float*>(dst);
+  half_t* dst_fp16 = static_cast<half_t*>(dst);
+  float* p_src = static_cast<float*>(src);
+
+  for (int block_y = 0; 4 * block_y < padded_O; block_y++) {
+    for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
+      for (int block_x = 0; 4 * block_x < padded_I; block_x++) {
+        for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
+          int y = 4 * block_y + y_in_block;
+          int x = 4 * block_x + x_in_block;
+          // Consider destination as an array with extents
+          // [padded_src_channels/4][padded_dst_channels/4][4][4]
+          int dst_index = block_x * padded_O * 4 + block_y * 16 +
+                          x_in_block * 4 + y_in_block;
+          if (x < I && y < O) {
+            fp16_support ? dst_fp16[dst_index] = Float2Half(p_src[I * y + x])
+                         : dst_fp32[dst_index] = p_src[I * y + x];
+          } else {
+            fp16_support ? dst_fp16[dst_index] = Float2Half(0.f)
+                         : dst_fp32[dst_index] = 0.f;
+          }
+        }
+      }
+    }
   }
 }
 
@@ -1387,6 +1453,32 @@ void ConvImageCompute::Conv2d1x1Mali() {
   CL_CHECK_FATAL(status_);
 }
 
+void ConvImageCompute::Conv2d1x1FC() {
+  int cnt = 0;
+  status_ = kernel_.setArg(cnt++, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(cnt++, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(cnt++, *filter_buffer_p_);
+  CL_CHECK_FATAL(status_);
+  if (has_bias_) {
+    status_ = kernel_.setArg(cnt++, *bias_image_p_);
+    CL_CHECK_FATAL(status_);
+  }
+  if (build_options_[0].find("-DPRELU") != std::string::npos) {
+    status_ = kernel_.setArg(cnt++, *alpha_image_p_);
+    CL_CHECK_FATAL(status_);
+  }
+  if (!fuse_eltwise_op_type_.empty()) {
+    status_ = kernel_.setArg(cnt++, *second_input_image_p_);
+    CL_CHECK_FATAL(status_);
+  }
+  status_ = kernel_.setArg(cnt++, UP_DIV(input_tensor_c_, 4));
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(cnt++, UP_DIV(output_tensor_c_, 4));
+  CL_CHECK_FATAL(status_);
+}
+
 void ConvImageCompute::Conv2d1x1opt() {
   status_ = kernel_.setArg(0, default_c_blk_);
   CL_CHECK_FATAL(status_);
@@ -2011,6 +2103,19 @@ void ConvImageCompute::Run() {
   }
 }
 
+bool ConvImageCompute::UseFcReplaceConv() {
+  auto x_dims = conv_param_->x->dims();
+  auto out_dims = conv_param_->output->dims();
+
+  bool hw_is_1 =
+      x_dims[2] == 1 && x_dims[3] == 1 && out_dims[2] == 1 && out_dims[3] == 1;
+  bool attr_valid = filter_tensor_h_ == 1 && filter_tensor_w_ == 1 &&
+                    stride_h_ == 1 && stride_w_ == 1 && pad_up_ == 0 &&
+                    pad_down_ == 0 && pad_left_ == 0 && pad_right_ == 0 &&
+                    dilation_h_ == 1 && dilation_w_ == 1;
+  return hw_is_1 && attr_valid;
+}
+
 void ConvImageCompute::PrintConvInfo() {
   const bool is_element_wise_bias =
       has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims();

@@ -76,9 +76,12 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   void DepthwiseConv2d();
   void Conv2dCommon();
   void Conv2d1x1Mali();
+  void Conv2d1x1FC();
   void OIHW2OHWIO4I4(
       void* src, void* dst, size_t O, size_t I, size_t H, size_t W);
+  void OI2IOO4I4(void* src, void* dst, size_t O, size_t I);
   void AssignDataFromCPUToGPU(const Tensor* tensor_cpu_p, Tensor* tensor_gpu_p);
+  bool UseFcReplaceConv();
 
   param_t* conv_param_{nullptr};