diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h index ef263545339..f92bfd5d6e7 100644 --- a/lite/backends/opencl/cl_kernel/cl_common.h +++ b/lite/backends/opencl/cl_kernel/cl_common.h @@ -53,6 +53,7 @@ limitations under the License. */ #define GET_VEC_TYPE(type__, size__) type__##size__ #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__) #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4) +#define CL_DTYPE16 VECTORIZED_TYPE(CL_DTYPE, 16) #define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4) ///////////////////////////////// diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index 663c04b44bc..b57dcbd4f07 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -1,8 +1,8 @@ #include -inline elt_fuse_func_wrapper(__read_only image2d_t second_input_image, - const int2 pos, - CL_DTYPE4 *value_p) { +inline void elt_fuse_func_wrapper(__read_only image2d_t second_input_image, + const int2 pos, + CL_DTYPE4 *value_p) { CL_DTYPE4 second_val = READ_IMG_TYPE(CL_DTYPE_CHAR, second_input_image, SAMPLER, pos); *value_p += second_val; @@ -2413,3 +2413,75 @@ __kernel void conv2d_1x1_mali_h2w2c2( } } } + +__kernel void conv2d_1x1_fc(__read_only image2d_t input, + __write_only image2d_t output, + __global CL_DTYPE16 *weights, +#ifdef BIASE_CH + __read_only image2d_t biases, +#endif // BIASE_CH +#ifdef PRELU + __read_only image2d_t prelu_alpha, +#endif // PRELU +#ifdef ELT_FUSE + __read_only image2d_t second_input_image, +#endif // ELT_FUSE + int in_c_blks, + int out_c_blks) { + int out_c = get_global_id(0); + int2 tid = (int2)(get_local_id(0), get_local_id(1)); + CL_DTYPE4 s = (CL_DTYPE4)(0.0f); + + if (out_c < out_c_blks) { + for (int c = tid.y; c < in_c_blks; c += 4) { + CL_DTYPE4 v = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(c, 0)); + CL_DTYPE16 w = weights[c * out_c_blks + out_c]; + CL_DTYPE4 partial = v.x * w.s0123; + partial += v.y * w.s4567; + partial += v.z * w.s89ab; + partial += v.w * w.scdef; + s += partial; + } + } + __local CL_DTYPE4 temp[32][4]; + temp[tid.x][tid.y] = s; + barrier(CLK_LOCAL_MEM_FENCE); + + if (out_c >= out_c_blks) { + return; + } + if (tid.y == 0) { + s += temp[tid.x][1]; + s += temp[tid.x][2]; + s += temp[tid.x][3]; + int2 output_pos0 = (int2)(out_c, 0); + +#ifdef BIASE_CH + CL_DTYPE4 output0 = + s + READ_IMG_TYPE(CL_DTYPE_CHAR, biases, SAMPLER, output_pos0); +#else + CL_DTYPE4 output0 = s; +#endif + + CL_DTYPE4 alpha0; +#ifdef PRELU_CH + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); +#elif defined(PRELU_ELE) + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); +#elif defined(PRELU_ALL) + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; +#endif + output0 = activation_type4(output0, alpha0); +#ifdef SCALE_ACTIVATION + output0 = fuse_scale(output0, 1.f, 0.f, 0.f); +#endif + +#ifdef ELT_FUSE + elt_fuse_func_wrapper(second_input_image, output_pos0, &output0); +#endif + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos0, output0); + } +} diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index bf1131667ca..eece25896b5 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -150,7 +150,35 @@ void ConvImageCompute::PrepareForRun() { // filter_gpu_buffer_.get()); // impl_ = &ConvImageCompute::Conv2d1x1Mali; - if (is_mali_ && filter_tensor_h_ == 1 && filter_tensor_w_ == 1) { + + if (UseFcReplaceConv()) { + kernel_func_names_.push_back("conv2d_1x1_fc"); + kernel_func_paths_.push_back("image/conv2d_1x1_opt_kernel.cl"); + + filter_gpu_image_ = std::unique_ptr(new Tensor); + auto tensor_hold_filter_buffer = std::unique_ptr(new Tensor); + tensor_hold_bias_image_ = std::unique_ptr(new Tensor); + auto filter_ext_dims = filter_dims; + filter_ext_dims[0] = ROUND_UP(filter_dims[0], 4); + filter_ext_dims[1] = ROUND_UP(filter_dims[1], 4); + tensor_hold_filter_buffer->Resize(filter_ext_dims); + auto* filter_buffer_data = + MUTABLE_DATA_CPU(tensor_hold_filter_buffer.get()); + size_t buf_size = tensor_hold_filter_buffer->memory_size(); + + std::memset(filter_buffer_data, 0, buf_size); // can be remove later + OI2IOO4I4(filter_cpu, filter_buffer_data, filter_dims[0], filter_dims[1]); + + filter_gpu_buffer_ = std::unique_ptr(new Tensor); + auto* filter_gpu_data = filter_gpu_buffer_->mutable_data( + TARGET(kOpenCL), tensor_hold_filter_buffer->memory_size()); + TargetWrapperCL::MemcpySync(filter_gpu_data, + tensor_hold_filter_buffer->raw_data(), + tensor_hold_filter_buffer->memory_size(), + IoDirection::HtoD); + filter_buffer_p_ = GET_BUFFER_GPU(filter_gpu_buffer_); + impl_ = &ConvImageCompute::Conv2d1x1FC; + } else if (is_mali_ && filter_tensor_h_ == 1 && filter_tensor_w_ == 1) { filter_gpu_image_ = std::unique_ptr(new Tensor); tensor_hold_filter_image_ = std::unique_ptr(new Tensor); tensor_hold_bias_image_ = std::unique_ptr(new Tensor); @@ -765,7 +793,14 @@ void ConvImageCompute::PrepareForRun() { #define SHOW_EACH_LWS_TIME #undef SHOW_EACH_LWS_TIME void ConvImageCompute::SetLocalWorkSize(size_t repeats /*=4*/) { - if (kernel_func_names_[0] == "conv2d_1x1_h1w4c1") { + if (kernel_func_names_[0] == "conv2d_1x1_fc") { + auto& context = ctx_->As(); + std::stringstream kernel_key; + kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_; + kernel_ = context.cl_context()->GetKernel(kernel_key.str()); + + local_work_size_ = cl::NDRange(32, 4, 1); + } else if (kernel_func_names_[0] == "conv2d_1x1_h1w4c1") { auto tuned_map_key = GenerateTunedKey(); cl::NDRange lws_in_map = cl::NullRange; // if (CLRuntime::Global()->HasTunedLocalWorkSizeMap(tuned_map_key, @@ -1161,7 +1196,12 @@ void ConvImageCompute::SetGlobalWorkSize() { static_cast(w_blk_), static_cast(nh_blk_)}; - if (kernel_func_names_[0] == "conv2d_1x1_mali") { + if (kernel_func_names_[0] == "conv2d_1x1_fc") { + c_blk_ = ROUND_UP(global_work_size_[0], 32); + global_work_size_ = cl::NDRange{static_cast(c_blk_), + 4 * static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "conv2d_1x1_mali") { global_work_size_ = cl::NDRange{static_cast(c_blk_ * UP_DIV(w_blk_, 4)), static_cast(nh_blk_)}; @@ -1304,12 +1344,38 @@ void ConvImageCompute::SetGlobalWorkSize() { static_cast(nh_blk_)}; input_c_block_ = static_cast((input_tensor_c_ + 3) / 4); } - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; - VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << "," - << local_work_size_[1] << "," << local_work_size_[2] << "}"; - for (auto i = 0; i < global_work_size_.dimensions(); i++) { - VLOG(4) << "global_work_size[" << i << "]: " << global_work_size_[i]; +} + +void ConvImageCompute::OI2IOO4I4(void* src, void* dst, size_t O, size_t I) { + bool fp16_support = + CLRuntime::Global()->get_precision() == lite_api::CL_PRECISION_FP16; + size_t padded_I = ROUND_UP(I, 4); + size_t padded_O = ROUND_UP(O, 4); + + float* dst_fp32 = static_cast(dst); + half_t* dst_fp16 = static_cast(dst); + float* p_src = static_cast(src); + + for (int block_y = 0; 4 * block_y < padded_O; block_y++) { + for (int y_in_block = 0; y_in_block < 4; y_in_block++) { + for (int block_x = 0; 4 * block_x < padded_I; block_x++) { + for (int x_in_block = 0; x_in_block < 4; x_in_block++) { + int y = 4 * block_y + y_in_block; + int x = 4 * block_x + x_in_block; + // Consider destination as an array with extents + // [padded_src_channels/4][padded_dst_channels/4][4][4] + int dst_index = block_x * padded_O * 4 + block_y * 16 + + x_in_block * 4 + y_in_block; + if (x < I && y < O) { + fp16_support ? dst_fp16[dst_index] = Float2Half(p_src[I * y + x]) + : dst_fp32[dst_index] = p_src[I * y + x]; + } else { + fp16_support ? dst_fp16[dst_index] = Float2Half(0.f) + : dst_fp32[dst_index] = 0.f; + } + } + } + } } } @@ -1387,6 +1453,32 @@ void ConvImageCompute::Conv2d1x1Mali() { CL_CHECK_FATAL(status_); } +void ConvImageCompute::Conv2d1x1FC() { + int cnt = 0; + status_ = kernel_.setArg(cnt++, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(cnt++, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(cnt++, *filter_buffer_p_); + CL_CHECK_FATAL(status_); + if (has_bias_) { + status_ = kernel_.setArg(cnt++, *bias_image_p_); + CL_CHECK_FATAL(status_); + } + if (build_options_[0].find("-DPRELU") != std::string::npos) { + status_ = kernel_.setArg(cnt++, *alpha_image_p_); + CL_CHECK_FATAL(status_); + } + if (!fuse_eltwise_op_type_.empty()) { + status_ = kernel_.setArg(cnt++, *second_input_image_p_); + CL_CHECK_FATAL(status_); + } + status_ = kernel_.setArg(cnt++, UP_DIV(input_tensor_c_, 4)); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(cnt++, UP_DIV(output_tensor_c_, 4)); + CL_CHECK_FATAL(status_); +} + void ConvImageCompute::Conv2d1x1opt() { status_ = kernel_.setArg(0, default_c_blk_); CL_CHECK_FATAL(status_); @@ -2011,6 +2103,19 @@ void ConvImageCompute::Run() { } } +bool ConvImageCompute::UseFcReplaceConv() { + auto x_dims = conv_param_->x->dims(); + auto out_dims = conv_param_->output->dims(); + + bool hw_is_1 = + x_dims[2] == 1 && x_dims[3] == 1 && out_dims[2] == 1 && out_dims[3] == 1; + bool attr_valid = filter_tensor_h_ == 1 && filter_tensor_w_ == 1 && + stride_h_ == 1 && stride_w_ == 1 && pad_up_ == 0 && + pad_down_ == 0 && pad_left_ == 0 && pad_right_ == 0 && + dilation_h_ == 1 && dilation_w_ == 1; + return hw_is_1 && attr_valid; +} + void ConvImageCompute::PrintConvInfo() { const bool is_element_wise_bias = has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims(); diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 1e38df983fa..2f5eef8046e 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -76,9 +76,12 @@ class ConvImageCompute : public KernelLite