Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lite/backends/opencl/cl_kernel/cl_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ limitations under the License. */
#define GET_VEC_TYPE(type__, size__) type__##size__
#define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
#define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
#define CL_DTYPE16 VECTORIZED_TYPE(CL_DTYPE, 16)
#define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4)

/////////////////////////////////
Expand Down
78 changes: 75 additions & 3 deletions lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include <cl_common.h>

inline elt_fuse_func_wrapper(__read_only image2d_t second_input_image,
const int2 pos,
CL_DTYPE4 *value_p) {
inline void elt_fuse_func_wrapper(__read_only image2d_t second_input_image,
const int2 pos,
CL_DTYPE4 *value_p) {
CL_DTYPE4 second_val =
READ_IMG_TYPE(CL_DTYPE_CHAR, second_input_image, SAMPLER, pos);
*value_p += second_val;
Expand Down Expand Up @@ -2413,3 +2413,75 @@ __kernel void conv2d_1x1_mali_h2w2c2(
}
}
}

__kernel void conv2d_1x1_fc(__read_only image2d_t input,
__write_only image2d_t output,
__global CL_DTYPE16 *weights,
#ifdef BIASE_CH
__read_only image2d_t biases,
#endif // BIASE_CH
#ifdef PRELU
__read_only image2d_t prelu_alpha,
#endif // PRELU
#ifdef ELT_FUSE
__read_only image2d_t second_input_image,
#endif // ELT_FUSE
int in_c_blks,
int out_c_blks) {
int out_c = get_global_id(0);
int2 tid = (int2)(get_local_id(0), get_local_id(1));
CL_DTYPE4 s = (CL_DTYPE4)(0.0f);

if (out_c < out_c_blks) {
for (int c = tid.y; c < in_c_blks; c += 4) {
CL_DTYPE4 v = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(c, 0));
CL_DTYPE16 w = weights[c * out_c_blks + out_c];
CL_DTYPE4 partial = v.x * w.s0123;
partial += v.y * w.s4567;
partial += v.z * w.s89ab;
partial += v.w * w.scdef;
s += partial;
}
}
__local CL_DTYPE4 temp[32][4];
temp[tid.x][tid.y] = s;
barrier(CLK_LOCAL_MEM_FENCE);

if (out_c >= out_c_blks) {
return;
}
if (tid.y == 0) {
s += temp[tid.x][1];
s += temp[tid.x][2];
s += temp[tid.x][3];
int2 output_pos0 = (int2)(out_c, 0);

#ifdef BIASE_CH
CL_DTYPE4 output0 =
s + READ_IMG_TYPE(CL_DTYPE_CHAR, biases, SAMPLER, output_pos0);
#else
CL_DTYPE4 output0 = s;
#endif

CL_DTYPE4 alpha0;
#ifdef PRELU_CH
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
#elif defined(PRELU_ELE)
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
#elif defined(PRELU_ALL)
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
alpha0.y = alpha0.x;
alpha0.z = alpha0.x;
alpha0.w = alpha0.x;
#endif
output0 = activation_type4(output0, alpha0);
#ifdef SCALE_ACTIVATION
output0 = fuse_scale(output0, 1.f, 0.f, 0.f);
#endif

#ifdef ELT_FUSE
elt_fuse_func_wrapper(second_input_image, output_pos0, &output0);
#endif
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos0, output0);
}
}
123 changes: 114 additions & 9 deletions lite/kernels/opencl/conv_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,35 @@ void ConvImageCompute::PrepareForRun() {
// filter_gpu_buffer_.get());

// impl_ = &ConvImageCompute::Conv2d1x1Mali;
if (is_mali_ && filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {

if (UseFcReplaceConv()) {
kernel_func_names_.push_back("conv2d_1x1_fc");
kernel_func_paths_.push_back("image/conv2d_1x1_opt_kernel.cl");

filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
auto tensor_hold_filter_buffer = std::unique_ptr<Tensor>(new Tensor);
tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
auto filter_ext_dims = filter_dims;
filter_ext_dims[0] = ROUND_UP(filter_dims[0], 4);
filter_ext_dims[1] = ROUND_UP(filter_dims[1], 4);
tensor_hold_filter_buffer->Resize(filter_ext_dims);
auto* filter_buffer_data =
MUTABLE_DATA_CPU(tensor_hold_filter_buffer.get());
size_t buf_size = tensor_hold_filter_buffer->memory_size();

std::memset(filter_buffer_data, 0, buf_size); // can be remove later
OI2IOO4I4(filter_cpu, filter_buffer_data, filter_dims[0], filter_dims[1]);

filter_gpu_buffer_ = std::unique_ptr<Tensor>(new Tensor);
auto* filter_gpu_data = filter_gpu_buffer_->mutable_data(
TARGET(kOpenCL), tensor_hold_filter_buffer->memory_size());
TargetWrapperCL::MemcpySync(filter_gpu_data,
tensor_hold_filter_buffer->raw_data(),
tensor_hold_filter_buffer->memory_size(),
IoDirection::HtoD);
filter_buffer_p_ = GET_BUFFER_GPU(filter_gpu_buffer_);
impl_ = &ConvImageCompute::Conv2d1x1FC;
} else if (is_mali_ && filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
Expand Down Expand Up @@ -765,7 +793,14 @@ void ConvImageCompute::PrepareForRun() {
#define SHOW_EACH_LWS_TIME
#undef SHOW_EACH_LWS_TIME
void ConvImageCompute::SetLocalWorkSize(size_t repeats /*=4*/) {
if (kernel_func_names_[0] == "conv2d_1x1_h1w4c1") {
if (kernel_func_names_[0] == "conv2d_1x1_fc") {
auto& context = ctx_->As<OpenCLContext>();
std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());

local_work_size_ = cl::NDRange(32, 4, 1);
} else if (kernel_func_names_[0] == "conv2d_1x1_h1w4c1") {
auto tuned_map_key = GenerateTunedKey();
cl::NDRange lws_in_map = cl::NullRange;
// if (CLRuntime::Global()->HasTunedLocalWorkSizeMap(tuned_map_key,
Expand Down Expand Up @@ -1161,7 +1196,12 @@ void ConvImageCompute::SetGlobalWorkSize() {
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};

if (kernel_func_names_[0] == "conv2d_1x1_mali") {
if (kernel_func_names_[0] == "conv2d_1x1_fc") {
c_blk_ = ROUND_UP(global_work_size_[0], 32);
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
4 * static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
} else if (kernel_func_names_[0] == "conv2d_1x1_mali") {
global_work_size_ =
cl::NDRange{static_cast<size_t>(c_blk_ * UP_DIV(w_blk_, 4)),
static_cast<size_t>(nh_blk_)};
Expand Down Expand Up @@ -1304,12 +1344,38 @@ void ConvImageCompute::SetGlobalWorkSize() {
static_cast<size_t>(nh_blk_)};
input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
}
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
<< local_work_size_[1] << "," << local_work_size_[2] << "}";
for (auto i = 0; i < global_work_size_.dimensions(); i++) {
VLOG(4) << "global_work_size[" << i << "]: " << global_work_size_[i];
}

void ConvImageCompute::OI2IOO4I4(void* src, void* dst, size_t O, size_t I) {
bool fp16_support =
CLRuntime::Global()->get_precision() == lite_api::CL_PRECISION_FP16;
size_t padded_I = ROUND_UP(I, 4);
size_t padded_O = ROUND_UP(O, 4);

float* dst_fp32 = static_cast<float*>(dst);
half_t* dst_fp16 = static_cast<half_t*>(dst);
float* p_src = static_cast<float*>(src);

for (int block_y = 0; 4 * block_y < padded_O; block_y++) {
for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
for (int block_x = 0; 4 * block_x < padded_I; block_x++) {
for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
int y = 4 * block_y + y_in_block;
int x = 4 * block_x + x_in_block;
// Consider destination as an array with extents
// [padded_src_channels/4][padded_dst_channels/4][4][4]
int dst_index = block_x * padded_O * 4 + block_y * 16 +
x_in_block * 4 + y_in_block;
if (x < I && y < O) {
fp16_support ? dst_fp16[dst_index] = Float2Half(p_src[I * y + x])
: dst_fp32[dst_index] = p_src[I * y + x];
} else {
fp16_support ? dst_fp16[dst_index] = Float2Half(0.f)
: dst_fp32[dst_index] = 0.f;
}
}
}
}
}
}

Expand Down Expand Up @@ -1387,6 +1453,32 @@ void ConvImageCompute::Conv2d1x1Mali() {
CL_CHECK_FATAL(status_);
}

void ConvImageCompute::Conv2d1x1FC() {
int cnt = 0;
status_ = kernel_.setArg(cnt++, *input_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(cnt++, *output_image_p_);
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(cnt++, *filter_buffer_p_);
CL_CHECK_FATAL(status_);
if (has_bias_) {
status_ = kernel_.setArg(cnt++, *bias_image_p_);
CL_CHECK_FATAL(status_);
}
if (build_options_[0].find("-DPRELU") != std::string::npos) {
status_ = kernel_.setArg(cnt++, *alpha_image_p_);
CL_CHECK_FATAL(status_);
}
if (!fuse_eltwise_op_type_.empty()) {
status_ = kernel_.setArg(cnt++, *second_input_image_p_);
CL_CHECK_FATAL(status_);
}
status_ = kernel_.setArg(cnt++, UP_DIV(input_tensor_c_, 4));
CL_CHECK_FATAL(status_);
status_ = kernel_.setArg(cnt++, UP_DIV(output_tensor_c_, 4));
CL_CHECK_FATAL(status_);
}

void ConvImageCompute::Conv2d1x1opt() {
status_ = kernel_.setArg(0, default_c_blk_);
CL_CHECK_FATAL(status_);
Expand Down Expand Up @@ -2011,6 +2103,19 @@ void ConvImageCompute::Run() {
}
}

bool ConvImageCompute::UseFcReplaceConv() {
auto x_dims = conv_param_->x->dims();
auto out_dims = conv_param_->output->dims();

bool hw_is_1 =
x_dims[2] == 1 && x_dims[3] == 1 && out_dims[2] == 1 && out_dims[3] == 1;
bool attr_valid = filter_tensor_h_ == 1 && filter_tensor_w_ == 1 &&
stride_h_ == 1 && stride_w_ == 1 && pad_up_ == 0 &&
pad_down_ == 0 && pad_left_ == 0 && pad_right_ == 0 &&
dilation_h_ == 1 && dilation_w_ == 1;
return hw_is_1 && attr_valid;
}

void ConvImageCompute::PrintConvInfo() {
const bool is_element_wise_bias =
has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims();
Expand Down
3 changes: 3 additions & 0 deletions lite/kernels/opencl/conv_image_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,12 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
void DepthwiseConv2d();
void Conv2dCommon();
void Conv2d1x1Mali();
void Conv2d1x1FC();
void OIHW2OHWIO4I4(
void* src, void* dst, size_t O, size_t I, size_t H, size_t W);
void OI2IOO4I4(void* src, void* dst, size_t O, size_t I);
void AssignDataFromCPUToGPU(const Tensor* tensor_cpu_p, Tensor* tensor_gpu_p);
bool UseFcReplaceConv();

param_t* conv_param_{nullptr};

Expand Down