Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions lite/backends/opencl/cl_kernel/image/matmul_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,50 @@ __kernel void matmul_xdim3_ydim1(__read_only image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, out_pos3, out3 * s);
}

__kernel void matmul_xdim2_ydim1(__read_only image2d_t input,
__write_only image2d_t output,
__global const CL_COMPUTE_DTYPE4 *weights,
int M,
int C,
int H,
int W,
float scale) {
int hblk_id = get_global_id(0);

CL_COMPUTE_DTYPE s0 = (CL_COMPUTE_DTYPE)(0.0f);
CL_COMPUTE_DTYPE s1 = (CL_COMPUTE_DTYPE)(0.0f);
CL_COMPUTE_DTYPE s2 = (CL_COMPUTE_DTYPE)(0.0f);
CL_COMPUTE_DTYPE s3 = (CL_COMPUTE_DTYPE)(0.0f);

for (int w = 0; w < (W + 3) / 4; ++w) {
CL_COMPUTE_DTYPE4 w0 = weights[w];
CL_COMPUTE_DTYPE4 v0 = READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, SAMPLER, (int2)(w, hblk_id * 4));
CL_COMPUTE_DTYPE4 v1 = READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, SAMPLER, (int2)(w, hblk_id * 4 + 1));
CL_COMPUTE_DTYPE4 v2 = READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, SAMPLER, (int2)(w, hblk_id * 4 + 2));
CL_COMPUTE_DTYPE4 v3 = READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, SAMPLER, (int2)(w, hblk_id * 4 + 3));
s0 += dot(v0, w0);
s1 += dot(v1, w0);
s2 += dot(v2, w0);
s3 += dot(v3, w0);
}

CL_COMPUTE_DTYPE4 output0 = (CL_COMPUTE_DTYPE4)(s0, s1, s2, s3);
CL_DTYPE4 zero_v4 = (CL_DTYPE4)0;
CL_DTYPE4 out0 = zero_v4;
out0.x = CONVERT_TYPE_TO(output0.x, CL_DTYPE);
out0.y = CONVERT_TYPE_TO(output0.y, CL_DTYPE);
out0.z = CONVERT_TYPE_TO(output0.z, CL_DTYPE);
out0.w = CONVERT_TYPE_TO(output0.w, CL_DTYPE);

int2 out_pos0 = (int2)(hblk_id, 0);
CL_DTYPE s = CONVERT_TYPE_TO(scale, CL_DTYPE);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, out_pos0, out0 * s);
}

__kernel void matmul_highdimx_ydim2(__read_only image2d_t input,
__write_only image2d_t output,
__global const CL_COMPUTE_DTYPE4 *weights,
Expand Down
45 changes: 38 additions & 7 deletions lite/kernels/opencl/matmul_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ class MatMulV2ImageCompute : public KernelLite<TARGET(kOpenCL),
y_dims.size() == 1
? DDim(std::vector<DDim::value_type>{1, 4, 1, y_dims[0]})
: DDim(std::vector<DDim::value_type>{1, 4, k_y, n_});
} else if (x_dims.size() == 2 && y_dims.size() == 1) {
y_ext_dims =
DDim(std::vector<DDim::value_type>{ROUND_UP(y_dims[0], 4)});
}

auto y_cpu_t = std::unique_ptr<Tensor>(new Tensor);
Expand All @@ -181,6 +184,20 @@ class MatMulV2ImageCompute : public KernelLite<TARGET(kOpenCL),
? DDim(std::vector<DDim::value_type>{1, 1, 1, y_dims[0]})
: DDim(std::vector<DDim::value_type>{1, 1, k_y, n_});
convert(y_cpu, y_buffer_data, tmp_dim);
} else if (x_dims.size() == 2 && y_dims.size() == 1) {
batch_ = x_dims.count(0, x_dims.size() - y_dims.size());
DDim tmp_dim =
y_dims.size() == 1
? DDim(std::vector<DDim::value_type>{1, 1, 1, y_dims[0]})
: DDim(std::vector<DDim::value_type>{1, 1, k_y, n_});
float* image_fp32 = static_cast<float*>(y_buffer_data);
half_t* image_fp16 = static_cast<half_t*>(y_buffer_data);
bool fp16_support =
CLRuntime::Global()->get_precision() == lite_api::CL_PRECISION_FP16;
for (int i = 0; i < tmp_dim.production(); i++) {
fp16_support ? image_fp16[i] = Float2Half(y_cpu[i]) : image_fp32[i] =
y_cpu[i];
}
} else {
VLOG(4) << "y_ext_dims: " << y_ext_dims;
RearrangeByBlk4x4(y_cpu, y_buffer_data, k_y, n_);
Expand Down Expand Up @@ -268,16 +285,20 @@ class MatMulV2ImageCompute : public KernelLite<TARGET(kOpenCL),
k_ = 1;
kernel_func_name_ = "matmul_transpose_x";
kernel_file_name_ = "image/matmul_xtranspose_kernel.cl";
} else if (x_dims.size() > 2 && y_dims.size() == 1 &&
} else if (x_dims.size() >= 2 && y_dims.size() == 1 &&
x_dims[x_dims.size() - 1] == y_dims[0]) {
m_ = 1, n_ = 1;
k_ = y_dims[0];
N = x_dims.size() == 4 ? x_dims[0] : 1;
C = x_dims.size() == 4 ? x_dims[1] : x_dims[0];
C = x_dims.size() == 4 ? x_dims[1]
: (x_dims.size() == 3 ? x_dims[0] : 1);
H = x_dims[x_dims.size() - 2], W = x_dims[x_dims.size() - 1];
c_blks_ = UP_DIV(x_dims[x_dims.size() - 3], 4);
kernel_func_name_ =
x_dims.size() == 4 ? "matmul_xdim4_ydim1" : "matmul_xdim3_ydim1";
c_blks_ =
x_dims.size() == 2 ? 1 : UP_DIV(x_dims[x_dims.size() - 3], 4);
kernel_func_name_ = x_dims.size() == 4
? "matmul_xdim4_ydim1"
: (x_dims.size() == 3 ? "matmul_xdim3_ydim1"
: "matmul_xdim2_ydim1");
kernel_file_name_ = "image/matmul_kernel.cl";
} else if (x_dims.size() > 2 && y_dims.size() == 2) {
N = x_dims.size() == 4 ? x_dims[0] : 1;
Expand Down Expand Up @@ -404,7 +425,10 @@ class MatMulV2ImageCompute : public KernelLite<TARGET(kOpenCL),
out_img_shape = folder_converter->InitImageDimInfoWith(out_dims);

if (matmul_v2_param_->Y->persistable()) {
if (x_dims.size() <= 2 && y_dims.size() <= 2) {
if (x_dims.size() == 2 && y_dims.size() == 1) {
local_work_size_ = cl::NDRange(1, 1);
global_work_size_ = cl::NDRange(UP_DIV(H, 4), c_blks_);
} else if (x_dims.size() <= 2 && y_dims.size() <= 2) {
if (transpose_x_) {
local_work_size_ = cl::NDRange(32, 4, 1);
global_work_size_ =
Expand Down Expand Up @@ -494,7 +518,14 @@ class MatMulV2ImageCompute : public KernelLite<TARGET(kOpenCL),
}
status = kernel.setArg(arg_idx++, m_);
CL_CHECK_FATAL(status);
if (x_dims.size() <= 2 && y_dims.size() <= 2) {
if (x_dims.size() == 2 && y_dims.size() == 1) {
status = kernel.setArg(arg_idx++, C);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, H);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, W);
CL_CHECK_FATAL(status);
} else if (x_dims.size() <= 2 && y_dims.size() <= 2) {
status = kernel.setArg(arg_idx++, k_blks_);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, n_blks_);
Expand Down