Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 227 additions & 5 deletions lite/backends/opencl/cl_kernel/image/pool_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,231 @@ limitations under the License. */

#include <cl_common.h>

__kernel void pool(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width,
__private const int ksize_h,
__private const int ksize_w,
__private const int stride_h,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_height;
const int out_h = out_nh % out_height;

int start_h = out_h * stride_h - pad_top;
int end_h = min(start_h + ksize_h, in_height);
start_h = max(start_h, 0);

int start_w = out_w * stride_w - pad_left;
int end_w = min(start_w + ksize_w, in_width);
start_w = max(start_w, 0);

const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
const int pos_out_x = mad24(out_c, out_width, out_w);

#ifdef POOL_AVG

CL_DTYPE4 res = (CL_DTYPE4)(0.0f);
int div;
#ifdef EXCLUSIVE
div = (end_h - start_h) * (end_w - start_w);
#else
div = ksize_w * ksize_h;
#endif // EXCLUSIVE

#ifdef GLOBAL
// pool_avg_global: force to use fp32 to avoid the loss of accuracy
float4 res_f32 = 0.f;
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
res_f32 += read_imagef(input, SAMPLER, (int2)(pos_in_x + x, pos_in_y + y));
}
}
res_f32 /= (float)div;
#ifdef CL_DTYPE_half
res = convert_half4(res_f32);
#else
res = res_f32;
#endif

#else
// pool_avg: use default precision
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
res += READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_in_x + x, pos_in_y + y));
}
}
res /= (CL_DTYPE)div;
#endif // GLOBAL

#else

// POOL_MAX
CL_DTYPE4 res = (CL_DTYPE4)(-FLT_MAX);
for (int y = start_h; y < end_h; ++y) {
for (int x = start_w; x < end_w; ++x) {
CL_DTYPE4 tmp = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_in_x + x, pos_in_y + y));
res = max(res, tmp);
}
}

#endif // POOL_AVG

WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), res);
}

__kernel void pool_local(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int out_height,
__private const int out_width,
__private const int ksize_h,
__private const int ksize_w,
__private const int stride_h,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left,
__private const int local_block_size,
__private const int2 local_block_size_wh,
__private const int2 local_block_count_wh,
__local CL_DTYPE4* local_output) {
const int out_c = get_global_id(0) / local_block_size;
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_height;
// const int out_h = out_nh % out_height;
const int out_h = out_nh - mul24(out_h, out_height);

const int local_id = get_local_id(0);
const int local_width_id = local_id % local_block_size_wh.x;
const int local_height_id = local_id / local_block_size_wh.x;

const int input_start = mul24(out_n, in_height);
const int input_channel_start = mul24(out_c, in_width);
const int input_height_start = mad24(out_h, stride_h, -pad_top);
const int input_width_start = mad24(out_w, stride_w, -pad_left);

#ifdef POOL_AVG
__local float4* avg_output = (__local float4*)local_output;
avg_output[local_id] = (float4)0;
int pos_h = local_height_id;

for (int local_h_block_id = 0; local_h_block_id < local_block_count_wh.y; local_h_block_id++) {
if (pos_h >= ksize_h) break;
int pos_w = local_width_id;
int input_height_idx = input_height_start + pos_h;
input_height_idx =
select(input_start + input_height_idx, -1, (input_height_idx < 0 || input_height_idx >= in_height));
for (int local_w_block_id = 0; local_w_block_id < local_block_count_wh.x; local_w_block_id++) {
if (pos_w >= ksize_w) break;
int input_width_idx = input_width_start + pos_w;
input_width_idx =
select(input_channel_start + input_width_idx, -1, (input_width_idx < 0 || input_width_idx >= in_width));
float4 input_data = read_imagef(input, SAMPLER, (int2)(input_width_idx, input_height_idx));
avg_output[local_id] += input_data;
pos_w += local_block_size_wh.x;
}
pos_h += local_block_size_wh.y;
}

barrier(CLK_LOCAL_MEM_FENCE);

for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0; stride_h >>= 1) {
if (local_height_id < stride_h) {
avg_output[local_id] += avg_output[local_id + stride_h * local_block_size_wh.x];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0; stride_w >>= 1) {
if (local_height_id == 0 && local_width_id < stride_w) {
avg_output[local_id] += avg_output[local_id + stride_w];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) {
const int kernel_height_start = max(0, input_height_start);
const int kernel_width_start = max(0, input_width_start);
const int kernel_height_end = min(input_height_start + ksize_h, in_height);
const int kernel_width_end = min(input_width_start + ksize_w, in_width);
#ifdef EXCLUSIVE
const int block_size = mul24((kernel_height_end - kernel_height_start), (kernel_width_end - kernel_width_start));
#else
const int block_size = ksize_w * ksize_h;
#endif // EXCLUSIVE
avg_output[local_id] = avg_output[local_id] / (float)block_size;

const int output_channel_width_idx = mad24(out_c, out_width, out_w);
#ifdef CL_DTYPE_half
CL_DTYPE4 res = convert_half4(avg_output[local_id]);
#else
CL_DTYPE4 res = avg_output[local_id];
#endif
WRITE_IMG_TYPE(
CL_DTYPE_CHAR, output, (int2)(output_channel_width_idx, out_nh), res);
}
#else
local_output[local_id] = (CL_DTYPE4)(-FLT_MAX);
int pos_h = local_height_id;

for (int local_h_block_id = 0; local_h_block_id < local_block_count_wh.y; local_h_block_id++) {
if (pos_h >= ksize_h) break;
int pos_w = local_width_id;
int input_height_idx = input_height_start + pos_h;
input_height_idx =
select(input_start + input_height_idx, -1, (input_height_idx < 0 || input_height_idx >= in_height));
if (input_height_idx != -1) {
for (int local_w_block_id = 0; local_w_block_id < local_block_count_wh.x; local_w_block_id++) {
if (pos_w >= ksize_w) break;
int input_width_idx = input_width_start + pos_w;
input_width_idx =
select(input_channel_start + input_width_idx, -1, (input_width_idx < 0 || input_width_idx >= in_width));

if (input_width_idx != -1) {
CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(input_width_idx, input_height_idx));
local_output[local_id] = fmax(input_data, local_output[local_id]);
}
pos_w += local_block_size_wh.x;
}
}
pos_h += local_block_size_wh.y;
}

barrier(CLK_LOCAL_MEM_FENCE);

for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0; stride_h >>= 1) {
if (local_height_id < stride_h) {
local_output[local_id] = fmax(local_output[local_id + stride_h * local_block_size_wh.x], local_output[local_id]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}

for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0; stride_w >>= 1) {
if (local_height_id == 0 && local_width_id < stride_w) {
local_output[local_id] = fmax(local_output[local_id + stride_w], local_output[local_id]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) {
const int output_channel_width_idx = mad24(out_c, out_width, out_w);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(output_channel_width_idx, out_nh), local_output[local_id]);
}
#endif // POOL_AVG
}

__kernel void pool_max(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
Expand Down Expand Up @@ -96,7 +321,7 @@ __kernel void pool_avg(__read_only image2d_t input,
div = (CL_DTYPE)((end_h - start_h)*(end_w - start_w));
#else
div = (CL_DTYPE)(ksize_w * ksize_h);
#endif
#endif
CL_DTYPE4 avg = sum / div;
const int pos_out_x = mad24(out_c, out_width, out_w);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg);
Expand Down Expand Up @@ -132,10 +357,7 @@ __kernel void pool_avg_global(__read_only image2d_t input,
CL_DTYPE4 tmp = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_in_x + x, pos_in_y + y));

sum.x = convert_float(tmp.x) + sum.x;
sum.y = convert_float(tmp.y) + sum.y;
sum.z = convert_float(tmp.z) + sum.z;
sum.w = convert_float(tmp.w) + sum.w;
sum = convert_float4(tmp) + sum;
}
}
const float global_size_div = 1.0f / (in_height * in_width);
Expand Down
8 changes: 8 additions & 0 deletions lite/backends/opencl/cl_runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -830,6 +830,14 @@ void CLRuntime::GetAdrenoContextProperties(
properties->push_back(0);
}

uint64_t CLRuntime::GetMaxWorkGroupSize(const cl::Kernel& kernel) {
uint64_t max_workgroup_size = 0;
int ret = kernel.getWorkGroupInfo(
*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size);
if (ret != 0) max_workgroup_size = 0;
return max_workgroup_size;
}

void CLRuntime::set_auto_tune(lite_api::CLTuneMode tune_mode,
const std::string& path,
const std::string& name,
Expand Down
4 changes: 4 additions & 0 deletions lite/backends/opencl/cl_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ class CLRuntime {
return device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
}

// Query the maximum work-group size that can be used to execute a kernel on a
// specific device
uint64_t GetMaxWorkGroupSize(const cl::Kernel& kernel);

double GetCommandTime(const cl::Event& event);

double GetQueuedTime(const cl::Event& event);
Expand Down
2 changes: 1 addition & 1 deletion lite/core/profile/profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,12 @@ void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) {
void Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range.";
units_[index].Timer(type)->Stop(ctx);
#ifdef LITE_WITH_OPENCL
units_[index].Timer(type)->CLStop(units_[index].character.op_type,
units_[index].character.io_duration,
units_[index].character.cl_event);
#endif
units_[index].Timer(type)->Stop(ctx);
}

int Profiler::GetKernelFuncCalledTimes(const std::string& op_type,
Expand Down
23 changes: 14 additions & 9 deletions lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,22 @@ void RunModel() {
config.set_model_dir(FLAGS_model_dir);
config.set_power_mode((paddle::lite_api::PowerMode)FLAGS_power_mode);
config.set_threads(FLAGS_threads);
if (FLAGS_use_gpu) {
std::vector<Place> valid_places{
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kInt32), DATALAYOUT(kNCHW)},
Place{TARGET(kARM)}};

std::vector<Place> valid_places;
if (FLAGS_use_gpu) {
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kInt32), DATALAYOUT(kNCHW)});
valid_places.emplace_back(Place{TARGET(kARM)});
} else {
std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
valid_places.emplace_back(Place{TARGET(kARM), PRECISION(kFloat)});
}

if (FLAGS_prefer_int8_kernel) {
Expand Down
Loading