Skip to content

Commit a3b1bc7

Browse files
authored
[OpenCL]Fix conv PrepareForRun to reduce first frame time (#8576)
* fix conv PrepareForRun test=develop * fix conv bias flag bug test=develop
1 parent 9c744e8 commit a3b1bc7

1 file changed

Lines changed: 74 additions & 66 deletions

File tree

lite/kernels/opencl/conv_image_compute.cc

Lines changed: 74 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ namespace opencl {
3030
void ConvImageCompute::PrepareForRun() {
3131
ReInitWhenNeeded();
3232

33+
bool bias_buffer_flag = false;
3334
auto& context = ctx_->As<OpenCLContext>();
3435
CHECK(context.cl_context() != nullptr);
3536
is_mali_ = context.cl_context()->IsArmMali();
@@ -243,6 +244,7 @@ void ConvImageCompute::PrepareForRun() {
243244
if (task_size <= threshold_2) {
244245
CLImageConverterNBlock converter;
245246
kernel_func_names_.push_back("conv2d_1x1_mali_h1w2c1");
247+
bias_buffer_flag = true;
246248
const DDim& filter_image_dims =
247249
converter.InitImageDimInfoWith(filter_dims);
248250
filter_image_h_ = filter_image_dims[1];
@@ -259,14 +261,10 @@ void ConvImageCompute::PrepareForRun() {
259261
tensor_hold_filter_image_->raw_data(),
260262
tensor_hold_filter_image_->memory_size(),
261263
IoDirection::HtoD);
262-
263-
MUTABLE_DATA_GPU(filter_gpu_image_,
264-
filter_image_w_,
265-
filter_image_h_,
266-
filter_image_data);
267264
} else if (task_size <= threshold_4) {
268265
CLImageConverterN2Block converter;
269266
kernel_func_names_.push_back("conv2d_1x1_mali_h1w2c2");
267+
bias_buffer_flag = true;
270268
const DDim& filter_image_dims =
271269
converter.InitImageDimInfoWith(filter_dims);
272270
filter_image_h_ = filter_image_dims[1];
@@ -283,14 +281,10 @@ void ConvImageCompute::PrepareForRun() {
283281
tensor_hold_filter_image_->raw_data(),
284282
tensor_hold_filter_image_->memory_size(),
285283
IoDirection::HtoD);
286-
287-
MUTABLE_DATA_GPU(filter_gpu_image_,
288-
filter_image_w_,
289-
filter_image_h_,
290-
filter_image_data);
291284
} else {
292285
CLImageConverterN2Block converter;
293286
kernel_func_names_.push_back("conv2d_1x1_mali_h2w2c2");
287+
bias_buffer_flag = true;
294288
const DDim& filter_image_dims =
295289
converter.InitImageDimInfoWith(filter_dims);
296290
filter_image_h_ = filter_image_dims[1];
@@ -306,11 +300,6 @@ void ConvImageCompute::PrepareForRun() {
306300
tensor_hold_filter_image_->raw_data(),
307301
tensor_hold_filter_image_->memory_size(),
308302
IoDirection::HtoD);
309-
310-
MUTABLE_DATA_GPU(filter_gpu_image_,
311-
filter_image_w_,
312-
filter_image_h_,
313-
filter_image_data);
314303
}
315304
kernel_func_paths_.push_back("image/conv2d_1x1_default_mali_kernel.cl");
316305
impl_ = &ConvImageCompute::Conv2d1x1opt;
@@ -385,6 +374,7 @@ void ConvImageCompute::PrepareForRun() {
385374
if (is_mali_) {
386375
kernel_func_names_.push_back("matrix_inner_product_mali");
387376
kernel_func_names_.push_back("transform_to_output_mali");
377+
bias_buffer_flag = true;
388378
} else {
389379
kernel_func_names_.push_back("matrix_inner_product");
390380
kernel_func_names_.push_back("transform_to_output");
@@ -403,21 +393,24 @@ void ConvImageCompute::PrepareForRun() {
403393
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
404394

405395
// for mali
406-
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
407-
auto* w_gpu_data = w_gpu_t_->mutable_data(
408-
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
409-
TargetWrapperCL::MemcpySync(w_gpu_data,
410-
tensor_hold_filter_image_->raw_data(),
411-
tensor_hold_filter_image_->memory_size(),
412-
IoDirection::HtoD);
413-
414-
MUTABLE_DATA_GPU(filter_gpu_image_,
415-
filter_image_w_,
416-
filter_image_h_,
417-
filter_image_data);
396+
if (is_mali_) {
397+
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
398+
auto* w_gpu_data = w_gpu_t_->mutable_data(
399+
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
400+
TargetWrapperCL::MemcpySync(w_gpu_data,
401+
tensor_hold_filter_image_->raw_data(),
402+
tensor_hold_filter_image_->memory_size(),
403+
IoDirection::HtoD);
404+
} else {
405+
MUTABLE_DATA_GPU(filter_gpu_image_,
406+
filter_image_w_,
407+
filter_image_h_,
408+
filter_image_data);
409+
}
418410
} else if (groups_ == 1) {
419411
if (is_mali_ && input_tensor_n_ == 1) {
420412
kernel_func_names_.push_back("conv2d_3x3_opt_mali");
413+
bias_buffer_flag = true;
421414
} else {
422415
kernel_func_names_.push_back(
423416
input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
@@ -439,17 +432,20 @@ void ConvImageCompute::PrepareForRun() {
439432
auto* filter_image_data = MUTABLE_DATA_CPU(tensor_hold_filter_image_);
440433
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
441434

442-
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
443-
auto* w_gpu_data = w_gpu_t_->mutable_data(
444-
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
445-
TargetWrapperCL::MemcpySync(w_gpu_data,
446-
tensor_hold_filter_image_->raw_data(),
447-
tensor_hold_filter_image_->memory_size(),
448-
IoDirection::HtoD);
449-
MUTABLE_DATA_GPU(filter_gpu_image_,
450-
filter_image_w_,
451-
filter_image_h_,
452-
filter_image_data);
435+
if (is_mali_ && input_tensor_n_ == 1) {
436+
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
437+
auto* w_gpu_data = w_gpu_t_->mutable_data(
438+
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
439+
TargetWrapperCL::MemcpySync(w_gpu_data,
440+
tensor_hold_filter_image_->raw_data(),
441+
tensor_hold_filter_image_->memory_size(),
442+
IoDirection::HtoD);
443+
} else {
444+
MUTABLE_DATA_GPU(filter_gpu_image_,
445+
filter_image_w_,
446+
filter_image_h_,
447+
filter_image_data);
448+
}
453449
} else { // groups_ > 1
454450
kernel_func_names_.push_back("conv2d_3x3");
455451
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
@@ -538,6 +534,7 @@ void ConvImageCompute::PrepareForRun() {
538534
// conv2d_7x7
539535
if (is_mali_ && input_tensor_n_ == 1) {
540536
kernel_func_names_.push_back("conv2d_7x7_opt_mali");
537+
bias_buffer_flag = true;
541538
} else {
542539
kernel_func_names_.push_back(
543540
input_tensor_n_ > 1 ? "conv2d_7x7_multi_batch" : "conv2d_7x7_opt");
@@ -552,15 +549,20 @@ void ConvImageCompute::PrepareForRun() {
552549

553550
auto* filter_image_data = MUTABLE_DATA_CPU(tensor_hold_filter_image_);
554551
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
555-
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
556-
auto* w_gpu_data = w_gpu_t_->mutable_data(
557-
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
558-
TargetWrapperCL::MemcpySync(w_gpu_data,
559-
tensor_hold_filter_image_->raw_data(),
560-
tensor_hold_filter_image_->memory_size(),
561-
IoDirection::HtoD);
562-
MUTABLE_DATA_GPU(
563-
filter_gpu_image_, filter_image_w_, filter_image_h_, filter_image_data);
552+
if (is_mali_ && input_tensor_n_ == 1) {
553+
w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
554+
auto* w_gpu_data = w_gpu_t_->mutable_data(
555+
TARGET(kOpenCL), tensor_hold_filter_image_->memory_size());
556+
TargetWrapperCL::MemcpySync(w_gpu_data,
557+
tensor_hold_filter_image_->raw_data(),
558+
tensor_hold_filter_image_->memory_size(),
559+
IoDirection::HtoD);
560+
} else {
561+
MUTABLE_DATA_GPU(filter_gpu_image_,
562+
filter_image_w_,
563+
filter_image_h_,
564+
filter_image_data);
565+
}
564566

565567
impl_ = &ConvImageCompute::Conv2d7x7opt;
566568
#endif
@@ -766,31 +768,37 @@ void ConvImageCompute::PrepareForRun() {
766768
bias_converter.NCHWToImage(
767769
bias_cpu_data, bias_image_data, conv_param_->bias->dims());
768770

769-
bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
770-
auto* f_gpu_data = bias_gpu_t_->mutable_data(
771-
TARGET(kOpenCL), tensor_hold_bias_image_->memory_size());
772-
TargetWrapperCL::MemcpySync(f_gpu_data,
773-
tensor_hold_bias_image_->raw_data(),
774-
tensor_hold_bias_image_->memory_size(),
775-
IoDirection::HtoD);
776-
MUTABLE_DATA_GPU(bias_gpu_image_,
777-
bias_image_dims[0],
778-
bias_image_dims[1],
779-
bias_image_data);
771+
if (bias_buffer_flag) {
772+
bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
773+
auto* f_gpu_data = bias_gpu_t_->mutable_data(
774+
TARGET(kOpenCL), tensor_hold_bias_image_->memory_size());
775+
TargetWrapperCL::MemcpySync(f_gpu_data,
776+
tensor_hold_bias_image_->raw_data(),
777+
tensor_hold_bias_image_->memory_size(),
778+
IoDirection::HtoD);
779+
} else {
780+
MUTABLE_DATA_GPU(bias_gpu_image_,
781+
bias_image_dims[0],
782+
bias_image_dims[1],
783+
bias_image_data);
784+
}
780785
} else {
781786
bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
782787
CLImageConverterFolder bias_converter;
783788
tensor_hold_bias_image_->Resize({1, 1, 1, 4});
784789
auto* bias_image_data = DATA_GPU(tensor_hold_bias_image_);
785790

786-
bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
787-
auto* f_gpu_data = bias_gpu_t_->mutable_data(
788-
TARGET(kOpenCL), tensor_hold_bias_image_->memory_size());
789-
TargetWrapperCL::MemcpySync(f_gpu_data,
790-
tensor_hold_bias_image_->raw_data(),
791-
tensor_hold_bias_image_->memory_size(),
792-
IoDirection::HtoD);
793-
MUTABLE_DATA_GPU(bias_gpu_image_, 1, 1, bias_image_data);
791+
if (bias_buffer_flag) {
792+
bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
793+
auto* f_gpu_data = bias_gpu_t_->mutable_data(
794+
TARGET(kOpenCL), tensor_hold_bias_image_->memory_size());
795+
TargetWrapperCL::MemcpySync(f_gpu_data,
796+
tensor_hold_bias_image_->raw_data(),
797+
tensor_hold_bias_image_->memory_size(),
798+
IoDirection::HtoD);
799+
} else {
800+
MUTABLE_DATA_GPU(bias_gpu_image_, 1, 1, bias_image_data);
801+
}
794802
}
795803

796804
// scale options

0 commit comments

Comments
 (0)