@@ -30,6 +30,7 @@ namespace opencl {
3030void ConvImageCompute::PrepareForRun () {
3131 ReInitWhenNeeded ();
3232
33+ bool bias_buffer_flag = false ;
3334 auto & context = ctx_->As <OpenCLContext>();
3435 CHECK (context.cl_context () != nullptr );
3536 is_mali_ = context.cl_context ()->IsArmMali ();
@@ -243,6 +244,7 @@ void ConvImageCompute::PrepareForRun() {
243244 if (task_size <= threshold_2) {
244245 CLImageConverterNBlock converter;
245246 kernel_func_names_.push_back (" conv2d_1x1_mali_h1w2c1" );
247+ bias_buffer_flag = true ;
246248 const DDim& filter_image_dims =
247249 converter.InitImageDimInfoWith (filter_dims);
248250 filter_image_h_ = filter_image_dims[1 ];
@@ -259,14 +261,10 @@ void ConvImageCompute::PrepareForRun() {
259261 tensor_hold_filter_image_->raw_data (),
260262 tensor_hold_filter_image_->memory_size (),
261263 IoDirection::HtoD);
262-
263- MUTABLE_DATA_GPU (filter_gpu_image_,
264- filter_image_w_,
265- filter_image_h_,
266- filter_image_data);
267264 } else if (task_size <= threshold_4) {
268265 CLImageConverterN2Block converter;
269266 kernel_func_names_.push_back (" conv2d_1x1_mali_h1w2c2" );
267+ bias_buffer_flag = true ;
270268 const DDim& filter_image_dims =
271269 converter.InitImageDimInfoWith (filter_dims);
272270 filter_image_h_ = filter_image_dims[1 ];
@@ -283,14 +281,10 @@ void ConvImageCompute::PrepareForRun() {
283281 tensor_hold_filter_image_->raw_data (),
284282 tensor_hold_filter_image_->memory_size (),
285283 IoDirection::HtoD);
286-
287- MUTABLE_DATA_GPU (filter_gpu_image_,
288- filter_image_w_,
289- filter_image_h_,
290- filter_image_data);
291284 } else {
292285 CLImageConverterN2Block converter;
293286 kernel_func_names_.push_back (" conv2d_1x1_mali_h2w2c2" );
287+ bias_buffer_flag = true ;
294288 const DDim& filter_image_dims =
295289 converter.InitImageDimInfoWith (filter_dims);
296290 filter_image_h_ = filter_image_dims[1 ];
@@ -306,11 +300,6 @@ void ConvImageCompute::PrepareForRun() {
306300 tensor_hold_filter_image_->raw_data (),
307301 tensor_hold_filter_image_->memory_size (),
308302 IoDirection::HtoD);
309-
310- MUTABLE_DATA_GPU (filter_gpu_image_,
311- filter_image_w_,
312- filter_image_h_,
313- filter_image_data);
314303 }
315304 kernel_func_paths_.push_back (" image/conv2d_1x1_default_mali_kernel.cl" );
316305 impl_ = &ConvImageCompute::Conv2d1x1opt;
@@ -385,6 +374,7 @@ void ConvImageCompute::PrepareForRun() {
385374 if (is_mali_) {
386375 kernel_func_names_.push_back (" matrix_inner_product_mali" );
387376 kernel_func_names_.push_back (" transform_to_output_mali" );
377+ bias_buffer_flag = true ;
388378 } else {
389379 kernel_func_names_.push_back (" matrix_inner_product" );
390380 kernel_func_names_.push_back (" transform_to_output" );
@@ -403,21 +393,24 @@ void ConvImageCompute::PrepareForRun() {
403393 converter.NCHWToImage (filter_cpu, filter_image_data, filter_dims);
404394
405395 // for mali
406- w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
407- auto * w_gpu_data = w_gpu_t_->mutable_data (
408- TARGET (kOpenCL ), tensor_hold_filter_image_->memory_size ());
409- TargetWrapperCL::MemcpySync (w_gpu_data,
410- tensor_hold_filter_image_->raw_data (),
411- tensor_hold_filter_image_->memory_size (),
412- IoDirection::HtoD);
413-
414- MUTABLE_DATA_GPU (filter_gpu_image_,
415- filter_image_w_,
416- filter_image_h_,
417- filter_image_data);
396+ if (is_mali_) {
397+ w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
398+ auto * w_gpu_data = w_gpu_t_->mutable_data (
399+ TARGET (kOpenCL ), tensor_hold_filter_image_->memory_size ());
400+ TargetWrapperCL::MemcpySync (w_gpu_data,
401+ tensor_hold_filter_image_->raw_data (),
402+ tensor_hold_filter_image_->memory_size (),
403+ IoDirection::HtoD);
404+ } else {
405+ MUTABLE_DATA_GPU (filter_gpu_image_,
406+ filter_image_w_,
407+ filter_image_h_,
408+ filter_image_data);
409+ }
418410 } else if (groups_ == 1 ) {
419411 if (is_mali_ && input_tensor_n_ == 1 ) {
420412 kernel_func_names_.push_back (" conv2d_3x3_opt_mali" );
413+ bias_buffer_flag = true ;
421414 } else {
422415 kernel_func_names_.push_back (
423416 input_tensor_n_ > 1 ? " conv2d_3x3_multi_batch" : " conv2d_3x3_opt" );
@@ -439,17 +432,20 @@ void ConvImageCompute::PrepareForRun() {
439432 auto * filter_image_data = MUTABLE_DATA_CPU (tensor_hold_filter_image_);
440433 converter.NCHWToImage (filter_cpu, filter_image_data, filter_dims);
441434
442- w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
443- auto * w_gpu_data = w_gpu_t_->mutable_data (
444- TARGET (kOpenCL ), tensor_hold_filter_image_->memory_size ());
445- TargetWrapperCL::MemcpySync (w_gpu_data,
446- tensor_hold_filter_image_->raw_data (),
447- tensor_hold_filter_image_->memory_size (),
448- IoDirection::HtoD);
449- MUTABLE_DATA_GPU (filter_gpu_image_,
450- filter_image_w_,
451- filter_image_h_,
452- filter_image_data);
435+ if (is_mali_ && input_tensor_n_ == 1 ) {
436+ w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
437+ auto * w_gpu_data = w_gpu_t_->mutable_data (
438+ TARGET (kOpenCL ), tensor_hold_filter_image_->memory_size ());
439+ TargetWrapperCL::MemcpySync (w_gpu_data,
440+ tensor_hold_filter_image_->raw_data (),
441+ tensor_hold_filter_image_->memory_size (),
442+ IoDirection::HtoD);
443+ } else {
444+ MUTABLE_DATA_GPU (filter_gpu_image_,
445+ filter_image_w_,
446+ filter_image_h_,
447+ filter_image_data);
448+ }
453449 } else { // groups_ > 1
454450 kernel_func_names_.push_back (" conv2d_3x3" );
455451 kernel_func_paths_.push_back (" image/conv2d_3x3_kernel.cl" );
@@ -538,6 +534,7 @@ void ConvImageCompute::PrepareForRun() {
538534 // conv2d_7x7
539535 if (is_mali_ && input_tensor_n_ == 1 ) {
540536 kernel_func_names_.push_back (" conv2d_7x7_opt_mali" );
537+ bias_buffer_flag = true ;
541538 } else {
542539 kernel_func_names_.push_back (
543540 input_tensor_n_ > 1 ? " conv2d_7x7_multi_batch" : " conv2d_7x7_opt" );
@@ -552,15 +549,20 @@ void ConvImageCompute::PrepareForRun() {
552549
553550 auto * filter_image_data = MUTABLE_DATA_CPU (tensor_hold_filter_image_);
554551 converter.NCHWToImage (filter_cpu, filter_image_data, filter_dims);
555- w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
556- auto * w_gpu_data = w_gpu_t_->mutable_data (
557- TARGET (kOpenCL ), tensor_hold_filter_image_->memory_size ());
558- TargetWrapperCL::MemcpySync (w_gpu_data,
559- tensor_hold_filter_image_->raw_data (),
560- tensor_hold_filter_image_->memory_size (),
561- IoDirection::HtoD);
562- MUTABLE_DATA_GPU (
563- filter_gpu_image_, filter_image_w_, filter_image_h_, filter_image_data);
552+ if (is_mali_ && input_tensor_n_ == 1 ) {
553+ w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
554+ auto * w_gpu_data = w_gpu_t_->mutable_data (
555+ TARGET (kOpenCL ), tensor_hold_filter_image_->memory_size ());
556+ TargetWrapperCL::MemcpySync (w_gpu_data,
557+ tensor_hold_filter_image_->raw_data (),
558+ tensor_hold_filter_image_->memory_size (),
559+ IoDirection::HtoD);
560+ } else {
561+ MUTABLE_DATA_GPU (filter_gpu_image_,
562+ filter_image_w_,
563+ filter_image_h_,
564+ filter_image_data);
565+ }
564566
565567 impl_ = &ConvImageCompute::Conv2d7x7opt;
566568#endif
@@ -766,31 +768,37 @@ void ConvImageCompute::PrepareForRun() {
766768 bias_converter.NCHWToImage (
767769 bias_cpu_data, bias_image_data, conv_param_->bias ->dims ());
768770
769- bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
770- auto * f_gpu_data = bias_gpu_t_->mutable_data (
771- TARGET (kOpenCL ), tensor_hold_bias_image_->memory_size ());
772- TargetWrapperCL::MemcpySync (f_gpu_data,
773- tensor_hold_bias_image_->raw_data (),
774- tensor_hold_bias_image_->memory_size (),
775- IoDirection::HtoD);
776- MUTABLE_DATA_GPU (bias_gpu_image_,
777- bias_image_dims[0 ],
778- bias_image_dims[1 ],
779- bias_image_data);
771+ if (bias_buffer_flag) {
772+ bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
773+ auto * f_gpu_data = bias_gpu_t_->mutable_data (
774+ TARGET (kOpenCL ), tensor_hold_bias_image_->memory_size ());
775+ TargetWrapperCL::MemcpySync (f_gpu_data,
776+ tensor_hold_bias_image_->raw_data (),
777+ tensor_hold_bias_image_->memory_size (),
778+ IoDirection::HtoD);
779+ } else {
780+ MUTABLE_DATA_GPU (bias_gpu_image_,
781+ bias_image_dims[0 ],
782+ bias_image_dims[1 ],
783+ bias_image_data);
784+ }
780785 } else {
781786 bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
782787 CLImageConverterFolder bias_converter;
783788 tensor_hold_bias_image_->Resize ({1 , 1 , 1 , 4 });
784789 auto * bias_image_data = DATA_GPU (tensor_hold_bias_image_);
785790
786- bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
787- auto * f_gpu_data = bias_gpu_t_->mutable_data (
788- TARGET (kOpenCL ), tensor_hold_bias_image_->memory_size ());
789- TargetWrapperCL::MemcpySync (f_gpu_data,
790- tensor_hold_bias_image_->raw_data (),
791- tensor_hold_bias_image_->memory_size (),
792- IoDirection::HtoD);
793- MUTABLE_DATA_GPU (bias_gpu_image_, 1 , 1 , bias_image_data);
791+ if (bias_buffer_flag) {
792+ bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
793+ auto * f_gpu_data = bias_gpu_t_->mutable_data (
794+ TARGET (kOpenCL ), tensor_hold_bias_image_->memory_size ());
795+ TargetWrapperCL::MemcpySync (f_gpu_data,
796+ tensor_hold_bias_image_->raw_data (),
797+ tensor_hold_bias_image_->memory_size (),
798+ IoDirection::HtoD);
799+ } else {
800+ MUTABLE_DATA_GPU (bias_gpu_image_, 1 , 1 , bias_image_data);
801+ }
794802 }
795803
796804 // scale options
0 commit comments