From b881367dd1104f899cb3b240670d9701891dd84a Mon Sep 17 00:00:00 2001 From: Zjq9409 <15205085056@163.com> Date: Tue, 3 Aug 2021 12:17:45 +0000 Subject: [PATCH 1/5] optimization channel --- paddle/fluid/operators/batch_norm_op.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index b2cffc3f9063c1..3ef5fefde39218 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -295,8 +295,7 @@ class BatchNormKernel bool global_stats = test_mode || use_global_stats; const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); + DataLayout data_layout = framework::StringToDataLayout(data_layout_str); const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); @@ -353,6 +352,9 @@ class BatchNormKernel return; } + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } switch (data_layout) { case DataLayout::kNCHW: { ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); @@ -578,8 +580,7 @@ class BatchNormGradKernel bool use_global_stats = ctx.Attr("use_global_stats"); const bool is_test = ctx.Attr("is_test"); const float epsilon = ctx.Attr("epsilon"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); + DataLayout data_layout = framework::StringToDataLayout(data_layout_str); auto *d_x = ctx.Output(framework::GradVarName("X")); auto *d_scale = ctx.Output(framework::GradVarName("Scale")); @@ -703,6 +704,9 @@ class BatchNormGradKernel dy_sum_arr.setZero(); dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } // inplace calculation // Y: ((x - est_mean) * (inv_var) * scale + bias // formula transform ====> From c51c122b12818cbf33655a56265a93a196afdb60 Mon Sep 17 00:00:00 2001 From: Zjq9409 <15205085056@163.com> Date: Wed, 4 Aug 2021 11:49:34 +0000 Subject: [PATCH 2/5] support NCHW multi-threading --- paddle/fluid/operators/batch_norm_op.cc | 48 ++++++++++++++++++------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 3ef5fefde39218..298446c94ba76f 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -352,16 +352,25 @@ class BatchNormKernel return; } + // input dimension is 2 and the format is NCHW. The input can be regarded + // as NHWC format if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { data_layout = DataLayout::kNHWC; } + switch (data_layout) { case DataLayout::kNCHW: { ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif for (int nc = 0; nc < N * C; ++nc) { saved_mean_e(nc % C) += x_arr.col(nc).sum(); } saved_mean_e /= N * sample_size; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif for (int nc = 0; nc < N * C; ++nc) { saved_variance_e(nc % C) += (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); @@ -434,6 +443,9 @@ class BatchNormKernel EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, N * C); ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif for (int nc = 0; nc < N * C; ++nc) { y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); } @@ -704,9 +716,12 @@ class BatchNormGradKernel dy_sum_arr.setZero(); dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); + // input dimension is 2 and the format is NCHW. The input can be regarded as + // NHWC format if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { data_layout = DataLayout::kNHWC; } + // inplace calculation // Y: ((x - est_mean) * (inv_var) * scale + bias // formula transform ====> @@ -721,6 +736,9 @@ class BatchNormGradKernel EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), sample_size, N * C); ConstEigenArrayMap y_data(x->data(), sample_size, N * C); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif for (int nc = 0; nc < N * C; ++nc) { x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) / scale_inv_var_nhw(nc % C) / scale_coefff + @@ -729,12 +747,14 @@ class BatchNormGradKernel } ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); - +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - dy_sum_arr(c) += d_y_arr.col(nc).sum(); - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += - ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) + dy_sum_arr(nc % C) += d_y_arr.col(nc).sum(); + dy_mul_x_sub_mean_mul_invstd_sum_arr(nc % C) += + ((x_arr.col(nc) - mean_arr(nc % C)) * inv_var_arr(nc % C) * + d_y_arr.col(nc)) .sum(); } @@ -747,14 +767,16 @@ class BatchNormGradKernel EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), sample_size, N * C); if (!use_global_stats) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; d_x_arr.col(nc) = - scale_inv_var_nhw(c) * - (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - - (x_arr.col(nc) - mean_arr[c]) * - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * - inv_var_arr(c)); + scale_inv_var_nhw(nc % C) * + (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(nc % C) - + (x_arr.col(nc) - mean_arr[nc % C]) * + dy_mul_x_sub_mean_mul_invstd_sum_arr(nc % C) * + inv_var_arr(nc % C)); } } else { for (int nc = 0; nc < N * C; ++nc) { @@ -779,7 +801,6 @@ class BatchNormGradKernel } ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); - for (int nhw = 0; nhw < N * sample_size; ++nhw) { dy_sum_arr += d_y_arr.col(nhw); dy_mul_x_sub_mean_mul_invstd_sum_arr += @@ -795,6 +816,9 @@ class BatchNormGradKernel EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, N * sample_size); if (!use_global_stats) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif for (int nhw = 0; nhw < N * sample_size; ++nhw) { d_x_arr.col(nhw) = scale_inv_var_nhw * From 60cc4ebe9e64c7f342bddc0f9e734bac7e0eb09e Mon Sep 17 00:00:00 2001 From: Zjq9409 <15205085056@163.com> Date: Thu, 5 Aug 2021 08:59:08 +0000 Subject: [PATCH 3/5] test --- paddle/fluid/operators/batch_norm_op.cc | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 298446c94ba76f..908f3b8e1505da 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -361,16 +361,11 @@ class BatchNormKernel switch (data_layout) { case DataLayout::kNCHW: { ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif + for (int nc = 0; nc < N * C; ++nc) { saved_mean_e(nc % C) += x_arr.col(nc).sum(); } saved_mean_e /= N * sample_size; -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif for (int nc = 0; nc < N * C; ++nc) { saved_variance_e(nc % C) += (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); @@ -736,9 +731,6 @@ class BatchNormGradKernel EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), sample_size, N * C); ConstEigenArrayMap y_data(x->data(), sample_size, N * C); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif for (int nc = 0; nc < N * C; ++nc) { x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) / scale_inv_var_nhw(nc % C) / scale_coefff + @@ -747,9 +739,7 @@ class BatchNormGradKernel } ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif + for (int nc = 0; nc < N * C; ++nc) { dy_sum_arr(nc % C) += d_y_arr.col(nc).sum(); dy_mul_x_sub_mean_mul_invstd_sum_arr(nc % C) += From 37c3599ce2af961e360c7779ca32fe9821f2d1dc Mon Sep 17 00:00:00 2001 From: Zjq9409 <15205085056@163.com> Date: Thu, 5 Aug 2021 11:10:27 +0000 Subject: [PATCH 4/5] add multi-threading --- paddle/fluid/operators/batch_norm_op.cc | 66 ++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 908f3b8e1505da..f64bdd8a69d840 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -362,15 +362,65 @@ class BatchNormKernel case DataLayout::kNCHW: { ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - saved_mean_e(nc % C) += x_arr.col(nc).sum(); - } - saved_mean_e /= N * sample_size; - for (int nc = 0; nc < N * C; ++nc) { - saved_variance_e(nc % C) += - (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); + auto x_data = x->data(); + auto mean = saved_mean->mutable_data(ctx.GetPlace()); + auto variance = saved_variance->mutable_data(ctx.GetPlace()); + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int nc = 0; nc < C; ++nc) { + double mean_sum = 0.0f; + auto mean_data = + x_data + nc * sample_size; // mean value of each channel +#ifdef PADDLE_WITH_MKLML +#pragma omp simd +#endif + for (auto i = 0; i < N; ++i) { + for (auto j = 0; j < sample_size; ++j) { + mean_sum += mean_data[j]; // add data for each channel + } + mean_data = mean_data + + C * sample_size; // jump to the same channel index + } + mean[nc] = mean_sum / (N * sample_size); + + double var_sum = 0.0f; + auto var_data = + x_data + nc * sample_size; // variance value of each channel +#ifdef PADDLE_WITH_MKLML +#pragma omp simd +#endif + for (auto i = 0; i < N; ++i) { + for (auto j = 0; j < sample_size; ++j) { + var_sum += static_cast((var_data[j] - mean[nc]) * + (var_data[j] - mean[nc])); + } + var_data = var_data + C * sample_size; + } + variance[nc] = var_sum / (N * sample_size); } - saved_variance_e /= N * sample_size; + + // for (int nc = 0; nc < N * C; ++nc) { + // saved_mean_e(nc % C) += x_arr.col(nc).sum(); + // } + // // int stride = C * sample_size; + // // auto x_data = x->data(); + + // // for(int nc=0; nc x_e(x_data, sample_size, C); + // // saved_mean_e += x_e.colwise().sum(); + // // x_data = x_data + stride; + // // } + + // saved_mean_e /= N * sample_size; + + // for (int nc = 0; nc < N * C; ++nc) { + // saved_variance_e(nc % C) += + // (x_arr.col(nc) - saved_mean_e(nc % + // C)).matrix().squaredNorm(); + // } + // saved_variance_e /= N * sample_size; break; } case DataLayout::kNHWC: { From 0f3c528d85e352feecfe6ca4c1fb0ac54d9092c9 Mon Sep 17 00:00:00 2001 From: Zjq9409 <15205085056@163.com> Date: Fri, 6 Aug 2021 07:18:17 +0000 Subject: [PATCH 5/5] optimization tow dim NCHW format --- paddle/fluid/operators/batch_norm_op.cc | 120 ++++++------------------ 1 file changed, 31 insertions(+), 89 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index f64bdd8a69d840..be17bf9a03fc19 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -331,6 +331,12 @@ class BatchNormKernel saved_mean->mutable_data(ctx.GetPlace()); saved_variance->mutable_data(ctx.GetPlace()); + // input dimension is 2 and the format is NCHW. The input can be regarded + // as NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + if (!global_stats) { // saved_xx is use just in this batch of data EigenVectorArrayMap saved_mean_e( @@ -352,75 +358,18 @@ class BatchNormKernel return; } - // input dimension is 2 and the format is NCHW. The input can be regarded - // as NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - switch (data_layout) { case DataLayout::kNCHW: { ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - - auto x_data = x->data(); - auto mean = saved_mean->mutable_data(ctx.GetPlace()); - auto variance = saved_variance->mutable_data(ctx.GetPlace()); - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int nc = 0; nc < C; ++nc) { - double mean_sum = 0.0f; - auto mean_data = - x_data + nc * sample_size; // mean value of each channel -#ifdef PADDLE_WITH_MKLML -#pragma omp simd -#endif - for (auto i = 0; i < N; ++i) { - for (auto j = 0; j < sample_size; ++j) { - mean_sum += mean_data[j]; // add data for each channel - } - mean_data = mean_data + - C * sample_size; // jump to the same channel index - } - mean[nc] = mean_sum / (N * sample_size); - - double var_sum = 0.0f; - auto var_data = - x_data + nc * sample_size; // variance value of each channel -#ifdef PADDLE_WITH_MKLML -#pragma omp simd -#endif - for (auto i = 0; i < N; ++i) { - for (auto j = 0; j < sample_size; ++j) { - var_sum += static_cast((var_data[j] - mean[nc]) * - (var_data[j] - mean[nc])); - } - var_data = var_data + C * sample_size; - } - variance[nc] = var_sum / (N * sample_size); + for (int nc = 0; nc < N * C; ++nc) { + saved_mean_e(nc % C) += x_arr.col(nc).sum(); } - - // for (int nc = 0; nc < N * C; ++nc) { - // saved_mean_e(nc % C) += x_arr.col(nc).sum(); - // } - // // int stride = C * sample_size; - // // auto x_data = x->data(); - - // // for(int nc=0; nc x_e(x_data, sample_size, C); - // // saved_mean_e += x_e.colwise().sum(); - // // x_data = x_data + stride; - // // } - - // saved_mean_e /= N * sample_size; - - // for (int nc = 0; nc < N * C; ++nc) { - // saved_variance_e(nc % C) += - // (x_arr.col(nc) - saved_mean_e(nc % - // C)).matrix().squaredNorm(); - // } - // saved_variance_e /= N * sample_size; + saved_mean_e /= N * sample_size; + for (int nc = 0; nc < N * C; ++nc) { + saved_variance_e(nc % C) += + (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); + } + saved_variance_e /= N * sample_size; break; } case DataLayout::kNHWC: { @@ -488,9 +437,6 @@ class BatchNormKernel EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, N * C); ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif for (int nc = 0; nc < N * C; ++nc) { y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); } @@ -691,6 +637,12 @@ class BatchNormGradKernel : x_dims[x_dims.size() - 1]); const int sample_size = x->numel() / N / C; + // input dimension is 2 and the format is NCHW. The input can be regarded as + // NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + // init output if (d_x) { d_x->mutable_data(ctx.GetPlace()); @@ -761,12 +713,6 @@ class BatchNormGradKernel dy_sum_arr.setZero(); dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); - // input dimension is 2 and the format is NCHW. The input can be regarded as - // NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - // inplace calculation // Y: ((x - est_mean) * (inv_var) * scale + bias // formula transform ====> @@ -791,10 +737,10 @@ class BatchNormGradKernel ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); for (int nc = 0; nc < N * C; ++nc) { - dy_sum_arr(nc % C) += d_y_arr.col(nc).sum(); - dy_mul_x_sub_mean_mul_invstd_sum_arr(nc % C) += - ((x_arr.col(nc) - mean_arr(nc % C)) * inv_var_arr(nc % C) * - d_y_arr.col(nc)) + int c = nc % C; + dy_sum_arr(c) += d_y_arr.col(nc).sum(); + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += + ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) .sum(); } @@ -807,16 +753,14 @@ class BatchNormGradKernel EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), sample_size, N * C); if (!use_global_stats) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; d_x_arr.col(nc) = - scale_inv_var_nhw(nc % C) * - (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(nc % C) - - (x_arr.col(nc) - mean_arr[nc % C]) * - dy_mul_x_sub_mean_mul_invstd_sum_arr(nc % C) * - inv_var_arr(nc % C)); + scale_inv_var_nhw(c) * + (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - + (x_arr.col(nc) - mean_arr[c]) * + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * + inv_var_arr(c)); } } else { for (int nc = 0; nc < N * C; ++nc) { @@ -841,6 +785,7 @@ class BatchNormGradKernel } ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); + for (int nhw = 0; nhw < N * sample_size; ++nhw) { dy_sum_arr += d_y_arr.col(nhw); dy_mul_x_sub_mean_mul_invstd_sum_arr += @@ -856,9 +801,6 @@ class BatchNormGradKernel EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, N * sample_size); if (!use_global_stats) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif for (int nhw = 0; nhw < N * sample_size; ++nhw) { d_x_arr.col(nhw) = scale_inv_var_nhw *