From 3ec5ea0cdbbcd02d5880a2a97b8c96c85869ae40 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Wed, 30 Nov 2022 07:11:09 +0000 Subject: [PATCH 1/2] tm --- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 77 ++++++++++++------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index fd6e92b2ffe06d..daa53b77beee59 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -842,31 +842,36 @@ void BatchNormGradRawKernel(const Context &ctx, ctx.template Alloc>(d_bias)); } -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationBackward( -// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), -// CudnnDataType::kZero(), CudnnDataType::kOne(), -// CudnnDataType::kZero(), data_desc_, -// transformed_x.template data(), data_desc_, -// transformed_d_y.template data(), data_desc_, -// transformed_d_x.template mutable_data(ctx.GetPlace()), -// bn_param_desc_, scale->template data>(), -// d_scale->template mutable_data>( -// ctx.GetPlace()), -// d_bias->template mutable_data>( -// ctx.GetPlace()), -// epsilon, saved_mean_data, saved_var_data)); + // TODO(wangran16): wait for MIOpen to improve the performance of BN + // PADDLE_ENFORCE_GPU_SUCCESS( + // platform::dynload::miopenBatchNormalizationBackward( + // dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + // CudnnDataType::kZero(), CudnnDataType::kOne(), + // CudnnDataType::kZero(), data_desc_, + // transformed_x.template data(), data_desc_, + // transformed_d_y.template data(), data_desc_, + // transformed_d_x.template mutable_data(ctx.GetPlace()), + // bn_param_desc_, scale->template data>(), + // d_scale->template mutable_data>( + // ctx.GetPlace()), + // d_bias->template mutable_data>( + // ctx.GetPlace()), + // epsilon, saved_mean_data, saved_var_data)); #else - // CUDNN only support small batch size - // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; - const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 10240; - const size_t CUDNN_SPATIAL_THRESHOLD = 880801; - const bool use_native_kernel = - ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) || - (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD)); - if (use_native_kernel) { - if (x_dims.size() == 2) { + } + // CUDNN only support small batch size + // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; + const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 10240; + const size_t CUDNN_SPATIAL_THRESHOLD = 880801; + bool use_native_nhwc = + d_x ? (x_dims.size() == 4 && compute_format == DataLayout::kNHWC) + : false; + const bool use_native_kernel = + ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) || + (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD)); + if (use_native_nhwc || (d_x && d_scale && d_bias)) { + if (use_native_kernel || use_native_nhwc) { + if (x_dims.size() == 2 || use_native_nhwc) { dim3 block; dim3 grid; const int block_size = 512; @@ -937,6 +942,21 @@ void BatchNormGradRawKernel(const Context &ctx, flag_ptr); } // 2. reduce_sum(x, dy, mean) => dscale, dbias + BatchNormParamType *dscale = nullptr; + BatchNormParamType *dbias = nullptr; + bool with_scale = false; + if (d_scale && d_bias) { + dscale = ctx.template Alloc>(d_scale); + dbias = ctx.template Alloc>(d_bias); + } else { + DenseTensor dscale_mem = + phi::Empty, Context>(ctx, {C}); + DenseTensor dbias_mem = + phi::Empty, Context>(ctx, {C}); + dscale = dscale_mem.data>(); + dbias = dbias_mem.data>(); + } + BNBackward2DChannelLastStage2 <<>>( transformed_d_y.template data(), @@ -948,8 +968,8 @@ void BatchNormGradRawKernel(const Context &ctx, H * W * D, epsilon, block_data_ptr, - ctx.template Alloc>(d_scale), - ctx.template Alloc>(d_bias), + dscale, + dbias, flag_ptr); // 3. elementwise_mul(scale, mean, inv_var, dy, dscale, dbias) => dx @@ -958,8 +978,8 @@ void BatchNormGradRawKernel(const Context &ctx, transformed_d_y.template data(), transformed_x.template data(), scale.template data>(), - d_scale->data>(), - d_bias->data>(), + dscale, + dbias, mean_ptr, variance_ptr, C, @@ -1169,6 +1189,7 @@ void BatchNormGradRawKernel(const Context &ctx, paddle::platform::dynload::cudnnDestroyTensorDescriptor( bn_param_desc_)); #endif + } else { const auto *running_mean = mean.get_ptr(); const auto *running_var = variance.get_ptr(); From e4fbe2cf19cc5e8cd3bbcfba8dfd0093b32b79c0 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Wed, 30 Nov 2022 07:12:50 +0000 Subject: [PATCH 2/2] tp --- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index daa53b77beee59..828c5b29bdb9da 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -842,21 +842,21 @@ void BatchNormGradRawKernel(const Context &ctx, ctx.template Alloc>(d_bias)); } - // TODO(wangran16): wait for MIOpen to improve the performance of BN - // PADDLE_ENFORCE_GPU_SUCCESS( - // platform::dynload::miopenBatchNormalizationBackward( - // dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - // CudnnDataType::kZero(), CudnnDataType::kOne(), - // CudnnDataType::kZero(), data_desc_, - // transformed_x.template data(), data_desc_, - // transformed_d_y.template data(), data_desc_, - // transformed_d_x.template mutable_data(ctx.GetPlace()), - // bn_param_desc_, scale->template data>(), - // d_scale->template mutable_data>( - // ctx.GetPlace()), - // d_bias->template mutable_data>( - // ctx.GetPlace()), - // epsilon, saved_mean_data, saved_var_data)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationBackward( +// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), +// CudnnDataType::kZero(), CudnnDataType::kOne(), +// CudnnDataType::kZero(), data_desc_, +// transformed_x.template data(), data_desc_, +// transformed_d_y.template data(), data_desc_, +// transformed_d_x.template mutable_data(ctx.GetPlace()), +// bn_param_desc_, scale->template data>(), +// d_scale->template mutable_data>( +// ctx.GetPlace()), +// d_bias->template mutable_data>( +// ctx.GetPlace()), +// epsilon, saved_mean_data, saved_var_data)); #else } // CUDNN only support small batch size