From 8be79366ea058dbf1352369dd7dd5bca05d8725d Mon Sep 17 00:00:00 2001 From: Zou Gaoyuan Date: Wed, 23 Jul 2025 18:04:02 +0800 Subject: [PATCH 1/6] fix bug:vector_norm test=develop --- paddle/phi/kernels/gpu/p_norm_grad_kernel.cu | 66 +++++++++++++------- paddle/phi/kernels/gpu/reduce_kernel.cu | 4 +- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu index 5efd6a36a5399f..4efd1124f129fe 100644 --- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu @@ -14,32 +14,21 @@ #include "paddle/phi/kernels/p_norm_grad_kernel.h" +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/abs_kernel.h" +#include "paddle/phi/kernels/elementwise_multiply_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/reduce_grad_functions.h" +#include "paddle/phi/kernels/reduce_amax_grad_kernel.h" +#include "paddle/phi/kernels/sign_kernel.h" namespace phi { -template -struct AbsMaxAndMinGradFunctor { - template - void operator()(const Context& place, - X* x, - Y* y, - DX* dx, - DY* dy, - const Dim& dim, - int size) { - dx->device(place) = dy->broadcast(dim) * (*x).sign() * - ((*x).abs() == y->broadcast(dim)).template cast(); - } -}; - template struct PNormGradFunctor { using MT = typename phi::dtype::MPTypeTrait::Type; @@ -109,17 +98,45 @@ void PNormGradKernel(const Context& dev_ctx, auto xdim = in_x->dims(); bool reduce_all = (in_norm->numel() == 1); - if (axis < 0) axis = xdim.size() + axis; + if (axis < 0) { + axis = xdim.size() + axis; + } const std::vector dims = {axis}; if (porder == 0) { phi::funcs::SetConstant set_zero; set_zero(dev_ctx, out_dx, static_cast(0)); } else if (porder == INFINITY || porder == -INFINITY) { - AbsMaxAndMinGradFunctor functor; - funcs::LaunchReduceGradKernel>( - dev_ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all); + std::vector dims_for_amax; + if (reduce_all) { + dims_for_amax.resize(xdim.size()); + for (int i = 0; i < xdim.size(); ++i) dims_for_amax[i] = i; + } else { + dims_for_amax.push_back(axis); + } + + DenseTensor x_abs; + x_abs.Resize(in_x->dims()); + dev_ctx.template Alloc(&x_abs); + phi::AbsKernel(dev_ctx, *in_x, &x_abs); + DenseTensor amax_grad_out; + amax_grad_out.Resize(in_x->dims()); + dev_ctx.template Alloc(&amax_grad_out); + phi::ReduceAMaxGradKernel(dev_ctx, + x_abs, + *in_norm, + *in_norm_dy, + dims_for_amax, + keepdim, + reduce_all, + &amax_grad_out); + DenseTensor x_sign; + x_sign.Resize(in_x->dims()); + dev_ctx.template Alloc(&x_sign); + phi::SignKernel(dev_ctx, *in_x, &x_sign); + + phi::MultiplyKernel(dev_ctx, amax_grad_out, x_sign, out_dx); } else { auto functor = PNormGradFunctor(porder, epsilon); funcs::LaunchReduceGradKernel>( @@ -127,6 +144,7 @@ void PNormGradKernel(const Context& dev_ctx, } } } // namespace phi + PD_REGISTER_KERNEL(p_norm_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu index 95132d09e2cc22..d06e976c4eb0c5 100644 --- a/paddle/phi/kernels/gpu/reduce_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_kernel.cu @@ -262,7 +262,9 @@ PD_REGISTER_KERNEL(amax_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(amin_grad, GPU, From 2231fa5c3aa0daa664db9684bcb77b7ae81846af Mon Sep 17 00:00:00 2001 From: Zou Gaoyuan Date: Thu, 21 Aug 2025 12:11:34 +0800 Subject: [PATCH 2/6] bugfix:p_norm test=develop --- paddle/phi/kernels/gpu/p_norm_grad_kernel.cu | 29 +++++--------------- paddle/phi/kernels/gpu/p_norm_kernel.cu | 23 ++++++++++++++-- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu index 4efd1124f129fe..1e9cbd364371b3 100644 --- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu @@ -50,29 +50,14 @@ struct PNormGradFunctor { DY* dy, const Dim& dim, int size) { - auto x_mt = x->template cast(); - auto y_mt = y->template cast(); - auto dy_mt = dy->template cast(); - - auto norm_pow = y_mt.pow(-this->porder); - auto mask_norm_nonzero = (y_mt != static_cast(0)).template cast(); - - // Set to 0 where porder < 0 and x == 0 - MT zero = static_cast(0); - auto mask_x_zero = (x_mt == zero).template cast(); - - MT is_porder_negative = - this->porder < zero ? static_cast(1) : static_cast(0); - auto invalid_mask = (mask_x_zero * is_porder_negative); - auto safe_pow = - x_mt.abs().pow(this->porder) * (static_cast(1) - invalid_mask); - + auto unstable_term = (*x).abs().pow(this->porder); + auto mask = (*x) == x->constant(static_cast(0)); + auto stable_term = + mask.select(x->constant(static_cast(0)), unstable_term); + auto self_scaled = (*x).sign() * stable_term; + auto norm_term = (*y).pow(-this->porder); dx->device(place) = - (safe_pow * x_mt.sign() * dy_mt.broadcast(dim) * - norm_pow.broadcast(dim) * - mask_norm_nonzero.broadcast(dim) // Mask out positions where norm == 0 - ) - .template cast(); + self_scaled * dy->broadcast(dim) * norm_term.broadcast(dim); } MT porder; diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu index 8809b082b7a826..8a91a9a84c132e 100644 --- a/paddle/phi/kernels/gpu/p_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu @@ -22,6 +22,9 @@ #include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/kernels/activation_kernel.h" + namespace phi { template struct NonzeroFunctor { @@ -134,8 +137,24 @@ void PNormKernel(const Context& dev_ctx, dev_ctx, *in_x, out_norm, FabsFunctor(), reduce_axis); } else if (porder == 2.0) { // fast 2-norm - phi::funcs::ReduceKernel>( - dev_ctx, *in_x, &out_temp, SquareFunctor(), reduce_axis); + using MT = typename phi::dtype::MPTypeTrait::Type; + phi::DenseTensor temp_sum_of_squares_hp; + temp_sum_of_squares_hp.Resize(out_norm->dims()); + dev_ctx.template Alloc(&temp_sum_of_squares_hp); + phi::funcs::ReduceKernel>( + dev_ctx, + *in_x, + &temp_sum_of_squares_hp, + SquareFunctor(), + reduce_axis); + + phi::DenseTensor temp_norm_hp; + temp_norm_hp.Resize(out_norm->dims()); + dev_ctx.template Alloc(&temp_norm_hp); + phi::SqrtKernel(dev_ctx, temp_sum_of_squares_hp, &temp_norm_hp); + phi::CastKernel(dev_ctx, temp_norm_hp, out_norm->dtype(), out_norm); + return; + } else if (porder == 3.0) { // fast 3-norm phi::funcs::ReduceKernel>( From 55c2649aa884f69123d8089d76413d502af33636 Mon Sep 17 00:00:00 2001 From: Zou Gaoyuan Date: Thu, 21 Aug 2025 16:54:34 +0800 Subject: [PATCH 3/6] bugfix:p_norm test=develop --- paddle/phi/kernels/gpu/p_norm_grad_kernel.cu | 55 ++++++++++++++++++-- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu index 1e9cbd364371b3..0e8eeb44241702 100644 --- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu @@ -33,7 +33,7 @@ template struct PNormGradFunctor { using MT = typename phi::dtype::MPTypeTrait::Type; HOSTDEVICE explicit inline PNormGradFunctor(float porder, float eps) { - this->porder = static_cast(porder - 1.); + this->porder = static_cast(porder - 1.0f); this->eps = static_cast(eps); } @@ -50,12 +50,16 @@ struct PNormGradFunctor { DY* dy, const Dim& dim, int size) { - auto unstable_term = (*x).abs().pow(this->porder); + auto unstable_term = + (*x).abs().template cast().pow(this->porder).template cast(); + auto mask = (*x) == x->constant(static_cast(0)); auto stable_term = mask.select(x->constant(static_cast(0)), unstable_term); auto self_scaled = (*x).sign() * stable_term; - auto norm_term = (*y).pow(-this->porder); + + auto norm_term = + (*y).template cast().pow(-this->porder).template cast(); dx->device(place) = self_scaled * dy->broadcast(dim) * norm_term.broadcast(dim); } @@ -64,6 +68,50 @@ struct PNormGradFunctor { MT eps; }; +// template +// void operator()(const Context& place, +// X* x, +// Y* y, +// DX* dx, +// DY* dy, +// const Dim& dim, +// int size) { +// auto x_mt = x->template cast(); +// auto y_mt = y->template cast(); +// auto dy_mt = dy->template cast(); + +// auto norm_pow = y_mt.pow(-this->porder); +// auto mask_norm_nonzero = (y_mt != static_cast(0)).template +// cast(); + +// // Set to 0 where porder < 0 and x == 0 +// MT zero = static_cast(0); +// auto mask_x_zero = (x_mt == zero).template cast(); + +// MT is_porder_negative = +// this->porder < zero ? static_cast(1) : static_cast(0); +// auto invalid_mask = (mask_x_zero * is_porder_negative); +// auto safe_pow = +// x_mt.abs().pow(this->porder) * (static_cast(1) - invalid_mask); + +// dx->device(place) = +// (safe_pow * x_mt.sign() * dy_mt.broadcast(dim) * +// norm_pow.broadcast(dim) * +// mask_norm_nonzero.broadcast(dim) // Mask out positions where norm +// == 0 +// ) +// .template cast(); +// } + +// MT porder; +// MT eps; +// }; + template void PNormGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -120,7 +168,6 @@ void PNormGradKernel(const Context& dev_ctx, x_sign.Resize(in_x->dims()); dev_ctx.template Alloc(&x_sign); phi::SignKernel(dev_ctx, *in_x, &x_sign); - phi::MultiplyKernel(dev_ctx, amax_grad_out, x_sign, out_dx); } else { auto functor = PNormGradFunctor(porder, epsilon); From 7828c52e1d25b28c9d898f87c2a1d9ba7f21fea3 Mon Sep 17 00:00:00 2001 From: Zou Gaoyuan Date: Thu, 21 Aug 2025 17:13:48 +0800 Subject: [PATCH 4/6] bugfix:p_norm test=develop --- paddle/phi/kernels/gpu/p_norm_grad_kernel.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu index 0e8eeb44241702..4c4b10d13dd3d2 100644 --- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu @@ -52,12 +52,10 @@ struct PNormGradFunctor { int size) { auto unstable_term = (*x).abs().template cast().pow(this->porder).template cast(); - auto mask = (*x) == x->constant(static_cast(0)); auto stable_term = mask.select(x->constant(static_cast(0)), unstable_term); auto self_scaled = (*x).sign() * stable_term; - auto norm_term = (*y).template cast().pow(-this->porder).template cast(); dx->device(place) = From 6faabd53add6fa8a73431b9e651412943ae0ee49 Mon Sep 17 00:00:00 2001 From: Zou Gaoyuan Date: Thu, 21 Aug 2025 17:15:57 +0800 Subject: [PATCH 5/6] bugfix:p_norm test=develop --- paddle/phi/kernels/gpu/p_norm_grad_kernel.cu | 44 -------------------- 1 file changed, 44 deletions(-) diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu index 4c4b10d13dd3d2..341989a475da81 100644 --- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu @@ -66,50 +66,6 @@ struct PNormGradFunctor { MT eps; }; -// template -// void operator()(const Context& place, -// X* x, -// Y* y, -// DX* dx, -// DY* dy, -// const Dim& dim, -// int size) { -// auto x_mt = x->template cast(); -// auto y_mt = y->template cast(); -// auto dy_mt = dy->template cast(); - -// auto norm_pow = y_mt.pow(-this->porder); -// auto mask_norm_nonzero = (y_mt != static_cast(0)).template -// cast(); - -// // Set to 0 where porder < 0 and x == 0 -// MT zero = static_cast(0); -// auto mask_x_zero = (x_mt == zero).template cast(); - -// MT is_porder_negative = -// this->porder < zero ? static_cast(1) : static_cast(0); -// auto invalid_mask = (mask_x_zero * is_porder_negative); -// auto safe_pow = -// x_mt.abs().pow(this->porder) * (static_cast(1) - invalid_mask); - -// dx->device(place) = -// (safe_pow * x_mt.sign() * dy_mt.broadcast(dim) * -// norm_pow.broadcast(dim) * -// mask_norm_nonzero.broadcast(dim) // Mask out positions where norm -// == 0 -// ) -// .template cast(); -// } - -// MT porder; -// MT eps; -// }; - template void PNormGradKernel(const Context& dev_ctx, const DenseTensor& x, From be1019d4461d191bd0c8db773ec6c8f45104ff9a Mon Sep 17 00:00:00 2001 From: Zhan Rongrui <2742392377@qq.com> Date: Tue, 26 Aug 2025 10:18:58 +0800 Subject: [PATCH 6/6] improve --- paddle/phi/kernels/gpu/p_norm_kernel.cu | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu index 8a91a9a84c132e..eaa8d51281ed10 100644 --- a/paddle/phi/kernels/gpu/p_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu @@ -22,7 +22,6 @@ #include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/gpu/reduce.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/kernels/activation_kernel.h" namespace phi { @@ -135,6 +134,7 @@ void PNormKernel(const Context& dev_ctx, // fast 1-norm phi::funcs::ReduceKernel>( dev_ctx, *in_x, out_norm, FabsFunctor(), reduce_axis); + return; } else if (porder == 2.0) { // fast 2-norm using MT = typename phi::dtype::MPTypeTrait::Type; @@ -154,7 +154,6 @@ void PNormKernel(const Context& dev_ctx, phi::SqrtKernel(dev_ctx, temp_sum_of_squares_hp, &temp_norm_hp); phi::CastKernel(dev_ctx, temp_norm_hp, out_norm->dtype(), out_norm); return; - } else if (porder == 3.0) { // fast 3-norm phi::funcs::ReduceKernel>( @@ -168,14 +167,11 @@ void PNormKernel(const Context& dev_ctx, UnsignedPowFunctor(porder), reduce_axis); } - - if (porder != 1.0) { - std::vector ins = {&out_temp}; - std::vector outs = {out_norm}; - MT p_order_ = static_cast(1.f / porder); - phi::funcs::ElementwiseKernel( - dev_ctx, ins, &outs, UnsignedPowFunctor(p_order_)); - } + std::vector ins = {&out_temp}; + std::vector outs = {out_norm}; + MT p_order_ = static_cast(1.f / porder); + phi::funcs::ElementwiseKernel( + dev_ctx, ins, &outs, UnsignedPowFunctor(p_order_)); #endif } }