From 1dc91dd232663ee0e5d502e92de01f20dce1ba69 Mon Sep 17 00:00:00 2001 From: co63oc Date: Fri, 8 Aug 2025 13:30:11 +0800 Subject: [PATCH] Fix --- paddle/phi/kernels/autotune/auto_tune_base.h | 30 +++-- paddle/phi/kernels/funcs/cross_entropy.cc | 4 +- .../kernels/funcs/gather_scatter_functor.cc | 121 +++++++++++------- .../phi/kernels/funcs/math/cos_sim_functor.cc | 2 +- .../phi/kernels/funcs/math/cos_sim_functor.cu | 4 +- .../phi/kernels/funcs/math/cos_sim_functor.h | 2 +- paddle/phi/kernels/funcs/scatter.cu.h | 4 +- paddle/phi/kernels/gpu/cuda_gemm_kernel.h | 2 +- .../index_elementwise_get_grad_kernel.h | 2 +- paddle/phi/kernels/onednn/gaussian_kernel.cc | 6 +- 10 files changed, 106 insertions(+), 71 deletions(-) diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h index 80d7028a3082f9..a4a858ff8eaf8b 100644 --- a/paddle/phi/kernels/autotune/auto_tune_base.h +++ b/paddle/phi/kernels/autotune/auto_tune_base.h @@ -63,7 +63,7 @@ class AutoTuneBase { } template - void Run(const Context& ctx, + void Run(const Context& dev_ctx, const AlgorithmType& algo, const size_t key, Args&&... args) { @@ -78,7 +78,7 @@ class AutoTuneBase { if (use_autotune) { // All available kernels have ran while picking the best kernel, // so there may be no need for another kernel run. - auto best_idx = PickBestKernel(ctx, args...); + auto best_idx = PickBestKernel(dev_ctx, args...); cache.Set(key, best_idx); } else { kernels_[0].Run(args...); @@ -100,14 +100,14 @@ class AutoTuneBase { } template - size_t PickBestKernel(const Context& ctx, Args&&... args) { + size_t PickBestKernel(const Context& dev_ctx, Args&&... args) { std::lock_guard lock(mutex_); size_t best_idx = 0; float min_time = std::numeric_limits::max(); // Time cost test established in default stream. for (size_t i = 0; i < kernels_.size(); ++i) { - auto time = RunAndMeasureKernel(ctx, i, args...); + auto time = RunAndMeasureKernel(dev_ctx, i, args...); if (time < min_time) { min_time = time; best_idx = i; @@ -118,15 +118,17 @@ class AutoTuneBase { } template - float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) { + float RunAndMeasureKernel(const Context& dev_ctx, + const int idx, + Args&&... args) { // Regard 1st run as warmup, judge the compare result by the time cost // of rest cycles. constexpr int repeats = 11; phi::GpuTimer timer; float time_cost = 0; - const auto& stream = ctx.stream(); + const auto& stream = dev_ctx.stream(); - ctx.Wait(); + dev_ctx.Wait(); for (int i = 0; i < repeats; ++i) { timer.Start(stream); kernels_[idx].Run(args...); @@ -158,7 +160,7 @@ class MatmulAutoTuner } template - void Run(const Context& ctx, const size_t key, Args... args) { + void Run(const Context& dev_ctx, const size_t key, Args... args) { this->is_init_ = true; this->CheckKernelSize(); auto& cache = AutoTuneCache::Instance().GetMatmul(); @@ -168,7 +170,7 @@ class MatmulAutoTuner } else { bool use_autotune = AutoTuneStatus::Instance().UseAutoTune(); if (use_autotune) { - auto best_idx = this->PickBestKernel(ctx, args...); + auto best_idx = this->PickBestKernel(dev_ctx, args...); cache.Set(key, best_idx); } else { this->kernels_[0].Run(args...); @@ -210,7 +212,7 @@ class GatherGemmScatterAutoTuner return instance.get(); } - void Run(const phi::GPUContext& ctx, + void Run(const phi::GPUContext& dev_ctx, const size_t key, T const alpha, T const beta, @@ -227,15 +229,15 @@ class GatherGemmScatterAutoTuner } else { // Set alpha to 0 and beta to 1 to avoid changing the value of d when // picking the best kernel - auto best_idx = - PickBestKernel(ctx, static_cast(0), static_cast(1), args...); + auto best_idx = PickBestKernel( + dev_ctx, static_cast(0), static_cast(1), args...); cache.Set(key, best_idx); this->kernels_[best_idx].Run(alpha, beta, args...); } } protected: - size_t PickBestKernel(const phi::GPUContext& ctx, + size_t PickBestKernel(const phi::GPUContext& dev_ctx, const T& alpha, const T& beta, Args&... args) { @@ -250,7 +252,7 @@ class GatherGemmScatterAutoTuner // Some kernels may require more shared memory than available, skip these // kernels. try { - time = this->RunAndMeasureKernel(ctx, i, alpha, beta, args...); + time = this->RunAndMeasureKernel(dev_ctx, i, alpha, beta, args...); if (time < min_time) { min_time = time; best_idx = i; diff --git a/paddle/phi/kernels/funcs/cross_entropy.cc b/paddle/phi/kernels/funcs/cross_entropy.cc index 6616f07e68a10c..9fb68c155402f5 100644 --- a/paddle/phi/kernels/funcs/cross_entropy.cc +++ b/paddle/phi/kernels/funcs/cross_entropy.cc @@ -93,7 +93,7 @@ struct HardLabelCrossEntropyCPUFunctorImpl { template void CrossEntropyFunctor::operator()( - const DeviceContext& ctx, + const DeviceContext& dev_ctx, phi::DenseTensor* out, const phi::DenseTensor* prob, const phi::DenseTensor* labels, @@ -110,7 +110,7 @@ void CrossEntropyFunctor::operator()( auto lbl = EigenMatrix::From(*labels); auto loss = EigenMatrix::From(*out); - loss.device(*ctx.eigen_device()) = + loss.device(*dev_ctx.eigen_device()) = -((lbl * in.log().unaryExpr(phi::funcs::TolerableValue())) .reshape(batch_axis_remain) .sum(Eigen::DSizes(1))); diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc index 95c9f69a2abfd8..f7274faebd6f08 100644 --- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc +++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc @@ -77,7 +77,7 @@ struct cpu_gather_scatter_functor { const std::string& method_name, const func_t& reduce_op, bool include_self, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx UNUSED) { if (index.numel() == 0) { return; } @@ -237,7 +237,7 @@ void cpu_gather_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor result, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()(result, @@ -247,7 +247,7 @@ void cpu_gather_kernel(phi::DenseTensor self, "gather_out_cpu", tensor_assign, include_self, - ctx); + dev_ctx); } template @@ -256,7 +256,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()(self, @@ -266,7 +266,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self, "scatter_assign_cpu", tensor_assign, include_self, - ctx); + dev_ctx); } template @@ -275,11 +275,17 @@ void cpu_scatter_add_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_add_cpu", reduce_add, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_add_cpu", + reduce_add, + include_self, + dev_ctx); } template @@ -288,11 +294,17 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_mul_cpu", reduce_mul, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_mul_cpu", + reduce_mul, + include_self, + dev_ctx); } template @@ -301,11 +313,17 @@ void cpu_scatter_mean_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_mean_cpu", reduce_add, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_mean_cpu", + reduce_add, + include_self, + dev_ctx); } template @@ -314,11 +332,17 @@ void cpu_scatter_max_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_max_cpu", reduce_max, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_max_cpu", + reduce_max, + include_self, + dev_ctx); } template @@ -327,11 +351,17 @@ void cpu_scatter_min_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_min_cpu", reduce_min, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_min_cpu", + reduce_min, + include_self, + dev_ctx); } template @@ -340,7 +370,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED, const phi::DenseTensor& index, phi::DenseTensor grad, bool include_self UNUSED, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx UNUSED) { auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -376,16 +406,17 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED, } template -void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED, - int dim, - const phi::DenseTensor& index, - const phi::DenseTensor& out, - const phi::DenseTensor& x, - const phi::DenseTensor& value, - phi::DenseTensor grad, - const std::string& reduce, - bool include_self UNUSED, - const phi::DeviceContext& ctx) { +void cpu_scatter_mul_min_max_input_grad_kernel( + phi::DenseTensor self UNUSED, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self UNUSED, + const phi::DeviceContext& dev_ctx) { auto* index_data = index.data(); auto* grad_data = grad.data(); auto* out_data = out.data(); @@ -457,7 +488,8 @@ void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self UNUSED, const phi::DenseTensor& index, phi::DenseTensor grad, bool include_self UNUSED, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx + UNUSED) { auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -504,7 +536,7 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor grad, bool include_self UNUSED, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx UNUSED) { auto* self_data = self.data(); auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -564,7 +596,7 @@ void cpu_scatter_add_mean_value_grad_kernel( phi::DenseTensor grad, const std::string& reduce, bool include_self, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx UNUSED) { auto* self_data = self.data(); auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -643,16 +675,17 @@ void cpu_scatter_add_mean_value_grad_kernel( } template -void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self, - int dim, - const phi::DenseTensor& index, - const phi::DenseTensor& out, - const phi::DenseTensor& x, - const phi::DenseTensor& value, - phi::DenseTensor grad, - const std::string& reduce, - bool include_self, - const phi::DeviceContext& ctx) { +void cpu_scatter_mul_min_max_value_grad_kernel( + phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& dev_ctx) { auto* self_data = self.data(); auto* index_data = index.data(); auto* grad_data = grad.data(); diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cc b/paddle/phi/kernels/funcs/math/cos_sim_functor.cc index 60f1b388de3ad0..cba26a884e72c2 100644 --- a/paddle/phi/kernels/funcs/math/cos_sim_functor.cc +++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.cc @@ -18,7 +18,7 @@ namespace phi { namespace math { template struct CosSimDyFunctor { - void operator()(const phi::CPUContext& ctx, + void operator()(const phi::CPUContext& dev_ctx, const T* x_norm, const T* y_norm, const T* x, diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cu b/paddle/phi/kernels/funcs/math/cos_sim_functor.cu index 762178b4a9d613..f37fd91ee87efd 100644 --- a/paddle/phi/kernels/funcs/math/cos_sim_functor.cu +++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.cu @@ -50,7 +50,7 @@ __global__ void CosSimDyKernel(const T* x_norm, template struct CosSimDyFunctor { - void operator()(const phi::GPUContext& ctx, + void operator()(const phi::GPUContext& dev_ctx, const T* x_norm, const T* y_norm, const T* x, @@ -63,7 +63,7 @@ struct CosSimDyFunctor { const int block_size = 512; dim3 threads(block_size, 1); dim3 grid((rows + block_size - 1) / block_size, 1); - CosSimDyKernel<<>>( + CosSimDyKernel<<>>( x_norm, y_norm, x, y, z, dz, rows, cols, dy); } }; diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.h b/paddle/phi/kernels/funcs/math/cos_sim_functor.h index e01af90df4d4e1..ed2e71f8af8d8f 100644 --- a/paddle/phi/kernels/funcs/math/cos_sim_functor.h +++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.h @@ -174,7 +174,7 @@ struct CosSimDxFunctor { template struct CosSimDyFunctor { - void operator()(const DeviceContext& ctx, + void operator()(const DeviceContext& dev_ctx, const T* x_norm, const T* y_norm, const T* x, diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h index 32afd7fdaa1b5d..a1da63a3ab9628 100644 --- a/paddle/phi/kernels/funcs/scatter.cu.h +++ b/paddle/phi/kernels/funcs/scatter.cu.h @@ -404,7 +404,7 @@ __global__ void scatter_gather_elementwise_kernel(int N, func_t f) { } template -void GPUScatterAdd(const phi::GPUContext& ctx, +void GPUScatterAdd(const phi::GPUContext& dev_ctx, const DenseTensor& src, const DenseTensor& index, DenseTensor* output, @@ -483,7 +483,7 @@ void GPUScatterAdd(const phi::GPUContext& ctx, constexpr int vt = 8; const dim3 block(nt); const dim3 grid((N + block.x * vt - 1) / (block.x * vt)); - auto stream = ctx.stream(); + auto stream = dev_ctx.stream(); scatter_gather_elementwise_kernel <<>>(N, reduce_add); diff --git a/paddle/phi/kernels/gpu/cuda_gemm_kernel.h b/paddle/phi/kernels/gpu/cuda_gemm_kernel.h index f13831bc25034b..0efe77d7817dc0 100644 --- a/paddle/phi/kernels/gpu/cuda_gemm_kernel.h +++ b/paddle/phi/kernels/gpu/cuda_gemm_kernel.h @@ -26,7 +26,7 @@ typedef struct { } GemmParams; template -void CudaGemm(const Context& ctx, +void CudaGemm(const Context& dev_ctx, const DenseTensor& input, const DenseTensor& w, DenseTensor* output); diff --git a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h index 42550bbc08de70..f5d9c3a2847d05 100644 --- a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h +++ b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h @@ -20,7 +20,7 @@ namespace phi { template -void IndexElementwiseGetGradKernel(const Context& ctx, +void IndexElementwiseGetGradKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& index, const DenseTensor& out_grad, diff --git a/paddle/phi/kernels/onednn/gaussian_kernel.cc b/paddle/phi/kernels/onednn/gaussian_kernel.cc index 98197961a9df6b..61cdb580008611 100644 --- a/paddle/phi/kernels/onednn/gaussian_kernel.cc +++ b/paddle/phi/kernels/onednn/gaussian_kernel.cc @@ -20,7 +20,7 @@ namespace phi { template -void GaussianKernel(const Context& ctx, +void GaussianKernel(const Context& dev_ctx, const IntArray& shape, float mean, float std, @@ -33,10 +33,10 @@ void GaussianKernel(const Context& ctx, engine = std::make_shared(); engine->seed(seed); } else { - engine = ctx.GetGenerator()->GetCPUEngine(); + engine = dev_ctx.GetGenerator()->GetCPUEngine(); } - T* data = ctx.template Alloc(out); + T* data = dev_ctx.template Alloc(out); for (int64_t i = 0; i < out->numel(); ++i) { data[i] = dist(*engine); }