From 1dc91dd232663ee0e5d502e92de01f20dce1ba69 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 8 Aug 2025 13:30:11 +0800
Subject: [PATCH] Fix

---
 paddle/phi/kernels/autotune/auto_tune_base.h  |  30 +++--
 paddle/phi/kernels/funcs/cross_entropy.cc     |   4 +-
 .../kernels/funcs/gather_scatter_functor.cc   | 121 +++++++++++-------
 .../phi/kernels/funcs/math/cos_sim_functor.cc |   2 +-
 .../phi/kernels/funcs/math/cos_sim_functor.cu |   4 +-
 .../phi/kernels/funcs/math/cos_sim_functor.h  |   2 +-
 paddle/phi/kernels/funcs/scatter.cu.h         |   4 +-
 paddle/phi/kernels/gpu/cuda_gemm_kernel.h     |   2 +-
 .../index_elementwise_get_grad_kernel.h       |   2 +-
 paddle/phi/kernels/onednn/gaussian_kernel.cc  |   6 +-
 10 files changed, 106 insertions(+), 71 deletions(-)
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
index 80d7028a3082f9..a4a858ff8eaf8b 100644
--- a/paddle/phi/kernels/autotune/auto_tune_base.h
+++ b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -63,7 +63,7 @@ class AutoTuneBase {
   }
 
   template <typename Context, typename... Args>
-  void Run(const Context& ctx,
+  void Run(const Context& dev_ctx,
            const AlgorithmType& algo,
            const size_t key,
            Args&&... args) {
@@ -78,7 +78,7 @@ class AutoTuneBase {
       if (use_autotune) {
         // All available kernels have ran while picking the best kernel,
         // so there may be no need for another kernel run.
-        auto best_idx = PickBestKernel(ctx, args...);
+        auto best_idx = PickBestKernel(dev_ctx, args...);
         cache.Set(key, best_idx);
       } else {
         kernels_[0].Run(args...);
@@ -100,14 +100,14 @@ class AutoTuneBase {
   }
 
   template <typename Context, typename... Args>
-  size_t PickBestKernel(const Context& ctx, Args&&... args) {
+  size_t PickBestKernel(const Context& dev_ctx, Args&&... args) {
     std::lock_guard<std::mutex> lock(mutex_);
     size_t best_idx = 0;
     float min_time = std::numeric_limits<float>::max();
 
     // Time cost test established in default stream.
     for (size_t i = 0; i < kernels_.size(); ++i) {
-      auto time = RunAndMeasureKernel<Context>(ctx, i, args...);
+      auto time = RunAndMeasureKernel<Context>(dev_ctx, i, args...);
       if (time < min_time) {
         min_time = time;
         best_idx = i;
@@ -118,15 +118,17 @@ class AutoTuneBase {
   }
 
   template <typename Context, typename... Args>
-  float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
+  float RunAndMeasureKernel(const Context& dev_ctx,
+                            const int idx,
+                            Args&&... args) {
     // Regard 1st run as warmup, judge the compare result by the time cost
     // of rest cycles.
     constexpr int repeats = 11;
     phi::GpuTimer timer;
     float time_cost = 0;
-    const auto& stream = ctx.stream();
+    const auto& stream = dev_ctx.stream();
 
-    ctx.Wait();
+    dev_ctx.Wait();
     for (int i = 0; i < repeats; ++i) {
       timer.Start(stream);
       kernels_[idx].Run(args...);
@@ -158,7 +160,7 @@ class MatmulAutoTuner
   }
 
   template <typename Context>
-  void Run(const Context& ctx, const size_t key, Args... args) {
+  void Run(const Context& dev_ctx, const size_t key, Args... args) {
     this->is_init_ = true;
     this->CheckKernelSize();
     auto& cache = AutoTuneCache::Instance().GetMatmul();
@@ -168,7 +170,7 @@ class MatmulAutoTuner
     } else {
       bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
       if (use_autotune) {
-        auto best_idx = this->PickBestKernel(ctx, args...);
+        auto best_idx = this->PickBestKernel(dev_ctx, args...);
         cache.Set(key, best_idx);
       } else {
         this->kernels_[0].Run(args...);
@@ -210,7 +212,7 @@ class GatherGemmScatterAutoTuner
     return instance.get();
   }
 
-  void Run(const phi::GPUContext& ctx,
+  void Run(const phi::GPUContext& dev_ctx,
            const size_t key,
            T const alpha,
            T const beta,
@@ -227,15 +229,15 @@ class GatherGemmScatterAutoTuner
     } else {
       // Set alpha to 0 and beta to 1 to avoid changing the value of d when
       // picking the best kernel
-      auto best_idx =
-          PickBestKernel(ctx, static_cast<T>(0), static_cast<T>(1), args...);
+      auto best_idx = PickBestKernel(
+          dev_ctx, static_cast<T>(0), static_cast<T>(1), args...);
       cache.Set(key, best_idx);
       this->kernels_[best_idx].Run(alpha, beta, args...);
     }
   }
 
  protected:
-  size_t PickBestKernel(const phi::GPUContext& ctx,
+  size_t PickBestKernel(const phi::GPUContext& dev_ctx,
                         const T& alpha,
                         const T& beta,
                         Args&... args) {
@@ -250,7 +252,7 @@ class GatherGemmScatterAutoTuner
       // Some kernels may require more shared memory than available, skip these
       // kernels.
       try {
-        time = this->RunAndMeasureKernel(ctx, i, alpha, beta, args...);
+        time = this->RunAndMeasureKernel(dev_ctx, i, alpha, beta, args...);
         if (time < min_time) {
           min_time = time;
           best_idx = i;
diff --git a/paddle/phi/kernels/funcs/cross_entropy.cc b/paddle/phi/kernels/funcs/cross_entropy.cc
index 6616f07e68a10c..9fb68c155402f5 100644
--- a/paddle/phi/kernels/funcs/cross_entropy.cc
+++ b/paddle/phi/kernels/funcs/cross_entropy.cc
@@ -93,7 +93,7 @@ struct HardLabelCrossEntropyCPUFunctorImpl {
 
 template <typename DeviceContext, typename T>
 void CrossEntropyFunctor<DeviceContext, T>::operator()(
-    const DeviceContext& ctx,
+    const DeviceContext& dev_ctx,
     phi::DenseTensor* out,
     const phi::DenseTensor* prob,
     const phi::DenseTensor* labels,
@@ -110,7 +110,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
     auto lbl = EigenMatrix<T>::From(*labels);
     auto loss = EigenMatrix<T>::From(*out);
 
-    loss.device(*ctx.eigen_device()) =
+    loss.device(*dev_ctx.eigen_device()) =
         -((lbl * in.log().unaryExpr(phi::funcs::TolerableValue<T>()))
               .reshape(batch_axis_remain)
               .sum(Eigen::DSizes<int, 1>(1)));
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
index 95c9f69a2abfd8..f7274faebd6f08 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -77,7 +77,7 @@ struct cpu_gather_scatter_functor {
                   const std::string& method_name,
                   const func_t& reduce_op,
                   bool include_self,
-                  const phi::DeviceContext& ctx UNUSED) {
+                  const phi::DeviceContext& dev_ctx UNUSED) {
     if (index.numel() == 0) {
       return;
     }
@@ -237,7 +237,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
                        bool include_self,
-                       const phi::DeviceContext& ctx) {
+                       const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/false>()(result,
@@ -247,7 +247,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
                                                           "gather_out_cpu",
                                                           tensor_assign,
                                                           include_self,
-                                                          ctx);
+                                                          dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -256,7 +256,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
                                bool include_self,
-                               const phi::DeviceContext& ctx) {
+                               const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(self,
@@ -266,7 +266,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                                          "scatter_assign_cpu",
                                                          tensor_assign,
                                                          include_self,
-                                                         ctx);
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -275,11 +275,17 @@ void cpu_scatter_add_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_add_cpu", reduce_add, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_add_cpu",
+                                                         reduce_add,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -288,11 +294,17 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mul_cpu", reduce_mul, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_mul_cpu",
+                                                         reduce_mul,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -301,11 +313,17 @@ void cpu_scatter_mean_kernel(phi::DenseTensor self,
                              const phi::DenseTensor& index,
                              phi::DenseTensor src,
                              bool include_self,
-                             const phi::DeviceContext& ctx) {
+                             const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mean_cpu", reduce_add, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_mean_cpu",
+                                                         reduce_add,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -314,11 +332,17 @@ void cpu_scatter_max_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_max_cpu", reduce_max, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_max_cpu",
+                                                         reduce_max,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -327,11 +351,17 @@ void cpu_scatter_min_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_min_cpu", reduce_min, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_min_cpu",
+                                                         reduce_min,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -340,7 +370,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
                                    bool include_self UNUSED,
-                                   const phi::DeviceContext& ctx UNUSED) {
+                                   const phi::DeviceContext& dev_ctx UNUSED) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
 
@@ -376,16 +406,17 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED,
-                                               int dim,
-                                               const phi::DenseTensor& index,
-                                               const phi::DenseTensor& out,
-                                               const phi::DenseTensor& x,
-                                               const phi::DenseTensor& value,
-                                               phi::DenseTensor grad,
-                                               const std::string& reduce,
-                                               bool include_self UNUSED,
-                                               const phi::DeviceContext& ctx) {
+void cpu_scatter_mul_min_max_input_grad_kernel(
+    phi::DenseTensor self UNUSED,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out,
+    const phi::DenseTensor& x,
+    const phi::DenseTensor& value,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self UNUSED,
+    const phi::DeviceContext& dev_ctx) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
   auto* out_data = out.data<tensor_t>();
@@ -457,7 +488,8 @@ void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self UNUSED,
                                         const phi::DenseTensor& index,
                                         phi::DenseTensor grad,
                                         bool include_self UNUSED,
-                                        const phi::DeviceContext& ctx UNUSED) {
+                                        const phi::DeviceContext& dev_ctx
+                                            UNUSED) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
 
@@ -504,7 +536,7 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
                                    bool include_self UNUSED,
-                                   const phi::DeviceContext& ctx UNUSED) {
+                                   const phi::DeviceContext& dev_ctx UNUSED) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -564,7 +596,7 @@ void cpu_scatter_add_mean_value_grad_kernel(
     phi::DenseTensor grad,
     const std::string& reduce,
     bool include_self,
-    const phi::DeviceContext& ctx UNUSED) {
+    const phi::DeviceContext& dev_ctx UNUSED) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -643,16 +675,17 @@ void cpu_scatter_add_mean_value_grad_kernel(
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
-                                               int dim,
-                                               const phi::DenseTensor& index,
-                                               const phi::DenseTensor& out,
-                                               const phi::DenseTensor& x,
-                                               const phi::DenseTensor& value,
-                                               phi::DenseTensor grad,
-                                               const std::string& reduce,
-                                               bool include_self,
-                                               const phi::DeviceContext& ctx) {
+void cpu_scatter_mul_min_max_value_grad_kernel(
+    phi::DenseTensor self,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out,
+    const phi::DenseTensor& x,
+    const phi::DenseTensor& value,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self,
+    const phi::DeviceContext& dev_ctx) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cc b/paddle/phi/kernels/funcs/math/cos_sim_functor.cc
index 60f1b388de3ad0..cba26a884e72c2 100644
--- a/paddle/phi/kernels/funcs/math/cos_sim_functor.cc
+++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.cc
@@ -18,7 +18,7 @@ namespace phi {
 namespace math {
 template <typename T>
 struct CosSimDyFunctor<phi::CPUContext, T> {
-  void operator()(const phi::CPUContext& ctx,
+  void operator()(const phi::CPUContext& dev_ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,
diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cu b/paddle/phi/kernels/funcs/math/cos_sim_functor.cu
index 762178b4a9d613..f37fd91ee87efd 100644
--- a/paddle/phi/kernels/funcs/math/cos_sim_functor.cu
+++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.cu
@@ -50,7 +50,7 @@ __global__ void CosSimDyKernel(const T* x_norm,
 
 template <typename T>
 struct CosSimDyFunctor<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& ctx,
+  void operator()(const phi::GPUContext& dev_ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,
@@ -63,7 +63,7 @@ struct CosSimDyFunctor<phi::GPUContext, T> {
     const int block_size = 512;
     dim3 threads(block_size, 1);
     dim3 grid((rows + block_size - 1) / block_size, 1);
-    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
+    CosSimDyKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
         x_norm, y_norm, x, y, z, dz, rows, cols, dy);
   }
 };
diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.h b/paddle/phi/kernels/funcs/math/cos_sim_functor.h
index e01af90df4d4e1..ed2e71f8af8d8f 100644
--- a/paddle/phi/kernels/funcs/math/cos_sim_functor.h
+++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.h
@@ -174,7 +174,7 @@ struct CosSimDxFunctor {
 
 template <typename DeviceContext, typename T>
 struct CosSimDyFunctor {
-  void operator()(const DeviceContext& ctx,
+  void operator()(const DeviceContext& dev_ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index 32afd7fdaa1b5d..a1da63a3ab9628 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -404,7 +404,7 @@ __global__ void scatter_gather_elementwise_kernel(int N, func_t f) {
 }
 
 template <typename T, typename IndexT = int>
-void GPUScatterAdd(const phi::GPUContext& ctx,
+void GPUScatterAdd(const phi::GPUContext& dev_ctx,
                    const DenseTensor& src,
                    const DenseTensor& index,
                    DenseTensor* output,
@@ -483,7 +483,7 @@ void GPUScatterAdd(const phi::GPUContext& ctx,
   constexpr int vt = 8;
   const dim3 block(nt);
   const dim3 grid((N + block.x * vt - 1) / (block.x * vt));
-  auto stream = ctx.stream();
+  auto stream = dev_ctx.stream();
 
   scatter_gather_elementwise_kernel<nt, vt>
       <<<grid, block, 0, stream>>>(N, reduce_add);
diff --git a/paddle/phi/kernels/gpu/cuda_gemm_kernel.h b/paddle/phi/kernels/gpu/cuda_gemm_kernel.h
index f13831bc25034b..0efe77d7817dc0 100644
--- a/paddle/phi/kernels/gpu/cuda_gemm_kernel.h
+++ b/paddle/phi/kernels/gpu/cuda_gemm_kernel.h
@@ -26,7 +26,7 @@ typedef struct {
 } GemmParams;
 
 template <typename T, typename Context>
-void CudaGemm(const Context& ctx,
+void CudaGemm(const Context& dev_ctx,
               const DenseTensor& input,
               const DenseTensor& w,
               DenseTensor* output);
diff --git a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h
index 42550bbc08de70..f5d9c3a2847d05 100644
--- a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h
+++ b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h
@@ -20,7 +20,7 @@
 namespace phi {
 
 template <typename T, typename Context>
-void IndexElementwiseGetGradKernel(const Context& ctx,
+void IndexElementwiseGetGradKernel(const Context& dev_ctx,
                                    const DenseTensor& x,
                                    const std::vector<const DenseTensor*>& index,
                                    const DenseTensor& out_grad,
diff --git a/paddle/phi/kernels/onednn/gaussian_kernel.cc b/paddle/phi/kernels/onednn/gaussian_kernel.cc
index 98197961a9df6b..61cdb580008611 100644
--- a/paddle/phi/kernels/onednn/gaussian_kernel.cc
+++ b/paddle/phi/kernels/onednn/gaussian_kernel.cc
@@ -20,7 +20,7 @@
 namespace phi {
 
 template <typename T, typename Context>
-void GaussianKernel(const Context& ctx,
+void GaussianKernel(const Context& dev_ctx,
                     const IntArray& shape,
                     float mean,
                     float std,
@@ -33,10 +33,10 @@ void GaussianKernel(const Context& ctx,
     engine = std::make_shared<std::mt19937_64>();
     engine->seed(seed);
   } else {
-    engine = ctx.GetGenerator()->GetCPUEngine();
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
   }
 
-  T* data = ctx.template Alloc<T>(out);
+  T* data = dev_ctx.template Alloc<T>(out);
   for (int64_t i = 0; i < out->numel(); ++i) {
     data[i] = dist(*engine);
   }