PaddlePaddle · luotao1 · Aug 11, 2025 · Aug 8, 2025
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -63,7 +63,7 @@ class AutoTuneBase {
   }
 
   template <typename Context, typename... Args>
-  void Run(const Context& ctx,
+  void Run(const Context& dev_ctx,
            const AlgorithmType& algo,
            const size_t key,
            Args&&... args) {
@@ -78,7 +78,7 @@ class AutoTuneBase {
       if (use_autotune) {
         // All available kernels have ran while picking the best kernel,
         // so there may be no need for another kernel run.
-        auto best_idx = PickBestKernel(ctx, args...);
+        auto best_idx = PickBestKernel(dev_ctx, args...);
         cache.Set(key, best_idx);
       } else {
         kernels_[0].Run(args...);
@@ -100,14 +100,14 @@ class AutoTuneBase {
   }
 
   template <typename Context, typename... Args>
-  size_t PickBestKernel(const Context& ctx, Args&&... args) {
+  size_t PickBestKernel(const Context& dev_ctx, Args&&... args) {
     std::lock_guard<std::mutex> lock(mutex_);
     size_t best_idx = 0;
     float min_time = std::numeric_limits<float>::max();
 
     // Time cost test established in default stream.
     for (size_t i = 0; i < kernels_.size(); ++i) {
-      auto time = RunAndMeasureKernel<Context>(ctx, i, args...);
+      auto time = RunAndMeasureKernel<Context>(dev_ctx, i, args...);
       if (time < min_time) {
         min_time = time;
         best_idx = i;
@@ -118,15 +118,17 @@ class AutoTuneBase {
   }
 
   template <typename Context, typename... Args>
-  float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
+  float RunAndMeasureKernel(const Context& dev_ctx,
+                            const int idx,
+                            Args&&... args) {
     // Regard 1st run as warmup, judge the compare result by the time cost
     // of rest cycles.
     constexpr int repeats = 11;
     phi::GpuTimer timer;
     float time_cost = 0;
-    const auto& stream = ctx.stream();
+    const auto& stream = dev_ctx.stream();
 
-    ctx.Wait();
+    dev_ctx.Wait();
     for (int i = 0; i < repeats; ++i) {
       timer.Start(stream);
       kernels_[idx].Run(args...);
@@ -158,7 +160,7 @@ class MatmulAutoTuner
   }
 
   template <typename Context>
-  void Run(const Context& ctx, const size_t key, Args... args) {
+  void Run(const Context& dev_ctx, const size_t key, Args... args) {
     this->is_init_ = true;
     this->CheckKernelSize();
     auto& cache = AutoTuneCache::Instance().GetMatmul();
@@ -168,7 +170,7 @@ class MatmulAutoTuner
     } else {
       bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
       if (use_autotune) {
-        auto best_idx = this->PickBestKernel(ctx, args...);
+        auto best_idx = this->PickBestKernel(dev_ctx, args...);
         cache.Set(key, best_idx);
       } else {
         this->kernels_[0].Run(args...);
@@ -210,7 +212,7 @@ class GatherGemmScatterAutoTuner
     return instance.get();
   }
 
-  void Run(const phi::GPUContext& ctx,
+  void Run(const phi::GPUContext& dev_ctx,
            const size_t key,
            T const alpha,
            T const beta,
@@ -227,15 +229,15 @@ class GatherGemmScatterAutoTuner
     } else {
       // Set alpha to 0 and beta to 1 to avoid changing the value of d when
       // picking the best kernel
-      auto best_idx =
-          PickBestKernel(ctx, static_cast<T>(0), static_cast<T>(1), args...);
+      auto best_idx = PickBestKernel(
+          dev_ctx, static_cast<T>(0), static_cast<T>(1), args...);
       cache.Set(key, best_idx);
       this->kernels_[best_idx].Run(alpha, beta, args...);
     }
   }
 
  protected:
-  size_t PickBestKernel(const phi::GPUContext& ctx,
+  size_t PickBestKernel(const phi::GPUContext& dev_ctx,
                         const T& alpha,
                         const T& beta,
                         Args&... args) {
@@ -250,7 +252,7 @@ class GatherGemmScatterAutoTuner
       // Some kernels may require more shared memory than available, skip these
       // kernels.
       try {
-        time = this->RunAndMeasureKernel(ctx, i, alpha, beta, args...);
+        time = this->RunAndMeasureKernel(dev_ctx, i, alpha, beta, args...);
         if (time < min_time) {
           min_time = time;
           best_idx = i;

diff --git a/paddle/phi/kernels/funcs/cross_entropy.cc b/paddle/phi/kernels/funcs/cross_entropy.cc
@@ -93,7 +93,7 @@ struct HardLabelCrossEntropyCPUFunctorImpl {
 
 template <typename DeviceContext, typename T>
 void CrossEntropyFunctor<DeviceContext, T>::operator()(
-    const DeviceContext& ctx,
+    const DeviceContext& dev_ctx,
     phi::DenseTensor* out,
     const phi::DenseTensor* prob,
     const phi::DenseTensor* labels,
@@ -110,7 +110,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
     auto lbl = EigenMatrix<T>::From(*labels);
     auto loss = EigenMatrix<T>::From(*out);
 
-    loss.device(*ctx.eigen_device()) =
+    loss.device(*dev_ctx.eigen_device()) =
         -((lbl * in.log().unaryExpr(phi::funcs::TolerableValue<T>()))
               .reshape(batch_axis_remain)
               .sum(Eigen::DSizes<int, 1>(1)));

diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -77,7 +77,7 @@ struct cpu_gather_scatter_functor {
                   const std::string& method_name,
                   const func_t& reduce_op,
                   bool include_self,
-                  const phi::DeviceContext& ctx UNUSED) {
+                  const phi::DeviceContext& dev_ctx UNUSED) {
     if (index.numel() == 0) {
       return;
     }
@@ -237,7 +237,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
                        bool include_self,
-                       const phi::DeviceContext& ctx) {
+                       const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/false>()(result,
@@ -247,7 +247,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
                                                           "gather_out_cpu",
                                                           tensor_assign,
                                                           include_self,
-                                                          ctx);
+                                                          dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -256,7 +256,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
                                bool include_self,
-                               const phi::DeviceContext& ctx) {
+                               const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(self,
@@ -266,7 +266,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                                          "scatter_assign_cpu",
                                                          tensor_assign,
                                                          include_self,
-                                                         ctx);
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -275,11 +275,17 @@ void cpu_scatter_add_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_add_cpu", reduce_add, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_add_cpu",
+                                                         reduce_add,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -288,11 +294,17 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mul_cpu", reduce_mul, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_mul_cpu",
+                                                         reduce_mul,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -301,11 +313,17 @@ void cpu_scatter_mean_kernel(phi::DenseTensor self,
                              const phi::DenseTensor& index,
                              phi::DenseTensor src,
                              bool include_self,
-                             const phi::DeviceContext& ctx) {
+                             const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mean_cpu", reduce_add, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_mean_cpu",
+                                                         reduce_add,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -314,11 +332,17 @@ void cpu_scatter_max_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_max_cpu", reduce_max, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_max_cpu",
+                                                         reduce_max,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -327,11 +351,17 @@ void cpu_scatter_min_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_min_cpu", reduce_min, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_min_cpu",
+                                                         reduce_min,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -340,7 +370,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
                                    bool include_self UNUSED,
-                                   const phi::DeviceContext& ctx UNUSED) {
+                                   const phi::DeviceContext& dev_ctx UNUSED) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
 
@@ -376,16 +406,17 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED,
-                                               int dim,
-                                               const phi::DenseTensor& index,
-                                               const phi::DenseTensor& out,
-                                               const phi::DenseTensor& x,
-                                               const phi::DenseTensor& value,
-                                               phi::DenseTensor grad,
-                                               const std::string& reduce,
-                                               bool include_self UNUSED,
-                                               const phi::DeviceContext& ctx) {
+void cpu_scatter_mul_min_max_input_grad_kernel(
+    phi::DenseTensor self UNUSED,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out,
+    const phi::DenseTensor& x,
+    const phi::DenseTensor& value,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self UNUSED,
+    const phi::DeviceContext& dev_ctx) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
   auto* out_data = out.data<tensor_t>();
@@ -457,7 +488,8 @@ void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self UNUSED,
                                         const phi::DenseTensor& index,
                                         phi::DenseTensor grad,
                                         bool include_self UNUSED,
-                                        const phi::DeviceContext& ctx UNUSED) {
+                                        const phi::DeviceContext& dev_ctx
+                                            UNUSED) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
 
@@ -504,7 +536,7 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
                                    bool include_self UNUSED,
-                                   const phi::DeviceContext& ctx UNUSED) {
+                                   const phi::DeviceContext& dev_ctx UNUSED) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -564,7 +596,7 @@ void cpu_scatter_add_mean_value_grad_kernel(
     phi::DenseTensor grad,
     const std::string& reduce,
     bool include_self,
-    const phi::DeviceContext& ctx UNUSED) {
+    const phi::DeviceContext& dev_ctx UNUSED) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -643,16 +675,17 @@ void cpu_scatter_add_mean_value_grad_kernel(
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
-                                               int dim,
-                                               const phi::DenseTensor& index,
-                                               const phi::DenseTensor& out,
-                                               const phi::DenseTensor& x,
-                                               const phi::DenseTensor& value,
-                                               phi::DenseTensor grad,
-                                               const std::string& reduce,
-                                               bool include_self,
-                                               const phi::DeviceContext& ctx) {
+void cpu_scatter_mul_min_max_value_grad_kernel(
+    phi::DenseTensor self,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out,
+    const phi::DenseTensor& x,
+    const phi::DenseTensor& value,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self,
+    const phi::DeviceContext& dev_ctx) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();

diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cc b/paddle/phi/kernels/funcs/math/cos_sim_functor.cc
@@ -18,7 +18,7 @@ namespace phi {
 namespace math {
 template <typename T>
 struct CosSimDyFunctor<phi::CPUContext, T> {
-  void operator()(const phi::CPUContext& ctx,
+  void operator()(const phi::CPUContext& dev_ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,

diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cu b/paddle/phi/kernels/funcs/math/cos_sim_functor.cu
@@ -50,7 +50,7 @@ __global__ void CosSimDyKernel(const T* x_norm,
 
 template <typename T>
 struct CosSimDyFunctor<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& ctx,
+  void operator()(const phi::GPUContext& dev_ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,
@@ -63,7 +63,7 @@ struct CosSimDyFunctor<phi::GPUContext, T> {
     const int block_size = 512;
     dim3 threads(block_size, 1);
     dim3 grid((rows + block_size - 1) / block_size, 1);
-    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
+    CosSimDyKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
         x_norm, y_norm, x, y, z, dz, rows, cols, dy);
   }
 };