Fix

co63oc · co63oc · commit a8d35d99a1a5 · 2025-07-16T12:40:12.000+08:00
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-static inline void GetBlockDims(const phi::GPUContext& context,
+static inline void GetBlockDims(const phi::GPUContext& dev_ctx,
                                 int64_t num_rows,
                                 int64_t num_cols,
                                 dim3* block_dims,
@@ -39,7 +39,7 @@ static inline void GetBlockDims(const phi::GPUContext& context,
   *block_dims = dim3(block_cols, block_rows, 1);
 
   constexpr int waves = 1;
-  int max_threads = context.GetMaxPhysicalThreadCount() * waves;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount() * waves;
   int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
   int grid_cols =
@@ -605,14 +605,14 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& dev_ctx,
 
 template <typename T>
 struct ConcatFunctor<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& context,
+  void operator()(const phi::GPUContext& dev_ctx,
                   const std::vector<phi::DenseTensor>& input,
                   int axis,
                   phi::DenseTensor* output) {
     if (output->numel() < std::numeric_limits<int32_t>::max()) {
-      ConcatFunctorWithIndexType<T, int32_t>(context, input, axis, output);
+      ConcatFunctorWithIndexType<T, int32_t>(dev_ctx, input, axis, output);
     } else {
-      ConcatFunctorWithIndexType<T, int64_t>(context, input, axis, output);
+      ConcatFunctorWithIndexType<T, int64_t>(dev_ctx, input, axis, output);
     }
   }
 };
@@ -805,7 +805,7 @@ void SplitFunctorDispatchWithIndexType(
 template <typename T>
 class SplitFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const phi::GPUContext& context,
+  void operator()(const phi::GPUContext& dev_ctx,
                   const phi::DenseTensor& input,
                   const std::vector<const phi::DenseTensor*>& ref_inputs,
                   int axis,
@@ -819,10 +819,10 @@ class SplitFunctor<phi::GPUContext, T> {
 
     if (numel < std::numeric_limits<int32_t>::max()) {
       SplitFunctorDispatchWithIndexType<T, int32_t>(
-          context, axis, input, ref_inputs, outputs);
+          dev_ctx, axis, input, ref_inputs, outputs);
     } else {
       SplitFunctorDispatchWithIndexType<T, int64_t>(
-          context, axis, input, ref_inputs, outputs);
+          dev_ctx, axis, input, ref_inputs, outputs);
     }
   }
 };
diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -22,7 +22,7 @@ namespace phi {
 namespace funcs {
 
 template <typename DeviceContext, typename T>
-void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
                                              const int M,
                                              const int N,
                                              const int K,
@@ -32,18 +32,18 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
                                              const T* B,
                                              bool relu,
                                              bool padding_weights) {
-  auto blas = GetBlas<DeviceContext, T>(context);
+  auto blas = GetBlas<DeviceContext, T>(dev_ctx);
   phi::DenseTensor Y1;
   T* Y1_data = nullptr;
   if (padding_weights) {
     const int NN = N + 4;
     const int KK = K + 4;
     phi::DenseTensor X1;
     X1.Resize({M * KK});
-    T* X1_data = context.template HostAlloc<T>(&X1);
+    T* X1_data = dev_ctx.template HostAlloc<T>(&X1);
 
     Y1.Resize({M * (N + 4)});
-    Y1_data = context.template HostAlloc<T>(&Y1);
+    Y1_data = dev_ctx.template HostAlloc<T>(&Y1);
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -336,7 +336,7 @@ void AddReluKernel(gpuStream_t stream,
 #endif
 
 template <typename DeviceContext, typename T>
-void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
                                              const int M,
                                              const int N,
                                              const int K,
@@ -350,7 +350,7 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
                     false,
                     errors::PermissionDenied(
                         "Weight padding in fc can not be used in GPU scope."));
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
+  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
   blas.GEMM(CblasNoTrans,
             CblasNoTrans,
             M,
@@ -366,7 +366,7 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
   }
 
   // M * N
-  AddReluKernel(context.stream(), M, N, Y, B, relu);
+  AddReluKernel(dev_ctx.stream(), M, N, Y, B, relu);
 }
 
 template class FCFunctor<GPUContext, float16>;
@@ -375,7 +375,7 @@ template class FCFunctor<GPUContext, double>;
 
 template <typename DeviceContext, typename T>
 void FCInt8Functor<DeviceContext, T>::operator()(
-    const DeviceContext& context,
+    const DeviceContext& dev_ctx,
     const int M,
     const int N,
     const int K,
@@ -399,9 +399,9 @@ void FCInt8Functor<DeviceContext, T>::operator()(
   DenseTensor quant_x_tensor, quant_y_tensor;
   quant_x_tensor.Resize(common::make_ddim({M, K}));
   quant_y_tensor.Resize(common::make_ddim({M, N}));
-  context.template Alloc<int8_t>(&quant_x_tensor,
+  dev_ctx.template Alloc<int8_t>(&quant_x_tensor,
                                  quant_x_tensor.numel() * sizeof(int8_t));
-  context.template Alloc<int32_t>(&quant_y_tensor,
+  dev_ctx.template Alloc<int32_t>(&quant_y_tensor,
                                   quant_y_tensor.numel() * sizeof(int32_t));
   LaunchQuantKernelWithVecSize<T>(X,
                                   quant_x_tensor.data<int8_t>(),
@@ -411,14 +411,14 @@ void FCInt8Functor<DeviceContext, T>::operator()(
                                   quant_round_type,
                                   quant_max_bound,
                                   quant_min_bound,
-                                  context.stream());
+                                  dev_ctx.stream());
 
   MatmulKernel<int8_t, GPUContext>(
-      context, quant_x_tensor, *w_tensor, false, false, &quant_y_tensor);
+      dev_ctx, quant_x_tensor, *w_tensor, false, false, &quant_y_tensor);
 
   DenseTensor scale_weights_dev;
   scale_weights_dev.Resize(common::make_ddim({N}));
-  context.template Alloc<float>(&scale_weights_dev,
+  dev_ctx.template Alloc<float>(&scale_weights_dev,
                                 scale_weights_dev.numel() * sizeof(float));
   float* scale_weights_dev_ptr = scale_weights_dev.data<float>();
 #ifdef PADDLE_WITH_HIP
@@ -436,15 +436,15 @@ void FCInt8Functor<DeviceContext, T>::operator()(
   phi::backends::gpu::GpuLaunchConfig config;
   if (N % DequantKernelVecSize == 0) {
     config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        context, M * N, DequantKernelVecSize);
+        dev_ctx, M * N, DequantKernelVecSize);
   } else {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(context, M * N, 1);
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, M * N, 1);
   }
   LaunchDequantKernelWithScaleOfInputAndWeight(quant_y_tensor.data<int32_t>(),
                                                Y,
                                                M,
                                                N,
-                                               context.stream(),
+                                               dev_ctx.stream(),
                                                &config,
                                                scale_in,
                                                scale_weights_dev_ptr,
@@ -455,7 +455,7 @@ void FCInt8Functor<DeviceContext, T>::operator()(
   }
 
   // M * N
-  AddReluKernel(context.stream(), M, N, Y, B, relu);
+  AddReluKernel(dev_ctx.stream(), M, N, Y, B, relu);
 }
 
 template class FCInt8Functor<GPUContext, float16>;
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
@@ -235,7 +235,7 @@ __global__ void TransposeNormalKernel(const T* in_ptr,
 
 template <typename DeviceContext, typename T>
 void TransposeNormal<DeviceContext, T>::operator()(
-    const DeviceContext& context,
+    const DeviceContext& dev_ctx,
     const phi::DenseTensor& in,
     phi::DenseTensor* out,
     const std::vector<int>& axis) {
@@ -246,7 +246,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
   auto* out_ptr = out->data<T>();
 
   // copy in_stride, out_stride, axis to gpu device
-  const phi::Place& cuda_place = context.GetPlace();
+  const phi::Place& cuda_place = dev_ctx.GetPlace();
   phi::CPUPlace cpu_place = phi::CPUPlace();
   size_t size = 3 * rank * sizeof(int64_t);
   auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
@@ -259,26 +259,26 @@ void TransposeNormal<DeviceContext, T>::operator()(
     cpu_buf[2 * rank + i] = axis[i];
   }
   memory_utils::Copy(
-      cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
+      cuda_place, cuda_buf, cpu_place, cpu_buf, size, dev_ctx.stream());
   REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
   REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
   REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
 
-  const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
-  const int MAX_GRID_DIM = context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+  const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
+  const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
   int64_t elements = in.numel();
   int block_size = (elements >= MAX_BLOCK_DIM)
                        ? MAX_BLOCK_DIM
                        : (1 << static_cast<int>(std::log2(elements)));
   int grid_size = elements / block_size;
   grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
-  TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+  TransposeNormalKernel<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
       in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr, rank);
 }
 
 template <typename T>
 struct TransposeNormal<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& context,
+  void operator()(const phi::GPUContext& dev_ctx,
                   const DenseTensor& in,
                   DenseTensor* out,
                   const std::vector<int>& axis) {
@@ -289,7 +289,7 @@ struct TransposeNormal<phi::GPUContext, T> {
     auto* out_ptr = out->data<T>();
 
     // copy in_stride, out_stride, axis to gpu device
-    const phi::Place& cuda_place = context.GetPlace();
+    const phi::Place& cuda_place = dev_ctx.GetPlace();
     phi::CPUPlace cpu_place = phi::CPUPlace();
     size_t size = 3 * rank * sizeof(int64_t);
     auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
@@ -302,22 +302,22 @@ struct TransposeNormal<phi::GPUContext, T> {
       cpu_buf[2 * rank + i] = axis[i];
     }
     memory_utils::Copy(
-        cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
+        cuda_place, cuda_buf, cpu_place, cpu_buf, size, dev_ctx.stream());
     REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
     REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
     REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
 
-    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
     const int MAX_GRID_DIM =
-        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+        dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
     int64_t elements = in.numel();
     int block_size = (elements >= MAX_BLOCK_DIM)
                          ? MAX_BLOCK_DIM
                          : (1 << static_cast<int>(std::log2(elements)));
     int grid_size = elements / block_size;
     grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
     TransposeNormalKernel<T>
-        <<<grid_size, block_size, 0, context.stream()>>>(in_ptr,
+        <<<grid_size, block_size, 0, dev_ctx.stream()>>>(in_ptr,
                                                          out_ptr,
                                                          elements,
                                                          in_stride_ptr,
@@ -347,30 +347,30 @@ DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<float>);
 DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<double>);
 
 struct TensorSetConstantGPU {
-  TensorSetConstantGPU(const phi::DeviceContext& context,
+  TensorSetConstantGPU(const phi::DeviceContext& dev_ctx,
                        phi::DenseTensor* tensor,
                        float value)
-      : context_(context), tensor_(tensor), value_(value) {}
+      : dev_ctx_(dev_ctx), tensor_(tensor), value_(value) {}
 
   template <typename T>
   void apply() const {
     SetConstant<phi::GPUContext, T> functor;
-    functor(reinterpret_cast<const phi::GPUContext&>(context_),
+    functor(reinterpret_cast<const phi::GPUContext&>(dev_ctx_),
             tensor_,
             static_cast<T>(value_));
   }
 
-  const phi::DeviceContext& context_;
+  const phi::DeviceContext& dev_ctx_;
   phi::DenseTensor* tensor_;
   float value_;
 };
 
 template <>
-void set_constant_with_place<phi::GPUPlace>(const phi::DeviceContext& context,
+void set_constant_with_place<phi::GPUPlace>(const phi::DeviceContext& dev_ctx,
                                             phi::DenseTensor* tensor,
                                             float value) {
   phi::VisitDataType(tensor->dtype(),
-                     TensorSetConstantGPU(context, tensor, value));
+                     TensorSetConstantGPU(dev_ctx, tensor, value));
 }
 
 template <typename T>
@@ -386,7 +386,7 @@ __global__ void RowwiseAddKernel(
 
 template <typename T>
 struct RowwiseAdd<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& context,
+  void operator()(const phi::GPUContext& dev_ctx,
                   const phi::DenseTensor& input,
                   const phi::DenseTensor& vector,
                   phi::DenseTensor* output) {
@@ -415,7 +415,7 @@ struct RowwiseAdd<phi::GPUContext, T> {
             out_dims_cstr));
     int blocks = 512;
     int grids = (input.numel() + blocks - 1) / blocks;
-    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+    RowwiseAddKernel<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
         input.data<T>(),
         vector.data<T>(),
         output->data<T>(),
diff --git a/paddle/phi/kernels/funcs/math_function_blas_impl.h b/paddle/phi/kernels/funcs/math_function_blas_impl.h
@@ -32,7 +32,7 @@ namespace funcs {
 // and only failed for this case. So reimplemented it.
 template <>
 void ColwiseSum<phi::GPUContext, double>::operator()(
-    const phi::GPUContext& context,
+    const phi::GPUContext& dev_ctx,
     const phi::DenseTensor& input,
     phi::DenseTensor* vector) {
   auto in_dims = input.dims();
@@ -47,11 +47,11 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
                         vector->numel()));
   phi::DenseTensor one;
   one.Resize({in_dims[0]});
-  context.template Alloc<double>(&one);
+  dev_ctx.template Alloc<double>(&one);
 
   SetConstant<phi::GPUContext, double> set;
-  set(context, &one, static_cast<double>(1.0));
-  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
+  set(dev_ctx, &one, static_cast<double>(1.0));
+  phi::funcs::GetBlas<phi::GPUContext, double>(dev_ctx).GEMV(
       true,
       static_cast<int>(in_dims[0]),
       static_cast<int>(in_dims[1]),
@@ -68,7 +68,7 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
 // mode,
 template <>
 void RowwiseSum<phi::GPUContext, double>::operator()(
-    const phi::GPUContext& context,
+    const phi::GPUContext& dev_ctx,
     const phi::DenseTensor& input,
     phi::DenseTensor* vector) {
   auto in_dims = input.dims();
@@ -83,11 +83,11 @@ void RowwiseSum<phi::GPUContext, double>::operator()(
                         vector->numel()));
   phi::DenseTensor one;
   one.Resize({size});
-  context.template Alloc<double>(&one);
+  dev_ctx.template Alloc<double>(&one);
 
   SetConstant<phi::GPUContext, double> set;
-  set(context, &one, static_cast<double>(1.0));
-  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
+  set(dev_ctx, &one, static_cast<double>(1.0));
+  phi::funcs::GetBlas<phi::GPUContext, double>(dev_ctx).GEMV(
       true,
       static_cast<int>(in_dims[1]),
       static_cast<int>(in_dims[0]),
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h