fix expand_as kernel for big tensor

ggggxm · ggggxm · commit 18cd5f2db969 · 2025-05-09T08:06:35.000Z
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
@@ -1816,7 +1816,7 @@ void CEmbeddingInferMeta(const MetaTensor& weight,
 
 void ExpandAsInferMeta(const MetaTensor& x,
                        const MetaTensor& y,
-                       const std::vector<int>& target_shape,
+                       const std::vector<int64_t>& target_shape,
                        MetaTensor* out) {
 #define MAX_RANK_SUPPORTED 8
   auto x_dims = x.dims();
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
@@ -348,7 +348,7 @@ void CEmbeddingInferMeta(const MetaTensor& weight,
 
 void ExpandAsInferMeta(const MetaTensor& x,
                        const MetaTensor& y,
-                       const std::vector<int>& target_shape,
+                       const std::vector<int64_t>& target_shape,
                        MetaTensor* out);
 
 void FakeDequantizeMaxAbsInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/kernels/expand_as_kernel.h b/paddle/phi/kernels/expand_as_kernel.h
@@ -22,7 +22,7 @@ template <typename T, typename Context>
 void ExpandAsKernel(const Context& ctx,
                     const DenseTensor& x,
                     const paddle::optional<DenseTensor>& y,
-                    const std::vector<int>& target_shape,
+                    const std::vector<int64_t>& target_shape,
                     DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -23,21 +23,21 @@
 
 namespace phi {
 
-template <typename T, typename Context>
-void ExpandAsKernel(const Context& ctx,
-                    const DenseTensor& x,
-                    const paddle::optional<DenseTensor>& y,
-                    const std::vector<int>& target_shape_t,
-                    DenseTensor* out) {
-  std::vector<int> target_shape = target_shape_t;
+template <typename T, typename Context, typename ShapeType>
+void ExpandAsKernelImpl(const Context& ctx,
+                        const DenseTensor& x,
+                        const paddle::optional<DenseTensor>& y,
+                        const std::vector<ShapeType>& target_shape_t,
+                        DenseTensor* out) {
+  std::vector<ShapeType> target_shape = target_shape_t;
 
   if (y.get_ptr()) {
-    target_shape = phi::vectorize<int>(y.get_ptr()->dims());
+    target_shape = phi::vectorize<ShapeType>(y.get_ptr()->dims());
   }
 
   int rank = x.dims().size();
   int target_rank = static_cast<int>(target_shape.size());
-  auto vec_in_dims = common::vectorize<int>(x.dims());
+  auto vec_in_dims = common::vectorize<ShapeType>(x.dims());
 
   unsigned int diff = target_rank - rank;
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
@@ -80,6 +80,34 @@ void ExpandAsKernel(const Context& ctx,
   ExpandKernel<T, Context>(ctx, x, target_shape, out);
 }
 
+static inline std::vector<int> convert_to_int_vec(std::vector<int64_t> a) {
+  std::vector<int> ret;
+  for (size_t i = 0; i < a.size(); i++) {
+    ret.emplace_back(static_cast<int>(a[i]));
+  }
+
+  return ret;
+}
+
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<DenseTensor>& y,
+                    const std::vector<int64_t>& target_shape_t,
+                    DenseTensor* out) {
+  bool use_int64 =
+      std::any_of(target_shape_t.begin(), target_shape_t.end(), [](int64_t v) {
+        return v > static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+      });
+
+  if (use_int64) {
+    ExpandAsKernelImpl<T, Context, int64_t>(ctx, x, y, target_shape_t, out);
+  } else {
+    ExpandAsKernelImpl<T, Context, int32_t>(
+        ctx, x, y, convert_to_int_vec(target_shape_t), out);
+  }
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(expand_as,
diff --git a/paddle/phi/kernels/impl/expand_as_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
@@ -27,13 +27,13 @@ namespace phi {
 template <typename Context, typename T, int Rank>
 void ExpandAs(const Context& context,
               const DenseTensor& x,
-              const std::vector<int>& target_shape,
+              const std::vector<int64_t>& target_shape,
               DenseTensor* out) {
   auto in_dims = x.dims();
   auto vec_in_dims = common::vectorize<int>(in_dims);
   auto diff = target_shape.size() - vec_in_dims.size();
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-  std::vector<int> repeat_times(vec_in_dims.size());
+  std::vector<int64_t> repeat_times(vec_in_dims.size());
   if (Rank == 0) {
     phi::Copy<Context>(context, x, context.GetPlace(), false, out);
     return;
@@ -98,7 +98,7 @@ template <typename T, typename Context>
 void ExpandAsKernel(const Context& ctx,
                     const DenseTensor& x,
                     const paddle::optional<DenseTensor>& y,
-                    const std::vector<int>& target_shape,
+                    const std::vector<int64_t>& target_shape,
                     DenseTensor* out) {
   auto rank = x.dims().size();
   auto target_rank = target_shape.size();
@@ -124,12 +124,12 @@ void ExpandAsKernel(const Context& ctx,
                         target_rank,
                         MAX_RANK_SUPPORTED));
 
-  std::vector<int> real_target_shape = target_shape;
+  std::vector<int64_t> real_target_shape = target_shape;
   for (size_t i = 0; i < target_shape.size(); ++i) {
     if (target_shape[i] == -1) {
       if (y) {
         if (y->IsInitialized()) {
-          real_target_shape = common::vectorize<int>(y->dims());
+          real_target_shape = common::vectorize<int64_t>(y->dims());
         }
       }
       break;
diff --git a/paddle/phi/kernels/impl/solve_kernel_impl.h b/paddle/phi/kernels/impl/solve_kernel_impl.h
@@ -76,15 +76,6 @@ static std::vector<int64_t> get_broadcast_batch_portion(
   return batchPortion;
 }
 
-static inline std::vector<int> convert_to_int_vec(std::vector<int64_t> a) {
-  std::vector<int> ret;
-  for (size_t i = 0; i < a.size(); i++) {
-    ret.emplace_back(static_cast<int>(a[i]));
-  }
-
-  return ret;
-}
-
 // broadcast the batch dimensions of tensor x and tensor y.
 static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
 get_broadcast_dims(const Tensor& x, const Tensor& y) {
@@ -150,11 +141,11 @@ static void linalg_solve(const Context& dev_ctx,
   Tensor tmp_x_bc;
 
   phi::ExpandAsKernel<T, Context>(
-      dev_ctx, tmp_x, nullptr, convert_to_int_vec(x_broadcast_dims), &tmp_x_bc);
+      dev_ctx, tmp_x, nullptr, x_broadcast_dims, &tmp_x_bc);
 
   Tensor tmp_y_bc;
   phi::ExpandAsKernel<T, Context>(
-      dev_ctx, tmp_y, nullptr, convert_to_int_vec(y_broadcast_dims), &tmp_y_bc);
+      dev_ctx, tmp_y, nullptr, y_broadcast_dims, &tmp_y_bc);
 
   auto x_dim = x.dims();
   auto y_dim = y.dims();
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -986,8 +986,8 @@
   composite : exp_grad(out, out_grad, x_grad)
 
 - backward_op : expand_as_grad
-  forward : expand_as (Tensor x, Tensor y, int[] target_shape = {}) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, int[] target_shape)
+  forward : expand_as (Tensor x, Tensor y, int64_t[] target_shape = {}) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int64_t[] target_shape)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -1754,7 +1754,7 @@
   backward : expand_grad
 
 - op : expand_as
-  args : (Tensor x, Tensor y, int[] target_shape = {})
+  args : (Tensor x, Tensor y, int64_t[] target_shape = {})
   output : Tensor(out)
   infer_meta :
     func : ExpandAsInferMeta