From 162f3417e537ab51e1c8615606365afb3ea04361 Mon Sep 17 00:00:00 2001
From: Hu Shenwei <hushenwei@baidu.com>
Date: Thu, 14 Aug 2025 22:27:19 +0800
Subject: [PATCH 1/4] fix(math.py, unary.cc): fix output type diff for cumsum
 kernel

---
 paddle/phi/infermeta/unary.cc | 10 ++++++++--
 python/paddle/tensor/math.py  |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index a30e9fd2f035e4..6c23946bb35c15 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -635,9 +635,15 @@ void CumInferMeta(const MetaTensor& x,
                   bool reverse,
                   MetaTensor* out) {
   auto x_dims = x.dims();
+  auto x_dtype = x.dtype();
+  auto out_dtype =
+      (x_dtype == phi::DataType::UINT8 || x_dtype == phi::DataType::INT8 ||
+       x_dtype == phi::DataType::INT16 || x_dtype == phi::DataType::INT32)
+          ? phi::DataType::INT64
+          : x_dtype;
   if (flatten) {
     out->set_dims(common::make_ddim({common::product(x_dims)}));
-    out->set_dtype(x.dtype());
+    out->set_dtype(out_dtype);
   } else {
     if (x_dims.size() > 0) {
       PADDLE_ENFORCE_GE(
@@ -667,7 +673,7 @@ void CumInferMeta(const MetaTensor& x,
                                   axis));
     }
     out->set_dims(x_dims);
-    out->set_dtype(x.dtype());
+    out->set_dtype(out_dtype);
   }
 
   out->share_lod(x);
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index fe6b21e8a543ae..8de3cf18466379 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -4350,6 +4350,8 @@ def cumsum(
         flatten = False
     if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
         x = cast(x, dtype)
+    elif x.dtype in [paddle.uint8, paddle.int8, paddle.int16, paddle.int32]:
+        x = cast(x, "int64")
 
     if in_dynamic_or_pir_mode():
         if axis is None:

From b7ad9c5a8c1f762d9915fe711ede18fa539cb865 Mon Sep 17 00:00:00 2001
From: Hu Shenwei <hushenwei@baidu.com>
Date: Fri, 15 Aug 2025 00:53:05 +0800
Subject: [PATCH 2/4] fix(math.py): fix output type diff for cumsum kernel

---
 python/paddle/tensor/math.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 8de3cf18466379..0fac4dae52f84e 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -4350,7 +4350,12 @@ def cumsum(
         flatten = False
     if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
         x = cast(x, dtype)
-    elif x.dtype in [paddle.uint8, paddle.int8, paddle.int16, paddle.int32]:
+    elif isinstance(x, paddle.Tensor) and x.dtype in [
+        paddle.uint8,
+        paddle.int8,
+        paddle.int16,
+        paddle.int32,
+    ]:
         x = cast(x, "int64")
 
     if in_dynamic_or_pir_mode():

From a0165e647eef0b0c38dc60a18ad6d9f8fbb47397 Mon Sep 17 00:00:00 2001
From: Hu Shenwei <hushenwei@baidu.com>
Date: Fri, 15 Aug 2025 15:23:49 +0800
Subject: [PATCH 3/4] fix(math.py): fix `cumsum` documentation

---
 python/paddle/tensor/math.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 0fac4dae52f84e..0408e07f4b131c 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -4308,7 +4308,7 @@ def cumsum(
     Args:
         x (Tensor): The input tensor needed to be cumsumed.
         axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
-        dtype (str|paddle.dtype|np.dtype|None, optional): The data type of the output tensor, can be bfloat16, float16, float32, float64, int32, int64, complex64, complex128. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
+        dtype (str|paddle.dtype|np.dtype|None, optional): The data type of the output tensor, can be bfloat16, float16, float32, float64, int32, int64, complex64, complex128. By default, it is int64 if the input x is int8/int16/int32; otherwise, it is None. If it is not None, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows.
         name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:

From 013def00efb1d08017809ca1e00ce85f48f06fa6 Mon Sep 17 00:00:00 2001
From: Hu Shenwei <hushenwei@baidu.com>
Date: Mon, 18 Aug 2025 16:28:46 +0800
Subject: [PATCH 4/4] fix(cum/cum_grad.cc/cu, test_cumsum_op.py): fix output
 type diff for cumsum kernel and add unit test

---
 paddle/phi/infermeta/unary.cc             |  10 +-
 paddle/phi/kernels/cpu/cum_grad_kernel.cc |   2 +
 paddle/phi/kernels/cpu/cum_kernel.cc      |   2 +
 paddle/phi/kernels/gpu/cum_grad_kernel.cu |   2 +
 paddle/phi/kernels/gpu/cum_kernel.cu      |   2 +
 test/legacy_test/test_cumsum_op.py        | 213 ++++++++++++++++++++++
 6 files changed, 223 insertions(+), 8 deletions(-)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 6c23946bb35c15..a30e9fd2f035e4 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -635,15 +635,9 @@ void CumInferMeta(const MetaTensor& x,
                   bool reverse,
                   MetaTensor* out) {
   auto x_dims = x.dims();
-  auto x_dtype = x.dtype();
-  auto out_dtype =
-      (x_dtype == phi::DataType::UINT8 || x_dtype == phi::DataType::INT8 ||
-       x_dtype == phi::DataType::INT16 || x_dtype == phi::DataType::INT32)
-          ? phi::DataType::INT64
-          : x_dtype;
   if (flatten) {
     out->set_dims(common::make_ddim({common::product(x_dims)}));
-    out->set_dtype(out_dtype);
+    out->set_dtype(x.dtype());
   } else {
     if (x_dims.size() > 0) {
       PADDLE_ENFORCE_GE(
@@ -673,7 +667,7 @@ void CumInferMeta(const MetaTensor& x,
                                   axis));
     }
     out->set_dims(x_dims);
-    out->set_dtype(out_dtype);
+    out->set_dtype(x.dtype());
   }
 
   out->share_lod(x);
diff --git a/paddle/phi/kernels/cpu/cum_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_grad_kernel.cc
index 0f5cf47c822bd4..9fbc51b5f4232b 100644
--- a/paddle/phi/kernels/cpu/cum_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cum_grad_kernel.cc
@@ -54,6 +54,8 @@ PD_REGISTER_KERNEL(cumsum_grad,
                    phi::CumsumGradKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/cpu/cum_kernel.cc b/paddle/phi/kernels/cpu/cum_kernel.cc
index 69578a27cff314..190b16a9c22e7d 100644
--- a/paddle/phi/kernels/cpu/cum_kernel.cc
+++ b/paddle/phi/kernels/cpu/cum_kernel.cc
@@ -273,6 +273,8 @@ PD_REGISTER_KERNEL(cumsum,
                    phi::CumsumKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
index 8f1d5c43940e15..91bcb70a17a81e 100644
--- a/paddle/phi/kernels/gpu/cum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
@@ -81,6 +81,8 @@ PD_REGISTER_KERNEL(cumsum_grad,
                    phi::CumsumGradKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
index 279b48312746bd..c11cc538a033e0 100644
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -508,6 +508,8 @@ PD_REGISTER_KERNEL(cumsum,
                    phi::CumsumKernel,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
diff --git a/test/legacy_test/test_cumsum_op.py b/test/legacy_test/test_cumsum_op.py
index f218892447978e..32e76d0c424a61 100644
--- a/test/legacy_test/test_cumsum_op.py
+++ b/test/legacy_test/test_cumsum_op.py
@@ -124,6 +124,219 @@ def test_name(self):
             self.assertTrue('out' in y.name)
 
 
+class TestCumsumOp_INT(unittest.TestCase):
+    def run_cases(self):
+        data_np = np.arange(12).reshape(3, 4).astype(np.uint8)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+        data_np = np.arange(12).reshape(3, 4).astype(np.int8)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+        data_np = np.arange(12).reshape(3, 4).astype(np.int16)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+        data_np = np.arange(12).reshape(3, 4).astype(np.int32)
+        data = paddle.to_tensor(data_np)
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        np.testing.assert_array_equal(z, y.numpy())
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        np.testing.assert_array_equal(z, y.numpy())
+
+    def run_static_uint8(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.uint8)
+            x = paddle.static.data('X', [100, 100], dtype='uint8')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def run_static_int8(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.int8)
+            x = paddle.static.data('X', [100, 100], dtype='int8')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def run_static_int16(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.int16)
+            x = paddle.static.data('X', [100, 100], dtype='int16')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def run_static_uint16(self, use_gpu=False):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data_np = np.random.random((100, 100)).astype(np.uint16)
+            x = paddle.static.data('X', [100, 100], dtype='uint16')
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, axis=-2)
+            place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(
+                feed={'X': data_np},
+                fetch_list=[
+                    y,
+                    y2,
+                    y3,
+                    y4,
+                ],
+            )
+            z = np.cumsum(data_np)
+            np.testing.assert_allclose(z, out[0], rtol=1e-05)
+            z = np.cumsum(data_np, axis=0)
+            np.testing.assert_allclose(z, out[1], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-1)
+            np.testing.assert_allclose(z, out[2], rtol=1e-05)
+            z = np.cumsum(data_np, axis=-2)
+            np.testing.assert_allclose(z, out[3], rtol=1e-05)
+
+    def test_cpu_dygraph(self):
+        paddle.disable_static(paddle.base.CPUPlace())
+        self.run_cases()
+        paddle.enable_static()
+
+    def test_cpu_static(self):
+        self.run_static_uint8()
+        self.run_static_int8()
+        self.run_static_int16()
+
+    def test_gpu_dygraph(self):
+        if not base.core.is_compiled_with_cuda():
+            return
+        paddle.disable_static(paddle.base.CUDAPlace(0))
+        self.run_cases()
+        paddle.enable_static()
+
+    def test_gpu_static(self):
+        if not base.core.is_compiled_with_cuda():
+            return
+        self.run_static_uint8(use_gpu=True)
+        self.run_static_int8(use_gpu=True)
+        self.run_static_uint16(use_gpu=True)
+        self.run_static_int16(use_gpu=True)
+
+    def test_name(self):
+        with (
+            paddle.pir_utils.OldIrGuard(),
+            base.program_guard(base.Program()),
+        ):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.cumsum(x, name='out')
+            self.assertTrue('out' in y.name)
+
+
 def cumsum_wrapper(x, axis=-1, flatten=False, exclusive=False, reverse=False):
     return paddle._C_ops.cumsum(x, axis, flatten, exclusive, reverse)