From 7bb6b61aeb9f3026e01102054e0a822b24e1a5fb Mon Sep 17 00:00:00 2001
From: xingmingyyj <zxm_3791@163.com>
Date: Tue, 15 Jul 2025 21:56:26 +0800
Subject: [PATCH 1/8] fix deform_conv2d api

---
 .../plugin/deformable_conv_op_plugin.cu       | 170 +++++++--------
 .../cpu/deformable_conv_grad_kernel.cc        | 167 +++++++--------
 .../kernels/funcs/deformable_conv_functor.cc  |  70 +++---
 .../kernels/funcs/deformable_conv_functor.cu  |  79 +++----
 .../gpu/deformable_conv_grad_kernel.cu        | 199 +++++++++---------
 .../impl/deformable_conv_grad_kernel_impl.h   |  14 +-
 .../impl/deformable_conv_kernel_impl.h        |  13 +-
 7 files changed, 359 insertions(+), 353 deletions(-)
diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index 93c9c221b26392..fa9214d990d835 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -34,9 +34,9 @@ namespace plugin {
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaximumNumBlocks = 4096;
 
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
+static inline int NumBlocks(const int64_t N) {
+  return std::min<int64_t>((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                           static_cast<int64_t>(kNumMaximumNumBlocks));
 }
 
 static inline int ConvOutputSize(
@@ -367,66 +367,66 @@ __device__ half DmcnIm2colBilinear<half>(const half* bottom_data,
 
 template <typename T>
 __global__ void ModulatedDeformableIm2colGpuKernel(
-    const int nthreads,
+    const int64_t nthreads,
     const T* data_im,
     const T* data_offset,
     const T* data_mask,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int num_channels,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    const int64_t channel_per_deformable_group,
+    const int64_t batch_size,
+    const int64_t num_channels,
+    const int64_t deformable_group,
+    const int64_t height_col,
+    const int64_t width_col,
     T* data_col);
 
 template <>
 __global__ void ModulatedDeformableIm2colGpuKernel<float>(
-    const int nthreads,
+    const int64_t nthreads,
     const float* data_im,
     const float* data_offset,
     const float* data_mask,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int num_channels,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    const int64_t channel_per_deformable_group,
+    const int64_t batch_size,
+    const int64_t num_channels,
+    const int64_t deformable_group,
+    const int64_t height_col,
+    const int64_t width_col,
     float* data_col) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * gridDim.x;
 
   float minus_one = -1.0f, height_t = height, width_t = width;
-  for (size_t i = index; i < nthreads; i += offset) {
-    const int w_col = i % width_col;
-    const int h_col = (i / width_col) % height_col;
-    const int b_col = (i / width_col) / height_col % batch_size;
-    const int c_im = (i / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
+  for (int64_t i = index; i < nthreads; i += offset) {
+    const int64_t w_col = i % width_col;
+    const int64_t h_col = (i / width_col) % height_col;
+    const int64_t b_col = (i / width_col) / height_col % batch_size;
+    const int64_t c_im = (i / width_col / height_col) / batch_size;
+    const int64_t c_col = c_im * kernel_h * kernel_w;
 
-    const int deformable_group_index = c_im / channel_per_deformable_group;
+    const int64_t deformable_group_index = c_im / channel_per_deformable_group;
 
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
+    const int64_t h_in = h_col * stride_h - pad_h;
+    const int64_t w_in = w_col * stride_w - pad_w;
 
     float* data_col_ptr =
         data_col +
@@ -440,14 +440,14 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
         data_mask + (b_col * deformable_group + deformable_group_index) *
                         kernel_h * kernel_w * height_col * width_col;
 
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
+    for (int64_t i = 0; i < kernel_h; ++i) {
+      for (int64_t j = 0; j < kernel_w; ++j) {
+        const int64_t data_offset_h_ptr =
             ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
+        const int64_t data_offset_w_ptr =
             ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
             w_col;
-        const int data_mask_hw_ptr =
+        const int64_t data_mask_hw_ptr =
             ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
 
         const float offset_h = data_offset_ptr[data_offset_h_ptr];
@@ -471,43 +471,43 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
 
 template <>
 __global__ void ModulatedDeformableIm2colGpuKernel<half>(
-    const int nthreads,
+    const int64_t nthreads,
     const half* data_im,
     const half* data_offset,
     const half* data_mask,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int num_channels,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    const int64_t channel_per_deformable_group,
+    const int64_t batch_size,
+    const int64_t num_channels,
+    const int64_t deformable_group,
+    const int64_t height_col,
+    const int64_t width_col,
     half* data_col) {
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * gridDim.x;
 
   half minus_one = -1.0f, height_t = height, width_t = width;
   for (size_t i = index; i < nthreads; i += offset) {
-    const int w_col = i % width_col;
-    const int h_col = (i / width_col) % height_col;
-    const int b_col = (i / width_col) / height_col % batch_size;
-    const int c_im = (i / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
+    const int64_t w_col = i % width_col;
+    const int64_t h_col = (i / width_col) % height_col;
+    const int64_t b_col = (i / width_col) / height_col % batch_size;
+    const int64_t c_im = (i / width_col / height_col) / batch_size;
+    const int64_t c_col = c_im * kernel_h * kernel_w;
 
-    const int deformable_group_index = c_im / channel_per_deformable_group;
+    const int64_t deformable_group_index = c_im / channel_per_deformable_group;
 
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
+    const int64_t h_in = h_col * stride_h - pad_h;
+    const int64_t w_in = w_col * stride_w - pad_w;
 
     half* data_col_ptr =
         data_col +
@@ -521,14 +521,14 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
         data_mask + (b_col * deformable_group + deformable_group_index) *
                         kernel_h * kernel_w * height_col * width_col;
 
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
+    for (int64_t i = 0; i < kernel_h; ++i) {
+      for (int64_t j = 0; j < kernel_w; ++j) {
+        const int64_t data_offset_h_ptr =
             ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
+        const int64_t data_offset_w_ptr =
             ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
             w_col;
-        const int data_mask_hw_ptr =
+        const int64_t data_mask_hw_ptr =
             ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
 
         const half offset_h = data_offset_ptr[data_offset_h_ptr];
diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index a8889d09aa757e..9608636aca6907 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -22,50 +22,50 @@ namespace phi {
 
 template <typename T>
 inline void ModulatedDeformableCol2imCPUKernel(
-    const int num_kernels,
+    const int64_t num_kernels,
     const T* data_col,
     const T* data_offset,
     const T* data_mask,
-    const int channels,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    const int64_t channel_per_deformable_group,
+    const int64_t batch_size,
+    const int64_t deformable_group,
+    const int64_t height_col,
+    const int64_t width_col,
     T* grad_im) {
-  for (int thread = 0; thread < num_kernels; thread++) {
-    const int j = (thread / width_col / height_col / batch_size) % kernel_w;
-    const int i =
+  for (int64_t thread = 0; thread < num_kernels; thread++) {
+    const int64_t j = (thread / width_col / height_col / batch_size) % kernel_w;
+    const int64_t i =
         (thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
+    const int64_t c =
         thread / width_col / height_col / batch_size / kernel_w / kernel_h;
 
-    const int deformable_group_index = c / channel_per_deformable_group;
+    const int64_t deformable_group_index = c / channel_per_deformable_group;
 
-    int w_out = thread % width_col;
-    int h_out = (thread / width_col) % height_col;
-    int b = (thread / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
+    int64_t w_out = thread % width_col;
+    int64_t h_out = (thread / width_col) % height_col;
+    int64_t b = (thread / width_col / height_col) % batch_size;
+    int64_t w_in = w_out * stride_w - pad_w;
+    int64_t h_in = h_out * stride_h - pad_h;
 
     const T* data_offset_ptr =
         data_offset + (b * deformable_group + deformable_group_index) * 2 *
                           kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr =
+    const int64_t data_offset_h_ptr =
         ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
+    const int64_t data_offset_w_ptr =
         ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const int data_mask_hw_ptr =
+    const int64_t data_mask_hw_ptr =
         ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
     const T offset_h = data_offset_ptr[data_offset_h_ptr];
     const T offset_w = data_offset_ptr[data_offset_w_ptr];
@@ -80,14 +80,14 @@ inline void ModulatedDeformableCol2imCPUKernel(
       const T mask = data_mask_ptr[data_mask_hw_ptr];
       cur_top_grad *= mask;
     }
-    const int cur_h = static_cast<int>(cur_inv_h_data);
-    const int cur_w = static_cast<int>(cur_inv_w_data);
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
+    const int64_t cur_h = static_cast<int64_t>(cur_inv_h_data);
+    const int64_t cur_w = static_cast<int64_t>(cur_inv_w_data);
+    for (int64_t dy = -2; dy <= 2; dy++) {
+      for (int64_t dx = -2; dx <= 2; dx++) {
         if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
             cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
             abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
+          int64_t cur_bottom_grad_pos =
               ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
           T weight = DmcnGetGradientWeight(cur_inv_h_data,
                                            cur_inv_w_data,
@@ -117,10 +117,10 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& dilation,
                                const int deformable_group,
                                T* grad_im) {
-  int channel_per_deformable_group =
-      static_cast<int>(im_shape[0] / deformable_group);
-  int num_kernels = static_cast<int>(col_shape[0] * col_shape[1] *
-                                     col_shape[2] * col_shape[3]);
+  int64_t channel_per_deformable_group =
+      static_cast<int64_t>(im_shape[0] / deformable_group);
+  int64_t num_kernels = static_cast<int64_t>(col_shape[0] * col_shape[1] *
+                                             col_shape[2] * col_shape[3]);
 
   ModulatedDeformableCol2imCPUKernel(num_kernels,
                                      data_col,
@@ -147,40 +147,40 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
 
 template <typename T>
 void ModulatedDeformableCol2imCoordCPUKernel(
-    const int num_kernels,
+    const int64_t num_kernels,
     const T* data_col,
     const T* data_im,
     const T* data_offset,
     const T* data_mask,
-    const int channels,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int offset_channels,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    const int64_t channel_per_deformable_group,
+    const int64_t batch_size,
+    const int64_t offset_channels,
+    const int64_t deformable_group,
+    const int64_t height_col,
+    const int64_t width_col,
     T* grad_offset,
     T* grad_mask) {
-  for (int i = 0; i < num_kernels; i++) {
+  for (int64_t i = 0; i < num_kernels; i++) {
     T val = 0, mval = 0;
-    const int w = i % width_col;
-    const int h = (i / width_col) % height_col;
-    const int c = (i / width_col / height_col) % offset_channels;
-    const int b = (i / width_col / height_col) / offset_channels;
+    const int64_t w = i % width_col;
+    const int64_t h = (i / width_col) % height_col;
+    const int64_t c = (i / width_col / height_col) % offset_channels;
+    const int64_t b = (i / width_col / height_col) / offset_channels;
 
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
+    const int64_t deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int64_t col_step = kernel_h * kernel_w;
+    int64_t cnt = 0;
     const T* data_col_ptr = data_col + deformable_group_index *
                                            channel_per_deformable_group *
                                            batch_size * width_col * height_col;
@@ -197,24 +197,25 @@ void ModulatedDeformableCol2imCoordCPUKernel(
                               kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+    const int64_t offset_c =
+        c - deformable_group_index * 2 * kernel_h * kernel_w;
 
-    for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
+    for (int64_t col_c = offset_c / 2; col_c < channel_per_deformable_group;
          col_c += col_step) {
-      const int col_pos =
+      const int64_t col_pos =
           (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
+      const int64_t bp_dir = offset_c % 2;
 
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
+      int64_t j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int64_t i =
           (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
+      int64_t w_out = col_pos % width_col;
+      int64_t h_out = (col_pos / width_col) % height_col;
+      int64_t w_in = w_out * stride_w - pad_w;
+      int64_t h_in = h_out * stride_h - pad_h;
+      const int64_t data_offset_h_ptr =
           (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
+      const int64_t data_offset_w_ptr =
           (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
            w_out);
       const T offset_h = data_offset_ptr[data_offset_h_ptr];
@@ -241,7 +242,7 @@ void ModulatedDeformableCol2imCoordCPUKernel(
                                   width,
                                   bp_dir);
       if (data_mask_ptr) {
-        const int data_mask_hw_ptr =
+        const int64_t data_mask_hw_ptr =
             (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
         const T mask = data_mask_ptr[data_mask_hw_ptr];
         val += weight * data_col_ptr[col_pos] * mask;
@@ -312,13 +313,13 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void FilterGradAddup(const Context& dev_ctx,
-                     const int nthreads,
-                     const int n,
-                     const int height,
-                     const int width,
+                     const int64_t nthreads,
+                     const int64_t n,
+                     const int64_t height,
+                     const int64_t width,
                      const T* dweight_3d,
                      T* filter_grad) {
-  for (int i = 0; i < nthreads; i++) {
+  for (int64_t i = 0; i < nthreads; i++) {
     filter_grad[i] = filter_grad[i] + dweight_3d[i];
   }
 }
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index e028b51e3ce7b9..8a3f09f8ae85b2 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -20,38 +20,38 @@ namespace phi::funcs {
 
 template <typename T>
 inline void ModulatedDeformableIm2colCPUKernel(
-    const int num_kernels,
+    const int64_t num_kernels,
     const T* data_im,
     const T* data_offset,
     const T* data_mask,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int num_channels,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    const int64_t channel_per_deformable_group,
+    const int64_t batch_size,
+    const int64_t num_channels,
+    const int64_t deformable_group,
+    const int64_t height_col,
+    const int64_t width_col,
     T* data_col) {
-  for (int i = 0; i < num_kernels; i++) {
-    const int w_col = i % width_col;
-    const int h_col = (i / width_col) % height_col;
-    const int b_col = (i / width_col) / height_col % batch_size;
-    const int c_im = (i / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
+  for (int64_t i = 0; i < num_kernels; i++) {
+    const int64_t w_col = i % width_col;
+    const int64_t h_col = (i / width_col) % height_col;
+    const int64_t b_col = (i / width_col) / height_col % batch_size;
+    const int64_t c_im = (i / width_col / height_col) / batch_size;
+    const int64_t c_col = c_im * kernel_h * kernel_w;
 
-    const int deformable_group_index = c_im / channel_per_deformable_group;
+    const int64_t deformable_group_index = c_im / channel_per_deformable_group;
 
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
+    const int64_t h_in = h_col * stride_h - pad_h;
+    const int64_t w_in = w_col * stride_w - pad_w;
 
     T* data_col_ptr =
         data_col +
@@ -67,11 +67,11 @@ inline void ModulatedDeformableIm2colCPUKernel(
                               kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
+    for (int64_t i = 0; i < kernel_h; ++i) {
+      for (int64_t j = 0; j < kernel_w; ++j) {
+        const int64_t data_offset_h_ptr =
             ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
+        const int64_t data_offset_w_ptr =
             ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
             w_col;
 
@@ -86,7 +86,7 @@ inline void ModulatedDeformableIm2colCPUKernel(
         }
         *data_col_ptr = val;
         if (data_mask_ptr) {
-          const int data_mask_hw_ptr =
+          const int64_t data_mask_hw_ptr =
               ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
           const T mask = data_mask_ptr[data_mask_hw_ptr];
           *data_col_ptr *= mask;
@@ -110,10 +110,10 @@ void ModulatedDeformableIm2col(const Context& dev_ctx UNUSED,
                                const std::vector<int>& dilations,
                                const int deformable_groups,
                                T* data_col) {
-  int channel_per_deformable_group =
-      static_cast<int>(im_shape[0] / deformable_groups);
-  int num_kernels = static_cast<int>(im_shape[0] * col_shape[1] * col_shape[2] *
-                                     col_shape[3]);
+  int64_t channel_per_deformable_group =
+      static_cast<int64_t>(im_shape[0] / deformable_groups);
+  int64_t num_kernels = static_cast<int64_t>(im_shape[0] * col_shape[1] *
+                                             col_shape[2] * col_shape[3]);
 
   // get outputs of im2col with offset by bilinear interpolation
   ModulatedDeformableIm2colCPUKernel(num_kernels,
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
index 48105d1f517e9b..c1ca11eb33e5cc 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
@@ -21,47 +21,47 @@ namespace funcs {
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaximumNumBlocks = 4096;
 
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
+static inline int NumBlocks(const int64_t N) {
+  return std::min<int64_t>((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                           static_cast<int64_t>(kNumMaximumNumBlocks));
 }
 
 template <typename T>
 __global__ void ModulatedDeformableIm2colGpuKernel(
-    const int nthreads,
+    const int64_t nthreads,
     const T* data_im,
     const T* data_offset,
     const T* data_mask,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int num_channels,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    const int64_t channel_per_deformable_group,
+    const int64_t batch_size,
+    const int64_t num_channels,
+    const int64_t deformable_group,
+    const int64_t height_col,
+    const int64_t width_col,
     T* data_col) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * gridDim.x;
   for (size_t i = index; i < nthreads; i += offset) {
-    const int w_col = i % width_col;
-    const int h_col = (i / width_col) % height_col;
-    const int b_col = (i / width_col) / height_col % batch_size;
-    const int c_im = (i / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
+    const int64_t w_col = i % width_col;
+    const int64_t h_col = (i / width_col) % height_col;
+    const int64_t b_col = (i / width_col) / height_col % batch_size;
+    const int64_t c_im = (i / width_col / height_col) / batch_size;
+    const int64_t c_col = c_im * kernel_h * kernel_w;
 
-    const int deformable_group_index = c_im / channel_per_deformable_group;
+    const int64_t deformable_group_index = c_im / channel_per_deformable_group;
 
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
+    const int64_t h_in = h_col * stride_h - pad_h;
+    const int64_t w_in = w_col * stride_w - pad_w;
 
     T* data_col_ptr =
         data_col +
@@ -77,11 +77,11 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
                               kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
+    for (int64_t i = 0; i < kernel_h; ++i) {
+      for (int64_t j = 0; j < kernel_w; ++j) {
+        const int64_t data_offset_h_ptr =
             ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
+        const int64_t data_offset_w_ptr =
             ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
             w_col;
 
@@ -96,7 +96,7 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
         }
         *data_col_ptr = val;
         if (data_mask_ptr) {
-          const int data_mask_hw_ptr =
+          const int64_t data_mask_hw_ptr =
               ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
           const T mask = data_mask_ptr[data_mask_hw_ptr];
           *data_col_ptr *= mask;
@@ -120,11 +120,12 @@ void ModulatedDeformableIm2col(const Context& dev_ctx,
                                const std::vector<int>& dilations,
                                const int deformable_groups,
                                T* data_col) {
-  int channel_per_deformable_group = im_shape[0] / deformable_groups;
-  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+  int64_t channel_per_deformable_group = im_shape[0] / deformable_groups;
+  int64_t num_kernels =
+      im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
 
-  int blocks = NumBlocks(num_kernels);
-  int threads = kNumCUDAThreads;
+  int64_t blocks = NumBlocks(num_kernels);
+  int64_t threads = kNumCUDAThreads;
 
   ModulatedDeformableIm2colGpuKernel<T>
       <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 55c8a9f96fd818..947a8a4654fc58 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -31,52 +31,52 @@ static inline int NumBlocks(const int N) {
 
 template <typename T>
 __global__ void ModulatedDeformableCol2imGpuKernel(
-    const int nthreads,
+    const int64_t nthreads,
     const T* data_col,
     const T* data_offset,
     const T* data_mask,
-    const int channels,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    const int64_t channel_per_deformable_group,
+    const int64_t batch_size,
+    const int64_t deformable_group,
+    const int64_t height_col,
+    const int64_t width_col,
     T* grad_im) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * gridDim.x;
   for (size_t thread = index; thread < nthreads; thread += offset) {
-    const int j = (thread / width_col / height_col / batch_size) % kernel_w;
-    const int i =
+    const int64_t j = (thread / width_col / height_col / batch_size) % kernel_w;
+    const int64_t i =
         (thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
+    const int64_t c =
         thread / width_col / height_col / batch_size / kernel_w / kernel_h;
 
-    const int deformable_group_index = c / channel_per_deformable_group;
+    const int64_t deformable_group_index = c / channel_per_deformable_group;
 
-    int w_out = thread % width_col;
-    int h_out = (thread / width_col) % height_col;
-    int b = (thread / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
+    int64_t w_out = thread % width_col;
+    int64_t h_out = (thread / width_col) % height_col;
+    int64_t b = (thread / width_col / height_col) % batch_size;
+    int64_t w_in = w_out * stride_w - pad_w;
+    int64_t h_in = h_out * stride_h - pad_h;
 
     const T* data_offset_ptr =
         data_offset + (b * deformable_group + deformable_group_index) * 2 *
                           kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr =
+    const int64_t data_offset_h_ptr =
         ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
+    const int64_t data_offset_w_ptr =
         ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const int data_mask_hw_ptr =
+    const int64_t data_mask_hw_ptr =
         ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
     const T offset_h = data_offset_ptr[data_offset_h_ptr];
     const T offset_w = data_offset_ptr[data_offset_w_ptr];
@@ -91,14 +91,14 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
       const T mask = data_mask_ptr[data_mask_hw_ptr];
       cur_top_grad *= mask;
     }
-    const int cur_h = static_cast<int>(cur_inv_h_data);
-    const int cur_w = static_cast<int>(cur_inv_w_data);
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
+    const int64_t cur_h = static_cast<int64_t>(cur_inv_h_data);
+    const int64_t cur_w = static_cast<int64_t>(cur_inv_w_data);
+    for (int64_t dy = -2; dy <= 2; dy++) {
+      for (int64_t dx = -2; dx <= 2; dx++) {
         if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
             cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
             abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
+          int64_t cur_bottom_grad_pos =
               ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
           T weight = DmcnGetGradientWeight(cur_inv_h_data,
                                            cur_inv_w_data,
@@ -128,10 +128,11 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& dilation,
                                const int deformable_group,
                                T* grad_im) {
-  int channel_per_deformable_group = im_shape[0] / deformable_group;
-  int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
-  int blocks = NumBlocks(num_kernels);
-  int threads = kNumCUDAThreads;
+  int64_t channel_per_deformable_group = im_shape[0] / deformable_group;
+  int64_t num_kernels =
+      col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+  int64_t blocks = NumBlocks(num_kernels);
+  int64_t threads = kNumCUDAThreads;
 
   ModulatedDeformableCol2imGpuKernel<T>
       <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
@@ -159,42 +160,42 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
 
 template <typename T>
 __global__ void ModulatedDeformableCol2imCoordGpuKernel(
-    const int nthreads,
+    const int64_t nthreads,
     const T* data_col,
     const T* data_im,
     const T* data_offset,
     const T* data_mask,
-    const int channels,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int pad_h,
-    const int pad_w,
-    const int stride_h,
-    const int stride_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size,
-    const int offset_channels,
-    const int deformable_group,
-    const int height_col,
-    const int width_col,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    const int64_t channel_per_deformable_group,
+    const int64_t batch_size,
+    const int64_t offset_channels,
+    const int64_t deformable_group,
+    const int64_t height_col,
+    const int64_t width_col,
     T* grad_offset,
     T* grad_mask) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * gridDim.x;
   for (size_t i = index; i < nthreads; i += offset) {
     T val = 0, mval = 0;
-    const int w = i % width_col;
-    const int h = (i / width_col) % height_col;
-    const int c = (i / width_col / height_col) % offset_channels;
-    const int b = (i / width_col / height_col) / offset_channels;
+    const int64_t w = i % width_col;
+    const int64_t h = (i / width_col) % height_col;
+    const int64_t c = (i / width_col / height_col) % offset_channels;
+    const int64_t b = (i / width_col / height_col) / offset_channels;
 
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
+    const int64_t deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int64_t col_step = kernel_h * kernel_w;
+    int64_t cnt = 0;
     const T* data_col_ptr = data_col + deformable_group_index *
                                            channel_per_deformable_group *
                                            batch_size * width_col * height_col;
@@ -211,24 +212,25 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
                               kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+    const int64_t offset_c =
+        c - deformable_group_index * 2 * kernel_h * kernel_w;
 
-    for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
+    for (int64_t col_c = offset_c / 2; col_c < channel_per_deformable_group;
          col_c += col_step) {
-      const int col_pos =
+      const int64_t col_pos =
           (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
+      const int64_t bp_dir = offset_c % 2;
 
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
+      int64_t j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int64_t i =
           (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
+      int64_t w_out = col_pos % width_col;
+      int64_t h_out = (col_pos / width_col) % height_col;
+      int64_t w_in = w_out * stride_w - pad_w;
+      int64_t h_in = h_out * stride_h - pad_h;
+      const int64_t data_offset_h_ptr =
           (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
+      const int64_t data_offset_w_ptr =
           (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
            w_out);
       const T offset_h = data_offset_ptr[data_offset_h_ptr];
@@ -255,7 +257,7 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
                                   width,
                                   bp_dir);
       if (data_mask_ptr) {
-        const int data_mask_hw_ptr =
+        const int64_t data_mask_hw_ptr =
             (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
         const T mask = data_mask_ptr[data_mask_hw_ptr];
         val += weight * data_col_ptr[col_pos] * mask;
@@ -291,11 +293,11 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
                                     const int deformable_groups,
                                     T* grad_offset,
                                     T* grad_mask) {
-  int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
-                    col_shape[2] * col_shape[3] * deformable_groups;
-  int channel_per_deformable_group = col_shape[0] / deformable_groups;
-  int blocks = NumBlocks(num_kernels);
-  int threads = kNumCUDAThreads;
+  int64_t num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
+                        col_shape[2] * col_shape[3] * deformable_groups;
+  int64_t channel_per_deformable_group = col_shape[0] / deformable_groups;
+  int64_t blocks = NumBlocks(num_kernels);
+  int64_t threads = kNumCUDAThreads;
 
   ModulatedDeformableCol2imCoordGpuKernel<T>
       <<<blocks, threads, 0, dev_ctx.stream()>>>(
@@ -326,29 +328,32 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
 }
 
 template <typename T>
-__global__ void FilterGradAddupGpuKernel(const int nthreads,
-                                         const int n,
-                                         const int height,
-                                         const int width,
+__global__ void FilterGradAddupGpuKernel(const int64_t nthreads,
+                                         const int64_t n,
+                                         const int64_t height,
+                                         const int64_t width,
                                          const T* dweight_3d,
                                          T* filter_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * gridDim.x;
+  for (int64_t i = index; i < nthreads; i += offset) {
     filter_grad[i] = filter_grad[i] + dweight_3d[i];
   }
 }
 
 template <typename T, typename Context>
 void FilterGradAddup(const Context& dev_ctx,
-                     const int nthreads,
-                     const int n,
-                     const int height,
-                     const int width,
+                     const int64_t nthreads,
+                     const int64_t n,
+                     const int64_t height,
+                     const int64_t width,
                      const T* dweight_3d,
                      T* filter_grad) {
+  const int64_t max_grid_x = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  const int64_t grid_size = std::min<int64_t>(
+      (nthreads + kNumCUDAThreads - 1) / kNumCUDAThreads, max_grid_x);
   FilterGradAddupGpuKernel<T>
-      <<<NumBlocks(nthreads), kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+      <<<grid_size, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
           nthreads, n, height, width, dweight_3d, filter_grad);
 }
 
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index b9931a89978307..a7e3b3d7edc659 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -151,10 +151,10 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void FilterGradAddup(const Context& dev_ctx,
-                     const int nthreads,
-                     const int n,
-                     const int height,
-                     const int width,
+                     const int64_t nthreads,
+                     const int64_t n,
+                     const int64_t height,
+                     const int64_t width,
                      const T* dweight_3d,
                      T* filter_grad);
 
@@ -217,9 +217,9 @@ void DeformableConvGradKernel(const Context& dev_ctx,
   phi::funcs::SetConstant<Context, T> set_zero;
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
 
-  int input_dim = x.numel() / x.dims()[0];
-  int input_offset_dim = offset.numel() / offset.dims()[0];
-  int input_mask_dim = mask ? mask->numel() / mask->dims()[0] : 0;
+  int64_t input_dim = x.numel() / x.dims()[0];
+  int64_t input_offset_dim = offset.numel() / offset.dims()[0];
+  int64_t input_mask_dim = mask ? mask->numel() / mask->dims()[0] : 0;
 
   if (filter_grad) {
     Full<T>(dev_ctx,
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index 9d3d66ab2dbe65..2446741c6e6784 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -37,9 +37,9 @@ void DeformableConvKernel(const Context& dev_ctx,
                           int groups,
                           int im2col_step,
                           DenseTensor* out) {
-  const int batch_size = static_cast<int>(x.dims()[0]);
+  const int64_t batch_size = static_cast<int64_t>(x.dims()[0]);
 
-  int temp_step = std::min(64, batch_size);
+  int64_t temp_step = std::min<int64_t>(64, batch_size);
   if (batch_size % temp_step == 0) {
     im2col_step = temp_step;
   }
@@ -80,9 +80,9 @@ void DeformableConvKernel(const Context& dev_ctx,
   DDim input_shape = common::slice_ddim(x.dims(), 1, x.dims().size());
   std::vector<int64_t> input_shape_vec = common::vectorize(input_shape);
 
-  int input_dim = x.numel() / x.dims()[0];
-  int input_offset_dim = offset.numel() / offset.dims()[0];
-  int input_mask_dim = mask ? mask->numel() / mask->dims()[0] : 0;
+  int64_t input_dim = x.numel() / x.dims()[0];
+  int64_t input_offset_dim = offset.numel() / offset.dims()[0];
+  int64_t input_mask_dim = mask ? mask->numel() / mask->dims()[0] : 0;
 
   const T* input_ptr = x.data<T>();
   const T* offset_ptr = offset.data<T>();
@@ -91,7 +91,7 @@ void DeformableConvKernel(const Context& dev_ctx,
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
 
-  for (int i = 0; i < batch_size / im2col_step; ++i) {
+  for (int64_t i = 0; i < batch_size / im2col_step; ++i) {
     const T* temp_mask_ptr =
         mask_ptr ? mask_ptr + i * im2col_step * input_mask_dim : nullptr;
     funcs::ModulatedDeformableIm2col(
@@ -133,7 +133,6 @@ void DeformableConvKernel(const Context& dev_ctx,
                   T(0.0));
     }
   }
-
   //  swap axis to get the right result when im2col_step is greater than 1
   if (im2col_step > 1) {
     std::vector<int> axis(4);

From 22afabfe18f71e7af8a3187813a280f292ab2dae Mon Sep 17 00:00:00 2001
From: xingmingyyj <zxm_3791@163.com>
Date: Sun, 20 Jul 2025 15:02:48 +0800
Subject: [PATCH 2/8] fix precision diff

---
 .../plugin/deformable_conv_op_plugin.cu       |  8 ++---
 .../cpu/deformable_conv_grad_kernel.cc        |  4 +--
 .../gpu/deformable_conv_grad_kernel.cu        | 14 ++++-----
 .../impl/deformable_conv_grad_kernel_impl.h   | 29 +++++++++++--------
 4 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index fa9214d990d835..3549e0ced380ec 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -412,8 +412,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
     const int64_t height_col,
     const int64_t width_col,
     float* data_col) {
-  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t offset = blockDim.x * gridDim.x;
+  int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * static_cast<int64_t>(gridDim.x);
 
   float minus_one = -1.0f, height_t = height, width_t = width;
   for (int64_t i = index; i < nthreads; i += offset) {
@@ -493,8 +493,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
     const int64_t width_col,
     half* data_col) {
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
-  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t offset = blockDim.x * gridDim.x;
+  int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * static_cast<int64_t>(gridDim.x);
 
   half minus_one = -1.0f, height_t = height, width_t = width;
   for (size_t i = index; i < nthreads; i += offset) {
diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index 9608636aca6907..a5a5e007fa3965 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -222,9 +222,7 @@ void ModulatedDeformableCol2imCoordCPUKernel(
       const T offset_w = data_offset_ptr[data_offset_w_ptr];
       T inv_h = h_in + i * dilation_h + offset_h;
       T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
-        inv_h = inv_w = -2;
-      } else {
+      if (!(inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)) {
         mval += data_col_ptr[col_pos] *
                 funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
                                           width,
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 947a8a4654fc58..1012b82c89fbfe 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -184,9 +184,9 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
     const int64_t width_col,
     T* grad_offset,
     T* grad_mask) {
-  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
+  int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * static_cast<int64_t>(gridDim.x);
+  for (int64_t i = index; i < nthreads; i += offset) {
     T val = 0, mval = 0;
     const int64_t w = i % width_col;
     const int64_t h = (i / width_col) % height_col;
@@ -237,9 +237,7 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
       const T offset_w = data_offset_ptr[data_offset_w_ptr];
       T inv_h = h_in + i * dilation_h + offset_h;
       T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
-        inv_h = inv_w = -2;
-      } else {
+      if (!(inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)) {
         mval += data_col_ptr[col_pos] *
                 funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
                                           width,
@@ -334,8 +332,8 @@ __global__ void FilterGradAddupGpuKernel(const int64_t nthreads,
                                          const int64_t width,
                                          const T* dweight_3d,
                                          T* filter_grad) {
-  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t offset = blockDim.x * gridDim.x;
+  int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  int64_t offset = blockDim.x * static_cast<int64_t>(gridDim.x);
   for (int64_t i = index; i < nthreads; i += offset) {
     filter_grad[i] = filter_grad[i] + dweight_3d[i];
   }
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 4b0f7e5e8a180c..cfd282481b44c4 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -66,51 +66,56 @@ HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
                                      const T* im_data,
                                      const int data_width,
                                      const int bp_dir) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    return 0;
-  }
+  // if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+  //     argmax_w >= width) {
+  //   return 0;
+  // }
 
   int argmax_h_low = floor(argmax_h);
   int argmax_w_low = floor(argmax_w);
   int argmax_h_high = argmax_h_low + 1;
   int argmax_w_high = argmax_w_low + 1;
 
+  bool valid_argmax_h_low = (argmax_h_low >= 0) && (argmax_h_low < height);
+  bool valid_argmax_h_high = (argmax_h_high >= 0) && (argmax_h_high < height);
+  bool valid_argmax_w_low = (argmax_w_low >= 0) && (argmax_w_low < width);
+  bool valid_argmax_w_high = (argmax_w_high >= 0) && (argmax_w_high < width);
+
   T weight = 0;
 
   if (bp_dir == 0) {
-    weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
+    weight += (valid_argmax_h_low && valid_argmax_w_low)
                   ? -1 * (argmax_w_low + 1 - argmax_w) *
                         im_data[argmax_h_low * data_width + argmax_w_low]
                   : 0;
 
-    weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+    weight += (valid_argmax_h_low && valid_argmax_w_high)
                   ? -1 * (argmax_w - argmax_w_low) *
                         im_data[argmax_h_low * data_width + argmax_w_high]
                   : 0;
 
-    weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+    weight += (valid_argmax_h_high && valid_argmax_w_low)
                   ? (argmax_w_low + 1 - argmax_w) *
                         im_data[argmax_h_high * data_width + argmax_w_low]
                   : 0;
-    weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+    weight += (valid_argmax_h_high && valid_argmax_w_high)
                   ? (argmax_w - argmax_w_low) *
                         im_data[argmax_h_high * data_width + argmax_w_high]
                   : 0;
   } else if (bp_dir == 1) {
-    weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
+    weight += (valid_argmax_h_low && valid_argmax_w_low)
                   ? -1 * (argmax_h_low + 1 - argmax_h) *
                         im_data[argmax_h_low * data_width + argmax_w_low]
                   : 0;
-    weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+    weight += (valid_argmax_h_low && valid_argmax_w_high)
                   ? (argmax_h_low + 1 - argmax_h) *
                         im_data[argmax_h_low * data_width + argmax_w_high]
                   : 0;
-    weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+    weight += (valid_argmax_h_high && valid_argmax_w_low)
                   ? -1 * (argmax_h - argmax_h_low) *
                         im_data[argmax_h_high * data_width + argmax_w_low]
                   : 0;
-    weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+    weight += (valid_argmax_h_high && valid_argmax_w_high)
                   ? (argmax_h - argmax_h_low) *
                         im_data[argmax_h_high * data_width + argmax_w_high]
                   : 0;

From fa70e8fb2f5d8f08c44e393894ea5f3fd1851f7c Mon Sep 17 00:00:00 2001
From: xingmingyyj <zxm_3791@163.com>
Date: Mon, 21 Jul 2025 20:12:30 +0800
Subject: [PATCH 3/8] fix

---
 .../inference/tensorrt/plugin/deformable_conv_op_plugin.cu     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index 3549e0ced380ec..6cb22199475a61 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -496,7 +496,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
   int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
   int64_t offset = blockDim.x * static_cast<int64_t>(gridDim.x);
 
-  half minus_one = -1.0f, height_t = height, width_t = width;
+  half minus_one = -1.0f, height_t = static_cast<half>(height),
+       width_t = static_cast<half>(width);
   for (size_t i = index; i < nthreads; i += offset) {
     const int64_t w_col = i % width_col;
     const int64_t h_col = (i / width_col) % height_col;

From d982ce5e7401c1a8d40f68ce7adbe456ea30b100 Mon Sep 17 00:00:00 2001
From: xingmingyyj <zxm_3791@163.com>
Date: Mon, 21 Jul 2025 20:15:20 +0800
Subject: [PATCH 4/8] fix

---
 paddle/phi/kernels/impl/deformable_conv_kernel_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index 878c5ffbcb8576..43d68a85380f3f 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -39,7 +39,7 @@ void DeformableConvKernel(const Context& dev_ctx,
                           int im2col_step,
                           DenseTensor* out) {
   const int64_t batch_size = static_cast<int64_t>(x.dims()[0]);
-  
+
   if (x.numel() == 0 || filter.numel() == 0) {
     phi::Full<T, Context>(
         dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);

From b42d267ce833cd616bdb2537e26e5e508e36ed34 Mon Sep 17 00:00:00 2001
From: xingmingyyj <zxm_3791@163.com>
Date: Mon, 21 Jul 2025 20:28:25 +0800
Subject: [PATCH 5/8] fix

---
 paddle/phi/kernels/funcs/deformable_conv_functor.cc        | 7 +++----
 paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu      | 4 ++--
 paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h | 5 -----
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index 8a3f09f8ae85b2..49016f148fb62b 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -110,10 +110,9 @@ void ModulatedDeformableIm2col(const Context& dev_ctx UNUSED,
                                const std::vector<int>& dilations,
                                const int deformable_groups,
                                T* data_col) {
-  int64_t channel_per_deformable_group =
-      static_cast<int64_t>(im_shape[0] / deformable_groups);
-  int64_t num_kernels = static_cast<int64_t>(im_shape[0] * col_shape[1] *
-                                             col_shape[2] * col_shape[3]);
+  int64_t channel_per_deformable_group = im_shape[0] / deformable_groups;
+  int64_t num_kernels =
+      im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
 
   // get outputs of im2col with offset by bilinear interpolation
   ModulatedDeformableIm2colCPUKernel(num_kernels,
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 1012b82c89fbfe..f4fda49c1e09e3 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -52,8 +52,8 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
     const int64_t height_col,
     const int64_t width_col,
     T* grad_im) {
-  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t offset = blockDim.x * gridDim.x;
+  int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  int64_t offset = static_cast<int64_t>(blockDim.x) * gridDim.x;
   for (size_t thread = index; thread < nthreads; thread += offset) {
     const int64_t j = (thread / width_col / height_col / batch_size) % kernel_w;
     const int64_t i =
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index cfd282481b44c4..8f78c99b4176f7 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -66,11 +66,6 @@ HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
                                      const T* im_data,
                                      const int data_width,
                                      const int bp_dir) {
-  // if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-  //     argmax_w >= width) {
-  //   return 0;
-  // }
-
   int argmax_h_low = floor(argmax_h);
   int argmax_w_low = floor(argmax_w);
   int argmax_h_high = argmax_h_low + 1;

From 609cf26bbc78455ce64d7eb776ab355c0e334dbf Mon Sep 17 00:00:00 2001
From: xingmingyyj <zxm_3791@163.com>
Date: Mon, 21 Jul 2025 22:35:29 +0800
Subject: [PATCH 6/8] fix

---
 .../inference/tensorrt/plugin/deformable_conv_op_plugin.cu   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index 6cb22199475a61..b8ad760ddcb35c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -496,8 +496,9 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
   int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
   int64_t offset = blockDim.x * static_cast<int64_t>(gridDim.x);
 
-  half minus_one = -1.0f, height_t = static_cast<half>(height),
-       width_t = static_cast<half>(width);
+  half minus_one = -1.0f,
+       height_t = static_cast<half>(static_cast<float>(height)),
+       width_t = static_cast<half>(static_cast<float>(width));
   for (size_t i = index; i < nthreads; i += offset) {
     const int64_t w_col = i % width_col;
     const int64_t h_col = (i / width_col) % height_col;

From be77a6c7957bd844de0015759c595d1a925e9084 Mon Sep 17 00:00:00 2001
From: xingmingyyj <zxm_3791@163.com>
Date: Tue, 22 Jul 2025 14:00:50 +0800
Subject: [PATCH 7/8] fix

---
 .../inference/tensorrt/plugin/deformable_conv_op_plugin.cu    | 3 ++-
 paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc         | 4 ++--
 paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu         | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index b8ad760ddcb35c..978bb4f01e9f83 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -537,7 +537,8 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
         const half offset_w = data_offset_ptr[data_offset_w_ptr];
         const half mask = data_mask_ptr[data_mask_hw_ptr];
         half val = 0;
-        half h_im_t = h_in + i * dilation_h, w_im_t = w_in + j * dilation_w;
+        half h_im_t = static_cast<float>(h_in) + i * dilation_h,
+             w_im_t = static_cast<float>(w_in) + j * dilation_w;
         const half h_im = h_im_t + offset_h;
         const half w_im = w_im_t + offset_w;
         if (h_im > minus_one && w_im > minus_one && h_im < height_t &&
diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index a5a5e007fa3965..6037ff08ad1d6c 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -119,8 +119,8 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                T* grad_im) {
   int64_t channel_per_deformable_group =
       static_cast<int64_t>(im_shape[0] / deformable_group);
-  int64_t num_kernels = static_cast<int64_t>(col_shape[0] * col_shape[1] *
-                                             col_shape[2] * col_shape[3]);
+  int64_t num_kernels =
+      col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
 
   ModulatedDeformableCol2imCPUKernel(num_kernels,
                                      data_col,
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index f4fda49c1e09e3..d82d83f7e0311b 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -54,7 +54,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
     T* grad_im) {
   int64_t index = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
   int64_t offset = static_cast<int64_t>(blockDim.x) * gridDim.x;
-  for (size_t thread = index; thread < nthreads; thread += offset) {
+  for (int64_t thread = index; thread < nthreads; thread += offset) {
     const int64_t j = (thread / width_col / height_col / batch_size) % kernel_w;
     const int64_t i =
         (thread / width_col / height_col / batch_size / kernel_w) % kernel_h;

From 52aff6677cafed4eed707c7e6f4a474b493cd1c8 Mon Sep 17 00:00:00 2001
From: xingmingyyj <zxm_3791@163.com>
Date: Wed, 23 Jul 2025 21:01:19 +0800
Subject: [PATCH 8/8] fix

---
 .../cpu/deformable_conv_grad_kernel.cc        |  4 ++-
 .../gpu/deformable_conv_grad_kernel.cu        |  4 ++-
 .../impl/deformable_conv_grad_kernel_impl.h   | 26 +++++++++----------
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index 6037ff08ad1d6c..12df9d18d11465 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -222,7 +222,9 @@ void ModulatedDeformableCol2imCoordCPUKernel(
       const T offset_w = data_offset_ptr[data_offset_w_ptr];
       T inv_h = h_in + i * dilation_h + offset_h;
       T inv_w = w_in + j * dilation_w + offset_w;
-      if (!(inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)) {
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      } else {
         mval += data_col_ptr[col_pos] *
                 funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
                                           width,
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index d82d83f7e0311b..c6e54b070a6ab5 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -237,7 +237,9 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
       const T offset_w = data_offset_ptr[data_offset_w_ptr];
       T inv_h = h_in + i * dilation_h + offset_h;
       T inv_w = w_in + j * dilation_w + offset_w;
-      if (!(inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)) {
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      } else {
         mval += data_col_ptr[col_pos] *
                 funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
                                           width,
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 8f78c99b4176f7..4b0f7e5e8a180c 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -66,51 +66,51 @@ HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
                                      const T* im_data,
                                      const int data_width,
                                      const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    return 0;
+  }
+
   int argmax_h_low = floor(argmax_h);
   int argmax_w_low = floor(argmax_w);
   int argmax_h_high = argmax_h_low + 1;
   int argmax_w_high = argmax_w_low + 1;
 
-  bool valid_argmax_h_low = (argmax_h_low >= 0) && (argmax_h_low < height);
-  bool valid_argmax_h_high = (argmax_h_high >= 0) && (argmax_h_high < height);
-  bool valid_argmax_w_low = (argmax_w_low >= 0) && (argmax_w_low < width);
-  bool valid_argmax_w_high = (argmax_w_high >= 0) && (argmax_w_high < width);
-
   T weight = 0;
 
   if (bp_dir == 0) {
-    weight += (valid_argmax_h_low && valid_argmax_w_low)
+    weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_w_low + 1 - argmax_w) *
                         im_data[argmax_h_low * data_width + argmax_w_low]
                   : 0;
 
-    weight += (valid_argmax_h_low && valid_argmax_w_high)
+    weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? -1 * (argmax_w - argmax_w_low) *
                         im_data[argmax_h_low * data_width + argmax_w_high]
                   : 0;
 
-    weight += (valid_argmax_h_high && valid_argmax_w_low)
+    weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? (argmax_w_low + 1 - argmax_w) *
                         im_data[argmax_h_high * data_width + argmax_w_low]
                   : 0;
-    weight += (valid_argmax_h_high && valid_argmax_w_high)
+    weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_w - argmax_w_low) *
                         im_data[argmax_h_high * data_width + argmax_w_high]
                   : 0;
   } else if (bp_dir == 1) {
-    weight += (valid_argmax_h_low && valid_argmax_w_low)
+    weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_h_low + 1 - argmax_h) *
                         im_data[argmax_h_low * data_width + argmax_w_low]
                   : 0;
-    weight += (valid_argmax_h_low && valid_argmax_w_high)
+    weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? (argmax_h_low + 1 - argmax_h) *
                         im_data[argmax_h_low * data_width + argmax_w_high]
                   : 0;
-    weight += (valid_argmax_h_high && valid_argmax_w_low)
+    weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? -1 * (argmax_h - argmax_h_low) *
                         im_data[argmax_h_high * data_width + argmax_w_low]
                   : 0;
-    weight += (valid_argmax_h_high && valid_argmax_w_high)
+    weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_h - argmax_h_low) *
                         im_data[argmax_h_high * data_width + argmax_w_high]
                   : 0;