@@ -34,9 +34,9 @@ namespace plugin {
3434static constexpr int kNumCUDAThreads = 512 ;
3535static constexpr int kNumMaximumNumBlocks = 4096 ;
3636
37- static inline int NumBlocks (const int N) {
38- return std::min ((N + kNumCUDAThreads - 1 ) / kNumCUDAThreads ,
39- kNumMaximumNumBlocks );
37+ static inline int NumBlocks (const int64_t N) {
38+ return std::min< int64_t > ((N + kNumCUDAThreads - 1 ) / kNumCUDAThreads ,
39+ static_cast < int64_t >( kNumMaximumNumBlocks ) );
4040}
4141
4242static inline int ConvOutputSize (
@@ -367,66 +367,66 @@ __device__ half DmcnIm2colBilinear<half>(const half* bottom_data,
367367
368368template <typename T>
369369__global__ void ModulatedDeformableIm2colGpuKernel (
370- const int nthreads,
370+ const int64_t nthreads,
371371 const T* data_im,
372372 const T* data_offset,
373373 const T* data_mask,
374- const int height,
375- const int width,
376- const int kernel_h,
377- const int kernel_w,
378- const int pad_h,
379- const int pad_w,
380- const int stride_h,
381- const int stride_w,
382- const int dilation_h,
383- const int dilation_w,
384- const int channel_per_deformable_group,
385- const int batch_size,
386- const int num_channels,
387- const int deformable_group,
388- const int height_col,
389- const int width_col,
374+ const int64_t height,
375+ const int64_t width,
376+ const int64_t kernel_h,
377+ const int64_t kernel_w,
378+ const int64_t pad_h,
379+ const int64_t pad_w,
380+ const int64_t stride_h,
381+ const int64_t stride_w,
382+ const int64_t dilation_h,
383+ const int64_t dilation_w,
384+ const int64_t channel_per_deformable_group,
385+ const int64_t batch_size,
386+ const int64_t num_channels,
387+ const int64_t deformable_group,
388+ const int64_t height_col,
389+ const int64_t width_col,
390390 T* data_col);
391391
392392template <>
393393__global__ void ModulatedDeformableIm2colGpuKernel<float >(
394- const int nthreads,
394+ const int64_t nthreads,
395395 const float * data_im,
396396 const float * data_offset,
397397 const float * data_mask,
398- const int height,
399- const int width,
400- const int kernel_h,
401- const int kernel_w,
402- const int pad_h,
403- const int pad_w,
404- const int stride_h,
405- const int stride_w,
406- const int dilation_h,
407- const int dilation_w,
408- const int channel_per_deformable_group,
409- const int batch_size,
410- const int num_channels,
411- const int deformable_group,
412- const int height_col,
413- const int width_col,
398+ const int64_t height,
399+ const int64_t width,
400+ const int64_t kernel_h,
401+ const int64_t kernel_w,
402+ const int64_t pad_h,
403+ const int64_t pad_w,
404+ const int64_t stride_h,
405+ const int64_t stride_w,
406+ const int64_t dilation_h,
407+ const int64_t dilation_w,
408+ const int64_t channel_per_deformable_group,
409+ const int64_t batch_size,
410+ const int64_t num_channels,
411+ const int64_t deformable_group,
412+ const int64_t height_col,
413+ const int64_t width_col,
414414 float * data_col) {
415- int index = blockIdx .x * blockDim .x + threadIdx .x ;
416- int offset = blockDim .x * gridDim .x ;
415+ int64_t index = static_cast < int64_t >( blockIdx .x ) * blockDim .x + threadIdx .x ;
416+ int64_t offset = blockDim .x * static_cast < int64_t >( gridDim .x ) ;
417417
418418 float minus_one = -1 .0f , height_t = height, width_t = width;
419- for (size_t i = index; i < nthreads; i += offset) {
420- const int w_col = i % width_col;
421- const int h_col = (i / width_col) % height_col;
422- const int b_col = (i / width_col) / height_col % batch_size;
423- const int c_im = (i / width_col / height_col) / batch_size;
424- const int c_col = c_im * kernel_h * kernel_w;
419+ for (int64_t i = index; i < nthreads; i += offset) {
420+ const int64_t w_col = i % width_col;
421+ const int64_t h_col = (i / width_col) % height_col;
422+ const int64_t b_col = (i / width_col) / height_col % batch_size;
423+ const int64_t c_im = (i / width_col / height_col) / batch_size;
424+ const int64_t c_col = c_im * kernel_h * kernel_w;
425425
426- const int deformable_group_index = c_im / channel_per_deformable_group;
426+ const int64_t deformable_group_index = c_im / channel_per_deformable_group;
427427
428- const int h_in = h_col * stride_h - pad_h;
429- const int w_in = w_col * stride_w - pad_w;
428+ const int64_t h_in = h_col * stride_h - pad_h;
429+ const int64_t w_in = w_col * stride_w - pad_w;
430430
431431 float * data_col_ptr =
432432 data_col +
@@ -440,14 +440,14 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
440440 data_mask + (b_col * deformable_group + deformable_group_index) *
441441 kernel_h * kernel_w * height_col * width_col;
442442
443- for (int i = 0 ; i < kernel_h; ++i) {
444- for (int j = 0 ; j < kernel_w; ++j) {
445- const int data_offset_h_ptr =
443+ for (int64_t i = 0 ; i < kernel_h; ++i) {
444+ for (int64_t j = 0 ; j < kernel_w; ++j) {
445+ const int64_t data_offset_h_ptr =
446446 ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
447- const int data_offset_w_ptr =
447+ const int64_t data_offset_w_ptr =
448448 ((2 * (i * kernel_w + j) + 1 ) * height_col + h_col) * width_col +
449449 w_col;
450- const int data_mask_hw_ptr =
450+ const int64_t data_mask_hw_ptr =
451451 ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
452452
453453 const float offset_h = data_offset_ptr[data_offset_h_ptr];
@@ -471,43 +471,45 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
471471
472472template <>
473473__global__ void ModulatedDeformableIm2colGpuKernel<half>(
474- const int nthreads,
474+ const int64_t nthreads,
475475 const half* data_im,
476476 const half* data_offset,
477477 const half* data_mask,
478- const int height,
479- const int width,
480- const int kernel_h,
481- const int kernel_w,
482- const int pad_h,
483- const int pad_w,
484- const int stride_h,
485- const int stride_w,
486- const int dilation_h,
487- const int dilation_w,
488- const int channel_per_deformable_group,
489- const int batch_size,
490- const int num_channels,
491- const int deformable_group,
492- const int height_col,
493- const int width_col,
478+ const int64_t height,
479+ const int64_t width,
480+ const int64_t kernel_h,
481+ const int64_t kernel_w,
482+ const int64_t pad_h,
483+ const int64_t pad_w,
484+ const int64_t stride_h,
485+ const int64_t stride_w,
486+ const int64_t dilation_h,
487+ const int64_t dilation_w,
488+ const int64_t channel_per_deformable_group,
489+ const int64_t batch_size,
490+ const int64_t num_channels,
491+ const int64_t deformable_group,
492+ const int64_t height_col,
493+ const int64_t width_col,
494494 half* data_col) {
495495#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
496- int index = blockIdx .x * blockDim .x + threadIdx .x ;
497- int offset = blockDim .x * gridDim .x ;
496+ int64_t index = static_cast < int64_t >( blockIdx .x ) * blockDim .x + threadIdx .x ;
497+ int64_t offset = blockDim .x * static_cast < int64_t >( gridDim .x ) ;
498498
499- half minus_one = -1 .0f , height_t = height, width_t = width;
499+ half minus_one = -1 .0f ,
500+ height_t = static_cast <half>(static_cast <float >(height)),
501+ width_t = static_cast <half>(static_cast <float >(width));
500502 for (size_t i = index; i < nthreads; i += offset) {
501- const int w_col = i % width_col;
502- const int h_col = (i / width_col) % height_col;
503- const int b_col = (i / width_col) / height_col % batch_size;
504- const int c_im = (i / width_col / height_col) / batch_size;
505- const int c_col = c_im * kernel_h * kernel_w;
503+ const int64_t w_col = i % width_col;
504+ const int64_t h_col = (i / width_col) % height_col;
505+ const int64_t b_col = (i / width_col) / height_col % batch_size;
506+ const int64_t c_im = (i / width_col / height_col) / batch_size;
507+ const int64_t c_col = c_im * kernel_h * kernel_w;
506508
507- const int deformable_group_index = c_im / channel_per_deformable_group;
509+ const int64_t deformable_group_index = c_im / channel_per_deformable_group;
508510
509- const int h_in = h_col * stride_h - pad_h;
510- const int w_in = w_col * stride_w - pad_w;
511+ const int64_t h_in = h_col * stride_h - pad_h;
512+ const int64_t w_in = w_col * stride_w - pad_w;
511513
512514 half* data_col_ptr =
513515 data_col +
@@ -521,21 +523,22 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
521523 data_mask + (b_col * deformable_group + deformable_group_index) *
522524 kernel_h * kernel_w * height_col * width_col;
523525
524- for (int i = 0 ; i < kernel_h; ++i) {
525- for (int j = 0 ; j < kernel_w; ++j) {
526- const int data_offset_h_ptr =
526+ for (int64_t i = 0 ; i < kernel_h; ++i) {
527+ for (int64_t j = 0 ; j < kernel_w; ++j) {
528+ const int64_t data_offset_h_ptr =
527529 ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
528- const int data_offset_w_ptr =
530+ const int64_t data_offset_w_ptr =
529531 ((2 * (i * kernel_w + j) + 1 ) * height_col + h_col) * width_col +
530532 w_col;
531- const int data_mask_hw_ptr =
533+ const int64_t data_mask_hw_ptr =
532534 ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
533535
534536 const half offset_h = data_offset_ptr[data_offset_h_ptr];
535537 const half offset_w = data_offset_ptr[data_offset_w_ptr];
536538 const half mask = data_mask_ptr[data_mask_hw_ptr];
537539 half val = 0 ;
538- half h_im_t = h_in + i * dilation_h, w_im_t = w_in + j * dilation_w;
540+ half h_im_t = static_cast <float >(h_in) + i * dilation_h,
541+ w_im_t = static_cast <float >(w_in) + j * dilation_w;
539542 const half h_im = h_im_t + offset_h;
540543 const half w_im = w_im_t + offset_w;
541544 if (h_im > minus_one && w_im > minus_one && h_im < height_t &&
0 commit comments