PaddlePaddle
diff --git a/‎paddle/fluid/operators/bce_loss_op.cu‎
Lines changed: 8 additions & 8 deletions b/‎paddle/fluid/operators/bce_loss_op.cu‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎paddle/fluid/operators/bilateral_slice_op.cu‎
Lines changed: 18 additions & 12 deletions b/‎paddle/fluid/operators/bilateral_slice_op.cu‎
Lines changed: 18 additions & 12 deletions
diff --git a/‎paddle/fluid/operators/cumsum_op.cu‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/operators/cumsum_op.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/operators/interpolate_op.cu‎
Lines changed: 22 additions & 20 deletions b/‎paddle/fluid/operators/interpolate_op.cu‎
Lines changed: 22 additions & 20 deletions
diff --git a/‎paddle/fluid/operators/interpolate_v2_op.cu‎
Lines changed: 22 additions & 20 deletions b/‎paddle/fluid/operators/interpolate_v2_op.cu‎
Lines changed: 22 additions & 20 deletions
diff --git a/‎paddle/fluid/operators/math/segment_pooling.cu‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/operators/math/segment_pooling.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/operators/mish_op.cu‎
Lines changed: 12 additions & 8 deletions b/‎paddle/fluid/operators/mish_op.cu‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎paddle/fluid/operators/mv_op.cu‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/operators/mv_op.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/operators/segment_pool_op.cu‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/operators/segment_pool_op.cu‎
Lines changed: 1 addition & 1 deletion
@@ -70,7 +70,7 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
     auto x_numel = x->numel();
 
     platform::GpuLaunchConfig config =
-        platform::getGpuLaunchConfig(x_numel, ctx);
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), x_numel);
 
     Tensor x_cpu;
     framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
@@ -89,9 +89,9 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
-    GPUBCELossForward<
-        T><<<config.blocks, config.threads, 0, dev_ctx.stream()>>>(
-        x_data, labels->data<T>(), out_data, x_numel);
+    GPUBCELossForward<T><<<config.block_per_grid, config.thread_per_block, 0,
+                           dev_ctx.stream()>>>(x_data, labels->data<T>(),
+                                               out_data, x_numel);
   }
 };
 
@@ -106,12 +106,12 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
     auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     int x_numel = x->numel();
-    platform::GpuLaunchConfig config =
-        platform::getGpuLaunchConfig(x_numel, ctx);
     auto& dev_ctx = ctx.cuda_device_context();
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
 
-    GPUBCELossBackward<
-        T><<<config.blocks, config.threads, 0, dev_ctx.stream()>>>(
+    GPUBCELossBackward<T><<<config.block_per_grid, config.thread_per_block, 0,
+                            dev_ctx.stream()>>>(
         x->data<T>(), labels->data<T>(), dout->data<T>(), dx_data, x_numel);
   }
 };
 
@@ -165,10 +165,11 @@ class BilateralSliceOpCUDAKernel : public framework::OpKernel<T> {
     int total_count = batch_size * h * w * output_dims[1];
 
     platform::GpuLaunchConfig config =
-        platform::getGpuLaunchConfig(total_count, ctx);
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), total_count);
 
-    BilateralSliceCudaForwardKernel<T><<<config.blocks, config.threads, 0,
-                                         ctx.cuda_device_context().stream()>>>(
+    BilateralSliceCudaForwardKernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         output_data, grid_data, guide_data, input_data, grid_sizes, has_offset,
         total_count, output_dims[1]);
   }
@@ -472,24 +473,29 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
     grid_sizes.input_chans = input_chans;
 
     platform::GpuLaunchConfig config =
-        platform::getGpuLaunchConfig(grid_count, ctx, 512);
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count);
 
-    BilateralSliceCudaGridGradKernel<T><<<config.blocks, config.threads, 0,
-                                          ctx.cuda_device_context().stream()>>>(
+    BilateralSliceCudaGridGradKernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
         has_offset, grid_count, output_chans);
 
-    config = platform::getGpuLaunchConfig(guide_count, ctx, 512);
+    config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count);
 
-    BilateralSliceCudaGuideGradKernel<T><<<
-        config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
+    BilateralSliceCudaGuideGradKernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
         grid_sizes, has_offset, guide_count, output_chans);
 
-    config = platform::getGpuLaunchConfig(input_count, ctx, 512);
+    config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count);
 
-    BilateralSliceCudaInputGradKernel<T><<<
-        config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
+    BilateralSliceCudaInputGradKernel<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_grad_data, output_grad_data, grid_data, guide_data, grid_sizes,
         has_offset, input_count, output_chans);
   }
 
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 #include "paddle/fluid/operators/cum_op.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 using Tensor = paddle::framework::Tensor;
 using LoDTensor = paddle::framework::LoDTensor;
 
@@ -887,10 +887,10 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("linear" == interp_method) {
-    KeLinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeLinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                           ctx.cuda_device_context().stream()>>>(
         input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w,
         align_corners, align_mode, data_layout);
@@ -981,21 +981,22 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
-                                   ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpFw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpFw<
-        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBicubicInterpFw<T><<<config.block_per_grid, 512, 0,
+                           ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
@@ -1097,10 +1098,10 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cdhw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeTrilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                              ctx.cuda_device_context().stream()>>>(
         input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
         out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
@@ -1176,10 +1177,10 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("linear" == interp_method) {
-    KeLinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeLinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                           ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c,
         ratio_w, align_corners, align_mode, data_layout);
@@ -1267,22 +1268,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
-                                   ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpBw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
         data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpBw<
-        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
+                           ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
@@ -1378,10 +1380,10 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cdhw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeTrilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                              ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
         out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
 
@@ -899,10 +899,10 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("linear" == interp_method) {
-    KeLinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeLinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                           ctx.cuda_device_context().stream()>>>(
         input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w,
         align_corners, align_mode, data_layout);
@@ -1018,21 +1018,22 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
-                                   ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpFw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpFw<
-        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBicubicInterpFw<T><<<config.block_per_grid, 512, 0,
+                           ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
@@ -1167,10 +1168,10 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cdhw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
+    KeTrilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                              ctx.cuda_device_context().stream()>>>(
         input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
         out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
@@ -1259,10 +1260,10 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("linear" == interp_method) {
-    KeLinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeLinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                           ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c,
         ratio_w, align_corners, align_mode, data_layout);
@@ -1376,22 +1377,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
-                                   ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpBw<
+        T><<<config.block_per_grid, config.thread_per_block, 0,
+             ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
         data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpBw<
-        T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
+                           ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
@@ -1520,10 +1522,10 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   int pixelNum = n * out_cdhw;
 
   platform::GpuLaunchConfig config =
-      platform::getGpuLaunchConfig(pixelNum, ctx);
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
+    KeTrilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                              ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
         out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
 
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
 
@@ -87,8 +87,9 @@ class MishCUDAKernel : public framework::OpKernel<T> {
 
     const int numel = x->numel();
 
-    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
-    KeMishFw<T><<<config.blocks, config.threads, 0,
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
+    KeMishFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                   ctx.cuda_device_context().stream()>>>(x_data, out_data, numel,
                                                         threshold);
   }
@@ -108,8 +109,9 @@ class MishFP32CUDAKernel : public framework::OpKernel<float> {
 
     const int numel = x->numel();
 
-    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
-    KeMishFwFP32<<<config.blocks, config.threads, 0,
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
+    KeMishFwFP32<<<config.block_per_grid, config.thread_per_block, 0,
                    ctx.cuda_device_context().stream()>>>(x_data, out_data,
                                                          numel, threshold);
   }
@@ -131,8 +133,9 @@ class MishGradCUDAKernel : public framework::OpKernel<T> {
 
     const int numel = x->numel();
 
-    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
-    KeMishBw<T><<<config.blocks, config.threads, 0,
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
+    KeMishBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                   ctx.cuda_device_context().stream()>>>(
         x_data, dout_data, dx_data, numel, threshold);
   }
@@ -154,8 +157,9 @@ class MishGradFP32CUDAKernel : public framework::OpKernel<float> {
 
     const int numel = x->numel();
 
-    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
-    KeMishBwFP32<<<config.blocks, config.threads, 0,
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
+    KeMishBwFP32<<<config.block_per_grid, config.thread_per_block, 0,
                    ctx.cuda_device_context().stream()>>>(
         x_data, dout_data, dx_data, numel, threshold);
   }
 
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mv_op.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
 
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/segment_pool_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(