Skip to content

Commit 463c72c

Browse files
refine gpu kernel config for Paddle (#28085)
1 parent 2cb1ecb commit 463c72c

File tree

14 files changed

+173
-208
lines changed

14 files changed

+173
-208
lines changed

paddle/fluid/operators/bce_loss_op.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
7070
auto x_numel = x->numel();
7171

7272
platform::GpuLaunchConfig config =
73-
platform::getGpuLaunchConfig(x_numel, ctx);
73+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), x_numel);
7474

7575
Tensor x_cpu;
7676
framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
@@ -89,9 +89,9 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
8989

9090
auto& dev_ctx = ctx.cuda_device_context();
9191

92-
GPUBCELossForward<
93-
T><<<config.blocks, config.threads, 0, dev_ctx.stream()>>>(
94-
x_data, labels->data<T>(), out_data, x_numel);
92+
GPUBCELossForward<T><<<config.block_per_grid, config.thread_per_block, 0,
93+
dev_ctx.stream()>>>(x_data, labels->data<T>(),
94+
out_data, x_numel);
9595
}
9696
};
9797

@@ -106,12 +106,12 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
106106
auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
107107

108108
int x_numel = x->numel();
109-
platform::GpuLaunchConfig config =
110-
platform::getGpuLaunchConfig(x_numel, ctx);
111109
auto& dev_ctx = ctx.cuda_device_context();
110+
platform::GpuLaunchConfig config =
111+
platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
112112

113-
GPUBCELossBackward<
114-
T><<<config.blocks, config.threads, 0, dev_ctx.stream()>>>(
113+
GPUBCELossBackward<T><<<config.block_per_grid, config.thread_per_block, 0,
114+
dev_ctx.stream()>>>(
115115
x->data<T>(), labels->data<T>(), dout->data<T>(), dx_data, x_numel);
116116
}
117117
};

paddle/fluid/operators/bilateral_slice_op.cu

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,11 @@ class BilateralSliceOpCUDAKernel : public framework::OpKernel<T> {
165165
int total_count = batch_size * h * w * output_dims[1];
166166

167167
platform::GpuLaunchConfig config =
168-
platform::getGpuLaunchConfig(total_count, ctx);
168+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), total_count);
169169

170-
BilateralSliceCudaForwardKernel<T><<<config.blocks, config.threads, 0,
171-
ctx.cuda_device_context().stream()>>>(
170+
BilateralSliceCudaForwardKernel<
171+
T><<<config.block_per_grid, config.thread_per_block, 0,
172+
ctx.cuda_device_context().stream()>>>(
172173
output_data, grid_data, guide_data, input_data, grid_sizes, has_offset,
173174
total_count, output_dims[1]);
174175
}
@@ -472,24 +473,29 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
472473
grid_sizes.input_chans = input_chans;
473474

474475
platform::GpuLaunchConfig config =
475-
platform::getGpuLaunchConfig(grid_count, ctx, 512);
476+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count);
476477

477-
BilateralSliceCudaGridGradKernel<T><<<config.blocks, config.threads, 0,
478-
ctx.cuda_device_context().stream()>>>(
478+
BilateralSliceCudaGridGradKernel<
479+
T><<<config.block_per_grid, config.thread_per_block, 0,
480+
ctx.cuda_device_context().stream()>>>(
479481
grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
480482
has_offset, grid_count, output_chans);
481483

482-
config = platform::getGpuLaunchConfig(guide_count, ctx, 512);
484+
config =
485+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count);
483486

484-
BilateralSliceCudaGuideGradKernel<T><<<
485-
config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
487+
BilateralSliceCudaGuideGradKernel<
488+
T><<<config.block_per_grid, config.thread_per_block, 0,
489+
ctx.cuda_device_context().stream()>>>(
486490
guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
487491
grid_sizes, has_offset, guide_count, output_chans);
488492

489-
config = platform::getGpuLaunchConfig(input_count, ctx, 512);
493+
config =
494+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count);
490495

491-
BilateralSliceCudaInputGradKernel<T><<<
492-
config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
496+
BilateralSliceCudaInputGradKernel<
497+
T><<<config.block_per_grid, config.thread_per_block, 0,
498+
ctx.cuda_device_context().stream()>>>(
493499
input_grad_data, output_grad_data, grid_data, guide_data, grid_sizes,
494500
has_offset, input_count, output_chans);
495501
}

paddle/fluid/operators/cumsum_op.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ limitations under the License. */
1717
#include <thrust/reverse.h>
1818
#include <thrust/scan.h>
1919
#include "paddle/fluid/operators/cum_op.h"
20-
#include "paddle/fluid/platform/gpu_launch_param_config.h"
20+
#include "paddle/fluid/platform/gpu_launch_config.h"
2121

2222
using Tensor = paddle::framework::Tensor;
2323
using LoDTensor = paddle::framework::LoDTensor;

paddle/fluid/operators/interpolate_op.cu

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -887,10 +887,10 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
887887
int pixelNum = n * out_cw;
888888

889889
platform::GpuLaunchConfig config =
890-
platform::getGpuLaunchConfig(pixelNum, ctx);
890+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
891891

892892
if ("linear" == interp_method) {
893-
KeLinearInterpFw<T><<<config.blocks, config.threads, 0,
893+
KeLinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
894894
ctx.cuda_device_context().stream()>>>(
895895
input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w,
896896
align_corners, align_mode, data_layout);
@@ -981,21 +981,22 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
981981
int pixelNum = n * out_chw;
982982

983983
platform::GpuLaunchConfig config =
984-
platform::getGpuLaunchConfig(pixelNum, ctx);
984+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
985985

986986
if ("nearest" == interp_method) {
987-
KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
988-
ctx.cuda_device_context().stream()>>>(
987+
KeNearestNeighborInterpFw<
988+
T><<<config.block_per_grid, config.thread_per_block, 0,
989+
ctx.cuda_device_context().stream()>>>(
989990
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
990991
out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
991992
} else if ("bilinear" == interp_method) {
992-
KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
993+
KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
993994
ctx.cuda_device_context().stream()>>>(
994995
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
995996
out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
996997
} else if ("bicubic" == interp_method) {
997-
KeBicubicInterpFw<
998-
T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
998+
KeBicubicInterpFw<T><<<config.block_per_grid, 512, 0,
999+
ctx.cuda_device_context().stream()>>>(
9991000
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
10001001
out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
10011002
}
@@ -1097,10 +1098,10 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
10971098
int pixelNum = n * out_cdhw;
10981099

10991100
platform::GpuLaunchConfig config =
1100-
platform::getGpuLaunchConfig(pixelNum, ctx);
1101+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
11011102

11021103
if ("trilinear" == interp_method) {
1103-
KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
1104+
KeTrilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
11041105
ctx.cuda_device_context().stream()>>>(
11051106
input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
11061107
out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
@@ -1176,10 +1177,10 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
11761177
int pixelNum = n * out_cw;
11771178

11781179
platform::GpuLaunchConfig config =
1179-
platform::getGpuLaunchConfig(pixelNum, ctx);
1180+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
11801181

11811182
if ("linear" == interp_method) {
1182-
KeLinearInterpBw<T><<<config.blocks, config.threads, 0,
1183+
KeLinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
11831184
ctx.cuda_device_context().stream()>>>(
11841185
input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c,
11851186
ratio_w, align_corners, align_mode, data_layout);
@@ -1267,22 +1268,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
12671268
int pixelNum = n * out_chw;
12681269

12691270
platform::GpuLaunchConfig config =
1270-
platform::getGpuLaunchConfig(pixelNum, ctx);
1271+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
12711272

12721273
if ("nearest" == interp_method) {
1273-
KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
1274-
ctx.cuda_device_context().stream()>>>(
1274+
KeNearestNeighborInterpBw<
1275+
T><<<config.block_per_grid, config.thread_per_block, 0,
1276+
ctx.cuda_device_context().stream()>>>(
12751277
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
12761278
n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
12771279
} else if ("bilinear" == interp_method) {
1278-
KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
1280+
KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
12791281
ctx.cuda_device_context().stream()>>>(
12801282
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
12811283
n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
12821284
data_layout);
12831285
} else if ("bicubic" == interp_method) {
1284-
KeBicubicInterpBw<
1285-
T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
1286+
KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
1287+
ctx.cuda_device_context().stream()>>>(
12861288
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
12871289
n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
12881290
}
@@ -1378,10 +1380,10 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
13781380
int pixelNum = n * out_cdhw;
13791381

13801382
platform::GpuLaunchConfig config =
1381-
platform::getGpuLaunchConfig(pixelNum, ctx);
1383+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
13821384

13831385
if ("trilinear" == interp_method) {
1384-
KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
1386+
KeTrilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
13851387
ctx.cuda_device_context().stream()>>>(
13861388
input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
13871389
out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,

paddle/fluid/operators/interpolate_v2_op.cu

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -899,10 +899,10 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
899899
int pixelNum = n * out_cw;
900900

901901
platform::GpuLaunchConfig config =
902-
platform::getGpuLaunchConfig(pixelNum, ctx);
902+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
903903

904904
if ("linear" == interp_method) {
905-
KeLinearInterpFw<T><<<config.blocks, config.threads, 0,
905+
KeLinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
906906
ctx.cuda_device_context().stream()>>>(
907907
input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w,
908908
align_corners, align_mode, data_layout);
@@ -1018,21 +1018,22 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
10181018
int pixelNum = n * out_chw;
10191019

10201020
platform::GpuLaunchConfig config =
1021-
platform::getGpuLaunchConfig(pixelNum, ctx);
1021+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
10221022

10231023
if ("nearest" == interp_method) {
1024-
KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
1025-
ctx.cuda_device_context().stream()>>>(
1024+
KeNearestNeighborInterpFw<
1025+
T><<<config.block_per_grid, config.thread_per_block, 0,
1026+
ctx.cuda_device_context().stream()>>>(
10261027
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
10271028
out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
10281029
} else if ("bilinear" == interp_method) {
1029-
KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
1030+
KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
10301031
ctx.cuda_device_context().stream()>>>(
10311032
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
10321033
out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
10331034
} else if ("bicubic" == interp_method) {
1034-
KeBicubicInterpFw<
1035-
T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
1035+
KeBicubicInterpFw<T><<<config.block_per_grid, 512, 0,
1036+
ctx.cuda_device_context().stream()>>>(
10361037
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
10371038
out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
10381039
}
@@ -1167,10 +1168,10 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
11671168
int pixelNum = n * out_cdhw;
11681169

11691170
platform::GpuLaunchConfig config =
1170-
platform::getGpuLaunchConfig(pixelNum, ctx);
1171+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
11711172

11721173
if ("trilinear" == interp_method) {
1173-
KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
1174+
KeTrilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
11741175
ctx.cuda_device_context().stream()>>>(
11751176
input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
11761177
out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
@@ -1259,10 +1260,10 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
12591260
int pixelNum = n * out_cw;
12601261

12611262
platform::GpuLaunchConfig config =
1262-
platform::getGpuLaunchConfig(pixelNum, ctx);
1263+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
12631264

12641265
if ("linear" == interp_method) {
1265-
KeLinearInterpBw<T><<<config.blocks, config.threads, 0,
1266+
KeLinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
12661267
ctx.cuda_device_context().stream()>>>(
12671268
input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c,
12681269
ratio_w, align_corners, align_mode, data_layout);
@@ -1376,22 +1377,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
13761377
int pixelNum = n * out_chw;
13771378

13781379
platform::GpuLaunchConfig config =
1379-
platform::getGpuLaunchConfig(pixelNum, ctx);
1380+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
13801381

13811382
if ("nearest" == interp_method) {
1382-
KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
1383-
ctx.cuda_device_context().stream()>>>(
1383+
KeNearestNeighborInterpBw<
1384+
T><<<config.block_per_grid, config.thread_per_block, 0,
1385+
ctx.cuda_device_context().stream()>>>(
13841386
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
13851387
n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
13861388
} else if ("bilinear" == interp_method) {
1387-
KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
1389+
KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
13881390
ctx.cuda_device_context().stream()>>>(
13891391
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
13901392
n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
13911393
data_layout);
13921394
} else if ("bicubic" == interp_method) {
1393-
KeBicubicInterpBw<
1394-
T><<<config.blocks, 512, 0, ctx.cuda_device_context().stream()>>>(
1395+
KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
1396+
ctx.cuda_device_context().stream()>>>(
13951397
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
13961398
n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
13971399
}
@@ -1520,10 +1522,10 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
15201522
int pixelNum = n * out_cdhw;
15211523

15221524
platform::GpuLaunchConfig config =
1523-
platform::getGpuLaunchConfig(pixelNum, ctx);
1525+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
15241526

15251527
if ("trilinear" == interp_method) {
1526-
KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
1528+
KeTrilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
15271529
ctx.cuda_device_context().stream()>>>(
15281530
input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
15291531
out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,

paddle/fluid/operators/math/segment_pooling.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ limitations under the License. */
1717
#include "paddle/fluid/operators/math/math_function.h"
1818
#include "paddle/fluid/operators/math/segment_pooling.h"
1919
#include "paddle/fluid/platform/cuda_primitives.h"
20-
#include "paddle/fluid/platform/gpu_launch_param_config.h"
20+
#include "paddle/fluid/platform/gpu_launch_config.h"
2121

2222
namespace paddle {
2323
namespace operators {

paddle/fluid/operators/mish_op.cu

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,9 @@ class MishCUDAKernel : public framework::OpKernel<T> {
8787

8888
const int numel = x->numel();
8989

90-
platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
91-
KeMishFw<T><<<config.blocks, config.threads, 0,
90+
platform::GpuLaunchConfig config =
91+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
92+
KeMishFw<T><<<config.block_per_grid, config.thread_per_block, 0,
9293
ctx.cuda_device_context().stream()>>>(x_data, out_data, numel,
9394
threshold);
9495
}
@@ -108,8 +109,9 @@ class MishFP32CUDAKernel : public framework::OpKernel<float> {
108109

109110
const int numel = x->numel();
110111

111-
platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
112-
KeMishFwFP32<<<config.blocks, config.threads, 0,
112+
platform::GpuLaunchConfig config =
113+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
114+
KeMishFwFP32<<<config.block_per_grid, config.thread_per_block, 0,
113115
ctx.cuda_device_context().stream()>>>(x_data, out_data,
114116
numel, threshold);
115117
}
@@ -131,8 +133,9 @@ class MishGradCUDAKernel : public framework::OpKernel<T> {
131133

132134
const int numel = x->numel();
133135

134-
platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
135-
KeMishBw<T><<<config.blocks, config.threads, 0,
136+
platform::GpuLaunchConfig config =
137+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
138+
KeMishBw<T><<<config.block_per_grid, config.thread_per_block, 0,
136139
ctx.cuda_device_context().stream()>>>(
137140
x_data, dout_data, dx_data, numel, threshold);
138141
}
@@ -154,8 +157,9 @@ class MishGradFP32CUDAKernel : public framework::OpKernel<float> {
154157

155158
const int numel = x->numel();
156159

157-
platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
158-
KeMishBwFP32<<<config.blocks, config.threads, 0,
160+
platform::GpuLaunchConfig config =
161+
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
162+
KeMishBwFP32<<<config.block_per_grid, config.thread_per_block, 0,
159163
ctx.cuda_device_context().stream()>>>(
160164
x_data, dout_data, dx_data, numel, threshold);
161165
}

paddle/fluid/operators/mv_op.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/mv_op.h"
16-
#include "paddle/fluid/platform/gpu_launch_param_config.h"
16+
#include "paddle/fluid/platform/gpu_launch_config.h"
1717

1818
namespace paddle {
1919
namespace operators {

paddle/fluid/operators/segment_pool_op.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ limitations under the License. */
1515
#include "paddle/fluid/operators/gather.cu.h"
1616
#include "paddle/fluid/operators/segment_pool_op.h"
1717
#include "paddle/fluid/platform/cuda_primitives.h"
18-
#include "paddle/fluid/platform/gpu_launch_param_config.h"
18+
#include "paddle/fluid/platform/gpu_launch_config.h"
1919

2020
namespace ops = paddle::operators;
2121
REGISTER_OP_CUDA_KERNEL(

0 commit comments

Comments
 (0)