Skip to content

Commit a8d35d9

Browse files
committed
Fix
1 parent 976dd60 commit a8d35d9

File tree

7 files changed

+159
-159
lines changed

7 files changed

+159
-159
lines changed

paddle/phi/kernels/funcs/concat_and_split_functor.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ limitations under the License. */
2424
namespace phi {
2525
namespace funcs {
2626

27-
static inline void GetBlockDims(const phi::GPUContext& context,
27+
static inline void GetBlockDims(const phi::GPUContext& dev_ctx,
2828
int64_t num_rows,
2929
int64_t num_cols,
3030
dim3* block_dims,
@@ -39,7 +39,7 @@ static inline void GetBlockDims(const phi::GPUContext& context,
3939
*block_dims = dim3(block_cols, block_rows, 1);
4040

4141
constexpr int waves = 1;
42-
int max_threads = context.GetMaxPhysicalThreadCount() * waves;
42+
int max_threads = dev_ctx.GetMaxPhysicalThreadCount() * waves;
4343
int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
4444

4545
int grid_cols =
@@ -605,14 +605,14 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& dev_ctx,
605605

606606
template <typename T>
607607
struct ConcatFunctor<phi::GPUContext, T> {
608-
void operator()(const phi::GPUContext& context,
608+
void operator()(const phi::GPUContext& dev_ctx,
609609
const std::vector<phi::DenseTensor>& input,
610610
int axis,
611611
phi::DenseTensor* output) {
612612
if (output->numel() < std::numeric_limits<int32_t>::max()) {
613-
ConcatFunctorWithIndexType<T, int32_t>(context, input, axis, output);
613+
ConcatFunctorWithIndexType<T, int32_t>(dev_ctx, input, axis, output);
614614
} else {
615-
ConcatFunctorWithIndexType<T, int64_t>(context, input, axis, output);
615+
ConcatFunctorWithIndexType<T, int64_t>(dev_ctx, input, axis, output);
616616
}
617617
}
618618
};
@@ -805,7 +805,7 @@ void SplitFunctorDispatchWithIndexType(
805805
template <typename T>
806806
class SplitFunctor<phi::GPUContext, T> {
807807
public:
808-
void operator()(const phi::GPUContext& context,
808+
void operator()(const phi::GPUContext& dev_ctx,
809809
const phi::DenseTensor& input,
810810
const std::vector<const phi::DenseTensor*>& ref_inputs,
811811
int axis,
@@ -819,10 +819,10 @@ class SplitFunctor<phi::GPUContext, T> {
819819

820820
if (numel < std::numeric_limits<int32_t>::max()) {
821821
SplitFunctorDispatchWithIndexType<T, int32_t>(
822-
context, axis, input, ref_inputs, outputs);
822+
dev_ctx, axis, input, ref_inputs, outputs);
823823
} else {
824824
SplitFunctorDispatchWithIndexType<T, int64_t>(
825-
context, axis, input, ref_inputs, outputs);
825+
dev_ctx, axis, input, ref_inputs, outputs);
826826
}
827827
}
828828
};

paddle/phi/kernels/funcs/fc_functor.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ namespace phi {
2222
namespace funcs {
2323

2424
template <typename DeviceContext, typename T>
25-
void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
25+
void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
2626
const int M,
2727
const int N,
2828
const int K,
@@ -32,18 +32,18 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
3232
const T* B,
3333
bool relu,
3434
bool padding_weights) {
35-
auto blas = GetBlas<DeviceContext, T>(context);
35+
auto blas = GetBlas<DeviceContext, T>(dev_ctx);
3636
phi::DenseTensor Y1;
3737
T* Y1_data = nullptr;
3838
if (padding_weights) {
3939
const int NN = N + 4;
4040
const int KK = K + 4;
4141
phi::DenseTensor X1;
4242
X1.Resize({M * KK});
43-
T* X1_data = context.template HostAlloc<T>(&X1);
43+
T* X1_data = dev_ctx.template HostAlloc<T>(&X1);
4444

4545
Y1.Resize({M * (N + 4)});
46-
Y1_data = context.template HostAlloc<T>(&Y1);
46+
Y1_data = dev_ctx.template HostAlloc<T>(&Y1);
4747
#ifdef PADDLE_WITH_MKLML
4848
#pragma omp parallel for
4949
#endif

paddle/phi/kernels/funcs/fc_functor.cu

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ void AddReluKernel(gpuStream_t stream,
336336
#endif
337337

338338
template <typename DeviceContext, typename T>
339-
void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
339+
void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
340340
const int M,
341341
const int N,
342342
const int K,
@@ -350,7 +350,7 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
350350
false,
351351
errors::PermissionDenied(
352352
"Weight padding in fc can not be used in GPU scope."));
353-
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
353+
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
354354
blas.GEMM(CblasNoTrans,
355355
CblasNoTrans,
356356
M,
@@ -366,7 +366,7 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
366366
}
367367

368368
// M * N
369-
AddReluKernel(context.stream(), M, N, Y, B, relu);
369+
AddReluKernel(dev_ctx.stream(), M, N, Y, B, relu);
370370
}
371371

372372
template class FCFunctor<GPUContext, float16>;
@@ -375,7 +375,7 @@ template class FCFunctor<GPUContext, double>;
375375

376376
template <typename DeviceContext, typename T>
377377
void FCInt8Functor<DeviceContext, T>::operator()(
378-
const DeviceContext& context,
378+
const DeviceContext& dev_ctx,
379379
const int M,
380380
const int N,
381381
const int K,
@@ -399,9 +399,9 @@ void FCInt8Functor<DeviceContext, T>::operator()(
399399
DenseTensor quant_x_tensor, quant_y_tensor;
400400
quant_x_tensor.Resize(common::make_ddim({M, K}));
401401
quant_y_tensor.Resize(common::make_ddim({M, N}));
402-
context.template Alloc<int8_t>(&quant_x_tensor,
402+
dev_ctx.template Alloc<int8_t>(&quant_x_tensor,
403403
quant_x_tensor.numel() * sizeof(int8_t));
404-
context.template Alloc<int32_t>(&quant_y_tensor,
404+
dev_ctx.template Alloc<int32_t>(&quant_y_tensor,
405405
quant_y_tensor.numel() * sizeof(int32_t));
406406
LaunchQuantKernelWithVecSize<T>(X,
407407
quant_x_tensor.data<int8_t>(),
@@ -411,14 +411,14 @@ void FCInt8Functor<DeviceContext, T>::operator()(
411411
quant_round_type,
412412
quant_max_bound,
413413
quant_min_bound,
414-
context.stream());
414+
dev_ctx.stream());
415415

416416
MatmulKernel<int8_t, GPUContext>(
417-
context, quant_x_tensor, *w_tensor, false, false, &quant_y_tensor);
417+
dev_ctx, quant_x_tensor, *w_tensor, false, false, &quant_y_tensor);
418418

419419
DenseTensor scale_weights_dev;
420420
scale_weights_dev.Resize(common::make_ddim({N}));
421-
context.template Alloc<float>(&scale_weights_dev,
421+
dev_ctx.template Alloc<float>(&scale_weights_dev,
422422
scale_weights_dev.numel() * sizeof(float));
423423
float* scale_weights_dev_ptr = scale_weights_dev.data<float>();
424424
#ifdef PADDLE_WITH_HIP
@@ -436,15 +436,15 @@ void FCInt8Functor<DeviceContext, T>::operator()(
436436
phi::backends::gpu::GpuLaunchConfig config;
437437
if (N % DequantKernelVecSize == 0) {
438438
config = phi::backends::gpu::GetGpuLaunchConfig1D(
439-
context, M * N, DequantKernelVecSize);
439+
dev_ctx, M * N, DequantKernelVecSize);
440440
} else {
441-
config = phi::backends::gpu::GetGpuLaunchConfig1D(context, M * N, 1);
441+
config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, M * N, 1);
442442
}
443443
LaunchDequantKernelWithScaleOfInputAndWeight(quant_y_tensor.data<int32_t>(),
444444
Y,
445445
M,
446446
N,
447-
context.stream(),
447+
dev_ctx.stream(),
448448
&config,
449449
scale_in,
450450
scale_weights_dev_ptr,
@@ -455,7 +455,7 @@ void FCInt8Functor<DeviceContext, T>::operator()(
455455
}
456456

457457
// M * N
458-
AddReluKernel(context.stream(), M, N, Y, B, relu);
458+
AddReluKernel(dev_ctx.stream(), M, N, Y, B, relu);
459459
}
460460

461461
template class FCInt8Functor<GPUContext, float16>;

paddle/phi/kernels/funcs/math_function.cu

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ __global__ void TransposeNormalKernel(const T* in_ptr,
235235

236236
template <typename DeviceContext, typename T>
237237
void TransposeNormal<DeviceContext, T>::operator()(
238-
const DeviceContext& context,
238+
const DeviceContext& dev_ctx,
239239
const phi::DenseTensor& in,
240240
phi::DenseTensor* out,
241241
const std::vector<int>& axis) {
@@ -246,7 +246,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
246246
auto* out_ptr = out->data<T>();
247247

248248
// copy in_stride, out_stride, axis to gpu device
249-
const phi::Place& cuda_place = context.GetPlace();
249+
const phi::Place& cuda_place = dev_ctx.GetPlace();
250250
phi::CPUPlace cpu_place = phi::CPUPlace();
251251
size_t size = 3 * rank * sizeof(int64_t);
252252
auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
@@ -259,26 +259,26 @@ void TransposeNormal<DeviceContext, T>::operator()(
259259
cpu_buf[2 * rank + i] = axis[i];
260260
}
261261
memory_utils::Copy(
262-
cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
262+
cuda_place, cuda_buf, cpu_place, cpu_buf, size, dev_ctx.stream());
263263
REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
264264
REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
265265
REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
266266

267-
const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
268-
const int MAX_GRID_DIM = context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
267+
const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
268+
const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
269269
int64_t elements = in.numel();
270270
int block_size = (elements >= MAX_BLOCK_DIM)
271271
? MAX_BLOCK_DIM
272272
: (1 << static_cast<int>(std::log2(elements)));
273273
int grid_size = elements / block_size;
274274
grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
275-
TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
275+
TransposeNormalKernel<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
276276
in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr, rank);
277277
}
278278

279279
template <typename T>
280280
struct TransposeNormal<phi::GPUContext, T> {
281-
void operator()(const phi::GPUContext& context,
281+
void operator()(const phi::GPUContext& dev_ctx,
282282
const DenseTensor& in,
283283
DenseTensor* out,
284284
const std::vector<int>& axis) {
@@ -289,7 +289,7 @@ struct TransposeNormal<phi::GPUContext, T> {
289289
auto* out_ptr = out->data<T>();
290290

291291
// copy in_stride, out_stride, axis to gpu device
292-
const phi::Place& cuda_place = context.GetPlace();
292+
const phi::Place& cuda_place = dev_ctx.GetPlace();
293293
phi::CPUPlace cpu_place = phi::CPUPlace();
294294
size_t size = 3 * rank * sizeof(int64_t);
295295
auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
@@ -302,22 +302,22 @@ struct TransposeNormal<phi::GPUContext, T> {
302302
cpu_buf[2 * rank + i] = axis[i];
303303
}
304304
memory_utils::Copy(
305-
cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
305+
cuda_place, cuda_buf, cpu_place, cpu_buf, size, dev_ctx.stream());
306306
REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
307307
REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
308308
REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
309309

310-
const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
310+
const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock();
311311
const int MAX_GRID_DIM =
312-
context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
312+
dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
313313
int64_t elements = in.numel();
314314
int block_size = (elements >= MAX_BLOCK_DIM)
315315
? MAX_BLOCK_DIM
316316
: (1 << static_cast<int>(std::log2(elements)));
317317
int grid_size = elements / block_size;
318318
grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
319319
TransposeNormalKernel<T>
320-
<<<grid_size, block_size, 0, context.stream()>>>(in_ptr,
320+
<<<grid_size, block_size, 0, dev_ctx.stream()>>>(in_ptr,
321321
out_ptr,
322322
elements,
323323
in_stride_ptr,
@@ -347,30 +347,30 @@ DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<float>);
347347
DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<double>);
348348

349349
struct TensorSetConstantGPU {
350-
TensorSetConstantGPU(const phi::DeviceContext& context,
350+
TensorSetConstantGPU(const phi::DeviceContext& dev_ctx,
351351
phi::DenseTensor* tensor,
352352
float value)
353-
: context_(context), tensor_(tensor), value_(value) {}
353+
: dev_ctx_(dev_ctx), tensor_(tensor), value_(value) {}
354354

355355
template <typename T>
356356
void apply() const {
357357
SetConstant<phi::GPUContext, T> functor;
358-
functor(reinterpret_cast<const phi::GPUContext&>(context_),
358+
functor(reinterpret_cast<const phi::GPUContext&>(dev_ctx_),
359359
tensor_,
360360
static_cast<T>(value_));
361361
}
362362

363-
const phi::DeviceContext& context_;
363+
const phi::DeviceContext& dev_ctx_;
364364
phi::DenseTensor* tensor_;
365365
float value_;
366366
};
367367

368368
template <>
369-
void set_constant_with_place<phi::GPUPlace>(const phi::DeviceContext& context,
369+
void set_constant_with_place<phi::GPUPlace>(const phi::DeviceContext& dev_ctx,
370370
phi::DenseTensor* tensor,
371371
float value) {
372372
phi::VisitDataType(tensor->dtype(),
373-
TensorSetConstantGPU(context, tensor, value));
373+
TensorSetConstantGPU(dev_ctx, tensor, value));
374374
}
375375

376376
template <typename T>
@@ -386,7 +386,7 @@ __global__ void RowwiseAddKernel(
386386

387387
template <typename T>
388388
struct RowwiseAdd<phi::GPUContext, T> {
389-
void operator()(const phi::GPUContext& context,
389+
void operator()(const phi::GPUContext& dev_ctx,
390390
const phi::DenseTensor& input,
391391
const phi::DenseTensor& vector,
392392
phi::DenseTensor* output) {
@@ -415,7 +415,7 @@ struct RowwiseAdd<phi::GPUContext, T> {
415415
out_dims_cstr));
416416
int blocks = 512;
417417
int grids = (input.numel() + blocks - 1) / blocks;
418-
RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
418+
RowwiseAddKernel<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
419419
input.data<T>(),
420420
vector.data<T>(),
421421
output->data<T>(),

paddle/phi/kernels/funcs/math_function_blas_impl.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ namespace funcs {
3232
// and only failed for this case. So reimplemented it.
3333
template <>
3434
void ColwiseSum<phi::GPUContext, double>::operator()(
35-
const phi::GPUContext& context,
35+
const phi::GPUContext& dev_ctx,
3636
const phi::DenseTensor& input,
3737
phi::DenseTensor* vector) {
3838
auto in_dims = input.dims();
@@ -47,11 +47,11 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
4747
vector->numel()));
4848
phi::DenseTensor one;
4949
one.Resize({in_dims[0]});
50-
context.template Alloc<double>(&one);
50+
dev_ctx.template Alloc<double>(&one);
5151

5252
SetConstant<phi::GPUContext, double> set;
53-
set(context, &one, static_cast<double>(1.0));
54-
phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
53+
set(dev_ctx, &one, static_cast<double>(1.0));
54+
phi::funcs::GetBlas<phi::GPUContext, double>(dev_ctx).GEMV(
5555
true,
5656
static_cast<int>(in_dims[0]),
5757
static_cast<int>(in_dims[1]),
@@ -68,7 +68,7 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
6868
// mode,
6969
template <>
7070
void RowwiseSum<phi::GPUContext, double>::operator()(
71-
const phi::GPUContext& context,
71+
const phi::GPUContext& dev_ctx,
7272
const phi::DenseTensor& input,
7373
phi::DenseTensor* vector) {
7474
auto in_dims = input.dims();
@@ -83,11 +83,11 @@ void RowwiseSum<phi::GPUContext, double>::operator()(
8383
vector->numel()));
8484
phi::DenseTensor one;
8585
one.Resize({size});
86-
context.template Alloc<double>(&one);
86+
dev_ctx.template Alloc<double>(&one);
8787

8888
SetConstant<phi::GPUContext, double> set;
89-
set(context, &one, static_cast<double>(1.0));
90-
phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
89+
set(dev_ctx, &one, static_cast<double>(1.0));
90+
phi::funcs::GetBlas<phi::GPUContext, double>(dev_ctx).GEMV(
9191
true,
9292
static_cast<int>(in_dims[1]),
9393
static_cast<int>(in_dims[0]),

0 commit comments

Comments
 (0)