Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7d58b91
Merge pull request #1 from PaddlePaddle/develop
AnnaTrainingG Mar 25, 2021
1021e08
Merge pull request #2 from PaddlePaddle/develop
AnnaTrainingG Mar 29, 2021
43f53fe
Merge pull request #3 from PaddlePaddle/develop
AnnaTrainingG Apr 19, 2021
d25ab26
Merge pull request #4 from PaddlePaddle/develop
AnnaTrainingG May 7, 2021
8c8717f
Merge pull request #5 from PaddlePaddle/develop
AnnaTrainingG May 25, 2021
9ddf5e8
Merge pull request #6 from PaddlePaddle/develop
AnnaTrainingG May 26, 2021
b0cbcca
Merge pull request #9 from PaddlePaddle/develop
AnnaTrainingG Jun 1, 2021
cdecaf0
Merge pull request #14 from PaddlePaddle/develop
AnnaTrainingG Jun 11, 2021
0da14c9
Merge pull request #16 from PaddlePaddle/develop
AnnaTrainingG Jun 15, 2021
ca95763
Merge pull request #17 from PaddlePaddle/develop
AnnaTrainingG Jun 22, 2021
25ba21c
Merge pull request #18 from PaddlePaddle/develop
AnnaTrainingG Jul 5, 2021
3ce9983
Merge pull request #19 from PaddlePaddle/develop
AnnaTrainingG Jul 6, 2021
61842ed
Merge pull request #20 from PaddlePaddle/develop
AnnaTrainingG Jul 12, 2021
0e2c73b
Merge pull request #21 from PaddlePaddle/develop
AnnaTrainingG Jul 28, 2021
c1e59cf
Merge pull request #22 from PaddlePaddle/develop
AnnaTrainingG Aug 2, 2021
3a54149
Merge pull request #23 from PaddlePaddle/develop
AnnaTrainingG Aug 4, 2021
7addd79
Merge pull request #24 from PaddlePaddle/develop
AnnaTrainingG Aug 11, 2021
1e843d1
Merge pull request #25 from PaddlePaddle/develop
AnnaTrainingG Aug 23, 2021
0ee3411
add ElementwiseTernary, Reduce, ReadDataStride
AnnaTrainingG Aug 23, 2021
f763e02
delete divFunctor
AnnaTrainingG Aug 23, 2021
b0c3dcd
add writedataBase
AnnaTrainingG Aug 23, 2021
3b74aaa
delete cast and remove cast
AnnaTrainingG Aug 23, 2021
cdcfcda
update
AnnaTrainingG Aug 23, 2021
c476bba
update
AnnaTrainingG Aug 25, 2021
4c28141
add notes
AnnaTrainingG Aug 27, 2021
acbe8e6
add notes and change the name of expFunctor
AnnaTrainingG Aug 31, 2021
6c6ea8c
update
AnnaTrainingG Aug 31, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 132 additions & 23 deletions paddle/fluid/operators/kernel_primitives/compute_primitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,22 @@
#endif

#include <algorithm>
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"

namespace paddle {
namespace operators {
namespace kernel_primitives {
namespace details {

#ifdef __HIPCC__
constexpr int kMaxThread = 256;
constexpr int kWarpSize = 64;
#else
constexpr int kMaxThread = 128;
constexpr int kWarpSize = 32;
#endif

template <typename T>
class MPTypeTrait {
public:
Expand All @@ -43,24 +52,7 @@ class MPTypeTrait<platform::float16> {

} // namespace details

/*************************** Compute Functor****************************/
template <typename T, typename Enable = void>
struct DivFunctor {
inline HOSTDEVICE T operator()(const T* args) const {
return args[0] / args[1];
}
};

template <typename T>
struct DivFunctor<T, typename std::enable_if_t<std::is_integral<T>::value>> {
inline HOSTDEVICE T operator()(const T* args) const {
PADDLE_ENFORCE(args[1] != 0,
platform::errors::InvalidArgument(
"Invalid Argument Error: Integer division by zero "
"encountered in divide. Please check the input value."));
return args[0] / args[1];
}
};
enum ReduceMode { GlobalMode, LocalMode };

/*************************** Compute Function****************************/

Expand Down Expand Up @@ -88,7 +80,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1,
}

/**
* @brief fma eg: a * b + c, in1 in2, in3 and out has the same shape
* @brief eg: a * b + c, in1 in2, in3 and out has the same shape
* @param:
* T : the type of in1 and in2, in3
* NX: the row of in1, in2 and in3
Expand All @@ -97,12 +89,16 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1,
*/
template <typename T, typename OutT, int NX, int NY, int BlockSize,
class OpFunc>
__device__ __forceinline__ void ElementwiseFma(OutT* out, const T* in1,
const T* in2, const T* in3,
OpFunc compute) {
__device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1,
const T* in2, const T* in3,
OpFunc compute) {
T args[3];
#pragma unroll
for (int idx = 0; idx < NX * NY; ++idx) {
out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
args[0] = in1[idx];
args[1] = in2[idx];
args[2] = in3[idx];
out[idx] = static_cast<OutT>(compute(args));
}
}

Expand Down Expand Up @@ -148,6 +144,119 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in,
}
}

__device__ __forceinline__ int SharedMemoryIndex(int index) {
return (threadIdx.y + index) * blockDim.x + threadIdx.x;
}

template <typename T, typename ReduceOp>
__device__ __forceinline__ T WarpReduce(T val, ReduceOp reducer) {
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int stride = details::kWarpSize / 2; stride > 0; stride >>= 1) {
T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride);
val = reducer(val, temp);
}
return val;
}

/* e.g.
* |---------block---------|
* |warp0|warp1|warp2|warp3|
* |0~31|32~63|64~95|96~127| ---->blockDim.x = 128
* \|/ \|/ \|/ \|/ ---->1. First WarpReduce in each warp
* res0 res1 res2 res3 ---->2. Store result of each warp to shared memory
* \ \ / / ---->3. Load the result above from shared memory
* res to warp0 and process the second WarpReduce
*/
template <typename T, typename ReduceOp>
__device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
__syncthreads();
using details::kWarpSize;
__shared__ T shared[2 * kWarpSize];
int block_dim_x = blockDim.x;
if (blockDim.x > kWarpSize) {
block_dim_x = blockDim.x / kWarpSize;
int lane = threadIdx.x % kWarpSize;
int tid = threadIdx.y * blockDim.x + threadIdx.x;
int wid = tid / kWarpSize;
int bid = threadIdx.y;
val = WarpReduce(val, reducer);
if (lane == 0) {
shared[wid] = val;
}
__syncthreads();
val = shared[bid * block_dim_x + lane];
}

unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int stride = 1; stride < block_dim_x; stride <<= 1) {
T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride);
val = reducer(val, temp);
}
return val;
}

template <typename T, typename ReduceOp>
__device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
__shared__ T shared_memory[details::kMaxThread];
shared_memory[SharedMemoryIndex(0)] = val;
for (int stride = blockDim.y / 2; stride > 0; stride >>= 1) {
__syncthreads();
if (threadIdx.y < stride && threadIdx.y + stride < blockDim.y) {
T temp = shared_memory[SharedMemoryIndex(stride)];
val = reducer(val, temp);
}
shared_memory[SharedMemoryIndex(0)] = val;
}
return val;
}

template <typename Tx, typename Ty, int SIZE>
__device__ __forceinline__ void Cast(Ty* out, const Tx* in) {
#pragma unroll
for (int i = 0; i < SIZE; i++) {
out[i] = static_cast<Ty>(in[i]);
}
}

// in[NY][NX] -> in[NY]
template <typename T, int NX, int NY, int BlockSize, class OpFunc,
int ReduceMode>
__device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
bool reduce_lastDim) {
// blockReduceY

// thread Reduce

if (ReduceMode == ReduceMode::GlobalMode) {
bool block_reduce_y = (!reduce_lastDim) && (blockDim.y > 1);
// blockYReduce
if (block_reduce_y) {
#pragma unroll
for (int i = 0; i < NY; i++) {
out[i] = BlockYReduce<T, OpFunc>(out[i], reducer);
}
}

// blockXReduce
if (reduce_lastDim) {
#pragma unroll
for (int i = 0; i < NY; i++) {
out[i] = BlockXReduce<T, OpFunc>(out[i], reducer);
}
}
} else { // else LocalMode
#pragma unroll
for (int i = 0; i < NY; ++i) {
#pragma unroll
for (int j = 0; j < NX; ++j) {
out[i] = reducer(out[i], in[i * NX + j]);
}
}
}
}

} // namespace kernel_primitives
} // namespace operators
} // namespace paddle
150 changes: 145 additions & 5 deletions paddle/fluid/operators/kernel_primitives/datamover_primitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,67 @@ __device__ __forceinline__ void ReadDataBase(T* dst, const T* __restrict__ src,
}
}

// dst[NY][NX];
template <typename Tx, typename Ty, int NX, int NY, int BlockSize>
__device__ __forceinline__ void ReadDataStride(Ty* dst,
const Tx* __restrict__ src,
int stride_nx, int stride_ny) {
if (NY == 1 && NX == 1) {
dst[0] = static_cast<Ty>(src[threadIdx.x]);
} else if (NX == 1) {
int dx = threadIdx.x;
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
dst[idy] = static_cast<Ty>(src[dx + idy * stride_ny]);
}
} else if (NY == 1) {
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
dst[idx] = static_cast<Ty>(src[idx * stride_nx]);
}
} else {
int dx = threadIdx.x * NX;
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
dst[idy * NX + idx] =
static_cast<Ty>(src[idx * stride_nx + dx + idy * stride_ny]);
}
}
}
}

// dst[NY][NX];
template <typename Tx, typename Ty, int NX, int NY, int BlockSize>
__device__ __forceinline__ void ReadDataStride(Ty* dst,
const Tx* __restrict__ src,
int size_nx, int size_ny,
int stride_nx, int stride_ny) {
int dx = threadIdx.x * NX;
int size = size_nx - dx;
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (idx >= size) {
break;
}
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
if (idy >= size_ny) {
break;
}
dst[idy * NX + idx] =
static_cast<Ty>(src[idx * stride_nx + dx + idy * stride_ny]);
}
}
}
template <typename T, int NX>
__device__ __forceinline__ void Init(T* dst, T init_data) {
#pragma unroll
for (int i = 0; i < NX; i++) {
dst[i] = init_data;
}
}
template <typename T, int NX, int NY, int BlockSize>
__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
int size) {
Expand All @@ -125,14 +186,14 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,

// Vector per thread
if (blockDim.x * NX > size) {
ReadDataBase<T, NX, NY, BlockSize>(dst, src, size);
ReadDataStride<T, T, NX, NY, BlockSize>(dst, src, size, NY, 1, 1);
} else {
// Vector type
using VecType = details::VectorType<T, VECTOR_SIZE>;
VecType vec_temp[VECTORS_PER_THREAD];
const VecType* vec_input = reinterpret_cast<const VecType*>(src);
ReadDataBase<VecType, VECTORS_PER_THREAD, NY, BlockSize>(
vec_temp, vec_input, VECTORS_PER_THREAD * blockDim.x);
ReadDataStride<VecType, VecType, VECTORS_PER_THREAD, NY, BlockSize>(
vec_temp, vec_input, 1, 1);
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
dst[idx] = *(reinterpret_cast<T*>(vec_temp) + idx);
Expand Down Expand Up @@ -176,6 +237,75 @@ __device__ __forceinline__ void ReadDataBc(
}
}

// stride_nx = 1
template <typename T, int NX, int NY, int BlockSize, int ShapeSize,
typename IndexCal>
__device__ __forceinline__ void ReadDataReduce(
T* dst, const T* __restrict__ src, int fix, const IndexCal& index_cal,
int stride_nx, int stride_ny, bool reduce_lastdim) {
int base_offset = fix;
if (reduce_lastdim) {
base_offset += threadIdx.x;
} else {
base_offset += threadIdx.y;
}

if (NX == 1) {
#pragma unroll
for (int ny = 0; ny < NY; ++ny) {
int idx = base_offset + ny * stride_ny;
uint32_t offset = index_cal(idx);
dst[ny] = src[offset];
}
} else {
#pragma unroll
for (int nx = 0; nx < NX; ++nx) {
#pragma unroll
for (int ny = 0; ny < NY; ++ny) {
int idx = base_offset + ny * stride_ny + nx * stride_nx;
uint32_t offset = index_cal(idx);
dst[ny] = src[offset];
}
}
}
}

// stride_nx = 1
template <typename T, int NX, int NY, int BlockSize, int ShapeSize,
typename IndexCal>
__device__ __forceinline__ void ReadDataReduce(
T* dst, const T* __restrict__ src, int fix, const IndexCal& index_cal,
int size_nx, int size_ny, int stride_nx, int stride_ny,
bool reduce_lastdim) {
int base_offset = fix;
if (reduce_lastdim) {
base_offset += threadIdx.x;
} else {
base_offset += threadIdx.y;
}

if (NX == 1) {
#pragma unroll
for (int ny = 0; ny < NY; ++ny) {
if (base_offset >= size_ny) break;
uint32_t offset = index_cal(base_offset);
dst[ny] = src[offset];
base_offset += stride_ny;
}
} else {
#pragma unroll
for (int nx = 0; nx < NX; ++nx) {
if (nx * stride_nx >= size_nx) break;
#pragma unroll
for (int ny = 0; ny < NY; ++ny) {
if (base_offset >= size_ny) break;
uint32_t offset = index_cal(base_offset);
dst[nx + ny * NX] = src[offset];
base_offset += stride_ny;
}
}
}
}
template <typename T, int NX, int NY, int BlockSize>
__device__ __forceinline__ void WriteDataBase(T* dst, const T* __restrict__ src,
int size) {
Expand All @@ -189,6 +319,16 @@ __device__ __forceinline__ void WriteDataBase(T* dst, const T* __restrict__ src,
}
}

template <typename T, int NX, int NY, int BlockSize>
__device__ __forceinline__ void WriteDataBase(T* dst,
const T* __restrict__ src) {
int dx = threadIdx.x * NX;
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
dst[idx + dx] = src[idx];
}
}

template <typename T, int NX, int NY, int BlockSize>
__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
int size) {
Expand All @@ -207,8 +347,8 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
vec_temp[idx] = *(reinterpret_cast<VecType*>(src) + idx);
}
VecType* vec_dst = reinterpret_cast<VecType*>(dst);
WriteDataBase<VecType, VECTORS_PER_THREAD, NY, BlockSize>(
vec_dst, vec_temp, VECTORS_PER_THREAD * blockDim.x);
WriteDataBase<VecType, VECTORS_PER_THREAD, NY, BlockSize>(vec_dst,
vec_temp);
}
}

Expand Down