Skip to content
Merged
Changes from 16 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
8f532b0
Merge pull request #1 from PaddlePaddle/develop
AshburnLee Sep 8, 2020
5b5804d
Merge pull request #2 from PaddlePaddle/develop
AshburnLee Sep 17, 2020
cee2470
Merge pull request #3 from PaddlePaddle/develop
AshburnLee Sep 30, 2020
5be3a45
Merge pull request #4 from PaddlePaddle/develop
AshburnLee Oct 13, 2020
a1d92b7
Merge pull request #5 from PaddlePaddle/develop
AshburnLee Oct 20, 2020
e674a5d
Merge pull request #6 from PaddlePaddle/develop
AshburnLee Nov 15, 2020
855d00b
Merge pull request #7 from PaddlePaddle/develop
AshburnLee Nov 18, 2020
20a37a8
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into…
AshburnLee Mar 15, 2021
82328a7
temporary PR for log_softmax
AshburnLee Mar 15, 2021
f6ece4d
Logsoftmax formard case#1: axis=-1
AshburnLee Mar 16, 2021
0f56b5e
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into…
AshburnLee Mar 16, 2021
4d5533b
Changed copyright
AshburnLee Mar 16, 2021
060953b
Made modifications according to PR reviewers
AshburnLee Mar 17, 2021
eb14185
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into…
AshburnLee Mar 17, 2021
302f08d
Dealt with unittest precision errors
AshburnLee Mar 18, 2021
844b880
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into…
AshburnLee Mar 18, 2021
26e1850
change launch cinfigure and code style
AshburnLee Mar 23, 2021
66c48ae
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into…
AshburnLee Mar 23, 2021
f2a2f2e
Removed header file cuda_runtime.h for HIP support
AshburnLee Mar 23, 2021
ab96a80
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into…
AshburnLee Mar 23, 2021
bf320c7
Modified code according to review comments
AshburnLee Mar 24, 2021
c5404ce
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into…
AshburnLee Mar 24, 2021
c7d785e
Reply to review comments
AshburnLee Apr 8, 2021
480a52f
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into…
AshburnLee Apr 8, 2021
0c1aec6
cudaStream_t -> gpuStream_t
AshburnLee Apr 9, 2021
24cd730
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into…
AshburnLee Apr 9, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 193 additions & 0 deletions paddle/fluid/operators/log_softmax_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,200 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cuda_runtime.h>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HIP上会找不到cuda_runtime.h,可以试试看删掉这个头文件应该也可以运行,或者写成

#ifdef __HIPCC__
#include <hip/hip_runtime.h>
#else
#include <cuda_runtime.h>
#endif

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

#include <limits>
#include "paddle/fluid/operators/log_softmax_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"

namespace paddle {
namespace operators {

#define WARP_SIZE 32

#define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) \
case L2E: \
WarpLogSoftmaxForward<T, double, L2E><<<blocks, threads, 0>>>( \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不要都用double,double速度会很慢。

dst, src, batch_count, softmax_elements_stride, softmax_elements); \
break;

int LogTwoCeil(int value) {
int log2_value = 0;
while ((1 << log2_value) < value) ++log2_value;
return log2_value;
}

template <typename T, int NumBatch, int KernelWarpSize>
__device__ __forceinline__ void ReduceSumForWarpBatch(T* sum) {
#pragma unroll
for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
#pragma unroll
for (int i = 0; i < NumBatch; ++i) {
T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
sum[i] = sum[i] + sum_val;
}
}
}

template <typename T, int NumBatch, int KernelWarpSize>
__device__ __forceinline__ void ReduceMaxForWarpBatch(T* sum) {
#pragma unroll
for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
#pragma unroll
for (int i = 0; i < NumBatch; ++i) {
T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
sum[i] = max(sum[i], max_val);
}
}
}

template <typename T, typename AccT, int log2_elements>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

模板里面的变量其实是常量,命名用AxxBxx形式,以跟函数里面的变量区分。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

__global__ void WarpLogSoftmaxForward(T* dst, const T* src, int batch_size,
int stride, int element_count) {
constexpr int next_power_of_two = 1 << log2_elements;
constexpr int kernel_warp_size =
(next_power_of_two < WARP_SIZE) ? next_power_of_two : WARP_SIZE;
constexpr int warp_iterations = next_power_of_two / kernel_warp_size;
constexpr int num_batch = (next_power_of_two <= 128) ? 2 : 1;

int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * num_batch;
int local_batches = batch_size - first_batch;
if (local_batches > num_batch) local_batches = num_batch;

int local_idx = threadIdx.x;
src += first_batch * stride + local_idx;
dst += first_batch * stride + local_idx;

// 1.load data from global memory
AccT elements[num_batch][warp_iterations];
int idx = threadIdx.x + blockDim.x * threadIdx.y;

for (int i = 0; i < num_batch; ++i) {
int batch_element_count = (i >= local_batches) ? 0 : element_count;
for (int it = 0; it < warp_iterations; ++it) {
int element_index = local_idx + it * kernel_warp_size;
if (element_index < batch_element_count) {
elements[i][it] =
static_cast<double>(src[i * element_count + it * kernel_warp_size]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不要都用double

} else {
elements[i][it] = -std::numeric_limits<AccT>::infinity();
}
}
}

// 2.compute max_value
AccT max_value[num_batch];
#pragma unroll
for (int i = 0; i < num_batch; ++i) {
max_value[i] = elements[i][0];
#pragma unroll
for (int it = 1; it < warp_iterations; ++it) {
max_value[i] =
(max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
}
}
ReduceMaxForWarpBatch<AccT, num_batch, kernel_warp_size>(max_value);

AccT sum[num_batch]{0.0f};
#pragma unroll
for (int i = 0; i < num_batch; ++i) {
#pragma unroll
for (int it = 0; it < warp_iterations; ++it) {
sum[i] += std::exp(elements[i][it] - max_value[i]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float16的时候会有问题吗?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

是因为__shfl_xor_sync&__shfl_xor不支持fp16。应该是可以处理的

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done,已处理。

}
}
ReduceSumForWarpBatch<AccT, num_batch, kernel_warp_size>(sum);

// 3.store result
#pragma unroll
for (int i = 0; i < num_batch; ++i) {
if (i >= local_batches) break;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这种if语句分行写,并且都加上{}。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

sum[i] = std::log(sum[i]);
#pragma unroll
for (int it = 0; it < warp_iterations; ++it) {
int element_index = local_idx + it * kernel_warp_size;
if (element_index < element_count) {
dst[i * element_count + it * kernel_warp_size] =
elements[i][it] - max_value[i] - sum[i];
} else {
break;
}
}
}
}

template <typename T>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

模板设置:

template <typename T, typename AccT>
void LaunchSoftmaxForwardForLastAxis(....) {
    ...
}

外层调用:LaunchSoftmaxForwardForLastAxis<T, MPTypeTrait<T>::Type>(...),即可解决模板调用中的double。MPTypeTrait的定义见:

template <typename T>
class MPTypeTrait {
public:
using Type = T;
};
template <>
class MPTypeTrait<platform::float16> {
public:
using Type = float;
};

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. 谢谢提供的解决方案!

void LaunchSoftmaxForwardForLastAxis(T* dst, const T* src, int softmax_elements,
int softmax_elements_stride,
int batch_count) {
int log2_elements = LogTwoCeil(softmax_elements);
const int next_power_of_two = 1 << log2_elements;
int warp_size =
(next_power_of_two < WARP_SIZE) ? next_power_of_two : WARP_SIZE;
int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

// use 128 threads per block to maximimize gpu utilization
constexpr int threads_per_block = 128;
int warps_per_block = (threads_per_block / warp_size);
int batches_per_block = warps_per_block * batches_per_warp;
int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
dim3 threads(warp_size, warps_per_block, 1);

switch (log2_elements) {
LAUNCH_SOFTMAX_WARP_FORWARD(0); // 1
LAUNCH_SOFTMAX_WARP_FORWARD(1); // 2
LAUNCH_SOFTMAX_WARP_FORWARD(2); // 4
LAUNCH_SOFTMAX_WARP_FORWARD(3); // 8
LAUNCH_SOFTMAX_WARP_FORWARD(4); // 16
LAUNCH_SOFTMAX_WARP_FORWARD(5); // 32
LAUNCH_SOFTMAX_WARP_FORWARD(6); // 64
LAUNCH_SOFTMAX_WARP_FORWARD(7); // 128
LAUNCH_SOFTMAX_WARP_FORWARD(8); // 256
LAUNCH_SOFTMAX_WARP_FORWARD(9); // 512
LAUNCH_SOFTMAX_WARP_FORWARD(10); // 1024
default:
break;
}
}

template <typename T>
class LogSoftmaxKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const auto* X = context.Input<framework::Tensor>("X");
auto* Out = context.Output<framework::Tensor>("Out");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

变量名命名:axx_bxx

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

变量名都改为了这种形式。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

X、Out还没改。

const auto* input_data = X->data<T>();
auto* output_data = Out->mutable_data<T>(context.GetPlace());

PADDLE_ENFORCE_GT(X->numel(), 0, platform::errors::InvalidArgument(
"Expected number of elements > 0. But "
"received number of elements is %d.",
X->numel()));
const int rank = X->dims().size();
const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);

int dim_size = X->dims()[axis];
int inner_size = 1;
for (int i = axis + 1; i < X->dims().size(); i++)
inner_size *= X->dims()[i];
int outer_size = 1;
outer_size = SizeToAxis(axis, X->dims());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

191和192可以合成1行。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if里面为什么要加&& dim_size * sizeof(T) <= 4096这个判断呢?不支持double吗?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

支持double。当把&& dim_size * sizeof(T) <= 4096删去,可以正确执行,但是一致性的diff从0.0 变为1.0728e-6(atol=1.00e-6)。

&& dim_size <= 1024是必要的。

当outer_size=128,dim_size=1024时,有config<<<32, (32, 4)>>>,warp_iter=32,正确执行。
当outer_size=128,dim_size=1025时,有config<<<32, (32, 4)>>>,warp_iter=64,不能得到结果。

warp_iter表示一个thread使用到的寄存器,应该是warp_iter=64超过硬件限制了。

// execute CUDA kernel
LaunchSoftmaxForwardForLastAxis<T>(output_data, input_data, dim_size,
dim_size, outer_size);
} else {
// execute Eigen kernel
LogSoftmaxFunctor<platform::CUDADeviceContext, T>()(
context.template device_context<platform::CUDADeviceContext>(), X,
Out, axis);
}
}
};

} // operators
} // paddle

namespace ops = paddle::operators;
namespace plat = paddle::platform;
Expand Down