From bf318b8b5481cd994e0d1c0dc29482fc8e12da8b Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 23 Aug 2021 08:16:17 +0000 Subject: [PATCH 01/19] add a fusion op: fused_residual_dropout_bias --- paddle/fluid/operators/fused/CMakeLists.txt | 5 + .../fused/fused_residual_dropout_bias.h | 558 ++++++++++++++++++ .../fused/test_fused_residual_dropout_bias.cu | 441 ++++++++++++++ 3 files changed, 1004 insertions(+) create mode 100644 paddle/fluid/operators/fused/fused_residual_dropout_bias.h create mode 100644 paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 541e5afdf9b71e..525f6504f9fa61 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -71,4 +71,9 @@ if (WITH_GPU OR WITH_ROCM) op_library(fused_bn_add_activation_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n") endif() + # fused_dropout + # only support CUDA + if(NOT WITH_ROCM) + nv_test(test_fused_residual_dropout_bias SRCS test_fused_residual_dropout_bias.cu DEPS tensor op_registry elementwise_add_op dropout_op device_context generator) + endif() endif() diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h new file mode 100644 index 00000000000000..e0b51be9e909e3 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -0,0 +1,558 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include +#include + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/float16.h" + +const int VecSize = 4; + +namespace paddle { +namespace operators { + +namespace platform = paddle::platform; + +inline std::pair GetResidualDropoutThreads( + const platform::CUDADeviceContext &ctx, const uint64_t n) { + const uint64_t tmp_n = n / VecSize; + int threads = std::max( + (uint64_t)32, std::min(tmp_n, (uint64_t)ctx.GetMaxThreadsPerBlock())); + int blocks = std::max((uint64_t)1, (tmp_n + threads - 1) / threads); + return std::pair{threads, blocks}; +} + +inline std::pair GetResidualDropoutBiasThreads( + const platform::CUDADeviceContext &ctx, const uint32_t rows, + const uint32_t cols) { + const uint32_t tmp_cols = cols / VecSize; + int threads = std::max( + (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock())); + int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads); + int blocks_y = std::max((uint32_t)1, rows); + dim3 block_dim(threads, 1, 1); + dim3 grid_dim(blocks_x, blocks_y, 1); + return std::pair{block_dim, grid_dim}; +} + +/********Forward**************/ +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(T) * Size) AlignedVector { + T val[Size]; +}; + +template +inline int VectorizedSize(const T *pointer) { + uint64_t address = reinterpret_cast(pointer); + constexpr int vec4 = std::alignment_of>::value; // NOLINT + if (address % vec4 == 0) { + return 4; + } + return 1; +} + +/** + * dst = residual + dropout(src + bias); + */ +template +__global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols, + uint64_t seed, + const float dropout_prob, + const bool is_upscale_in_train, + const T *src, const T *residual, + const T *bias, MaskType *mask_data, + T *dst, uint64_t increment) { + int col_id = blockDim.x * blockIdx.x + threadIdx.x; + int row_id = blockIdx.y; + int idx = row_id * cols + col_id; + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, increment, &state); + + T factor = static_cast(1.0f / (1.0f - dropout_prob)); + const int tmp_cols = cols / VecSize * VecSize; + for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { + for (int i = col_id * VecSize; i < tmp_cols; + i += blockDim.x * gridDim.x * VecSize) { + float4 rand = curand_uniform4(&state); + float *rand_data = &(rand.x); + MaskType mask[VecSize]; + T bias_vec[VecSize]; +#pragma unroll + for (int j = 0; j < VecSize; j++) { + mask[j] = (MaskType)(rand_data[j] > dropout_prob); + bias_vec[j] = bias != nullptr ? bias[i + j] : static_cast(0); + } +#pragma unroll + for (int j = 0; j < VecSize; j++) { + mask_data[r * cols + i + j] = mask[j]; + } + + if (is_upscale_in_train) { +#pragma unroll + for (int j = 0; j < VecSize; j++) { + dst[r * cols + i + j] = (src[r * cols + i + j] + bias_vec[j]) * + static_cast(mask[j]) * factor + + residual[r * cols + i + j]; + } + } else { +#pragma unroll + for (int j = 0; j < VecSize; j++) { + dst[r * cols + i + j] = + (src[r * cols + i + j] + bias_vec[j]) * static_cast(mask[j]) + + residual[r * cols + i + j]; + } + } + } + + int high_index = tmp_cols + col_id; + if (high_index < cols) { + float4 rand = curand_uniform4(&state); + float *rand_data = &(rand.x); + int k = 0; + if (is_upscale_in_train) { + for (int i = high_index; i < cols; i++) { + MaskType m = (MaskType)(rand_data[k++] > dropout_prob); + mask_data[r * cols + i] = m; + dst[r * cols + i] = + (src[r * cols + i] + + (bias != nullptr ? bias[i] : static_cast(0.0))) * + static_cast(m) * factor + + residual[r * cols + i]; + } + } else { + for (int i = high_index; i < cols; i++) { + MaskType m = (MaskType)(rand_data[k++] > dropout_prob); + mask_data[r * cols + i] = m; + dst[r * cols + i] = + (src[r * cols + i] + + (bias != nullptr ? bias[i] : static_cast(0.0))) * + static_cast(m) + + residual[r * cols + i]; + } + } + } + } +} + +template +__global__ void FusedResidualDropoutBiasVec(const size_t rows, + const size_t cols, uint64_t seed, + const float dropout_prob, + const bool is_upscale_in_train, + const T *src, const T *residual, + const T *bias, MaskType *mask_data, + T *dst, uint64_t increment) { + int col_id = blockDim.x * blockIdx.x + threadIdx.x; + int row_id = blockIdx.y; + int idx = row_id * cols + col_id; + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, increment, &state); + + T dest; + MaskType mask; + T factor = static_cast(1.0f / (1.0f - dropout_prob)); + using LoadT = AlignedVector; + using MaskLoadT = AlignedVector; + for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { + for (int i = col_id * VecSize; i < cols; + i += blockDim.x * gridDim.x * VecSize) { + T src_vec[VecSize]; + T residual_vec[VecSize]; + T bias_vec[VecSize]; +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + bias_vec[ii] = static_cast(0); + } + LoadT *value = reinterpret_cast(&src_vec); + LoadT *residual_value = reinterpret_cast(&residual_vec); + *value = *reinterpret_cast(&src[r * cols + i]); + *residual_value = + *reinterpret_cast(&residual[r * cols + i]); + + LoadT *bias_value = + bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; + if (bias != nullptr) + *bias_value = *reinterpret_cast(&bias[i]); + + float4 rand = curand_uniform4(&state); + T dest_vec[VecSize]; + MaskType mask_vec[VecSize]; + +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob); + } + + if (is_upscale_in_train) { +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * + static_cast(mask_vec[ii]) * factor + + residual_vec[ii]; + } + } else { +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + dest_vec[ii] = + (src_vec[ii] + bias_vec[ii]) * static_cast(mask_vec[ii]) + + residual_vec[ii]; + } + } + *(reinterpret_cast(&dst[r * cols + i])) = + *reinterpret_cast(&dest_vec[0]); + *(reinterpret_cast(&mask_data[r * cols + i])) = + *reinterpret_cast(&mask_vec[0]); + } + } +} + +template +__global__ void FusedResidualDropoutBiasTest(const size_t rows, + const size_t cols, + const float dropout_prob, + const bool is_upscale_in_train, + const T *src, const T *residual, + const T *bias, T *dst) { + int col_id = blockDim.x * blockIdx.x + threadIdx.x; + int row_id = blockIdx.y; + int idx = row_id * cols + col_id; + + T factor = static_cast(1.0f - dropout_prob); + const int tmp_cols = cols / VecSize * VecSize; + for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { + for (int i = col_id * VecSize; i < tmp_cols; + i += blockDim.x * gridDim.x * VecSize) { + if (is_upscale_in_train) { +#pragma unroll + for (int j = 0; j < VecSize; j++) { + dst[r * cols + i + j] = + (src[r * cols + i + j] + + (bias != nullptr ? bias[i + j] : static_cast(0.0))) + + residual[r * cols + i + j]; + } + } else { +#pragma unroll + for (int j = 0; j < VecSize; j++) { + dst[r * cols + i + j] = + (src[r * cols + i + j] + + (bias != nullptr ? bias[i + j] : static_cast(0.0))) * + factor + + residual[r * cols + i + j]; + } + } + } + + int high_index = tmp_cols + col_id; + if (high_index < cols) { + if (is_upscale_in_train) { + for (int i = high_index; i < cols; i++) { + dst[r * cols + i] = + (src[r * cols + i] + + (bias != nullptr ? bias[i] : static_cast(0.0))) + + residual[r * cols + i]; + } + } else { + for (int i = high_index; i < cols; i++) { + dst[r * cols + i] = + (src[r * cols + i] + + (bias != nullptr ? bias[i] : static_cast(0.0))) * + factor + + residual[r * cols + i]; + } + } + } + } +} + +/** + * dst = residual + dropout(src + bias); + */ +template +void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, + const int increment, uint64_t seed, + const float dropout_prob, + bool is_upscale_in_train, const T *src, + const T *residual, const T *bias, + MaskType *mask_data, T *dst, + const platform::CUDADeviceContext &ctx) { + if (std::abs(dropout_prob - 1.0) < 1e-5) { + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T), + cudaMemcpyDeviceToDevice, ctx.stream())); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync( + mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream())); + return; + } + + auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols); + if (cols % VecSize != 0) + FusedResidualDropoutBias< + T, uint8_t, + VecSize><<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, + bias, mask_data, dst, increment); + else + FusedResidualDropoutBiasVec< + T, uint8_t, + VecSize><<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, + bias, mask_data, dst, increment); +} + +template +void LaunchResidualDropoutBiasTest(const uint32_t rows, const uint32_t cols, + const float dropout_prob, + bool is_upscale_in_train, const T *src, + const T *residual, const T *bias, T *dst, + const platform::CUDADeviceContext &ctx) { + if (std::abs(dropout_prob - 1.0) < 1e-5) { + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T), + cudaMemcpyDeviceToDevice, ctx.stream())); + return; + } + auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols); + FusedResidualDropoutBiasTest< + T, VecSize><<>>( + rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, dst); +} + +/********Backward**************/ +template +__global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask, + const T factor, const int64_t size, + T *dx, bool is_upscale_in_train) { + int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; + + int tmp_size = size / VecSize * VecSize; + for (int i = idx * VecSize; i < tmp_size; + i += blockDim.x * gridDim.x * VecSize) { +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + dx[i + ii] = dout[i + ii] * static_cast(mask[i + ii]) * factor; + } + } + + int high_index = tmp_size + idx; + if (size > high_index) { + for (int i = high_index; i < size; i++) { + if (is_upscale_in_train) + dx[i] = dout[i] * static_cast(mask[i]) * factor; + else + dx[i] = dout[i] * static_cast(mask[i]); + } + } +} + +template +__global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask, + const T factor, const int64_t size, + T *dx, bool is_upscale_in_train) { + int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; + + using LoadT = AlignedVector; + using MaskLoadT = AlignedVector; + for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { + T dout_vec[VecSize]; + MaskType mask_vec[VecSize]; + LoadT *dout_value = reinterpret_cast(&dout_vec); + MaskLoadT *mask_value = reinterpret_cast(&mask_vec); + *dout_value = *reinterpret_cast(&dout[i]); + *mask_value = *reinterpret_cast(&mask[i]); + + T dx_vec[VecSize]; +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + dx_vec[ii] = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; + } + *(reinterpret_cast(&dx[i])) = + *reinterpret_cast(&dx_vec[0]); + } +} + +template +__global__ void FusedResidualDropoutBiasGrad( + const T *dout, const MaskType *mask, const T factor, const int64_t rows, + const int64_t cols, T *dx, T *dbias, bool is_upscale_in_train) { + int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; + + __shared__ T cache[BSX][BSY]; + T tmp_sum = static_cast(0); + if (col_id < cols) { + for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) { + int index = row_id * cols + col_id; + T out_value = dout[index]; + if (is_upscale_in_train) + dx[index] = out_value * static_cast(mask[index]) * factor; + else + dx[index] = out_value * static_cast(mask[index]); + tmp_sum += out_value; + } + } + cache[threadIdx.x][threadIdx.y] = tmp_sum; + __syncthreads(); + + // reduce sum + // TODO(zhangkaihuo) : Replace with ModuleAPI + T sum = static_cast(0); + int tid = threadIdx.y * blockDim.x + threadIdx.x; + int x = tid / BSY; + int y = tid & (BSY - 1); + + int s = BSY / 2; + while (s > 0) { + if (y < s) { + cache[x][y] += cache[x][y + s]; + } + s /= 2; + __syncthreads(); + } + + if (threadIdx.y == 0 && col_id < cols) { + dbias[col_id] = cache[threadIdx.x][0]; + } +} + +template +__global__ void FusedResidualDropoutBiasGradVec( + const T *dout, const MaskType *mask, const T factor, const int64_t rows, + const int64_t cols, T *dx, T *dbias, bool is_upscale_in_train) { + int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; + + using LoadT = AlignedVector; + using MaskLoadT = AlignedVector; + + T tmp_sum[VecSize] = {static_cast(0)}; + if (col_id * 4 < cols) { + for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) { + int index = row_id * cols + col_id * 4; + T out_vec[VecSize]; + MaskType mask_vec[VecSize]; + T dx_vec[VecSize]; + LoadT *out_value = reinterpret_cast(&out_vec); + MaskLoadT *mask_value = reinterpret_cast(&mask_vec); + LoadT *dx_value = reinterpret_cast(&dx_vec); + *out_value = *reinterpret_cast(&dout[index]); + *mask_value = *reinterpret_cast(&mask[index]); + + if (is_upscale_in_train) { +#pragma unroll + for (int i = 0; i < VecSize; i++) { + dx_vec[i] = out_vec[i] * static_cast(mask_vec[i]) * factor; + tmp_sum[i] += out_vec[i]; + } + } else { +#pragma unroll + for (int i = 0; i < VecSize; i++) { + dx_vec[i] = out_vec[i] * static_cast(mask_vec[i]); + tmp_sum[i] += out_vec[i]; + } + } + + *(reinterpret_cast(&dx[index])) = + *reinterpret_cast(&dx_vec[0]); + } + } + + __shared__ T cache[BSX * VecSize][BSY]; + for (int i = 0; i < VecSize; i++) + cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i]; + __syncthreads(); + + // reduce sum + // TODO(zhangkaihuo) : Replace with ModuleAPI + T sum = static_cast(0); + int tid = threadIdx.y * blockDim.x + threadIdx.x; + int x = tid / BSY; + int y = tid & (BSY - 1); + + int s = BSY / 2; + while (s > 0) { + if (y < s) { + for (int i = 0; i < VecSize; i++) { + cache[x * VecSize + i][y] += cache[x * VecSize + i][y + s]; + } + } + s /= 2; + __syncthreads(); + } + + if (threadIdx.y == 0 && col_id * VecSize < cols) { + for (int i = 0; i < VecSize; i++) + dbias[col_id * VecSize + i] = cache[threadIdx.x * VecSize + i][0]; + } +} + +template +void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, + const float dropout_prob, + const bool is_upscale_in_train, + const uint32_t rows, const uint32_t cols, + T *dx, T *dbias, + const platform::CUDADeviceContext &ctx) { + const T zero = static_cast(0.0); + auto factor = dropout_prob == static_cast(1.0) + ? zero + : static_cast(1.0 / (1.0 - dropout_prob)); + + if (dbias != nullptr) { + if (cols % 4 == 0) { + auto threads = std::min(cols / VecSize, static_cast(8)); + auto blocks = std::max((uint32_t)1, + std::min((cols / VecSize + threads - 1) / threads, + (uint32_t)ctx.GetSMCount())); + dim3 block_dim(threads, 128, 1); + dim3 grid_dim(blocks, 1, 1); + FusedResidualDropoutBiasGradVec< + T, MaskType, 8, 128, + VecSize><<>>( + dout, mask, factor, rows, cols, dx, dbias, is_upscale_in_train); + + } else { + auto threads = std::min(cols, static_cast(8)); + auto blocks = std::max( + (uint32_t)1, + std::min((cols + threads - 1) / threads, (uint32_t)ctx.GetSMCount())); + dim3 block_dim(threads, 128, 1); + dim3 grid_dim(blocks, 1, 1); + FusedResidualDropoutBiasGrad< + T, MaskType, 8, 128><<>>( + dout, mask, factor, rows, cols, dx, dbias, is_upscale_in_train); + } + } else { + const uint64_t n = rows * cols; + auto threads = GetResidualDropoutThreads(ctx, n); + if (n % 4 == 0) { + FusedResidualDropoutGradVec< + T, MaskType, + VecSize><<>>( + dout, mask, factor, n, dx, is_upscale_in_train); + } else { + FusedResidualDropoutGrad< + T, MaskType><<>>( + dout, mask, factor, n, dx, is_upscale_in_train); + } + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu b/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu new file mode 100644 index 00000000000000..c8a1485ab7e0fa --- /dev/null +++ b/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu @@ -0,0 +1,441 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; + +USE_OP(elementwise_add); +USE_OP(dropout); + +template +void Dropout(const std::vector &x, const framework::DDim &x_dim, + std::vector *out, std::vector *mask, + const platform::CUDADeviceContext &ctx, uint64_t seed, + float dropout_prob, bool is_upscale_in_train, bool is_test) { + framework::Scope scope; + auto var_x = scope.Var("X"); + auto tensor_x = var_x->GetMutable(); + tensor_x->Resize(x_dim); + tensor_x->mutable_data(ctx.GetPlace()); + cudaMemcpy(tensor_x->data(), x.data(), x_dim[0] * x_dim[1] * sizeof(T), + cudaMemcpyHostToDevice); + + auto var_out = scope.Var("Out"); + auto tensor_out = var_out->GetMutable(); + + auto var_mask = scope.Var("Mask"); + auto tensor_mask = var_mask->GetMutable(); + + framework::AttributeMap attrs; + attrs.insert({"fix_seed", 1}); + attrs.insert({"seed", static_cast(seed)}); + attrs.insert({"dropout_prob", dropout_prob}); + if (is_upscale_in_train) { + attrs.insert({"dropout_implementation", std::string("upscale_in_train")}); + } + if (is_test) { + attrs.insert({"is_test", 1}); + } + + auto op = framework::OpRegistry::CreateOp( + "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs); + op->Run(scope, ctx.GetPlace()); + cudaMemcpy((*out).data(), tensor_out->data(), + x_dim[0] * x_dim[1] * sizeof(T), cudaMemcpyDeviceToHost); + if (!is_test) { + cudaMemcpy((*mask).data(), tensor_mask->data(), + x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost); + } + ctx.Wait(); +} + +template +void DropoutGrad(std::vector *dx, const framework::DDim &x_dim, + const std::vector &dout, const std::vector &mask, + const platform::CUDADeviceContext &ctx, float dropout_prob, + bool is_upscale_in_train) { + framework::Scope scope; + const size_t n = x_dim[0] * x_dim[1]; + auto var_out = scope.Var("DOut"); + auto tensor_out = var_out->GetMutable(); + tensor_out->Resize(x_dim); + tensor_out->mutable_data(ctx.GetPlace()); + cudaMemcpy(tensor_out->data(), dout.data(), n * sizeof(T), + cudaMemcpyHostToDevice); + + auto var_mask = scope.Var("Mask"); + auto tensor_mask = var_mask->GetMutable(); + tensor_mask->Resize(x_dim); + tensor_mask->mutable_data(ctx.GetPlace()); + cudaMemcpy(tensor_mask->data(), mask.data(), n * sizeof(uint8_t), + cudaMemcpyHostToDevice); + + auto var_dx = scope.Var("DX"); + auto tensor_dx = var_dx->GetMutable(); + + framework::AttributeMap attrs; + attrs.insert({"dropout_prob", dropout_prob}); + attrs.insert({"is_test", 0}); + if (is_upscale_in_train) { + attrs.insert({"dropout_implementation", std::string("upscale_in_train")}); + } else { + attrs.insert({"dropout_implementation", std::string("downgrade_in_infer")}); + } + + auto op = framework::OpRegistry::CreateOp( + "dropout_grad", {{"Out@GRAD", {"DOut"}}, {"Mask", {"Mask"}}}, + {{"X@GRAD", {"DX"}}}, attrs); + op->Run(scope, ctx.GetPlace()); + + cudaMemcpy((*dx).data(), tensor_dx->data(), + x_dim[0] * x_dim[1] * sizeof(T), cudaMemcpyDeviceToHost); + ctx.Wait(); +} + +template +struct TestFusedResidualDropoutBias { + uint32_t _rows; + uint32_t _cols; + uint64_t _seed; + float _dropout_prob; + bool _is_upscale_in_train; + bool _is_test; // default false, Set to true for inference only + bool _has_bias = true; + framework::Tensor _src, _residual, _bias, _out, _mask; + framework::Tensor _dsrc, _dbias; + + std::vector _src_vec, _residual_vec, _bias_vec, _out_vec, _mask_vec; + std::vector _correct_out, _correct_dsrc, _correct_dbias; + std::vector _correct_mask; + + platform::CUDAPlace _place; + platform::CUDADeviceContext *_ctx; + + TestFusedResidualDropoutBias() { + _rows = 32; + _cols = 32; + _seed = 0; + _dropout_prob = 0.0; + _is_upscale_in_train = false; + _is_test = false; + _has_bias = true; + _ctx = new platform::CUDADeviceContext(_place); + } + + TestFusedResidualDropoutBias(int rows, int cols, uint64_t seed = 0, + float dropout_prob = 0.0, + bool is_upscale_in_train = false, + bool is_test = false) { + _rows = rows; + _cols = cols; + _seed = seed; + _dropout_prob = dropout_prob; + _is_upscale_in_train = is_upscale_in_train; + _is_test = is_test; + _has_bias = true; + _ctx = new platform::CUDADeviceContext(_place); + } + + ~TestFusedResidualDropoutBias() { delete _ctx; } + + void SetUp() { + const int n = _rows * _cols; + _correct_out.resize(n); + _correct_mask.resize(n); + _correct_dsrc.resize(n); + _correct_dbias.resize(_cols); + + _src_vec.resize(n); + _residual_vec.resize(n); + _bias_vec.resize(_cols); + std::default_random_engine random(time(NULL)); + std::uniform_real_distribution dis(0.0, 1.0); + + for (int i = 0; i < _rows; i++) { + for (int j = 0; j < _cols; j++) { + _src_vec[i * _cols + j] = static_cast(dis(random)); + _residual_vec[i * _cols + j] = static_cast(dis(random)); + if (i == 0) _bias_vec[j] = dis(random); + } + } + + framework::TensorFromVector(_src_vec, *_ctx, &_src); + _src.Resize({_rows, _cols}); + framework::TensorFromVector(_residual_vec, *_ctx, &_residual); + _residual.Resize({_rows, _cols}); + if (_has_bias) { + framework::TensorFromVector(_bias_vec, *_ctx, &_bias); + _bias.Resize({_cols}); + } + + { + _out.Resize({_rows, _cols}); + _out.mutable_data(_place); + _mask.Resize({_rows, _cols}); + _mask.mutable_data(_place); + _dsrc.Resize({_rows, _cols}); + _dsrc.mutable_data(_place); + + if (_has_bias) { + _dbias.Resize({_cols}); + _dbias.mutable_data(_place); + } + } + } + + void BaseForward() { + std::vector out1(_rows * _cols), out2(_rows * _cols); + if (_has_bias) { + for (int i = 0; i < _rows; i++) { + for (int j = 0; j < _cols; j++) { + out1[i * _cols + j] = _src_vec[i * _cols + j] + _bias_vec[j]; + } + } + Dropout(out1, _src.dims(), &out2, &_correct_mask, *_ctx, _seed, + _dropout_prob, _is_upscale_in_train, _is_test); + } else { + Dropout(_src_vec, _src.dims(), &out2, &_correct_mask, *_ctx, _seed, + _dropout_prob, _is_upscale_in_train, _is_test); + } + for (int i = 0; i < _rows; i++) { + for (int j = 0; j < _cols; j++) { + _correct_out[i * _cols + j] = + _residual_vec[i * _cols + j] + out2[i * _cols + j]; + } + } + _ctx->Wait(); + } + + void BaseBackward() { + if (!_is_upscale_in_train) { + for (int i = 0; i < _rows * _cols; i++) { + _correct_dsrc[i] = _correct_out[i] * static_cast(_correct_mask[i]); + } + } else { + DropoutGrad(&_correct_dsrc, _src.dims(), _correct_out, _correct_mask, + *_ctx, _dropout_prob, _is_upscale_in_train); + } + memset(&_correct_dbias[0], 0, _cols * sizeof(T)); + for (int i = 0; i < _rows; i++) { + for (int j = 0; j < _cols; j++) { + _correct_dbias[j] += _correct_out[i * _cols + j]; + } + } + } + + void FusedForward() { + auto threads = paddle::operators::GetResidualDropoutBiasThreads( + *_ctx, (uint64_t)_rows, (uint64_t)_cols); + const int increment = + ((_cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) * + VecSize; + + T *bias_ptr = nullptr; + if (_has_bias) { + bias_ptr = _bias.data(); + } + if (_is_test) { + paddle::operators::LaunchResidualDropoutBiasTest( + _rows, _cols, _dropout_prob, _is_upscale_in_train, _src.data(), + _residual.data(), bias_ptr, _out.data(), *_ctx); + } else { + paddle::operators::LaunchResidualDropoutBias( + _rows, _cols, increment, _seed, _dropout_prob, _is_upscale_in_train, + _src.data(), _residual.data(), bias_ptr, _mask.data(), + _out.data(), *_ctx); + } + _ctx->Wait(); + } + + void FusedBackward() { + if (_is_test) return; + + T *bias_ptr = nullptr; + if (_has_bias) { + bias_ptr = _dbias.data(); + } + paddle::operators::LaunchResidualDropoutBiasGrad( + _out.data(), _mask.data(), _dropout_prob, + _is_upscale_in_train, _rows, _cols, _dsrc.data(), bias_ptr, *_ctx); + } + + void Run() { + SetUp(); + BaseForward(); + FusedForward(); + BaseBackward(); + FusedBackward(); + } + + void CheckOut(const T diff) { + const int n = _rows * _cols; + std::vector out(n); + std::vector mask(n); + cudaMemcpy(out.data(), _out.data(), _rows * _cols * sizeof(T), + cudaMemcpyDeviceToHost); + if (!_is_test) { + cudaMemcpy(mask.data(), _mask.data(), + _rows * _cols * sizeof(uint8_t), cudaMemcpyDeviceToHost); + } + _ctx->Wait(); + + for (int i = 0; i < n; i++) { + EXPECT_LT(std::abs(out[i] - _correct_out[i]), diff); + if (!_is_test) EXPECT_EQ(mask[i], _correct_mask[i]); + } + } + + void CheckGrad(const T diff) { + if (_is_test) return; + + const int n = _rows * _cols; + + std::vector dsrc(n); + cudaMemcpy(dsrc.data(), _dsrc.data(), _rows * _cols * sizeof(T), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < n; i++) { + EXPECT_LT(std::abs(dsrc[i] - _correct_dsrc[i]), diff); + } + + if (_has_bias) { + std::vector dbias(_cols); + cudaMemcpy(dbias.data(), _dbias.data(), _cols * sizeof(T), + cudaMemcpyDeviceToHost); + _ctx->Wait(); + for (int i = 0; i < _cols; i++) { + EXPECT_LT(std::abs(dbias[i] - _correct_dbias[i]), diff); + } + } + } +}; + +TEST(FusedDropout, GPUFusedRedisualDorpoutBias) { + const int rows = 16; + const int cols = 16; + TestFusedResidualDropoutBias test(rows, cols); + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} + +TEST(FusedDropout, GPUFusedRedisualDorpoutBiasDouble) { + const int rows = 16; + const int cols = 16; + TestFusedResidualDropoutBias test(rows, cols); + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} + +TEST(FusedDropout, GPUFusedRedisualDorpoutBiasFp16) { + const int rows = 16; + const int cols = 16; + TestFusedResidualDropoutBias test(rows, cols); + test.Run(); + test.CheckOut(static_cast(1e-2)); + // For inference, check_grad is not required. ref: test_dropout_op.py + // test.CheckGrad((platform::float16)1e-2); +} + +// test no bias and cols % 4 == 0 +TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias) { + const int rows = 16; + const int cols = 16; + TestFusedResidualDropoutBias test(rows, cols); + test._has_bias = false; + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} + +// test no bias and cols % 4 != 0 +TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias2) { + const int rows = 16; + const int cols = 17; + TestFusedResidualDropoutBias test(rows, cols); + test._has_bias = false; + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} + +// test add bias and cols % 4 != 0 +TEST(FusedDropout, GPUFusedRedisualDorpoutBias2) { + const int rows = 16; + const int cols = 17; + TestFusedResidualDropoutBias test(rows, cols); + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} + +TEST(FusedDropout, GPUFusedRedisualDorpoutBias3) { + const int rows = 16; + const int cols = 16; + TestFusedResidualDropoutBias test(rows, cols, 0, 1.0, false, false); + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} + +TEST(FusedDropout, GPUFusedRedisualDorpoutBias4) { + const int rows = 16; + const int cols = 16; + TestFusedResidualDropoutBias test(rows, cols, 0, 1.0, false, false); + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} + +TEST(FusedDropout, GPUFusedRedisualDorpoutBias5) { + const int rows = 16; + const int cols = 16; + TestFusedResidualDropoutBias test(rows, cols, 0, 1.0, true, false); + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} + +TEST(FusedDropout, GPUFusedRedisualDorpoutBias6) { + const int rows = 16; + const int cols = 16; + TestFusedResidualDropoutBias test(rows, cols, 0, 0.35, true, true); + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} + +TEST(FusedDropout, GPUFusedRedisualDorpoutBias7) { + const int rows = 16; + const int cols = 16; + TestFusedResidualDropoutBias test(rows, cols, 125, 0.0, false, false); + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-5)); +} From 507117a86e68dd4dc572ff167f286b39d7c84416 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 23 Aug 2021 13:22:26 +0000 Subject: [PATCH 02/19] simplify the code, andd opt reduce sum --- .../fused/fused_residual_dropout_bias.h | 320 +++++------------- .../fused/test_fused_residual_dropout_bias.cu | 52 ++- 2 files changed, 101 insertions(+), 271 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index e0b51be9e909e3..2d0de22952c88a 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -21,6 +21,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" @@ -30,6 +31,7 @@ namespace paddle { namespace operators { namespace platform = paddle::platform; +namespace cg = cooperative_groups; inline std::pair GetResidualDropoutThreads( const platform::CUDADeviceContext &ctx, const uint64_t n) { @@ -73,86 +75,6 @@ inline int VectorizedSize(const T *pointer) { /** * dst = residual + dropout(src + bias); */ -template -__global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols, - uint64_t seed, - const float dropout_prob, - const bool is_upscale_in_train, - const T *src, const T *residual, - const T *bias, MaskType *mask_data, - T *dst, uint64_t increment) { - int col_id = blockDim.x * blockIdx.x + threadIdx.x; - int row_id = blockIdx.y; - int idx = row_id * cols + col_id; - curandStatePhilox4_32_10_t state; - curand_init(seed, idx, increment, &state); - - T factor = static_cast(1.0f / (1.0f - dropout_prob)); - const int tmp_cols = cols / VecSize * VecSize; - for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { - for (int i = col_id * VecSize; i < tmp_cols; - i += blockDim.x * gridDim.x * VecSize) { - float4 rand = curand_uniform4(&state); - float *rand_data = &(rand.x); - MaskType mask[VecSize]; - T bias_vec[VecSize]; -#pragma unroll - for (int j = 0; j < VecSize; j++) { - mask[j] = (MaskType)(rand_data[j] > dropout_prob); - bias_vec[j] = bias != nullptr ? bias[i + j] : static_cast(0); - } -#pragma unroll - for (int j = 0; j < VecSize; j++) { - mask_data[r * cols + i + j] = mask[j]; - } - - if (is_upscale_in_train) { -#pragma unroll - for (int j = 0; j < VecSize; j++) { - dst[r * cols + i + j] = (src[r * cols + i + j] + bias_vec[j]) * - static_cast(mask[j]) * factor + - residual[r * cols + i + j]; - } - } else { -#pragma unroll - for (int j = 0; j < VecSize; j++) { - dst[r * cols + i + j] = - (src[r * cols + i + j] + bias_vec[j]) * static_cast(mask[j]) + - residual[r * cols + i + j]; - } - } - } - - int high_index = tmp_cols + col_id; - if (high_index < cols) { - float4 rand = curand_uniform4(&state); - float *rand_data = &(rand.x); - int k = 0; - if (is_upscale_in_train) { - for (int i = high_index; i < cols; i++) { - MaskType m = (MaskType)(rand_data[k++] > dropout_prob); - mask_data[r * cols + i] = m; - dst[r * cols + i] = - (src[r * cols + i] + - (bias != nullptr ? bias[i] : static_cast(0.0))) * - static_cast(m) * factor + - residual[r * cols + i]; - } - } else { - for (int i = high_index; i < cols; i++) { - MaskType m = (MaskType)(rand_data[k++] > dropout_prob); - mask_data[r * cols + i] = m; - dst[r * cols + i] = - (src[r * cols + i] + - (bias != nullptr ? bias[i] : static_cast(0.0))) * - static_cast(m) + - residual[r * cols + i]; - } - } - } - } -} - template __global__ void FusedResidualDropoutBiasVec(const size_t rows, const size_t cols, uint64_t seed, @@ -170,6 +92,9 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, T dest; MaskType mask; T factor = static_cast(1.0f / (1.0f - dropout_prob)); + if (!is_upscale_in_train) { + factor = static_cast(1.0); + } using LoadT = AlignedVector; using MaskLoadT = AlignedVector; for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { @@ -202,20 +127,11 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob); } - if (is_upscale_in_train) { #pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * - static_cast(mask_vec[ii]) * factor + - residual_vec[ii]; - } - } else { -#pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - dest_vec[ii] = - (src_vec[ii] + bias_vec[ii]) * static_cast(mask_vec[ii]) + - residual_vec[ii]; - } + for (int ii = 0; ii < VecSize; ii++) { + dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * + static_cast(mask_vec[ii]) * factor + + residual_vec[ii]; } *(reinterpret_cast(&dst[r * cols + i])) = *reinterpret_cast(&dest_vec[0]); @@ -237,47 +153,31 @@ __global__ void FusedResidualDropoutBiasTest(const size_t rows, int idx = row_id * cols + col_id; T factor = static_cast(1.0f - dropout_prob); + if (is_upscale_in_train) { + factor = static_cast(1.0); + } const int tmp_cols = cols / VecSize * VecSize; for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { for (int i = col_id * VecSize; i < tmp_cols; i += blockDim.x * gridDim.x * VecSize) { - if (is_upscale_in_train) { #pragma unroll - for (int j = 0; j < VecSize; j++) { - dst[r * cols + i + j] = - (src[r * cols + i + j] + - (bias != nullptr ? bias[i + j] : static_cast(0.0))) + - residual[r * cols + i + j]; - } - } else { -#pragma unroll - for (int j = 0; j < VecSize; j++) { - dst[r * cols + i + j] = - (src[r * cols + i + j] + - (bias != nullptr ? bias[i + j] : static_cast(0.0))) * - factor + - residual[r * cols + i + j]; - } + for (int j = 0; j < VecSize; j++) { + dst[r * cols + i + j] = + (src[r * cols + i + j] + + (bias != nullptr ? bias[i + j] : static_cast(0.0))) * + factor + + residual[r * cols + i + j]; } } int high_index = tmp_cols + col_id; if (high_index < cols) { - if (is_upscale_in_train) { - for (int i = high_index; i < cols; i++) { - dst[r * cols + i] = - (src[r * cols + i] + - (bias != nullptr ? bias[i] : static_cast(0.0))) + - residual[r * cols + i]; - } - } else { - for (int i = high_index; i < cols; i++) { - dst[r * cols + i] = - (src[r * cols + i] + - (bias != nullptr ? bias[i] : static_cast(0.0))) * - factor + - residual[r * cols + i]; - } + for (int i = high_index; i < cols; i++) { + dst[r * cols + i] = + (src[r * cols + i] + + (bias != nullptr ? bias[i] : static_cast(0.0))) * + factor + + residual[r * cols + i]; } } } @@ -305,9 +205,8 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols); if (cols % VecSize != 0) - FusedResidualDropoutBias< - T, uint8_t, - VecSize><<>>( + FusedResidualDropoutBiasVec< + T, uint8_t, 1><<>>( rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, bias, mask_data, dst, increment); else @@ -337,36 +236,10 @@ void LaunchResidualDropoutBiasTest(const uint32_t rows, const uint32_t cols, } /********Backward**************/ -template -__global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask, - const T factor, const int64_t size, - T *dx, bool is_upscale_in_train) { - int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - - int tmp_size = size / VecSize * VecSize; - for (int i = idx * VecSize; i < tmp_size; - i += blockDim.x * gridDim.x * VecSize) { -#pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - dx[i + ii] = dout[i + ii] * static_cast(mask[i + ii]) * factor; - } - } - - int high_index = tmp_size + idx; - if (size > high_index) { - for (int i = high_index; i < size; i++) { - if (is_upscale_in_train) - dx[i] = dout[i] * static_cast(mask[i]) * factor; - else - dx[i] = dout[i] * static_cast(mask[i]); - } - } -} - template __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask, const T factor, const int64_t size, - T *dx, bool is_upscale_in_train) { + T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; using LoadT = AlignedVector; @@ -389,62 +262,33 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask, } } -template -__global__ void FusedResidualDropoutBiasGrad( - const T *dout, const MaskType *mask, const T factor, const int64_t rows, - const int64_t cols, T *dx, T *dbias, bool is_upscale_in_train) { - int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; +template +__device__ void reduce_sum(T cache[BSX * VecSize][BSY]) {} - __shared__ T cache[BSX][BSY]; - T tmp_sum = static_cast(0); - if (col_id < cols) { - for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) { - int index = row_id * cols + col_id; - T out_value = dout[index]; - if (is_upscale_in_train) - dx[index] = out_value * static_cast(mask[index]) * factor; - else - dx[index] = out_value * static_cast(mask[index]); - tmp_sum += out_value; - } - } - cache[threadIdx.x][threadIdx.y] = tmp_sum; - __syncthreads(); - - // reduce sum - // TODO(zhangkaihuo) : Replace with ModuleAPI - T sum = static_cast(0); - int tid = threadIdx.y * blockDim.x + threadIdx.x; - int x = tid / BSY; - int y = tid & (BSY - 1); - - int s = BSY / 2; - while (s > 0) { - if (y < s) { - cache[x][y] += cache[x][y + s]; - } - s /= 2; - __syncthreads(); - } - - if (threadIdx.y == 0 && col_id < cols) { - dbias[col_id] = cache[threadIdx.x][0]; +template +static __forceinline__ __device__ U WarpReduceSum(U val) { + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, true); + const int warpSize = 32; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + val += paddle::platform::CudaShuffleDownSync(mask, val, offset); } + return val; } template __global__ void FusedResidualDropoutBiasGradVec( const T *dout, const MaskType *mask, const T factor, const int64_t rows, - const int64_t cols, T *dx, T *dbias, bool is_upscale_in_train) { + const int64_t cols, T *dx, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; using LoadT = AlignedVector; using MaskLoadT = AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; - if (col_id * 4 < cols) { + if (col_id * VecSize < cols) { for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) { - int index = row_id * cols + col_id * 4; + int index = row_id * cols + col_id * VecSize; T out_vec[VecSize]; MaskType mask_vec[VecSize]; T dx_vec[VecSize]; @@ -454,18 +298,10 @@ __global__ void FusedResidualDropoutBiasGradVec( *out_value = *reinterpret_cast(&dout[index]); *mask_value = *reinterpret_cast(&mask[index]); - if (is_upscale_in_train) { -#pragma unroll - for (int i = 0; i < VecSize; i++) { - dx_vec[i] = out_vec[i] * static_cast(mask_vec[i]) * factor; - tmp_sum[i] += out_vec[i]; - } - } else { #pragma unroll - for (int i = 0; i < VecSize; i++) { - dx_vec[i] = out_vec[i] * static_cast(mask_vec[i]); - tmp_sum[i] += out_vec[i]; - } + for (int i = 0; i < VecSize; i++) { + dx_vec[i] = out_vec[i] * static_cast(mask_vec[i]) * factor; + tmp_sum[i] += out_vec[i]; } *(reinterpret_cast(&dx[index])) = @@ -479,26 +315,23 @@ __global__ void FusedResidualDropoutBiasGradVec( __syncthreads(); // reduce sum - // TODO(zhangkaihuo) : Replace with ModuleAPI T sum = static_cast(0); int tid = threadIdx.y * blockDim.x + threadIdx.x; - int x = tid / BSY; - int y = tid & (BSY - 1); + int x = tid >> 5; + int y = tid & 31; - int s = BSY / 2; - while (s > 0) { - if (y < s) { - for (int i = 0; i < VecSize; i++) { - cache[x * VecSize + i][y] += cache[x * VecSize + i][y + s]; - } + if (x < BSX * VecSize) { +#pragma unroll + for (int i = 0; i < (BSY >> 5); i++) { + sum += cache[x][y + i * 32]; } - s /= 2; - __syncthreads(); } - if (threadIdx.y == 0 && col_id * VecSize < cols) { - for (int i = 0; i < VecSize; i++) - dbias[col_id * VecSize + i] = cache[threadIdx.x * VecSize + i][0]; + sum = WarpReduceSum(sum); + + int bias_id = blockIdx.x * blockDim.x * VecSize + x; + if (y == 0 && x < VecSize * BSX && bias_id < cols) { + dbias[bias_id] = sum; } } @@ -513,43 +346,42 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, auto factor = dropout_prob == static_cast(1.0) ? zero : static_cast(1.0 / (1.0 - dropout_prob)); + if (!is_upscale_in_train) { + factor = static_cast(1.0); + } if (dbias != nullptr) { - if (cols % 4 == 0) { - auto threads = std::min(cols / VecSize, static_cast(8)); - auto blocks = std::max((uint32_t)1, - std::min((cols / VecSize + threads - 1) / threads, - (uint32_t)ctx.GetSMCount())); - dim3 block_dim(threads, 128, 1); - dim3 grid_dim(blocks, 1, 1); + int real_vec_size = VecSize; + if (cols % VecSize != 0) real_vec_size = 1; + auto threads = std::min(cols / real_vec_size, static_cast(8)); + auto blocks = std::max( + (uint32_t)1, std::min((cols / real_vec_size + threads - 1) / threads, + (uint32_t)ctx.GetSMCount())); + dim3 block_dim(threads, 128, 1); + dim3 grid_dim(blocks, 1, 1); + + if (cols % VecSize == 0) { FusedResidualDropoutBiasGradVec< T, MaskType, 8, 128, VecSize><<>>( - dout, mask, factor, rows, cols, dx, dbias, is_upscale_in_train); - + dout, mask, factor, rows, cols, dx, dbias); } else { - auto threads = std::min(cols, static_cast(8)); - auto blocks = std::max( - (uint32_t)1, - std::min((cols + threads - 1) / threads, (uint32_t)ctx.GetSMCount())); - dim3 block_dim(threads, 128, 1); - dim3 grid_dim(blocks, 1, 1); - FusedResidualDropoutBiasGrad< - T, MaskType, 8, 128><<>>( - dout, mask, factor, rows, cols, dx, dbias, is_upscale_in_train); + FusedResidualDropoutBiasGradVec< + T, MaskType, 8, 128, 1><<>>( + dout, mask, factor, rows, cols, dx, dbias); } } else { const uint64_t n = rows * cols; auto threads = GetResidualDropoutThreads(ctx, n); - if (n % 4 == 0) { + if (n % VecSize == 0) { FusedResidualDropoutGradVec< T, MaskType, VecSize><<>>( - dout, mask, factor, n, dx, is_upscale_in_train); + dout, mask, factor, n, dx); } else { - FusedResidualDropoutGrad< - T, MaskType><<>>( - dout, mask, factor, n, dx, is_upscale_in_train); + FusedResidualDropoutGradVec< + T, MaskType, 1><<>>( + dout, mask, factor, n, dx); } } } diff --git a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu b/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu index c8a1485ab7e0fa..12c2fd6be68360 100644 --- a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu +++ b/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu @@ -33,16 +33,16 @@ USE_OP(elementwise_add); USE_OP(dropout); template -void Dropout(const std::vector &x, const framework::DDim &x_dim, - std::vector *out, std::vector *mask, - const platform::CUDADeviceContext &ctx, uint64_t seed, - float dropout_prob, bool is_upscale_in_train, bool is_test) { +void Dropout(const T *x, const framework::DDim &x_dim, T *out, + std::vector *mask, const platform::CUDADeviceContext &ctx, + uint64_t seed, float dropout_prob, bool is_upscale_in_train, + bool is_test) { framework::Scope scope; auto var_x = scope.Var("X"); auto tensor_x = var_x->GetMutable(); tensor_x->Resize(x_dim); tensor_x->mutable_data(ctx.GetPlace()); - cudaMemcpy(tensor_x->data(), x.data(), x_dim[0] * x_dim[1] * sizeof(T), + cudaMemcpy(tensor_x->data(), x, x_dim[0] * x_dim[1] * sizeof(T), cudaMemcpyHostToDevice); auto var_out = scope.Var("Out"); @@ -65,8 +65,8 @@ void Dropout(const std::vector &x, const framework::DDim &x_dim, auto op = framework::OpRegistry::CreateOp( "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs); op->Run(scope, ctx.GetPlace()); - cudaMemcpy((*out).data(), tensor_out->data(), - x_dim[0] * x_dim[1] * sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(out, tensor_out->data(), x_dim[0] * x_dim[1] * sizeof(T), + cudaMemcpyDeviceToHost); if (!is_test) { cudaMemcpy((*mask).data(), tensor_mask->data(), x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost); @@ -75,24 +75,23 @@ void Dropout(const std::vector &x, const framework::DDim &x_dim, } template -void DropoutGrad(std::vector *dx, const framework::DDim &x_dim, - const std::vector &dout, const std::vector &mask, - const platform::CUDADeviceContext &ctx, float dropout_prob, - bool is_upscale_in_train) { +void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout, + const uint8_t *mask, const platform::CUDADeviceContext &ctx, + float dropout_prob, bool is_upscale_in_train) { framework::Scope scope; const size_t n = x_dim[0] * x_dim[1]; auto var_out = scope.Var("DOut"); auto tensor_out = var_out->GetMutable(); tensor_out->Resize(x_dim); tensor_out->mutable_data(ctx.GetPlace()); - cudaMemcpy(tensor_out->data(), dout.data(), n * sizeof(T), + cudaMemcpy(tensor_out->data(), dout, n * sizeof(T), cudaMemcpyHostToDevice); auto var_mask = scope.Var("Mask"); auto tensor_mask = var_mask->GetMutable(); tensor_mask->Resize(x_dim); tensor_mask->mutable_data(ctx.GetPlace()); - cudaMemcpy(tensor_mask->data(), mask.data(), n * sizeof(uint8_t), + cudaMemcpy(tensor_mask->data(), mask, n * sizeof(uint8_t), cudaMemcpyHostToDevice); auto var_dx = scope.Var("DX"); @@ -112,8 +111,8 @@ void DropoutGrad(std::vector *dx, const framework::DDim &x_dim, {{"X@GRAD", {"DX"}}}, attrs); op->Run(scope, ctx.GetPlace()); - cudaMemcpy((*dx).data(), tensor_dx->data(), - x_dim[0] * x_dim[1] * sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(dx, tensor_dx->data(), x_dim[0] * x_dim[1] * sizeof(T), + cudaMemcpyDeviceToHost); ctx.Wait(); } @@ -211,17 +210,20 @@ struct TestFusedResidualDropoutBias { void BaseForward() { std::vector out1(_rows * _cols), out2(_rows * _cols); if (_has_bias) { + // add bias for (int i = 0; i < _rows; i++) { for (int j = 0; j < _cols; j++) { out1[i * _cols + j] = _src_vec[i * _cols + j] + _bias_vec[j]; } } - Dropout(out1, _src.dims(), &out2, &_correct_mask, *_ctx, _seed, - _dropout_prob, _is_upscale_in_train, _is_test); + // call dropout + Dropout(out1.data(), _src.dims(), out2.data(), &_correct_mask, *_ctx, + _seed, _dropout_prob, _is_upscale_in_train, _is_test); } else { - Dropout(_src_vec, _src.dims(), &out2, &_correct_mask, *_ctx, _seed, - _dropout_prob, _is_upscale_in_train, _is_test); + Dropout(_src_vec.data(), _src.dims(), out2.data(), &_correct_mask, + *_ctx, _seed, _dropout_prob, _is_upscale_in_train, _is_test); } + // add residual for (int i = 0; i < _rows; i++) { for (int j = 0; j < _cols; j++) { _correct_out[i * _cols + j] = @@ -232,14 +234,10 @@ struct TestFusedResidualDropoutBias { } void BaseBackward() { - if (!_is_upscale_in_train) { - for (int i = 0; i < _rows * _cols; i++) { - _correct_dsrc[i] = _correct_out[i] * static_cast(_correct_mask[i]); - } - } else { - DropoutGrad(&_correct_dsrc, _src.dims(), _correct_out, _correct_mask, - *_ctx, _dropout_prob, _is_upscale_in_train); - } + DropoutGrad(_correct_dsrc.data(), _src.dims(), _correct_out.data(), + _correct_mask.data(), *_ctx, _dropout_prob, + _is_upscale_in_train); + // calc dbias memset(&_correct_dbias[0], 0, _cols * sizeof(T)); for (int i = 0; i < _rows; i++) { for (int j = 0; j < _cols; j++) { From 462caa1f3226012289c5245b9c478294f9951a91 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 24 Aug 2021 04:46:40 +0000 Subject: [PATCH 03/19] resolve review comments and add comments to the code --- paddle/fluid/operators/fused/CMakeLists.txt | 2 +- paddle/fluid/operators/fused/fused_dropout.h | 70 ++++++ .../fused/fused_residual_dropout_bias.h | 204 +++++++++--------- ...cu => fused_residual_dropout_bias_test.cu} | 25 ++- 4 files changed, 192 insertions(+), 109 deletions(-) create mode 100644 paddle/fluid/operators/fused/fused_dropout.h rename paddle/fluid/operators/fused/{test_fused_residual_dropout_bias.cu => fused_residual_dropout_bias_test.cu} (95%) diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 525f6504f9fa61..78ff136c4d1038 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -74,6 +74,6 @@ if (WITH_GPU OR WITH_ROCM) # fused_dropout # only support CUDA if(NOT WITH_ROCM) - nv_test(test_fused_residual_dropout_bias SRCS test_fused_residual_dropout_bias.cu DEPS tensor op_registry elementwise_add_op dropout_op device_context generator) + nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry elementwise_add_op dropout_op device_context generator) endif() endif() diff --git a/paddle/fluid/operators/fused/fused_dropout.h b/paddle/fluid/operators/fused/fused_dropout.h new file mode 100644 index 00000000000000..bd6a4122f5830d --- /dev/null +++ b/paddle/fluid/operators/fused/fused_dropout.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include +#include + +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +/** + * get 1D threads and blocks + */ +template +inline std::pair Get1DThreadsAndBlocks( + const platform::CUDADeviceContext &ctx, const uint64_t n) { + const uint64_t tmp_n = n / VecSize; + int threads = std::max( + (uint64_t)32, std::min(tmp_n, (uint64_t)ctx.GetMaxThreadsPerBlock())); + int blocks = std::max((uint64_t)1, (tmp_n + threads - 1) / threads); + return std::pair{threads, blocks}; +} + +/** + * get the threads for fused_residual_dropout_bias: + * 1D blocks: blockDim.x = cols + * 2D grids: gridDim.y = rows + */ +template +inline std::pair Get1DBlocksAnd2DGrids( + const platform::CUDADeviceContext &ctx, const uint32_t rows, + const uint32_t cols) { + const uint32_t tmp_cols = cols / VecSize; + int threads = std::max( + (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock())); + int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads); + int blocks_y = std::max((uint32_t)1, rows); + dim3 block_dim(threads, 1, 1); + dim3 grid_dim(blocks_x, blocks_y, 1); + return std::pair{block_dim, grid_dim}; +} + +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(T) * VecSize) AlignedVector { + T val[VecSize]; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 2d0de22952c88a..16747d7739be1e 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,18 +14,7 @@ limitations under the License. */ #pragma once -#include -#include -#include - -#include -#include - -#include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/float16.h" - -const int VecSize = 4; +#include "paddle/fluid/operators/fused/fused_dropout.h" namespace paddle { namespace operators { @@ -33,47 +22,11 @@ namespace operators { namespace platform = paddle::platform; namespace cg = cooperative_groups; -inline std::pair GetResidualDropoutThreads( - const platform::CUDADeviceContext &ctx, const uint64_t n) { - const uint64_t tmp_n = n / VecSize; - int threads = std::max( - (uint64_t)32, std::min(tmp_n, (uint64_t)ctx.GetMaxThreadsPerBlock())); - int blocks = std::max((uint64_t)1, (tmp_n + threads - 1) / threads); - return std::pair{threads, blocks}; -} - -inline std::pair GetResidualDropoutBiasThreads( - const platform::CUDADeviceContext &ctx, const uint32_t rows, - const uint32_t cols) { - const uint32_t tmp_cols = cols / VecSize; - int threads = std::max( - (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock())); - int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads); - int blocks_y = std::max((uint32_t)1, rows); - dim3 block_dim(threads, 1, 1); - dim3 grid_dim(blocks_x, blocks_y, 1); - return std::pair{block_dim, grid_dim}; -} - /********Forward**************/ -// aligned vector generates vectorized load/store on CUDA -template -struct alignas(sizeof(T) * Size) AlignedVector { - T val[Size]; -}; - -template -inline int VectorizedSize(const T *pointer) { - uint64_t address = reinterpret_cast(pointer); - constexpr int vec4 = std::alignment_of>::value; // NOLINT - if (address % vec4 == 0) { - return 4; - } - return 1; -} - /** - * dst = residual + dropout(src + bias); + * @brief dst = residual + dropout(src + bias); + * the src, residual, mask and dst shape is (rows, cols) + * the bias shape is (1, cols) */ template __global__ void FusedResidualDropoutBiasVec(const size_t rows, @@ -81,7 +34,7 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, const float dropout_prob, const bool is_upscale_in_train, const T *src, const T *residual, - const T *bias, MaskType *mask_data, + const T *bias, MaskType *mask, T *dst, uint64_t increment) { int col_id = blockDim.x * blockIdx.x + threadIdx.x; int row_id = blockIdx.y; @@ -89,11 +42,9 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, curandStatePhilox4_32_10_t state; curand_init(seed, idx, increment, &state); - T dest; - MaskType mask; T factor = static_cast(1.0f / (1.0f - dropout_prob)); if (!is_upscale_in_train) { - factor = static_cast(1.0); + factor = static_cast(1.0f); } using LoadT = AlignedVector; using MaskLoadT = AlignedVector; @@ -107,6 +58,7 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, for (int ii = 0; ii < VecSize; ii++) { bias_vec[ii] = static_cast(0); } + // vectorize load data from global LoadT *value = reinterpret_cast(&src_vec); LoadT *residual_value = reinterpret_cast(&residual_vec); *value = *reinterpret_cast(&src[r * cols + i]); @@ -133,58 +85,77 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, static_cast(mask_vec[ii]) * factor + residual_vec[ii]; } + + // store result to global *(reinterpret_cast(&dst[r * cols + i])) = *reinterpret_cast(&dest_vec[0]); - *(reinterpret_cast(&mask_data[r * cols + i])) = + *(reinterpret_cast(&mask[r * cols + i])) = *reinterpret_cast(&mask_vec[0]); } } } +/** + * @brief for dropout's param is_test = true + * the src, residual and dst shape is (rows, cols) + * the bias shape is (1, cols) + */ template -__global__ void FusedResidualDropoutBiasTest(const size_t rows, - const size_t cols, - const float dropout_prob, - const bool is_upscale_in_train, - const T *src, const T *residual, - const T *bias, T *dst) { +__global__ void FusedResidualDropoutBiasIsTest(const size_t rows, + const size_t cols, + const float dropout_prob, + const bool is_upscale_in_train, + const T *src, const T *residual, + const T *bias, T *dst) { int col_id = blockDim.x * blockIdx.x + threadIdx.x; int row_id = blockIdx.y; int idx = row_id * cols + col_id; T factor = static_cast(1.0f - dropout_prob); if (is_upscale_in_train) { - factor = static_cast(1.0); + factor = static_cast(1.0f); } + + using LoadT = AlignedVector; + const int tmp_cols = cols / VecSize * VecSize; for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { for (int i = col_id * VecSize; i < tmp_cols; i += blockDim.x * gridDim.x * VecSize) { + T src_vec[VecSize]; + T residual_vec[VecSize]; + T bias_vec[VecSize]; #pragma unroll - for (int j = 0; j < VecSize; j++) { - dst[r * cols + i + j] = - (src[r * cols + i + j] + - (bias != nullptr ? bias[i + j] : static_cast(0.0))) * - factor + - residual[r * cols + i + j]; + for (int ii = 0; ii < VecSize; ii++) { + bias_vec[ii] = static_cast(0); } - } + // vectorize load data from global + LoadT *value = reinterpret_cast(&src_vec); + LoadT *residual_value = reinterpret_cast(&residual_vec); + *value = *reinterpret_cast(&src[r * cols + i]); + *residual_value = + *reinterpret_cast(&residual[r * cols + i]); - int high_index = tmp_cols + col_id; - if (high_index < cols) { - for (int i = high_index; i < cols; i++) { - dst[r * cols + i] = - (src[r * cols + i] + - (bias != nullptr ? bias[i] : static_cast(0.0))) * - factor + - residual[r * cols + i]; + LoadT *bias_value = + bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; + if (bias != nullptr) + *bias_value = *reinterpret_cast(&bias[i]); + + T dest_vec[VecSize]; +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii]; } + + // store result to global + *(reinterpret_cast(&dst[r * cols + i])) = + *reinterpret_cast(&dest_vec[0]); } } } /** - * dst = residual + dropout(src + bias); + * @brief dst = residual + dropout(src + bias); */ template void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, @@ -194,7 +165,8 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, const T *residual, const T *bias, MaskType *mask_data, T *dst, const platform::CUDADeviceContext &ctx) { - if (std::abs(dropout_prob - 1.0) < 1e-5) { + // dropout_prob == 1.0f + if (std::abs(dropout_prob - 1.0f) < 1e-5) { PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T), cudaMemcpyDeviceToDevice, ctx.stream())); @@ -203,7 +175,8 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, return; } - auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols); + const int VecSize = 4; + auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols); if (cols % VecSize != 0) FusedResidualDropoutBiasVec< T, uint8_t, 1><<>>( @@ -217,25 +190,39 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, bias, mask_data, dst, increment); } +/** + *@brief to launch kernel FusedResidualDropoutBiasIsTest + */ template -void LaunchResidualDropoutBiasTest(const uint32_t rows, const uint32_t cols, - const float dropout_prob, - bool is_upscale_in_train, const T *src, - const T *residual, const T *bias, T *dst, - const platform::CUDADeviceContext &ctx) { - if (std::abs(dropout_prob - 1.0) < 1e-5) { +void LaunchResidualDropoutBiasIsTest(const uint32_t rows, const uint32_t cols, + const float dropout_prob, + bool is_upscale_in_train, const T *src, + const T *residual, const T *bias, T *dst, + const platform::CUDADeviceContext &ctx) { + if (std::abs(dropout_prob - 1.0f) < 1e-5) { PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T), cudaMemcpyDeviceToDevice, ctx.stream())); return; } - auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols); - FusedResidualDropoutBiasTest< - T, VecSize><<>>( - rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, dst); + const int VecSize = 4; + auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols); + if (cols % VecSize != 0) + FusedResidualDropoutBiasIsTest< + T, 1><<>>( + rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, + dst); + else + FusedResidualDropoutBiasIsTest< + T, VecSize><<>>( + rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, + dst); } /********Backward**************/ +/* + * @brief calculate the grad of no bias + */ template __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask, const T factor, const int64_t size, @@ -262,9 +249,6 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask, } } -template -__device__ void reduce_sum(T cache[BSX * VecSize][BSY]) {} - template static __forceinline__ __device__ U WarpReduceSum(U val) { unsigned mask = 0u; @@ -276,6 +260,12 @@ static __forceinline__ __device__ U WarpReduceSum(U val) { return val; } +/** + * blocks(128 * 8) + * 1. calculate the dx and reduce total rows to 128 rows + * 2. save 128*8 temporary sum in 8*128 shared memory + * 3. reduce the sum of 128 rows data by 8*VecSize warps + */ template __global__ void FusedResidualDropoutBiasGradVec( const T *dout, const MaskType *mask, const T factor, const int64_t rows, @@ -286,6 +276,7 @@ __global__ void FusedResidualDropoutBiasGradVec( using MaskLoadT = AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; + // calculate the dx and temporary sum if (col_id * VecSize < cols) { for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) { int index = row_id * cols + col_id * VecSize; @@ -309,6 +300,7 @@ __global__ void FusedResidualDropoutBiasGradVec( } } + // save temporary sum to cache and do transpose __shared__ T cache[BSX * VecSize][BSY]; for (int i = 0; i < VecSize; i++) cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i]; @@ -317,24 +309,31 @@ __global__ void FusedResidualDropoutBiasGradVec( // reduce sum T sum = static_cast(0); int tid = threadIdx.y * blockDim.x + threadIdx.x; - int x = tid >> 5; - int y = tid & 31; + int x = tid >> 5; // warp id + int y = tid & 31; // thread id on warp 0~31 + // need BSX * VecSize warps if (x < BSX * VecSize) { +// reduce 128 to 32 #pragma unroll for (int i = 0; i < (BSY >> 5); i++) { sum += cache[x][y + i * 32]; } } + // reduce 32 to 1 sum = WarpReduceSum(sum); + // save sum to dbias int bias_id = blockIdx.x * blockDim.x * VecSize + x; if (y == 0 && x < VecSize * BSX && bias_id < cols) { dbias[bias_id] = sum; } } +/** + * @brief to launch kernel FusedResidualDropoutBiasGradVec + */ template void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, const float dropout_prob, @@ -342,14 +341,15 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, const uint32_t rows, const uint32_t cols, T *dx, T *dbias, const platform::CUDADeviceContext &ctx) { - const T zero = static_cast(0.0); - auto factor = dropout_prob == static_cast(1.0) + const T zero = static_cast(0.0f); + auto factor = dropout_prob == static_cast(1.0f) ? zero - : static_cast(1.0 / (1.0 - dropout_prob)); + : static_cast(1.0f / (1.0f - dropout_prob)); if (!is_upscale_in_train) { - factor = static_cast(1.0); + factor = static_cast(1.0f); } + const int VecSize = 4; if (dbias != nullptr) { int real_vec_size = VecSize; if (cols % VecSize != 0) real_vec_size = 1; @@ -372,7 +372,7 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, } } else { const uint64_t n = rows * cols; - auto threads = GetResidualDropoutThreads(ctx, n); + auto threads = Get1DThreadsAndBlocks(ctx, n); if (n % VecSize == 0) { FusedResidualDropoutGradVec< T, MaskType, diff --git a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu similarity index 95% rename from paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu rename to paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 12c2fd6be68360..5cd20dce57855b 100644 --- a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,9 +29,19 @@ limitations under the License. */ namespace framework = paddle::framework; namespace platform = paddle::platform; -USE_OP(elementwise_add); USE_OP(dropout); +/** + * @brief the unittest of fused_residual_dropout_bias + * 1. random input data + * 2. add bias, call paddle dropout op, add residual, and get the base result + * 3. call FusedResidualDropoutBias function get fused result + * 4. compare ther base result and fused result + */ + +/** + * @brief call paddle dropout op + */ template void Dropout(const T *x, const framework::DDim &x_dim, T *out, std::vector *mask, const platform::CUDADeviceContext &ctx, @@ -74,6 +84,9 @@ void Dropout(const T *x, const framework::DDim &x_dim, T *out, ctx.Wait(); } +/** + * @brief call paddle dropout_grad op + */ template void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout, const uint8_t *mask, const platform::CUDADeviceContext &ctx, @@ -247,8 +260,9 @@ struct TestFusedResidualDropoutBias { } void FusedForward() { - auto threads = paddle::operators::GetResidualDropoutBiasThreads( + auto threads = paddle::operators::Get1DBlocksAnd2DGrids( *_ctx, (uint64_t)_rows, (uint64_t)_cols); + const int VecSize = 4; const int increment = ((_cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) * VecSize; @@ -258,7 +272,7 @@ struct TestFusedResidualDropoutBias { bias_ptr = _bias.data(); } if (_is_test) { - paddle::operators::LaunchResidualDropoutBiasTest( + paddle::operators::LaunchResidualDropoutBiasIsTest( _rows, _cols, _dropout_prob, _is_upscale_in_train, _src.data(), _residual.data(), bias_ptr, _out.data(), *_ctx); } else { @@ -351,14 +365,13 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasDouble) { test.CheckGrad(static_cast(1e-5)); } +// test fp16, For inference, check_grad is not required. ref: test_dropout_op.py TEST(FusedDropout, GPUFusedRedisualDorpoutBiasFp16) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols); test.Run(); test.CheckOut(static_cast(1e-2)); - // For inference, check_grad is not required. ref: test_dropout_op.py - // test.CheckGrad((platform::float16)1e-2); } // test no bias and cols % 4 == 0 From 93e063864f56f15e2e88cae77e6a5b5b633ab3e5 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 24 Aug 2021 12:35:20 +0000 Subject: [PATCH 04/19] fused_dropout: optimize code structure to facilitate reuse --- paddle/fluid/operators/fused/CMakeLists.txt | 2 +- paddle/fluid/operators/fused/fused_dropout.h | 12 ++ .../operators/fused/fused_dropout_test.h | 121 ++++++++++++++++++ .../fused/fused_residual_dropout_bias.h | 15 +-- .../fused/fused_residual_dropout_bias_test.cu | 100 +-------------- 5 files changed, 136 insertions(+), 114 deletions(-) create mode 100644 paddle/fluid/operators/fused/fused_dropout_test.h diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 78ff136c4d1038..f3035cddcba020 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -74,6 +74,6 @@ if (WITH_GPU OR WITH_ROCM) # fused_dropout # only support CUDA if(NOT WITH_ROCM) - nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry elementwise_add_op dropout_op device_context generator) + nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op device_context generator) endif() endif() diff --git a/paddle/fluid/operators/fused/fused_dropout.h b/paddle/fluid/operators/fused/fused_dropout.h index bd6a4122f5830d..4188d935b9e458 100644 --- a/paddle/fluid/operators/fused/fused_dropout.h +++ b/paddle/fluid/operators/fused/fused_dropout.h @@ -66,5 +66,17 @@ struct alignas(sizeof(T) * VecSize) AlignedVector { T val[VecSize]; }; +// reduce sum by a warp +template +static __forceinline__ __device__ U WarpReduceSum(U val) { + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, true); + const int warpSize = 32; + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + val += paddle::platform::CudaShuffleDownSync(mask, val, offset); + } + return val; +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h new file mode 100644 index 00000000000000..6cb8cd19b608d1 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; + +USE_OP(dropout); + +/** + * @brief call paddle dropout op + */ +template +void Dropout(const T *x, const framework::DDim &x_dim, T *out, + std::vector *mask, const platform::CUDADeviceContext &ctx, + uint64_t seed, float dropout_prob, bool is_upscale_in_train, + bool is_test) { + framework::Scope scope; + auto var_x = scope.Var("X"); + auto tensor_x = var_x->GetMutable(); + tensor_x->Resize(x_dim); + tensor_x->mutable_data(ctx.GetPlace()); + cudaMemcpy(tensor_x->data(), x, x_dim[0] * x_dim[1] * sizeof(T), + cudaMemcpyHostToDevice); + + auto var_out = scope.Var("Out"); + auto tensor_out = var_out->GetMutable(); + + auto var_mask = scope.Var("Mask"); + auto tensor_mask = var_mask->GetMutable(); + + framework::AttributeMap attrs; + attrs.insert({"fix_seed", 1}); + attrs.insert({"seed", static_cast(seed)}); + attrs.insert({"dropout_prob", dropout_prob}); + if (is_upscale_in_train) { + attrs.insert({"dropout_implementation", std::string("upscale_in_train")}); + } + if (is_test) { + attrs.insert({"is_test", 1}); + } + + auto op = framework::OpRegistry::CreateOp( + "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs); + op->Run(scope, ctx.GetPlace()); + cudaMemcpy(out, tensor_out->data(), x_dim[0] * x_dim[1] * sizeof(T), + cudaMemcpyDeviceToHost); + if (!is_test) { + cudaMemcpy((*mask).data(), tensor_mask->data(), + x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost); + } + ctx.Wait(); +} + +/** + * @brief call paddle dropout_grad op + */ +template +void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout, + const uint8_t *mask, const platform::CUDADeviceContext &ctx, + float dropout_prob, bool is_upscale_in_train) { + framework::Scope scope; + const size_t n = x_dim[0] * x_dim[1]; + auto var_out = scope.Var("DOut"); + auto tensor_out = var_out->GetMutable(); + tensor_out->Resize(x_dim); + tensor_out->mutable_data(ctx.GetPlace()); + cudaMemcpy(tensor_out->data(), dout, n * sizeof(T), + cudaMemcpyHostToDevice); + + auto var_mask = scope.Var("Mask"); + auto tensor_mask = var_mask->GetMutable(); + tensor_mask->Resize(x_dim); + tensor_mask->mutable_data(ctx.GetPlace()); + cudaMemcpy(tensor_mask->data(), mask, n * sizeof(uint8_t), + cudaMemcpyHostToDevice); + + auto var_dx = scope.Var("DX"); + auto tensor_dx = var_dx->GetMutable(); + + framework::AttributeMap attrs; + attrs.insert({"dropout_prob", dropout_prob}); + attrs.insert({"is_test", 0}); + if (is_upscale_in_train) { + attrs.insert({"dropout_implementation", std::string("upscale_in_train")}); + } else { + attrs.insert({"dropout_implementation", std::string("downgrade_in_infer")}); + } + + auto op = framework::OpRegistry::CreateOp( + "dropout_grad", {{"Out@GRAD", {"DOut"}}, {"Mask", {"Mask"}}}, + {{"X@GRAD", {"DX"}}}, attrs); + op->Run(scope, ctx.GetPlace()); + + cudaMemcpy(dx, tensor_dx->data(), x_dim[0] * x_dim[1] * sizeof(T), + cudaMemcpyDeviceToHost); + ctx.Wait(); +} diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 16747d7739be1e..ce9273dff0a993 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -118,9 +118,8 @@ __global__ void FusedResidualDropoutBiasIsTest(const size_t rows, using LoadT = AlignedVector; - const int tmp_cols = cols / VecSize * VecSize; for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { - for (int i = col_id * VecSize; i < tmp_cols; + for (int i = col_id * VecSize; i < cols; i += blockDim.x * gridDim.x * VecSize) { T src_vec[VecSize]; T residual_vec[VecSize]; @@ -249,17 +248,6 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask, } } -template -static __forceinline__ __device__ U WarpReduceSum(U val) { - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, true); - const int warpSize = 32; - for (int offset = warpSize / 2; offset > 0; offset /= 2) { - val += paddle::platform::CudaShuffleDownSync(mask, val, offset); - } - return val; -} - /** * blocks(128 * 8) * 1. calculate the dx and reduce total rows to 128 rows @@ -285,7 +273,6 @@ __global__ void FusedResidualDropoutBiasGradVec( T dx_vec[VecSize]; LoadT *out_value = reinterpret_cast(&out_vec); MaskLoadT *mask_value = reinterpret_cast(&mask_vec); - LoadT *dx_value = reinterpret_cast(&dx_vec); *out_value = *reinterpret_cast(&dout[index]); *mask_value = *reinterpret_cast(&mask[index]); diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 5cd20dce57855b..fa119f1132e8f6 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -17,20 +17,12 @@ limitations under the License. */ #include #include -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/fused/fused_dropout_test.h" #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/string/printf.h" namespace framework = paddle::framework; namespace platform = paddle::platform; -USE_OP(dropout); - /** * @brief the unittest of fused_residual_dropout_bias * 1. random input data @@ -39,96 +31,6 @@ USE_OP(dropout); * 4. compare ther base result and fused result */ -/** - * @brief call paddle dropout op - */ -template -void Dropout(const T *x, const framework::DDim &x_dim, T *out, - std::vector *mask, const platform::CUDADeviceContext &ctx, - uint64_t seed, float dropout_prob, bool is_upscale_in_train, - bool is_test) { - framework::Scope scope; - auto var_x = scope.Var("X"); - auto tensor_x = var_x->GetMutable(); - tensor_x->Resize(x_dim); - tensor_x->mutable_data(ctx.GetPlace()); - cudaMemcpy(tensor_x->data(), x, x_dim[0] * x_dim[1] * sizeof(T), - cudaMemcpyHostToDevice); - - auto var_out = scope.Var("Out"); - auto tensor_out = var_out->GetMutable(); - - auto var_mask = scope.Var("Mask"); - auto tensor_mask = var_mask->GetMutable(); - - framework::AttributeMap attrs; - attrs.insert({"fix_seed", 1}); - attrs.insert({"seed", static_cast(seed)}); - attrs.insert({"dropout_prob", dropout_prob}); - if (is_upscale_in_train) { - attrs.insert({"dropout_implementation", std::string("upscale_in_train")}); - } - if (is_test) { - attrs.insert({"is_test", 1}); - } - - auto op = framework::OpRegistry::CreateOp( - "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs); - op->Run(scope, ctx.GetPlace()); - cudaMemcpy(out, tensor_out->data(), x_dim[0] * x_dim[1] * sizeof(T), - cudaMemcpyDeviceToHost); - if (!is_test) { - cudaMemcpy((*mask).data(), tensor_mask->data(), - x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost); - } - ctx.Wait(); -} - -/** - * @brief call paddle dropout_grad op - */ -template -void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout, - const uint8_t *mask, const platform::CUDADeviceContext &ctx, - float dropout_prob, bool is_upscale_in_train) { - framework::Scope scope; - const size_t n = x_dim[0] * x_dim[1]; - auto var_out = scope.Var("DOut"); - auto tensor_out = var_out->GetMutable(); - tensor_out->Resize(x_dim); - tensor_out->mutable_data(ctx.GetPlace()); - cudaMemcpy(tensor_out->data(), dout, n * sizeof(T), - cudaMemcpyHostToDevice); - - auto var_mask = scope.Var("Mask"); - auto tensor_mask = var_mask->GetMutable(); - tensor_mask->Resize(x_dim); - tensor_mask->mutable_data(ctx.GetPlace()); - cudaMemcpy(tensor_mask->data(), mask, n * sizeof(uint8_t), - cudaMemcpyHostToDevice); - - auto var_dx = scope.Var("DX"); - auto tensor_dx = var_dx->GetMutable(); - - framework::AttributeMap attrs; - attrs.insert({"dropout_prob", dropout_prob}); - attrs.insert({"is_test", 0}); - if (is_upscale_in_train) { - attrs.insert({"dropout_implementation", std::string("upscale_in_train")}); - } else { - attrs.insert({"dropout_implementation", std::string("downgrade_in_infer")}); - } - - auto op = framework::OpRegistry::CreateOp( - "dropout_grad", {{"Out@GRAD", {"DOut"}}, {"Mask", {"Mask"}}}, - {{"X@GRAD", {"DX"}}}, attrs); - op->Run(scope, ctx.GetPlace()); - - cudaMemcpy(dx, tensor_dx->data(), x_dim[0] * x_dim[1] * sizeof(T), - cudaMemcpyDeviceToHost); - ctx.Wait(); -} - template struct TestFusedResidualDropoutBias { uint32_t _rows; From 036b4307f58ddfbdeb7b815b138cd59cb09cd600 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 25 Aug 2021 11:59:17 +0000 Subject: [PATCH 05/19] optimize code structure to facilitate reuse --- paddle/fluid/operators/fused/fused_dropout.h | 12 - .../fused/fused_residual_dropout_bias.h | 258 ++++++++++-------- .../fused/fused_residual_dropout_bias_test.cu | 38 ++- 3 files changed, 159 insertions(+), 149 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_dropout.h b/paddle/fluid/operators/fused/fused_dropout.h index 4188d935b9e458..bd6a4122f5830d 100644 --- a/paddle/fluid/operators/fused/fused_dropout.h +++ b/paddle/fluid/operators/fused/fused_dropout.h @@ -66,17 +66,5 @@ struct alignas(sizeof(T) * VecSize) AlignedVector { T val[VecSize]; }; -// reduce sum by a warp -template -static __forceinline__ __device__ U WarpReduceSum(U val) { - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, true); - const int warpSize = 32; - for (int offset = warpSize / 2; offset > 0; offset /= 2) { - val += paddle::platform::CudaShuffleDownSync(mask, val, offset); - } - return val; -} - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index ce9273dff0a993..0a263635e46029 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/fused/fused_dropout.h" +#include "paddle/fluid/operators/layer_norm_kernel.cu.h" namespace paddle { namespace operators { @@ -22,7 +23,71 @@ namespace operators { namespace platform = paddle::platform; namespace cg = cooperative_groups; +/** + * @brief fused the add_bias, dropout, add residual into one operators + * + */ + /********Forward**************/ +/** + * @brief the fused function called by every thread + */ +template +__forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread( + const int row_id, const int col_id, const int cols, + curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor, + const T *src, const T *residual, const T *bias, T *dst, MaskType *mask, + U *mean_val, U *var_val) { + using LoadT = AlignedVector; + using MaskLoadT = AlignedVector; + T src_vec[VecSize]; + T residual_vec[VecSize]; + T bias_vec[VecSize]; +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + bias_vec[ii] = static_cast(0); + } + // vectorize load data from global + LoadT *value = reinterpret_cast(&src_vec); + LoadT *residual_value = reinterpret_cast(&residual_vec); + *value = *reinterpret_cast(&src[row_id * cols + col_id]); + *residual_value = + *reinterpret_cast(&residual[row_id * cols + col_id]); + + LoadT *bias_value = + bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; + if (bias != nullptr) + *bias_value = *reinterpret_cast(&bias[col_id]); + + float4 rand = curand_uniform4(state); + T dest_vec[VecSize]; + MaskType mask_vec[VecSize]; + +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob); + } + +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + dest_vec[ii] = + (src_vec[ii] + bias_vec[ii]) * static_cast(mask_vec[ii]) * factor + + residual_vec[ii]; + if (layer_norm) { + U tmp = static_cast(dest_vec[ii]); + *mean_val += tmp; + *var_val += (tmp * tmp); + } + } + + // store result to global + *(reinterpret_cast(&dst[row_id * cols + col_id])) = + *reinterpret_cast(&dest_vec[0]); + *(reinterpret_cast(&mask[row_id * cols + col_id])) = + *reinterpret_cast(&mask_vec[0]); +} + /** * @brief dst = residual + dropout(src + bias); * the src, residual, mask and dst shape is (rows, cols) @@ -46,67 +111,71 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, if (!is_upscale_in_train) { factor = static_cast(1.0f); } - using LoadT = AlignedVector; - using MaskLoadT = AlignedVector; for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { for (int i = col_id * VecSize; i < cols; i += blockDim.x * gridDim.x * VecSize) { - T src_vec[VecSize]; - T residual_vec[VecSize]; - T bias_vec[VecSize]; -#pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - bias_vec[ii] = static_cast(0); - } - // vectorize load data from global - LoadT *value = reinterpret_cast(&src_vec); - LoadT *residual_value = reinterpret_cast(&residual_vec); - *value = *reinterpret_cast(&src[r * cols + i]); - *residual_value = - *reinterpret_cast(&residual[r * cols + i]); - - LoadT *bias_value = - bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; - if (bias != nullptr) - *bias_value = *reinterpret_cast(&bias[i]); - - float4 rand = curand_uniform4(&state); - T dest_vec[VecSize]; - MaskType mask_vec[VecSize]; + FusedResidualDropoutBiasVecOneThread( + r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst, + mask, NULL, NULL); + } + } +} +/** + * @brief the fused function called by every thread + */ +template +__forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferVecOneThread( + const int row_id, const int col_id, const int cols, + const float dropout_prob, const T factor, const T *src, const T *residual, + const T *bias, T *dst, U *mean_val, U *var_val) { + using LoadT = AlignedVector; + T src_vec[VecSize]; + T residual_vec[VecSize]; + T bias_vec[VecSize]; #pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob); - } + for (int ii = 0; ii < VecSize; ii++) { + bias_vec[ii] = static_cast(0); + } + // vectorize load data from global + LoadT *value = reinterpret_cast(&src_vec); + LoadT *residual_value = reinterpret_cast(&residual_vec); + *value = *reinterpret_cast(&src[row_id * cols + col_id]); + *residual_value = + *reinterpret_cast(&residual[row_id * cols + col_id]); -#pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * - static_cast(mask_vec[ii]) * factor + - residual_vec[ii]; - } + LoadT *bias_value = + bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; + if (bias != nullptr) + *bias_value = *reinterpret_cast(&bias[col_id]); + + T dest_vec[VecSize]; - // store result to global - *(reinterpret_cast(&dst[r * cols + i])) = - *reinterpret_cast(&dest_vec[0]); - *(reinterpret_cast(&mask[r * cols + i])) = - *reinterpret_cast(&mask_vec[0]); +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii]; + if (layer_norm) { + U tmp = static_cast(dest_vec[ii]); + *mean_val += tmp; + *var_val += (tmp * tmp); } } + + // store result to global + *(reinterpret_cast(&dst[row_id * cols + col_id])) = + *reinterpret_cast(&dest_vec[0]); } /** - * @brief for dropout's param is_test = true + * @brief for dropout's param is_test = true, only used in inference * the src, residual and dst shape is (rows, cols) * the bias shape is (1, cols) */ template -__global__ void FusedResidualDropoutBiasIsTest(const size_t rows, - const size_t cols, - const float dropout_prob, - const bool is_upscale_in_train, - const T *src, const T *residual, - const T *bias, T *dst) { +__global__ void FusedResidualDropoutBiasOnlyInferVec( + const size_t rows, const size_t cols, const float dropout_prob, + const bool is_upscale_in_train, const T *src, const T *residual, + const T *bias, T *dst) { int col_id = blockDim.x * blockIdx.x + threadIdx.x; int row_id = blockIdx.y; int idx = row_id * cols + col_id; @@ -116,39 +185,12 @@ __global__ void FusedResidualDropoutBiasIsTest(const size_t rows, factor = static_cast(1.0f); } - using LoadT = AlignedVector; - for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { for (int i = col_id * VecSize; i < cols; i += blockDim.x * gridDim.x * VecSize) { - T src_vec[VecSize]; - T residual_vec[VecSize]; - T bias_vec[VecSize]; -#pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - bias_vec[ii] = static_cast(0); - } - // vectorize load data from global - LoadT *value = reinterpret_cast(&src_vec); - LoadT *residual_value = reinterpret_cast(&residual_vec); - *value = *reinterpret_cast(&src[r * cols + i]); - *residual_value = - *reinterpret_cast(&residual[r * cols + i]); - - LoadT *bias_value = - bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; - if (bias != nullptr) - *bias_value = *reinterpret_cast(&bias[i]); - - T dest_vec[VecSize]; -#pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii]; - } - - // store result to global - *(reinterpret_cast(&dst[r * cols + i])) = - *reinterpret_cast(&dest_vec[0]); + FusedResidualDropoutBiasOnlyInferVecOneThread( + r, i, cols, dropout_prob, factor, src, residual, bias, dst, nullptr, + nullptr); } } } @@ -159,7 +201,7 @@ __global__ void FusedResidualDropoutBiasIsTest(const size_t rows, template void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, const int increment, uint64_t seed, - const float dropout_prob, + const float dropout_prob, const bool is_test, bool is_upscale_in_train, const T *src, const T *residual, const T *bias, MaskType *mask_data, T *dst, @@ -176,46 +218,32 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, const int VecSize = 4; auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols); - if (cols % VecSize != 0) - FusedResidualDropoutBiasVec< - T, uint8_t, 1><<>>( - rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, - bias, mask_data, dst, increment); - else - FusedResidualDropoutBiasVec< - T, uint8_t, - VecSize><<>>( - rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, - bias, mask_data, dst, increment); -} - -/** - *@brief to launch kernel FusedResidualDropoutBiasIsTest - */ -template -void LaunchResidualDropoutBiasIsTest(const uint32_t rows, const uint32_t cols, - const float dropout_prob, - bool is_upscale_in_train, const T *src, - const T *residual, const T *bias, T *dst, - const platform::CUDADeviceContext &ctx) { - if (std::abs(dropout_prob - 1.0f) < 1e-5) { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T), - cudaMemcpyDeviceToDevice, ctx.stream())); - return; + if (cols % VecSize != 0) { + if (!is_test) { + FusedResidualDropoutBiasVec< + T, uint8_t, 1><<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, + bias, mask_data, dst, increment); + } else { + FusedResidualDropoutBiasOnlyInferVec< + T, 1><<>>( + rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, + dst); + } + } else { + if (!is_test) { + FusedResidualDropoutBiasVec< + T, uint8_t, + VecSize><<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, + bias, mask_data, dst, increment); + } else { + FusedResidualDropoutBiasOnlyInferVec< + T, VecSize><<>>( + rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, + dst); + } } - const int VecSize = 4; - auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols); - if (cols % VecSize != 0) - FusedResidualDropoutBiasIsTest< - T, 1><<>>( - rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, - dst); - else - FusedResidualDropoutBiasIsTest< - T, VecSize><<>>( - rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, - dst); } /********Backward**************/ diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index fa119f1132e8f6..d5377194934ff6 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -43,7 +43,7 @@ struct TestFusedResidualDropoutBias { framework::Tensor _src, _residual, _bias, _out, _mask; framework::Tensor _dsrc, _dbias; - std::vector _src_vec, _residual_vec, _bias_vec, _out_vec, _mask_vec; + std::vector _src_vec, _residual_vec, _bias_vec; std::vector _correct_out, _correct_dsrc, _correct_dbias; std::vector _correct_mask; @@ -173,16 +173,10 @@ struct TestFusedResidualDropoutBias { if (_has_bias) { bias_ptr = _bias.data(); } - if (_is_test) { - paddle::operators::LaunchResidualDropoutBiasIsTest( - _rows, _cols, _dropout_prob, _is_upscale_in_train, _src.data(), - _residual.data(), bias_ptr, _out.data(), *_ctx); - } else { - paddle::operators::LaunchResidualDropoutBias( - _rows, _cols, increment, _seed, _dropout_prob, _is_upscale_in_train, - _src.data(), _residual.data(), bias_ptr, _mask.data(), - _out.data(), *_ctx); - } + paddle::operators::LaunchResidualDropoutBias( + _rows, _cols, increment, _seed, _dropout_prob, _is_test, + _is_upscale_in_train, _src.data(), _residual.data(), bias_ptr, + _mask.data(), _out.data(), *_ctx); _ctx->Wait(); } @@ -249,7 +243,7 @@ struct TestFusedResidualDropoutBias { } }; -TEST(FusedDropout, GPUFusedRedisualDorpoutBias) { +TEST(FusedDropout, GPUFusedResidualDropoutBias) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols); @@ -258,7 +252,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias) { test.CheckGrad(static_cast(1e-5)); } -TEST(FusedDropout, GPUFusedRedisualDorpoutBiasDouble) { +TEST(FusedDropout, GPUFusedResidualDropoutBiasDouble) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols); @@ -268,7 +262,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasDouble) { } // test fp16, For inference, check_grad is not required. ref: test_dropout_op.py -TEST(FusedDropout, GPUFusedRedisualDorpoutBiasFp16) { +TEST(FusedDropout, GPUFusedResidualDropoutBiasFp16) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols); @@ -277,7 +271,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasFp16) { } // test no bias and cols % 4 == 0 -TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias) { +TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols); @@ -288,7 +282,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias) { } // test no bias and cols % 4 != 0 -TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias2) { +TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias2) { const int rows = 16; const int cols = 17; TestFusedResidualDropoutBias test(rows, cols); @@ -299,7 +293,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias2) { } // test add bias and cols % 4 != 0 -TEST(FusedDropout, GPUFusedRedisualDorpoutBias2) { +TEST(FusedDropout, GPUFusedResidualDropoutBias2) { const int rows = 16; const int cols = 17; TestFusedResidualDropoutBias test(rows, cols); @@ -308,7 +302,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias2) { test.CheckGrad(static_cast(1e-5)); } -TEST(FusedDropout, GPUFusedRedisualDorpoutBias3) { +TEST(FusedDropout, GPUFusedResidualDropoutBias3) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols, 0, 1.0, false, false); @@ -317,7 +311,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias3) { test.CheckGrad(static_cast(1e-5)); } -TEST(FusedDropout, GPUFusedRedisualDorpoutBias4) { +TEST(FusedDropout, GPUFusedResidualDropoutBias4) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols, 0, 1.0, false, false); @@ -326,7 +320,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias4) { test.CheckGrad(static_cast(1e-5)); } -TEST(FusedDropout, GPUFusedRedisualDorpoutBias5) { +TEST(FusedDropout, GPUFusedResidualDropoutBias5) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols, 0, 1.0, true, false); @@ -335,7 +329,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias5) { test.CheckGrad(static_cast(1e-5)); } -TEST(FusedDropout, GPUFusedRedisualDorpoutBias6) { +TEST(FusedDropout, GPUFusedResidualDropoutBias6) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols, 0, 0.35, true, true); @@ -344,7 +338,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias6) { test.CheckGrad(static_cast(1e-5)); } -TEST(FusedDropout, GPUFusedRedisualDorpoutBias7) { +TEST(FusedDropout, GPUFusedResidualDropoutBias7) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols, 125, 0.0, false, false); From 6755aeacb50ab656174159344c6a0d838cdba3a1 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Fri, 27 Aug 2021 11:37:43 +0000 Subject: [PATCH 06/19] Add a new op: paddle.linalg.multi_dot --- paddle/fluid/operators/multi_dot_op.cc | 658 ++++++++++++++++++ python/paddle/__init__.py | 1 + .../tests/unittests/test_multi_dot_op.py | 298 ++++++++ .../white_list/check_shape_white_list.py | 1 + .../white_list/no_grad_set_white_list.py | 3 +- .../white_list/op_accuracy_white_list.py | 3 +- python/paddle/linalg.py | 4 +- python/paddle/tensor/linalg.py | 68 ++ 8 files changed, 1033 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/multi_dot_op.cc create mode 100644 python/paddle/fluid/tests/unittests/test_multi_dot_op.py diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc new file mode 100644 index 00000000000000..59c43ea2f95b6e --- /dev/null +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -0,0 +1,658 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/strided_memcpy.h" +#include "paddle/fluid/operators/utils.h" + +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +/** + * @brief compute the output shape and check the input shape valid or not + */ +inline framework::DDim ComputeAndCheckShape( + const bool is_runtime, const std::vector& inputs_dims) { + const size_t n = inputs_dims.size(); + auto first_dim = inputs_dims[0]; + + bool is_vector = false; + framework::DDim out_dim; + + if (first_dim.size() > 2) { + PADDLE_THROW(platform::errors::InvalidArgument( + "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", + static_cast(first_dim.size()))); + } + + // If the first tensor is 1D of size n view it as a row vector (1, n) + if (first_dim.size() == 1) { + first_dim = framework::make_ddim({1, static_cast(first_dim[0])}); + is_vector = true; + } + + auto last_dim = inputs_dims[n - 1]; + if (last_dim.size() > 2) { + PADDLE_THROW(platform::errors::InvalidArgument( + "the last input tensor of multi_dot op must be 1D or 2D but got[%d]!", + static_cast(last_dim.size()))); + } + + // If the last tensor is 1D of size n view it as a column vector (n, 1) + if (last_dim.size() == 1) { + last_dim = framework::make_ddim({static_cast(last_dim[0]), 1}); + if (is_vector) { + out_dim = framework::make_ddim({1}); + } else { + out_dim = framework::make_ddim({first_dim[0]}); + } + } else { + if (is_vector) { + out_dim = framework::make_ddim({last_dim[1]}); + } else { + out_dim = framework::make_ddim({first_dim[0], last_dim[1]}); + } + } + + auto width = first_dim[1]; + for (size_t i = 1; i < n - 1; i++) { + PADDLE_ENFORCE_EQ(inputs_dims[i].size(), static_cast(2), + platform::errors::InvalidArgument( + "the input tensor of multi_dot op must be 2D.")); + + const auto& tmp_dim = inputs_dims[i]; + PADDLE_ENFORCE_EQ(tmp_dim[0], width, + platform::errors::InvalidArgument( + "the input tensor of multi_dot op must be 2D.")); + width = tmp_dim[1]; + } + PADDLE_ENFORCE_EQ(last_dim[0], width, + platform::errors::InvalidArgument( + "the input tensor of multi_dot op must be 2D.")); + + return out_dim; +} + +/** + * @brief the matrix multiplication + */ +template +inline framework::Tensor MatMul(const framework::ExecutionContext& ctx, + const framework::Tensor& matrix_a, + const framework::Tensor& matrix_b, + const framework::DDim& a_dim, + const framework::DDim& b_dim) { + auto place = ctx.GetPlace(); + auto blas = math::GetBlas(ctx); + + framework::Tensor matrix_c; + framework::DDim c_dim = framework::make_ddim({a_dim[0], b_dim[1]}); + matrix_c.mutable_data(place, c_dim[0] * c_dim[1] * sizeof(T)); + matrix_c.Resize(c_dim); + + auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, false); + auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, false); + const T alpha = static_cast(1.0); + blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0)); + return matrix_c; +} + +/** + * @brief multi matrix dot by a chain order + * @param + * ins: the input tensors + * ins_dims: the shape of ins after reshape + * order: the optimal order + * i: the left of sub chain + * j: the righe of sub chain + * save_result: set true by backward + * results: save the intermediate result during backward + */ +template +inline framework::Tensor MatChainMul( + const framework::ExecutionContext& ctx, + const std::vector& ins, + const std::vector& ins_dims, + const std::vector& order, const uint64_t i, const uint64_t j, + const bool save_result, std::vector* results) { + if (i == j) { + return *ins[i]; + } + + const auto A = MatChainMul(ctx, ins, ins_dims, order, i, + order[i * ins.size() + j], + save_result, results); + framework::DDim a_dim = A.dims(); + if (i == order[i * ins.size() + j]) { + a_dim = ins_dims[i]; + } + + const auto B = MatChainMul(ctx, ins, ins_dims, order, + order[i * ins.size() + j] + 1, j, + save_result, results); + framework::DDim b_dim = B.dims(); + if (j == order[i * ins.size() + j] + 1) { + b_dim = ins_dims[j]; + } + + auto result = MatMul(ctx, A, B, a_dim, b_dim); + if (save_result) { + (*results)[i * ins.size() + j] = result; + } + return result; +} + +/** + * @brief get the optimal order + */ +std::vector GetOrder(const std::vector& ins, + const std::vector& ins_dims) { + auto n = ins.size(); + std::vector p(n + 1); + for (uint64_t i = 0; i < n; i++) { + p[i] = ins_dims[i][0]; + } + p[n] = ins_dims[n - 1][1]; + + std::vector m(n * n, 0); + std::vector order(n * n); + + for (uint64_t l = 1; l < n; l++) { + for (uint64_t i = 0; i < n - l; i++) { + auto j = i + l; + m[i * n + j] = 0xffffffff; + for (uint64_t k = i; k < j; k++) { + uint64_t q = + m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1]; + if (q < m[i * n + j]) { + m[i * n + j] = q; + order[i * n + j] = k; + } + } + } + } + return order; +} + +template +static inline framework::Tensor MultiDotMatChainOrder( + const framework::ExecutionContext& ctx, + const std::vector& ins, + const std::vector& ins_dims, const bool save_result, + std::vector* results) { + auto order = GetOrder(ins, ins_dims); + auto n = ins.size(); + return MatChainMul(ctx, ins, ins_dims, order, 0, n - 1, + save_result, results); +} + +inline void GetDims(const std::vector& ins, + std::vector* ins_dims) { + const auto n = ins.size(); + std::vector real_ins; + for (size_t i = 0; i < n; i++) { + (*ins_dims)[i] = ins[i]->dims(); + if (i == 0 && (*ins_dims)[i].size() == 1) { + (*ins_dims)[i] = framework::make_ddim({1, (*ins_dims)[i][0]}); + } else if (i == n - 1 && (*ins_dims)[i].size() == 1) { + (*ins_dims)[i] = framework::make_ddim({(*ins_dims)[i][0], 1}); + } + } +} + +class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensors of multi_dot operator.").AsDuplicable(); + AddOutput("Out", "The output tensor of multi_dot operator"); + AddAttr( + "use_mkldnn", + "(bool, default false) Indicates if MKL-DNN kernel will be used") + .SetDefault(false); + AddComment(R"DOC( +Compute the dot product of two or more arrays in a single function call, while automatically selecting the fastest evaluation order. + +multi_dot chains MatMul and uses optimal parenthesization of the matrices [1] [2]. Depending on the shapes of the matrices, this can speed up the multiplication a lot. + +If the first argument is 1-D it is treated as a row vector. If the last argument is 1-D it is treated as a column vector. The other arguments must be 2-D. + )DOC"); + } +}; + +class MultiDotOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "multi_dot"); + + auto inputs_dims = ctx->GetInputsDim("X"); + + const size_t inputs_num = inputs_dims.size(); + PADDLE_ENFORCE_GT( + inputs_num, static_cast(1), + platform::errors::InvalidArgument( + "The number of input tensors in multi_dot op should > 1.")); + auto out_dims = ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims); + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto inputs = ctx.MultiInput("X"); + auto input_data_type = framework::proto::VarType::Type(0); + bool flag = 1; + for (auto* input : inputs) { + if (!input->IsInitialized() || input->numel() == 0) { + flag = 0; + break; + } + } + if (flag == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "All Inputs of multi_dot OP are Empty!")); + } + input_data_type = inputs[0]->type(); + +#ifdef PADDLE_WITH_MKLDNN + using mkldnn::memory; + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const { + if (framework::IsComplexType(expected_kernel_type.data_type_)) { + // only promote inputs’s types when contains complex input + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } else { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } + } +}; + +template +class MultiDotKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + out->mutable_data(place); + + auto blas = math::GetBlas(ctx); + + auto n = ins.size(); + std::vector ins_dims(n); + GetDims(ins, &ins_dims); + + const T scale = static_cast(1.0); + if (n == 2) { + auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false); + auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false); + blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0)); + } else if (n == 3) { + const auto Ma = ins_dims[0][0]; + const auto Ka = ins_dims[0][1]; + const auto Nb = ins_dims[1][1]; + const auto Nc = ins_dims[2][1]; + const uint64_t cost1 = + Ma * Nb * (Ka + Nc); // Ma * Ka * Nb + Ma * Nb * Nc; + const uint64_t cost2 = + Ka * Nc * (Nb + Ma); // Ka * Nb * Nc + Ma * Ka * Nc; + auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false); + auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false); + auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false); + if (cost1 < cost2) { + framework::Tensor tmp_out; + tmp_out.mutable_data(place, Ma * Nb * sizeof(T)); + framework::DDim tmp_dim = ins_dims[0]; + tmp_dim[1] = Nb; + blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, + T(0)); + auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false); + blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0)); + } else { + framework::Tensor tmp_out; + tmp_out.mutable_data(place, Ka * Nc * sizeof(T)); + framework::DDim tmp_dim = ins_dims[1]; + tmp_dim[1] = Nc; + blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, + T(0)); + auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false); + blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0)); + } + } else { + std::vector results; + const auto tmp = MultiDotMatChainOrder( + ctx, ins, ins_dims, false, &results); + auto out_dim = out->dims(); + // TensorCopy(tmp, place, ctx.device_context(), out); + *out = tmp; + out->Resize(out_dim); + } + } +}; + +class MultiDotOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@GRAD", "multi_dot"); + + auto in_x = "X"; + auto out_x_g_n = framework::GradVarName(in_x); + auto ins_dims = ctx->GetInputsDim(in_x); + ctx->SetOutputsDim(out_x_g_n, ins_dims); + ctx->ShareAllLoD(in_x, out_x_g_n); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +template +class MultiDotGradKernel : public framework::OpKernel { + public: + /** + * @brief calculate dA and dB + * dA = dout * transpose(B) + * dB = transpose(A) * dout + */ + void CalcGrad(const framework::ExecutionContext& ctx, + const framework::Tensor& dout, const framework::Tensor& A, + const framework::Tensor& B, const framework::DDim& dout_dim, + const framework::DDim& a_dim, const framework::DDim& b_dim, + framework::Tensor* dA, framework::Tensor* dB) const { + auto mat_dim_dout = math::CreateMatrixDescriptor(dout_dim, 0, false); + auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, true); + auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, true); + T alpha = static_cast(1.0); + auto blas = math::GetBlas(ctx); + + blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0)); + blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0)); + } + + /** + * @brief calculate multi matrix multiplication grad by a chain order + * @param + * dout: the grad of multi matrix multiplication out + * dx: the out grad of inputs + * ins: the input tensors + * ins_dims: the shape of ins after reshape + * order: the optimal order + * i: the left of sub chain + * j: the righe of sub chain + * results: the intermediate result of farward + */ + void MatChainMulGrad(const framework::ExecutionContext& ctx, + const framework::Tensor& dout, + std::vector* dx, + const std::vector& ins, + const framework::DDim& dout_dim, + const std::vector& ins_dims, + const std::vector& order, const uint64_t i, + const uint64_t j, + const std::vector& results) const { + if (i == j) { + *((*dx)[i]) = dout; + return; + } + + const auto n = ins.size(); + const auto right = order[i * n + j]; + const auto left = order[i * n + j] + 1; + // get the multi result of left sub chain + const auto* A = &results[i * n + right]; + framework::DDim a_dim = A->dims(); + if (i == right) { + A = ins[i]; + a_dim = ins_dims[i]; + } + // get the multi result of right sub chain + const auto* B = &results[left * n + j]; + framework::DDim b_dim = B->dims(); + if (left == j) { + B = ins[j]; + b_dim = ins_dims[j]; + } + framework::Tensor dA, dB; + dA.Resize({dout_dim[0], b_dim[0]}); + dB.Resize({a_dim[1], dout_dim[1]}); + dA.mutable_data(ctx.GetPlace()); + dB.mutable_data(ctx.GetPlace()); + + CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB); + + MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, + results); + MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, + results); + } + + void MultiDotGradMatChainOrder( + const framework::ExecutionContext& ctx, const framework::Tensor& dout, + const std::vector& ins, + const framework::DDim& dout_dim, + const std::vector& ins_dims, + std::vector* dx) const { + auto order = GetOrder(ins, ins_dims); + auto n = ins.size(); + std::vector results(n * n); + // call the forward, get the itermediate result + MatChainMul(ctx, ins, ins_dims, order, 0, n - 1, true, + &results); + MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, + results); + } + + void Compute(const framework::ExecutionContext& ctx) const { + auto ins = ctx.MultiInput("X"); + auto dout = *ctx.Input(framework::GradVarName("Out")); + auto dx = ctx.MultiOutput(framework::GradVarName("X")); + + auto blas = math::GetBlas(ctx); + auto place = ctx.GetPlace(); + + const auto n = ins.size(); + for (size_t i = 0; i < n; i++) { + dx[i]->mutable_data(place); + } + + std::vector ins_dims(n); + GetDims(ins, &ins_dims); + + framework::DDim dout_dim = dout.dims(); + if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) { + dout_dim = framework::make_ddim({1, 1}); + } else if (ins[0]->dims().size() == 1) { + if (dout_dim.size() == 1) { + dout_dim = framework::make_ddim({1, dout_dim[0]}); + } + } else if (ins[n - 1]->dims().size() == 1) { + if (dout_dim.size() == 1) { + dout_dim = framework::make_ddim({dout_dim[0], 1}); + } + } + + T alpha = static_cast(1); + auto mat_dim_dout = math::CreateMatrixDescriptor(dout_dim, 0, false); + if (n == 2) { + CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1], + dx[0], dx[1]); + } else if (n == 3) { + const auto Ma = ins_dims[0][0]; + const auto Ka = ins_dims[0][1]; + const auto Nb = ins_dims[1][1]; + const auto Nc = ins_dims[2][1]; + const uint64_t cost1 = + Ma * Nb * (Ka + Nc); // Ma * Ka * Nb + Ma * Nb * Nc; + const uint64_t cost2 = + Ka * Nc * (Nb + Ma); // Ka * Nb * Nc + Ma * Ka * Nc; + auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false); + auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false); + auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false); + if (cost1 < cost2) { + framework::Tensor tmp_out, tmp_dout; + tmp_out.Resize({Ma, Nb}); + tmp_out.mutable_data(place); + tmp_dout.Resize({mat_dim_dout.height_, Nb}); + tmp_dout.mutable_data(place); + // tmp_out = A * B + blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, + T(0)); + + /* + * dC = dout * transpose(tmp_out) + * tmp_dout = dout * transpose(C) + */ + CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(), + ins_dims[2], &tmp_dout, dx[2]); + + /* + * dA = tmp_dout * transpose(B) + * dB = tmp_dout * transpose(A) + */ + CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0], + ins_dims[1], dx[0], dx[1]); + } else { + framework::Tensor tmp_out, tmp_dout; + tmp_out.Resize({Ka, Nc}); + tmp_out.mutable_data(place); + tmp_dout.Resize({Ka, mat_dim_dout.width_}); + tmp_dout.mutable_data(place); + blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, + T(0)); + + /* + * dA = dout * transpose(tmp_out) + * tmp_out = dout * transpose(A) + */ + CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0], + tmp_dout.dims(), dx[0], &tmp_dout); + + /* + * dB = tmp_dout * transpose(C) + * dC = tmp_dout * transpose(B) + */ + CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1], + ins_dims[2], dx[1], dx[2]); + } + } else { + MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx); + if (ins[n - 1]->dims().size() == 1) { + dx[n - 1]->Resize({dx[n - 1]->dims()[0]}); + } + } + } +}; + +template +class MultiDotOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("multi_dot_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X", false)); + } +}; +template +class MultiDotOpDoubleGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("multi_dot"); + grad_op->SetInput("X", this->Input(("X"))); + grad_op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + grad_op->SetOutput("DDx", this->OutputGrad(framework::GradVarName("X"))); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker, + ops::MultiDotOpGradMaker, + ops::MultiDotOpGradMaker); +REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad, + ops::MultiDotOpDoubleGradMaker, + ops::MultiDotOpDoubleGradMaker); + +REGISTER_OP_CPU_KERNEL( + multi_dot, ops::MultiDotKernel, + ops::MultiDotKernel); +REGISTER_OP_CPU_KERNEL( + multi_dot_grad, + ops::MultiDotGradKernel, + ops::MultiDotGradKernel); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +REGISTER_OP_CUDA_KERNEL( + multi_dot, ops::MultiDotKernel, + ops::MultiDotKernel, + ops::MultiDotKernel); +REGISTER_OP_CUDA_KERNEL( + multi_dot_grad, + ops::MultiDotGradKernel, + ops::MultiDotGradKernel, + ops::MultiDotGradKernel); +#endif diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 907a667cb6ba78..48a7a5c143300a 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -99,6 +99,7 @@ from .tensor.linalg import bmm # noqa: F401 from .tensor.linalg import histogram # noqa: F401 from .tensor.linalg import mv # noqa: F401 +from .tensor.linalg import multi_dot # noqa: F401 from .tensor.logic import equal # noqa: F401 from .tensor.logic import greater_equal # noqa: F401 from .tensor.logic import greater_than # noqa: F401 diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py new file mode 100644 index 00000000000000..71aa99a0f0f03e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py @@ -0,0 +1,298 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest, skip_check_grad_ci +from numpy.linalg import multi_dot +from op_test import OpTest +import paddle +from paddle.fluid import Program, program_guard +import paddle.fluid as fluid + +paddle.enable_static() + + +class TestMultiDotOp(OpTest): + def setUp(self): + self.op_type = "multi_dot" + self.dtype = self.get_dtype() + self.get_inputs_and_outputs() + + def get_dtype(self): + return "float32" + + def get_inputs_and_outputs(self): + self.A = np.random.random((2, 8)).astype(self.dtype) + self.B = np.random.random((8, 4)).astype(self.dtype) + self.inputs = {'X': [('x0', self.A), ('x1', self.B)]} + self.outputs = {'Out': multi_dot([self.A, self.B])} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + + +class TestMultiDotOpDouble(TestMultiDotOp): + def get_dtype(self): + return "float64" + + +#(A*B)*C +class TestMultiDotOp3Mat(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((2, 10)).astype(self.dtype) + self.B = np.random.random((10, 4)).astype(self.dtype) + self.C = np.random.random((4, 3)).astype(self.dtype) + self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]} + self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + + +#A*(B*C) +class TestMultiDotOp3Mat2(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((3, 4)).astype(self.dtype) + self.B = np.random.random((4, 8)).astype(self.dtype) + self.C = np.random.random((8, 2)).astype(self.dtype) + self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]} + self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + + +class TestMultiDotOp4Mat(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((8, 6)).astype(self.dtype) + self.B = np.random.random((6, 3)).astype(self.dtype) + self.C = np.random.random((3, 4)).astype(self.dtype) + self.D = np.random.random((4, 5)).astype(self.dtype) + self.inputs = { + 'X': + [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)] + } + self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + self.check_grad(['x3'], 'Out', max_relative_error=1e-3) + + +class TestMultiDotOpFirst1D(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((4)).astype(self.dtype) + self.B = np.random.random((4, 3)).astype(self.dtype) + self.inputs = {'X': [('x0', self.A), ('x1', self.B)]} + self.outputs = {'Out': multi_dot([self.A, self.B])} + + +class TestMultiDotOp3MatFirst1D(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((4)).astype(self.dtype) + self.B = np.random.random((4, 3)).astype(self.dtype) + self.C = np.random.random((3, 3)).astype(self.dtype) + self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]} + self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + + +class TestMultiDotOp4MatFirst1D(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((4)).astype(self.dtype) + self.B = np.random.random((4, 3)).astype(self.dtype) + self.C = np.random.random((3, 4)).astype(self.dtype) + self.D = np.random.random((4, 5)).astype(self.dtype) + self.inputs = { + 'X': + [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)] + } + self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + self.check_grad(['x3'], 'Out', max_relative_error=1e-3) + + +class TestMultiDotOpLast1D(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((3, 6)).astype(self.dtype) + self.B = np.random.random((6)).astype(self.dtype) + self.inputs = {'X': [('x0', self.A), ('x1', self.B)]} + self.outputs = {'Out': multi_dot([self.A, self.B])} + + +class TestMultiDotOp3MatLast1D(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((2, 4)).astype(self.dtype) + self.B = np.random.random((4, 3)).astype(self.dtype) + self.C = np.random.random((3)).astype(self.dtype) + self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]} + self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + + +class TestMultiDotOp4MatLast1D(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((2, 3)).astype(self.dtype) + self.B = np.random.random((3, 2)).astype(self.dtype) + self.C = np.random.random((2, 3)).astype(self.dtype) + self.D = np.random.random((3)).astype(self.dtype) + self.inputs = { + 'X': + [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)] + } + self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + self.check_grad(['x3'], 'Out', max_relative_error=1e-3) + + +class TestMultiDotOpFirstAndLast1D(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((4, )).astype(self.dtype) + self.B = np.random.random((4)).astype(self.dtype) + self.inputs = {'X': [('x0', self.A), ('x1', self.B)]} + self.outputs = {'Out': multi_dot([self.A, self.B])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + + +class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((6, )).astype(self.dtype) + self.B = np.random.random((6, 4)).astype(self.dtype) + self.C = np.random.random((4)).astype(self.dtype) + self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]} + self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + + +class TestMultiDotOp4MatFirstAndLast1D(TestMultiDotOp): + def get_inputs_and_outputs(self): + self.A = np.random.random((3, )).astype(self.dtype) + self.B = np.random.random((3, 4)).astype(self.dtype) + self.C = np.random.random((4, 2)).astype(self.dtype) + self.D = np.random.random((2)).astype(self.dtype) + self.inputs = { + 'X': + [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)] + } + self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', max_relative_error=1e-3) + self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + self.check_grad(['x3'], 'Out', max_relative_error=1e-3) + + +#####python API test####### +class TestMultiDotOpError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + # The inputs type of multi_dot must be list matrix. + input1 = 12 + self.assertRaises(TypeError, paddle.multi_dot, [input1, input1]) + + # The inputs dtype of multi_dot must be float32, float64 or float16. + input2 = fluid.layers.data( + name='input2', shape=[10, 10], dtype="int32") + self.assertRaises(TypeError, paddle.multi_dot, [input2, input2]) + + # the number of tensor must be larger than 1 + x0 = fluid.data(name='x0', shape=[3, 2], dtype="float32") + self.assertRaises(ValueError, paddle.multi_dot, [x0]) + + #the first tensor must be 1D or 2D + x1 = fluid.data(name='x1', shape=[3, 2, 3], dtype="float32") + x2 = fluid.data(name='x2', shape=[3, 2], dtype="float32") + self.assertRaises(ValueError, paddle.multi_dot, [x1, x2]) + + #the last tensor must be 1D or 2D + x3 = fluid.data(name='x3', shape=[3, 2], dtype="float32") + x4 = fluid.data(name='x4', shape=[3, 2, 2], dtype="float32") + self.assertRaises(ValueError, paddle.multi_dot, [x3, x4]) + + #the tensor must be 2D, except first and last tensor + x5 = fluid.data(name='x5', shape=[3, 2], dtype="float32") + x6 = fluid.data(name='x6', shape=[2], dtype="float32") + x7 = fluid.data(name='x7', shape=[2, 2], dtype="float32") + self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7]) + + +class API_TestMultiDot(unittest.TestCase): + def test_out(self): + with fluid.program_guard(fluid.Program()): + x0 = fluid.data(name='x0', shape=[3, 2], dtype="float32") + x1 = fluid.data(name='x1', shape=[2, 3], dtype='float32') + result = paddle.multi_dot([x0, x1]) + exe = fluid.Executor(fluid.CPUPlace()) + data1 = np.random.rand(3, 2).astype("float32") + data2 = np.random.rand(2, 3).astype("float32") + np_res = exe.run(feed={'x0': data1, + 'x1': data2}, + fetch_list=[result]) + expected_result = np.linalg.multi_dot([data1, data2]) + + self.assertTrue( + np.allclose( + np_res, expected_result, atol=1e-5), + "two value is\ + {}\n{}, check diff!".format(np_res, expected_result)) + + def test_dygraph_without_out(self): + device = fluid.CPUPlace() + with fluid.dygraph.guard(device): + input_array1 = np.random.rand(3, 4).astype("float64") + input_array2 = np.random.rand(4, 3).astype("float64") + data1 = fluid.dygraph.to_variable(input_array1) + data2 = fluid.dygraph.to_variable(input_array2) + out = paddle.multi_dot([data1, data2]) + expected_result = np.linalg.multi_dot([input_array1, input_array2]) + self.assertTrue(np.allclose(expected_result, out.numpy())) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py index 15f28d94c70d12..626ea6c2ae0a30 100644 --- a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py @@ -28,4 +28,5 @@ 'cvm', 'cudnn_lstm', 'rnn', + 'multi_dot', ] diff --git a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py index 15ba331e9de5a3..dd7e800053042a 100644 --- a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py @@ -65,6 +65,7 @@ 'rank_loss', 'sequence_conv', 'smooth_l1_loss', - 'spectral_norm' + 'spectral_norm', + 'multi_dot', ] # yapf: enable diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py index 581656f6cd421b..9db2752ab23217 100644 --- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py @@ -76,7 +76,8 @@ 'trilinear_interp_v2', \ 'var_conv_2d', \ 'warpctc', \ - 'bilateral_slice' + 'bilateral_slice', + 'multi_dot' ] NO_FP16_CHECK_GRAD_OP_LIST = [ diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index 5cef01d18aca48..fb0dd92acc5fd5 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -15,9 +15,11 @@ from .tensor.linalg import cholesky # noqa: F401 from .tensor.linalg import norm # noqa: F401 from .tensor import inverse as inv # noqa: F401 +from .tensor.linalg import multi_dot # noqa: F401 __all__ = [ 'cholesky', #noqa 'norm', - 'inv' + 'inv', + 'multi_dot' ] diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index a1610581b67c03..7e36a4841c9559 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -941,3 +941,71 @@ def __check_input(x, vec): type='mv', inputs={'X': x, 'Vec': vec}, outputs={'Out': out}) return out + + +def multi_dot(x, name=None): + """ + Compute the dot product of tow or more matrix in a single function call, while automatically selecting the fastest evaluation order. + + Supports inputs of float, double and float16 dtypes. This function does not support batched inputs. + + Every tensor in x must be 2D, except for the first and last which may be 1D. if the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1). + If the first and last tensors are matrices, the output will be a matrix. However, if either is a 1D vector, then the output will be a 1D vector. + + Notes: + The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (10, 100), (100, 5), (5, 50) respectively, we can calculate the cost of different multiplication orders as follows: + Cost((AB)C) = 10x100x5 + 10x5x50 = 7500 + Cost(A(BC)) = 10x100x50 + 100x5x50 = 75000 + + In this case, multiplying A and B first followed by C is 10 times faster. + + Args: + x ([Tensor]): The input tensors which is a list Tensor. + + Returns: + Tensor: The output Tensor. + + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + # A * B + A_data = np.random.random([3, 4]).astype(np.float32) + B_data = np.random.random([4, 5]).astype(np.float32) + A = paddle.to_tensor(A_data) + B = paddle.to_tensor(B_data) + out = paddle.multi_dot([A, B]) + print(out.numpy().shape) + # [3, 5] + + # A * B * C + A_data = np.random.random([10, 5]).astype(np.float32) + B_data = np.random.random([5, 8]).astype(np.float32) + C_data = np.random.random([8, 7]).astype(np.float32) + A = paddle.to_tensor(A_data) + B = paddle.to_tensor(B_data) + C = paddle.to_tensor(C_data) + out = paddle.multi_dot([A, B, C]) + print(out.numpy().shape) + # [10, 7] + """ + if in_dygraph_mode(): + return _C_ops.multi_dot(x) + + check_type(x, 'x', (list, tuple), 'multi_dot') + for id, item in enumerate(x): + check_variable_and_dtype(item, 'x[' + str(id) + ']', + ['float16', 'float32', 'float64'], 'multi_dot') + if item.dtype != x[0].dtype: + raise TypeError( + "All the Tensors in the input must have the same data type.") + + helper = LayerHelper('multi_dot', **locals()) + dtype = helper.input_dtype(input_param_name='x') + out = helper.create_variable_for_type_inference(dtype) + helper.append_op(type='multi_dot', inputs={"X": x}, outputs={"Out": out}) + return out From 1cc8aad1545ce5e23d6bece3c4116a728bbf26d5 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 30 Aug 2021 06:29:50 +0000 Subject: [PATCH 07/19] fix the ci problem --- paddle/fluid/operators/multi_dot_op.cc | 70 +++------ .../tests/unittests/test_multi_dot_op.py | 98 ++++++------ .../white_list/no_grad_set_white_list.py | 3 +- .../white_list/op_accuracy_white_list.py | 3 +- python/paddle/linalg.py | 2 +- python/paddle/tensor/__init__.py | 1 + python/paddle/tensor/linalg.py | 143 +++++++++--------- 7 files changed, 143 insertions(+), 177 deletions(-) diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index 59c43ea2f95b6e..6682248f8eff1c 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -63,17 +63,11 @@ inline framework::DDim ComputeAndCheckShape( // If the last tensor is 1D of size n view it as a column vector (n, 1) if (last_dim.size() == 1) { last_dim = framework::make_ddim({static_cast(last_dim[0]), 1}); - if (is_vector) { - out_dim = framework::make_ddim({1}); - } else { - out_dim = framework::make_ddim({first_dim[0]}); - } + out_dim = is_vector ? framework::make_ddim({1}) + : framework::make_ddim({first_dim[0]}); } else { - if (is_vector) { - out_dim = framework::make_ddim({last_dim[1]}); - } else { - out_dim = framework::make_ddim({first_dim[0], last_dim[1]}); - } + out_dim = is_vector ? framework::make_ddim({last_dim[1]}) + : framework::make_ddim({first_dim[0], last_dim[1]}); } auto width = first_dim[1]; @@ -83,21 +77,21 @@ inline framework::DDim ComputeAndCheckShape( "the input tensor of multi_dot op must be 2D.")); const auto& tmp_dim = inputs_dims[i]; - PADDLE_ENFORCE_EQ(tmp_dim[0], width, - platform::errors::InvalidArgument( - "the input tensor of multi_dot op must be 2D.")); + PADDLE_ENFORCE_EQ( + tmp_dim[0], width, + platform::errors::InvalidArgument( + "the input matrix does not meet the multiplication requirements.")); width = tmp_dim[1]; } - PADDLE_ENFORCE_EQ(last_dim[0], width, - platform::errors::InvalidArgument( - "the input tensor of multi_dot op must be 2D.")); + + PADDLE_ENFORCE_EQ( + last_dim[0], width, + platform::errors::InvalidArgument( + "the input matrix does not meet the multiplication requirements.")); return out_dim; } -/** - * @brief the matrix multiplication - */ template inline framework::Tensor MatMul(const framework::ExecutionContext& ctx, const framework::Tensor& matrix_a, @@ -109,8 +103,8 @@ inline framework::Tensor MatMul(const framework::ExecutionContext& ctx, framework::Tensor matrix_c; framework::DDim c_dim = framework::make_ddim({a_dim[0], b_dim[1]}); - matrix_c.mutable_data(place, c_dim[0] * c_dim[1] * sizeof(T)); matrix_c.Resize(c_dim); + matrix_c.mutable_data(place); auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, false); auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, false); @@ -330,18 +324,15 @@ class MultiDotKernel : public framework::OpKernel { const auto Ka = ins_dims[0][1]; const auto Nb = ins_dims[1][1]; const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = - Ma * Nb * (Ka + Nc); // Ma * Ka * Nb + Ma * Nb * Nc; - const uint64_t cost2 = - Ka * Nc * (Nb + Ma); // Ka * Nb * Nc + Ma * Ka * Nc; + const uint64_t cost1 = Ma * Nb * (Ka + Nc); + const uint64_t cost2 = Ka * Nc * (Nb + Ma); auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false); auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false); auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false); if (cost1 < cost2) { framework::Tensor tmp_out; tmp_out.mutable_data(place, Ma * Nb * sizeof(T)); - framework::DDim tmp_dim = ins_dims[0]; - tmp_dim[1] = Nb; + framework::DDim tmp_dim = framework::make_ddim({Ma, Nb}); blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, T(0)); auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false); @@ -349,8 +340,7 @@ class MultiDotKernel : public framework::OpKernel { } else { framework::Tensor tmp_out; tmp_out.mutable_data(place, Ka * Nc * sizeof(T)); - framework::DDim tmp_dim = ins_dims[1]; - tmp_dim[1] = Nc; + framework::DDim tmp_dim = framework::make_ddim({Ka, Nc}); blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, T(0)); auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false); @@ -361,7 +351,6 @@ class MultiDotKernel : public framework::OpKernel { const auto tmp = MultiDotMatChainOrder( ctx, ins, ins_dims, false, &results); auto out_dim = out->dims(); - // TensorCopy(tmp, place, ctx.device_context(), out); *out = tmp; out->Resize(out_dim); } @@ -473,7 +462,6 @@ class MultiDotGradKernel : public framework::OpKernel { dB.mutable_data(ctx.GetPlace()); CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB); - MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, results); MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, @@ -489,7 +477,6 @@ class MultiDotGradKernel : public framework::OpKernel { auto order = GetOrder(ins, ins_dims); auto n = ins.size(); std::vector results(n * n); - // call the forward, get the itermediate result MatChainMul(ctx, ins, ins_dims, order, 0, n - 1, true, &results); MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, @@ -548,21 +535,10 @@ class MultiDotGradKernel : public framework::OpKernel { tmp_out.mutable_data(place); tmp_dout.Resize({mat_dim_dout.height_, Nb}); tmp_dout.mutable_data(place); - // tmp_out = A * B blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, T(0)); - - /* - * dC = dout * transpose(tmp_out) - * tmp_dout = dout * transpose(C) - */ CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(), ins_dims[2], &tmp_dout, dx[2]); - - /* - * dA = tmp_dout * transpose(B) - * dB = tmp_dout * transpose(A) - */ CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0], ins_dims[1], dx[0], dx[1]); } else { @@ -573,18 +549,8 @@ class MultiDotGradKernel : public framework::OpKernel { tmp_dout.mutable_data(place); blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, T(0)); - - /* - * dA = dout * transpose(tmp_out) - * tmp_out = dout * transpose(A) - */ CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0], tmp_dout.dims(), dx[0], &tmp_dout); - - /* - * dB = tmp_dout * transpose(C) - * dC = tmp_dout * transpose(B) - */ CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1], ins_dims[2], dx[1], dx[2]); } diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py index 71aa99a0f0f03e..35482144c272dd 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py @@ -31,7 +31,7 @@ def setUp(self): self.get_inputs_and_outputs() def get_dtype(self): - return "float32" + return "float64" def get_inputs_and_outputs(self): self.A = np.random.random((2, 8)).astype(self.dtype) @@ -43,8 +43,8 @@ def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') class TestMultiDotOpDouble(TestMultiDotOp): @@ -62,9 +62,9 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) - self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') #A*(B*C) @@ -77,9 +77,9 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) - self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') class TestMultiDotOp4Mat(TestMultiDotOp): @@ -95,10 +95,10 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) - self.check_grad(['x2'], 'Out', max_relative_error=1e-3) - self.check_grad(['x3'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') + self.check_grad(['x3'], 'Out') class TestMultiDotOpFirst1D(TestMultiDotOp): @@ -118,9 +118,9 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) - self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') class TestMultiDotOp4MatFirst1D(TestMultiDotOp): @@ -136,10 +136,10 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) - self.check_grad(['x2'], 'Out', max_relative_error=1e-3) - self.check_grad(['x3'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') + self.check_grad(['x3'], 'Out') class TestMultiDotOpLast1D(TestMultiDotOp): @@ -159,9 +159,9 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) - self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') class TestMultiDotOp4MatLast1D(TestMultiDotOp): @@ -177,10 +177,10 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) - self.check_grad(['x2'], 'Out', max_relative_error=1e-3) - self.check_grad(['x3'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') + self.check_grad(['x3'], 'Out') class TestMultiDotOpFirstAndLast1D(TestMultiDotOp): @@ -191,8 +191,8 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp): @@ -204,9 +204,9 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) - self.check_grad(['x2'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') class TestMultiDotOp4MatFirstAndLast1D(TestMultiDotOp): @@ -222,10 +222,10 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} def test_check_grad(self): - self.check_grad(['x0'], 'Out', max_relative_error=1e-3) - self.check_grad(['x1'], 'Out', max_relative_error=1e-3) - self.check_grad(['x2'], 'Out', max_relative_error=1e-3) - self.check_grad(['x3'], 'Out', max_relative_error=1e-3) + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') + self.check_grad(['x3'], 'Out') #####python API test####### @@ -236,41 +236,41 @@ def test_errors(self): input1 = 12 self.assertRaises(TypeError, paddle.multi_dot, [input1, input1]) - # The inputs dtype of multi_dot must be float32, float64 or float16. + # The inputs dtype of multi_dot must be float64, float64 or float16. input2 = fluid.layers.data( name='input2', shape=[10, 10], dtype="int32") self.assertRaises(TypeError, paddle.multi_dot, [input2, input2]) # the number of tensor must be larger than 1 - x0 = fluid.data(name='x0', shape=[3, 2], dtype="float32") + x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64") self.assertRaises(ValueError, paddle.multi_dot, [x0]) #the first tensor must be 1D or 2D - x1 = fluid.data(name='x1', shape=[3, 2, 3], dtype="float32") - x2 = fluid.data(name='x2', shape=[3, 2], dtype="float32") + x1 = fluid.data(name='x1', shape=[3, 2, 3], dtype="float64") + x2 = fluid.data(name='x2', shape=[3, 2], dtype="float64") self.assertRaises(ValueError, paddle.multi_dot, [x1, x2]) #the last tensor must be 1D or 2D - x3 = fluid.data(name='x3', shape=[3, 2], dtype="float32") - x4 = fluid.data(name='x4', shape=[3, 2, 2], dtype="float32") + x3 = fluid.data(name='x3', shape=[3, 2], dtype="float64") + x4 = fluid.data(name='x4', shape=[3, 2, 2], dtype="float64") self.assertRaises(ValueError, paddle.multi_dot, [x3, x4]) #the tensor must be 2D, except first and last tensor - x5 = fluid.data(name='x5', shape=[3, 2], dtype="float32") - x6 = fluid.data(name='x6', shape=[2], dtype="float32") - x7 = fluid.data(name='x7', shape=[2, 2], dtype="float32") + x5 = fluid.data(name='x5', shape=[3, 2], dtype="float64") + x6 = fluid.data(name='x6', shape=[2], dtype="float64") + x7 = fluid.data(name='x7', shape=[2, 2], dtype="float64") self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7]) class API_TestMultiDot(unittest.TestCase): def test_out(self): with fluid.program_guard(fluid.Program()): - x0 = fluid.data(name='x0', shape=[3, 2], dtype="float32") - x1 = fluid.data(name='x1', shape=[2, 3], dtype='float32') + x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64") + x1 = fluid.data(name='x1', shape=[2, 3], dtype='float64') result = paddle.multi_dot([x0, x1]) exe = fluid.Executor(fluid.CPUPlace()) - data1 = np.random.rand(3, 2).astype("float32") - data2 = np.random.rand(2, 3).astype("float32") + data1 = np.random.rand(3, 2).astype("float64") + data2 = np.random.rand(2, 3).astype("float64") np_res = exe.run(feed={'x0': data1, 'x1': data2}, fetch_list=[result]) diff --git a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py index dd7e800053042a..15ba331e9de5a3 100644 --- a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py @@ -65,7 +65,6 @@ 'rank_loss', 'sequence_conv', 'smooth_l1_loss', - 'spectral_norm', - 'multi_dot', + 'spectral_norm' ] # yapf: enable diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py index 9db2752ab23217..581656f6cd421b 100644 --- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py @@ -76,8 +76,7 @@ 'trilinear_interp_v2', \ 'var_conv_2d', \ 'warpctc', \ - 'bilateral_slice', - 'multi_dot' + 'bilateral_slice' ] NO_FP16_CHECK_GRAD_OP_LIST = [ diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index fd06af3c3fbdfd..ba2c9b1d3ea8ee 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -22,6 +22,6 @@ 'cholesky', #noqa 'norm', 'inv', + 'matrix_power', 'multi_dot' - 'matrix_power' ] diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index cc20e98006fec4..5d92289b60e251 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -45,6 +45,7 @@ from .linalg import histogram # noqa: F401 from .linalg import mv # noqa: F401 from .linalg import matrix_power # noqa: F401 +from .linalg import multi_dot # noqa: F401 from .logic import equal # noqa: F401 from .logic import greater_equal # noqa: F401 from .logic import greater_than # noqa: F401 diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index fdb301cdb5a6c8..01be39d9011606 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -2,7 +2,6 @@ # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # @@ -943,73 +942,6 @@ def __check_input(x, vec): return out -def multi_dot(x, name=None): - """ - Compute the dot product of tow or more matrix in a single function call, while automatically selecting the fastest evaluation order. - - Supports inputs of float, double and float16 dtypes. This function does not support batched inputs. - - Every tensor in x must be 2D, except for the first and last which may be 1D. if the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1). - If the first and last tensors are matrices, the output will be a matrix. However, if either is a 1D vector, then the output will be a 1D vector. - - Notes: - The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (10, 100), (100, 5), (5, 50) respectively, we can calculate the cost of different multiplication orders as follows: - Cost((AB)C) = 10x100x5 + 10x5x50 = 7500 - Cost(A(BC)) = 10x100x50 + 100x5x50 = 75000 - - In this case, multiplying A and B first followed by C is 10 times faster. - - Args: - x ([Tensor]): The input tensors which is a list Tensor. - - Returns: - Tensor: The output Tensor. - - - Examples: - - .. code-block:: python - - import paddle - import numpy as np - - # A * B - A_data = np.random.random([3, 4]).astype(np.float32) - B_data = np.random.random([4, 5]).astype(np.float32) - A = paddle.to_tensor(A_data) - B = paddle.to_tensor(B_data) - out = paddle.multi_dot([A, B]) - print(out.numpy().shape) - # [3, 5] - - # A * B * C - A_data = np.random.random([10, 5]).astype(np.float32) - B_data = np.random.random([5, 8]).astype(np.float32) - C_data = np.random.random([8, 7]).astype(np.float32) - A = paddle.to_tensor(A_data) - B = paddle.to_tensor(B_data) - C = paddle.to_tensor(C_data) - out = paddle.multi_dot([A, B, C]) - print(out.numpy().shape) - # [10, 7] - """ - if in_dygraph_mode(): - return _C_ops.multi_dot(x) - - check_type(x, 'x', (list, tuple), 'multi_dot') - for id, item in enumerate(x): - check_variable_and_dtype(item, 'x[' + str(id) + ']', - ['float16', 'float32', 'float64'], 'multi_dot') - if item.dtype != x[0].dtype: - raise TypeError( - "All the Tensors in the input must have the same data type.") - - helper = LayerHelper('multi_dot', **locals()) - dtype = helper.input_dtype(input_param_name='x') - out = helper.create_variable_for_type_inference(dtype) - helper.append_op(type='multi_dot', inputs={"X": x}, outputs={"Out": out}) - - def matrix_power(x, n, name=None): r""" Computes the n-th power of a square matrix or a batch of square matrices. @@ -1019,12 +951,12 @@ def matrix_power(x, n, name=None): .. math:: Out = X ^ {n} - + Specifically, - If `n > 0`, it returns the matrix or a batch of matrices raised to the power of `n`. - + - If `n = 0`, it returns the identity matrix or a batch of identity matrices. - If `n < 0`, it returns the inverse of each matrix (if invertible) raised to @@ -1035,7 +967,7 @@ def matrix_power(x, n, name=None): to power `n`. Its shape should be `[*, M, M]`, where `*` is zero or more batch dimensions. Its data type should be float32 or float64. n (int): The exponent. It can be any positive, negative integer or zero. - name (str, optional): Name for the operation (optional, default is None). + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -1078,3 +1010,72 @@ def matrix_power(x, n, name=None): outputs={'Out': out}, attrs={'n': n}) return out + + +def multi_dot(x, name=None): + """ + Compute the dot product of tow or more matrix in a single function call, while automatically selecting the fastest evaluation order. + + Supports inputs of float, double and float16 dtypes. This function does not support batched inputs. + + Every tensor in x must be 2D, except for the first and last which may be 1D. if the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1). + If the first and last tensors are matrices, the output will be a matrix. However, if either is a 1D vector, then the output will be a 1D vector. + + The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (10, 100), (100, 5), (5, 50) respectively, we can calculate the cost of different multiplication orders as follows: + - Cost((AB)C) = 10x100x5 + 10x5x50 = 7500 + - Cost(A(BC)) = 10x100x50 + 100x5x50 = 75000 + + In this case, multiplying A and B first followed by C is 10 times faster. + + Args: + x ([Tensor]): The input tensors which is a list Tensor. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Tensor: The output Tensor. + + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + # A * B + A_data = np.random.random([3, 4]).astype(np.float32) + B_data = np.random.random([4, 5]).astype(np.float32) + A = paddle.to_tensor(A_data) + B = paddle.to_tensor(B_data) + out = paddle.multi_dot([A, B]) + print(out.numpy().shape) + # [3, 5] + + # A * B * C + A_data = np.random.random([10, 5]).astype(np.float32) + B_data = np.random.random([5, 8]).astype(np.float32) + C_data = np.random.random([8, 7]).astype(np.float32) + A = paddle.to_tensor(A_data) + B = paddle.to_tensor(B_data) + C = paddle.to_tensor(C_data) + out = paddle.multi_dot([A, B, C]) + print(out.numpy().shape) + + """ + if in_dygraph_mode(): + return _C_ops.multi_dot(x) + + check_type(x, 'x', (list, tuple), 'multi_dot') + for id, item in enumerate(x): + check_variable_and_dtype(item, 'x[' + str(id) + ']', + ['float16', 'float32', 'float64'], 'multi_dot') + if item.dtype != x[0].dtype: + raise TypeError( + "All the Tensors in the input must have the same data type.") + + helper = LayerHelper('multi_dot', **locals()) + dtype = helper.input_dtype(input_param_name='x') + out = helper.create_variable_for_type_inference(dtype) + helper.append_op(type='multi_dot', inputs={"X": x}, outputs={"Out": out}) + return out From 4d33b98f2fc4468f87e2462cfb1e1248e790a93d Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 30 Aug 2021 10:30:04 +0000 Subject: [PATCH 08/19] modify the code according to the review comments --- paddle/fluid/operators/fused/CMakeLists.txt | 2 +- ...fused_dropout.h => fused_dropout_common.h} | 5 +- .../operators/fused/fused_dropout_test.h | 22 +- .../fused/fused_residual_dropout_bias.h | 42 +-- .../fused/fused_residual_dropout_bias_test.cu | 262 +++++++++--------- 5 files changed, 171 insertions(+), 162 deletions(-) rename paddle/fluid/operators/fused/{fused_dropout.h => fused_dropout_common.h} (95%) diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index f3035cddcba020..3df2144aa35944 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -74,6 +74,6 @@ if (WITH_GPU OR WITH_ROCM) # fused_dropout # only support CUDA if(NOT WITH_ROCM) - nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op device_context generator) + nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op device_context generator memory) endif() endif() diff --git a/paddle/fluid/operators/fused/fused_dropout.h b/paddle/fluid/operators/fused/fused_dropout_common.h similarity index 95% rename from paddle/fluid/operators/fused/fused_dropout.h rename to paddle/fluid/operators/fused/fused_dropout_common.h index bd6a4122f5830d..755153bb07eee9 100644 --- a/paddle/fluid/operators/fused/fused_dropout.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -18,9 +18,8 @@ limitations under the License. */ #include #include -#include -#include - +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index 6cb8cd19b608d1..4a5e088d2013b8 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -22,11 +22,13 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/string/printf.h" namespace framework = paddle::framework; namespace platform = paddle::platform; +namespace memory = paddle::memory; USE_OP(dropout); @@ -34,17 +36,15 @@ USE_OP(dropout); * @brief call paddle dropout op */ template -void Dropout(const T *x, const framework::DDim &x_dim, T *out, - std::vector *mask, const platform::CUDADeviceContext &ctx, - uint64_t seed, float dropout_prob, bool is_upscale_in_train, - bool is_test) { +void Dropout(const std::vector &x, const framework::DDim &x_dim, + std::vector *out, std::vector *mask, + const platform::CUDADeviceContext &ctx, uint64_t seed, + float dropout_prob, bool is_upscale_in_train, bool is_test) { framework::Scope scope; auto var_x = scope.Var("X"); auto tensor_x = var_x->GetMutable(); + framework::TensorFromVector(x, ctx, tensor_x); tensor_x->Resize(x_dim); - tensor_x->mutable_data(ctx.GetPlace()); - cudaMemcpy(tensor_x->data(), x, x_dim[0] * x_dim[1] * sizeof(T), - cudaMemcpyHostToDevice); auto var_out = scope.Var("Out"); auto tensor_out = var_out->GetMutable(); @@ -59,6 +59,7 @@ void Dropout(const T *x, const framework::DDim &x_dim, T *out, if (is_upscale_in_train) { attrs.insert({"dropout_implementation", std::string("upscale_in_train")}); } + if (is_test) { attrs.insert({"is_test", 1}); } @@ -66,11 +67,10 @@ void Dropout(const T *x, const framework::DDim &x_dim, T *out, auto op = framework::OpRegistry::CreateOp( "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs); op->Run(scope, ctx.GetPlace()); - cudaMemcpy(out, tensor_out->data(), x_dim[0] * x_dim[1] * sizeof(T), - cudaMemcpyDeviceToHost); + + framework::TensorToVector(*tensor_out, ctx, out); if (!is_test) { - cudaMemcpy((*mask).data(), tensor_mask->data(), - x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost); + framework::TensorToVector(*tensor_mask, ctx, mask); } ctx.Wait(); } diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 0a263635e46029..eda633380e07a2 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/operators/fused/fused_dropout.h" +#include "paddle/fluid/operators/fused/fused_dropout_common.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" namespace paddle { @@ -22,6 +22,7 @@ namespace operators { namespace platform = paddle::platform; namespace cg = cooperative_groups; +namespace memory = paddle::memory; /** * @brief fused the add_bias, dropout, add residual into one operators @@ -32,15 +33,17 @@ namespace cg = cooperative_groups; /** * @brief the fused function called by every thread */ -template +template __forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread( const int row_id, const int col_id, const int cols, curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor, const T *src, const T *residual, const T *bias, T *dst, MaskType *mask, - U *mean_val, U *var_val) { + typename details::MPTypeTrait::Type *mean_val, + typename details::MPTypeTrait::Type *var_val) { using LoadT = AlignedVector; using MaskLoadT = AlignedVector; + using U = typename details::MPTypeTrait::Type; + T src_vec[VecSize]; T residual_vec[VecSize]; T bias_vec[VecSize]; @@ -74,7 +77,7 @@ __forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread( dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * static_cast(mask_vec[ii]) * factor + residual_vec[ii]; - if (layer_norm) { + if (ComputeLayerNorm) { U tmp = static_cast(dest_vec[ii]); *mean_val += tmp; *var_val += (tmp * tmp); @@ -114,7 +117,7 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { for (int i = col_id * VecSize; i < cols; i += blockDim.x * gridDim.x * VecSize) { - FusedResidualDropoutBiasVecOneThread( + FusedResidualDropoutBiasVecOneThread( r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst, mask, NULL, NULL); } @@ -208,9 +211,10 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, const platform::CUDADeviceContext &ctx) { // dropout_prob == 1.0f if (std::abs(dropout_prob - 1.0f) < 1e-5) { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T), - cudaMemcpyDeviceToDevice, ctx.stream())); + if (residual == dst) return; + auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T), + ctx.stream()); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync( mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream())); return; @@ -282,7 +286,8 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask, * 2. save 128*8 temporary sum in 8*128 shared memory * 3. reduce the sum of 128 rows data by 8*VecSize warps */ -template +template __global__ void FusedResidualDropoutBiasGradVec( const T *dout, const MaskType *mask, const T factor, const int64_t rows, const int64_t cols, T *dx, T *dbias) { @@ -316,9 +321,10 @@ __global__ void FusedResidualDropoutBiasGradVec( } // save temporary sum to cache and do transpose - __shared__ T cache[BSX * VecSize][BSY]; - for (int i = 0; i < VecSize; i++) + __shared__ T cache[BLOCK_SIZE_X * VecSize][BLOCK_SIZE_Y]; + for (int i = 0; i < VecSize; i++) { cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i]; + } __syncthreads(); // reduce sum @@ -327,11 +333,11 @@ __global__ void FusedResidualDropoutBiasGradVec( int x = tid >> 5; // warp id int y = tid & 31; // thread id on warp 0~31 - // need BSX * VecSize warps - if (x < BSX * VecSize) { + // need BLOCK_SIZE_X * VecSize warps + if (x < BLOCK_SIZE_X * VecSize) { // reduce 128 to 32 #pragma unroll - for (int i = 0; i < (BSY >> 5); i++) { + for (int i = 0; i < (BLOCK_SIZE_Y >> 5); i++) { sum += cache[x][y + i * 32]; } } @@ -341,7 +347,7 @@ __global__ void FusedResidualDropoutBiasGradVec( // save sum to dbias int bias_id = blockIdx.x * blockDim.x * VecSize + x; - if (y == 0 && x < VecSize * BSX && bias_id < cols) { + if (y == 0 && x < VecSize * BLOCK_SIZE_X && bias_id < cols) { dbias[bias_id] = sum; } } @@ -367,7 +373,9 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, const int VecSize = 4; if (dbias != nullptr) { int real_vec_size = VecSize; - if (cols % VecSize != 0) real_vec_size = 1; + if (cols % VecSize != 0) { + real_vec_size = 1; + } auto threads = std::min(cols / real_vec_size, static_cast(8)); auto blocks = std::max( (uint32_t)1, std::min((cols / real_vec_size + threads - 1) / threads, diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index d5377194934ff6..88438e6e0c36e5 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -24,7 +24,7 @@ namespace framework = paddle::framework; namespace platform = paddle::platform; /** - * @brief the unittest of fused_residual_dropout_bias + * @brief the unittest of fusedresidualdropoutbias * 1. random input data * 2. add bias, call paddle dropout op, add residual, and get the base result * 3. call FusedResidualDropoutBias function get fused result @@ -33,163 +33,169 @@ namespace platform = paddle::platform; template struct TestFusedResidualDropoutBias { - uint32_t _rows; - uint32_t _cols; - uint64_t _seed; - float _dropout_prob; - bool _is_upscale_in_train; - bool _is_test; // default false, Set to true for inference only - bool _has_bias = true; - framework::Tensor _src, _residual, _bias, _out, _mask; - framework::Tensor _dsrc, _dbias; - - std::vector _src_vec, _residual_vec, _bias_vec; - std::vector _correct_out, _correct_dsrc, _correct_dbias; - std::vector _correct_mask; - - platform::CUDAPlace _place; - platform::CUDADeviceContext *_ctx; + uint32_t rows; + uint32_t cols; + uint64_t seed; + float dropout_prob; + bool is_upscale_in_train; + bool is_test; // default false, Set to true for inference only + bool hasbias = true; + framework::Tensor src, residual, bias, out, mask; + framework::Tensor dsrc, dbias; + + std::vector src_vec, residual_vec, bias_vec; + std::vector correct_out, correct_dsrc, correct_dbias; + std::vector correct_mask; + + platform::CUDAPlace place; + platform::CUDADeviceContext *ctx; TestFusedResidualDropoutBias() { - _rows = 32; - _cols = 32; - _seed = 0; - _dropout_prob = 0.0; - _is_upscale_in_train = false; - _is_test = false; - _has_bias = true; - _ctx = new platform::CUDADeviceContext(_place); + rows = 32; + cols = 32; + seed = 0; + dropout_prob = 0.0; + is_upscale_in_train = false; + is_test = false; + hasbias = true; + // ctx = new platform::CUDADeviceContext(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto devicectx = pool.Get(place); + ctx = reinterpret_cast(devicectx); } - TestFusedResidualDropoutBias(int rows, int cols, uint64_t seed = 0, - float dropout_prob = 0.0, - bool is_upscale_in_train = false, - bool is_test = false) { - _rows = rows; - _cols = cols; - _seed = seed; - _dropout_prob = dropout_prob; - _is_upscale_in_train = is_upscale_in_train; - _is_test = is_test; - _has_bias = true; - _ctx = new platform::CUDADeviceContext(_place); + TestFusedResidualDropoutBias(int rows_, int cols_, uint64_t seed_ = 0, + float dropout_prob_ = 0.0, + bool is_upscale_in_train_ = false, + bool is_test_ = false) { + rows = rows_; + cols = cols_; + seed = seed_; + dropout_prob = dropout_prob_; + is_upscale_in_train = is_upscale_in_train_; + is_test = is_test_; + hasbias = true; + // ctx = new platform::CUDADeviceContext(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto devicectx = pool.Get(place); + ctx = reinterpret_cast(devicectx); } - ~TestFusedResidualDropoutBias() { delete _ctx; } + ~TestFusedResidualDropoutBias() {} void SetUp() { - const int n = _rows * _cols; - _correct_out.resize(n); - _correct_mask.resize(n); - _correct_dsrc.resize(n); - _correct_dbias.resize(_cols); - - _src_vec.resize(n); - _residual_vec.resize(n); - _bias_vec.resize(_cols); + const int n = rows * cols; + correct_out.resize(n); + correct_mask.resize(n); + correct_dsrc.resize(n); + correct_dbias.resize(cols); + + src_vec.resize(n); + residual_vec.resize(n); + bias_vec.resize(cols); std::default_random_engine random(time(NULL)); std::uniform_real_distribution dis(0.0, 1.0); - for (int i = 0; i < _rows; i++) { - for (int j = 0; j < _cols; j++) { - _src_vec[i * _cols + j] = static_cast(dis(random)); - _residual_vec[i * _cols + j] = static_cast(dis(random)); - if (i == 0) _bias_vec[j] = dis(random); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + src_vec[i * cols + j] = static_cast(dis(random)); + residual_vec[i * cols + j] = static_cast(dis(random)); + if (i == 0) bias_vec[j] = dis(random); } } - framework::TensorFromVector(_src_vec, *_ctx, &_src); - _src.Resize({_rows, _cols}); - framework::TensorFromVector(_residual_vec, *_ctx, &_residual); - _residual.Resize({_rows, _cols}); - if (_has_bias) { - framework::TensorFromVector(_bias_vec, *_ctx, &_bias); - _bias.Resize({_cols}); + framework::TensorFromVector(src_vec, *ctx, &src); + src.Resize({rows, cols}); + framework::TensorFromVector(residual_vec, *ctx, &residual); + residual.Resize({rows, cols}); + if (hasbias) { + framework::TensorFromVector(bias_vec, *ctx, &bias); + bias.Resize({cols}); } { - _out.Resize({_rows, _cols}); - _out.mutable_data(_place); - _mask.Resize({_rows, _cols}); - _mask.mutable_data(_place); - _dsrc.Resize({_rows, _cols}); - _dsrc.mutable_data(_place); - - if (_has_bias) { - _dbias.Resize({_cols}); - _dbias.mutable_data(_place); + out.Resize({rows, cols}); + out.mutable_data(place); + mask.Resize({rows, cols}); + mask.mutable_data(place); + dsrc.Resize({rows, cols}); + dsrc.mutable_data(place); + + if (hasbias) { + dbias.Resize({cols}); + dbias.mutable_data(place); } } } void BaseForward() { - std::vector out1(_rows * _cols), out2(_rows * _cols); - if (_has_bias) { + std::vector out1(rows * cols), out2(rows * cols); + if (hasbias) { // add bias - for (int i = 0; i < _rows; i++) { - for (int j = 0; j < _cols; j++) { - out1[i * _cols + j] = _src_vec[i * _cols + j] + _bias_vec[j]; + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + out1[i * cols + j] = src_vec[i * cols + j] + bias_vec[j]; } } // call dropout - Dropout(out1.data(), _src.dims(), out2.data(), &_correct_mask, *_ctx, - _seed, _dropout_prob, _is_upscale_in_train, _is_test); + Dropout(out1, src.dims(), &out2, &correct_mask, *ctx, seed, + dropout_prob, is_upscale_in_train, is_test); } else { - Dropout(_src_vec.data(), _src.dims(), out2.data(), &_correct_mask, - *_ctx, _seed, _dropout_prob, _is_upscale_in_train, _is_test); + Dropout(src_vec, src.dims(), &out2, &correct_mask, *ctx, seed, + dropout_prob, is_upscale_in_train, is_test); } + ctx->Wait(); // add residual - for (int i = 0; i < _rows; i++) { - for (int j = 0; j < _cols; j++) { - _correct_out[i * _cols + j] = - _residual_vec[i * _cols + j] + out2[i * _cols + j]; + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + correct_out[i * cols + j] = + residual_vec[i * cols + j] + out2[i * cols + j]; } } - _ctx->Wait(); } void BaseBackward() { - DropoutGrad(_correct_dsrc.data(), _src.dims(), _correct_out.data(), - _correct_mask.data(), *_ctx, _dropout_prob, - _is_upscale_in_train); + DropoutGrad(correct_dsrc.data(), src.dims(), correct_out.data(), + correct_mask.data(), *ctx, dropout_prob, + is_upscale_in_train); // calc dbias - memset(&_correct_dbias[0], 0, _cols * sizeof(T)); - for (int i = 0; i < _rows; i++) { - for (int j = 0; j < _cols; j++) { - _correct_dbias[j] += _correct_out[i * _cols + j]; + memset(&correct_dbias[0], 0, cols * sizeof(T)); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + correct_dbias[j] += correct_out[i * cols + j]; } } } void FusedForward() { auto threads = paddle::operators::Get1DBlocksAnd2DGrids( - *_ctx, (uint64_t)_rows, (uint64_t)_cols); + *ctx, (uint64_t)rows, (uint64_t)cols); const int VecSize = 4; const int increment = - ((_cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) * + ((cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) * VecSize; T *bias_ptr = nullptr; - if (_has_bias) { - bias_ptr = _bias.data(); + if (hasbias) { + bias_ptr = bias.data(); } paddle::operators::LaunchResidualDropoutBias( - _rows, _cols, increment, _seed, _dropout_prob, _is_test, - _is_upscale_in_train, _src.data(), _residual.data(), bias_ptr, - _mask.data(), _out.data(), *_ctx); - _ctx->Wait(); + rows, cols, increment, seed, dropout_prob, is_test, is_upscale_in_train, + src.data(), residual.data(), bias_ptr, mask.data(), + out.data(), *ctx); + ctx->Wait(); } void FusedBackward() { - if (_is_test) return; + if (is_test) return; T *bias_ptr = nullptr; - if (_has_bias) { - bias_ptr = _dbias.data(); + if (hasbias) { + bias_ptr = dbias.data(); } paddle::operators::LaunchResidualDropoutBiasGrad( - _out.data(), _mask.data(), _dropout_prob, - _is_upscale_in_train, _rows, _cols, _dsrc.data(), bias_ptr, *_ctx); + out.data(), mask.data(), dropout_prob, is_upscale_in_train, + rows, cols, dsrc.data(), bias_ptr, *ctx); } void Run() { @@ -201,43 +207,39 @@ struct TestFusedResidualDropoutBias { } void CheckOut(const T diff) { - const int n = _rows * _cols; - std::vector out(n); - std::vector mask(n); - cudaMemcpy(out.data(), _out.data(), _rows * _cols * sizeof(T), - cudaMemcpyDeviceToHost); - if (!_is_test) { - cudaMemcpy(mask.data(), _mask.data(), - _rows * _cols * sizeof(uint8_t), cudaMemcpyDeviceToHost); + const int n = rows * cols; + std::vector _out(n); + std::vector _mask(n); + framework::TensorToVector(out, *ctx, &_out); + if (!is_test) { + framework::TensorToVector(mask, *ctx, &_mask); } - _ctx->Wait(); + ctx->Wait(); for (int i = 0; i < n; i++) { - EXPECT_LT(std::abs(out[i] - _correct_out[i]), diff); - if (!_is_test) EXPECT_EQ(mask[i], _correct_mask[i]); + EXPECT_LT(std::abs(_out[i] - correct_out[i]), diff); + if (!is_test) EXPECT_EQ(_mask[i], correct_mask[i]); } } void CheckGrad(const T diff) { - if (_is_test) return; + if (is_test) return; - const int n = _rows * _cols; + const int n = rows * cols; - std::vector dsrc(n); - cudaMemcpy(dsrc.data(), _dsrc.data(), _rows * _cols * sizeof(T), - cudaMemcpyDeviceToHost); + std::vector _dsrc(n); + framework::TensorToVector(dsrc, *ctx, &_dsrc); for (int i = 0; i < n; i++) { - EXPECT_LT(std::abs(dsrc[i] - _correct_dsrc[i]), diff); + EXPECT_LT(std::abs(_dsrc[i] - correct_dsrc[i]), diff); } - if (_has_bias) { - std::vector dbias(_cols); - cudaMemcpy(dbias.data(), _dbias.data(), _cols * sizeof(T), - cudaMemcpyDeviceToHost); - _ctx->Wait(); - for (int i = 0; i < _cols; i++) { - EXPECT_LT(std::abs(dbias[i] - _correct_dbias[i]), diff); + if (hasbias) { + std::vector _dbias(cols); + framework::TensorToVector(dbias, *ctx, &_dbias); + ctx->Wait(); + for (int i = 0; i < cols; i++) { + EXPECT_LT(std::abs(_dbias[i] - correct_dbias[i]), diff); } } } @@ -261,7 +263,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasDouble) { test.CheckGrad(static_cast(1e-5)); } -// test fp16, For inference, check_grad is not required. ref: test_dropout_op.py +// test fp16, For inference, check_grad is not required. ref: testdropout_op.py TEST(FusedDropout, GPUFusedResidualDropoutBiasFp16) { const int rows = 16; const int cols = 16; @@ -275,7 +277,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols); - test._has_bias = false; + test.hasbias = false; test.Run(); test.CheckOut(static_cast(1e-5)); test.CheckGrad(static_cast(1e-5)); @@ -286,7 +288,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias2) { const int rows = 16; const int cols = 17; TestFusedResidualDropoutBias test(rows, cols); - test._has_bias = false; + test.hasbias = false; test.Run(); test.CheckOut(static_cast(1e-5)); test.CheckGrad(static_cast(1e-5)); From bd44d043d24b5f76c2f07da6bb0a7a52788c1ace Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 30 Aug 2021 11:27:40 +0000 Subject: [PATCH 09/19] replace cudaMemcpy with TensorFromVector and TensorToVector in DropoutGrad --- .../operators/fused/fused_dropout_test.h | 18 +++++++----------- .../fused/fused_residual_dropout_bias_test.cu | 19 +++++++++++-------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index 4a5e088d2013b8..e9fd0e6c097851 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -79,24 +79,21 @@ void Dropout(const std::vector &x, const framework::DDim &x_dim, * @brief call paddle dropout_grad op */ template -void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout, - const uint8_t *mask, const platform::CUDADeviceContext &ctx, - float dropout_prob, bool is_upscale_in_train) { +void DropoutGrad(std::vector *dx, const framework::DDim &x_dim, + const std::vector &dout, const std::vector &mask, + const platform::CUDADeviceContext &ctx, float dropout_prob, + bool is_upscale_in_train) { framework::Scope scope; const size_t n = x_dim[0] * x_dim[1]; auto var_out = scope.Var("DOut"); auto tensor_out = var_out->GetMutable(); + framework::TensorFromVector(dout, ctx, tensor_out); tensor_out->Resize(x_dim); - tensor_out->mutable_data(ctx.GetPlace()); - cudaMemcpy(tensor_out->data(), dout, n * sizeof(T), - cudaMemcpyHostToDevice); auto var_mask = scope.Var("Mask"); auto tensor_mask = var_mask->GetMutable(); + framework::TensorFromVector(mask, ctx, tensor_mask); tensor_mask->Resize(x_dim); - tensor_mask->mutable_data(ctx.GetPlace()); - cudaMemcpy(tensor_mask->data(), mask, n * sizeof(uint8_t), - cudaMemcpyHostToDevice); auto var_dx = scope.Var("DX"); auto tensor_dx = var_dx->GetMutable(); @@ -115,7 +112,6 @@ void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout, {{"X@GRAD", {"DX"}}}, attrs); op->Run(scope, ctx.GetPlace()); - cudaMemcpy(dx, tensor_dx->data(), x_dim[0] * x_dim[1] * sizeof(T), - cudaMemcpyDeviceToHost); + framework::TensorToVector(*tensor_dx, ctx, dx); ctx.Wait(); } diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 88438e6e0c36e5..14267974ff2657 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -58,7 +58,6 @@ struct TestFusedResidualDropoutBias { is_upscale_in_train = false; is_test = false; hasbias = true; - // ctx = new platform::CUDADeviceContext(place); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); ctx = reinterpret_cast(devicectx); @@ -75,7 +74,6 @@ struct TestFusedResidualDropoutBias { is_upscale_in_train = is_upscale_in_train_; is_test = is_test_; hasbias = true; - // ctx = new platform::CUDADeviceContext(place); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); ctx = reinterpret_cast(devicectx); @@ -100,7 +98,9 @@ struct TestFusedResidualDropoutBias { for (int j = 0; j < cols; j++) { src_vec[i * cols + j] = static_cast(dis(random)); residual_vec[i * cols + j] = static_cast(dis(random)); - if (i == 0) bias_vec[j] = dis(random); + if (i == 0) { + bias_vec[j] = dis(random); + } } } @@ -155,9 +155,8 @@ struct TestFusedResidualDropoutBias { } void BaseBackward() { - DropoutGrad(correct_dsrc.data(), src.dims(), correct_out.data(), - correct_mask.data(), *ctx, dropout_prob, - is_upscale_in_train); + DropoutGrad(&correct_dsrc, src.dims(), correct_out, correct_mask, *ctx, + dropout_prob, is_upscale_in_train); // calc dbias memset(&correct_dbias[0], 0, cols * sizeof(T)); for (int i = 0; i < rows; i++) { @@ -187,7 +186,9 @@ struct TestFusedResidualDropoutBias { } void FusedBackward() { - if (is_test) return; + if (is_test) { + return; + } T *bias_ptr = nullptr; if (hasbias) { @@ -223,7 +224,9 @@ struct TestFusedResidualDropoutBias { } void CheckGrad(const T diff) { - if (is_test) return; + if (is_test) { + return; + } const int n = rows * cols; From d2beab70c620a19a6e26b5e8c16a4f537882fc68 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 31 Aug 2021 02:19:33 +0000 Subject: [PATCH 10/19] set dropout attr 'is_test':false --- paddle/fluid/operators/fused/fused_dropout_test.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index e9fd0e6c097851..288b415aef31f9 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -61,7 +61,7 @@ void Dropout(const std::vector &x, const framework::DDim &x_dim, } if (is_test) { - attrs.insert({"is_test", 1}); + attrs.insert({"is_test", true}); } auto op = framework::OpRegistry::CreateOp( @@ -100,7 +100,7 @@ void DropoutGrad(std::vector *dx, const framework::DDim &x_dim, framework::AttributeMap attrs; attrs.insert({"dropout_prob", dropout_prob}); - attrs.insert({"is_test", 0}); + attrs.insert({"is_test", false}); if (is_upscale_in_train) { attrs.insert({"dropout_implementation", std::string("upscale_in_train")}); } else { From 40cd7cad7071a07b6dd1fa18b8373a8ad00fa7d9 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 31 Aug 2021 06:07:58 +0000 Subject: [PATCH 11/19] reduce the code to less than 1000 lines --- paddle/fluid/operators/multi_dot_op.cc | 23 ++++++------------- .../tests/unittests/test_multi_dot_op.py | 4 ---- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index 6682248f8eff1c..c27c01b92b6db3 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -197,15 +197,13 @@ static inline framework::Tensor MultiDotMatChainOrder( const std::vector& ins_dims, const bool save_result, std::vector* results) { auto order = GetOrder(ins, ins_dims); - auto n = ins.size(); - return MatChainMul(ctx, ins, ins_dims, order, 0, n - 1, - save_result, results); + return MatChainMul(ctx, ins, ins_dims, order, 0, + ins.size() - 1, save_result, results); } inline void GetDims(const std::vector& ins, std::vector* ins_dims) { const auto n = ins.size(); - std::vector real_ins; for (size_t i = 0; i < n; i++) { (*ins_dims)[i] = ins[i]->dims(); if (i == 0 && (*ins_dims)[i].size() == 1) { @@ -260,17 +258,13 @@ class MultiDotOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto inputs = ctx.MultiInput("X"); auto input_data_type = framework::proto::VarType::Type(0); - bool flag = 1; for (auto* input : inputs) { - if (!input->IsInitialized() || input->numel() == 0) { - flag = 0; + if (!input->IsInitialized()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The inputs of multi_dot OP are Empty!")); break; } } - if (flag == 0) { - PADDLE_THROW(platform::errors::InvalidArgument( - "All Inputs of multi_dot OP are Empty!")); - } input_data_type = inputs[0]->type(); #ifdef PADDLE_WITH_MKLDNN @@ -407,7 +401,6 @@ class MultiDotGradKernel : public framework::OpKernel { auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, true); T alpha = static_cast(1.0); auto blas = math::GetBlas(ctx); - blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0)); blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0)); } @@ -522,10 +515,8 @@ class MultiDotGradKernel : public framework::OpKernel { const auto Ka = ins_dims[0][1]; const auto Nb = ins_dims[1][1]; const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = - Ma * Nb * (Ka + Nc); // Ma * Ka * Nb + Ma * Nb * Nc; - const uint64_t cost2 = - Ka * Nc * (Nb + Ma); // Ka * Nb * Nc + Ma * Ka * Nc; + const uint64_t cost1 = Ma * Nb * (Ka + Nc); + const uint64_t cost2 = Ka * Nc * (Nb + Ma); auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false); auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false); auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false); diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py index 35482144c272dd..c5335d11410897 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py @@ -190,10 +190,6 @@ def get_inputs_and_outputs(self): self.inputs = {'X': [('x0', self.A), ('x1', self.B)]} self.outputs = {'Out': multi_dot([self.A, self.B])} - def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp): def get_inputs_and_outputs(self): From f342f00661deff765ecf1fc4825453d56ea9e8d5 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Thu, 2 Sep 2021 02:42:14 +0000 Subject: [PATCH 12/19] add comment and modifying code according to the review comments --- paddle/fluid/operators/multi_dot_op.cc | 17 ++++++- .../tests/unittests/test_multi_dot_op.py | 49 ++++--------------- python/paddle/tensor/linalg.py | 16 +++--- 3 files changed, 33 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index c27c01b92b6db3..3929aebc4382e7 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -114,7 +114,10 @@ inline framework::Tensor MatMul(const framework::ExecutionContext& ctx, } /** - * @brief multi matrix dot by a chain order + * @brief Recursively calculate matrix multiplication according to the optimal + * order + * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j] + * * @param * ins: the input tensors * ins_dims: the shape of ins after reshape @@ -164,15 +167,19 @@ inline framework::Tensor MatChainMul( std::vector GetOrder(const std::vector& ins, const std::vector& ins_dims) { auto n = ins.size(); + // p: save the ins shape, the ins[i] shape is (p[i], p[i+1]) std::vector p(n + 1); for (uint64_t i = 0; i < n; i++) { p[i] = ins_dims[i][0]; } p[n] = ins_dims[n - 1][1]; + // m[i, j]: save the lowest cost for multiplying ins[i...j] std::vector m(n * n, 0); + // define ins[i...j] means multiplying matrices from ins[i] to ins[j] + // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then + // multiply the resulting matrices is the optimal order for ins[i...j] std::vector order(n * n); - for (uint64_t l = 1; l < n; l++) { for (uint64_t i = 0; i < n - l; i++) { auto j = i + l; @@ -292,6 +299,12 @@ class MultiDotOp : public framework::OperatorWithKernel { } }; +/** + * 1. there are only 2 matrices: direct matrix multiplication A*B + * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C), + * choose the least cost order for calculation + * 3. more than 3 matrices: call MultiDotMatChainOrder + */ template class MultiDotKernel : public framework::OpKernel { public: diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py index c5335d11410897..e128939d9d5ecc 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py @@ -24,6 +24,8 @@ paddle.enable_static() +#the unittest of multi_dot +#compare the result of paddle multi_dot and numpy multi_dot class TestMultiDotOp(OpTest): def setUp(self): self.op_type = "multi_dot" @@ -47,11 +49,6 @@ def test_check_grad(self): self.check_grad(['x1'], 'Out') -class TestMultiDotOpDouble(TestMultiDotOp): - def get_dtype(self): - return "float64" - - #(A*B)*C class TestMultiDotOp3Mat(TestMultiDotOp): def get_inputs_and_outputs(self): @@ -109,7 +106,7 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B])} -class TestMultiDotOp3MatFirst1D(TestMultiDotOp): +class TestMultiDotOp3MatFirst1D(TestMultiDotOp3Mat): def get_inputs_and_outputs(self): self.A = np.random.random((4)).astype(self.dtype) self.B = np.random.random((4, 3)).astype(self.dtype) @@ -117,13 +114,8 @@ def get_inputs_and_outputs(self): self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]} self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} - def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') - -class TestMultiDotOp4MatFirst1D(TestMultiDotOp): +class TestMultiDotOp4MatFirst1D(TestMultiDotOp4Mat): def get_inputs_and_outputs(self): self.A = np.random.random((4)).astype(self.dtype) self.B = np.random.random((4, 3)).astype(self.dtype) @@ -135,12 +127,6 @@ def get_inputs_and_outputs(self): } self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} - def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') - self.check_grad(['x3'], 'Out') - class TestMultiDotOpLast1D(TestMultiDotOp): def get_inputs_and_outputs(self): @@ -150,7 +136,7 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B])} -class TestMultiDotOp3MatLast1D(TestMultiDotOp): +class TestMultiDotOp3MatLast1D(TestMultiDotOp3Mat): def get_inputs_and_outputs(self): self.A = np.random.random((2, 4)).astype(self.dtype) self.B = np.random.random((4, 3)).astype(self.dtype) @@ -164,7 +150,7 @@ def test_check_grad(self): self.check_grad(['x2'], 'Out') -class TestMultiDotOp4MatLast1D(TestMultiDotOp): +class TestMultiDotOp4MatLast1D(TestMultiDotOp4Mat): def get_inputs_and_outputs(self): self.A = np.random.random((2, 3)).astype(self.dtype) self.B = np.random.random((3, 2)).astype(self.dtype) @@ -176,12 +162,6 @@ def get_inputs_and_outputs(self): } self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} - def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') - self.check_grad(['x3'], 'Out') - class TestMultiDotOpFirstAndLast1D(TestMultiDotOp): def get_inputs_and_outputs(self): @@ -191,7 +171,7 @@ def get_inputs_and_outputs(self): self.outputs = {'Out': multi_dot([self.A, self.B])} -class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp): +class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp3Mat): def get_inputs_and_outputs(self): self.A = np.random.random((6, )).astype(self.dtype) self.B = np.random.random((6, 4)).astype(self.dtype) @@ -199,13 +179,8 @@ def get_inputs_and_outputs(self): self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]} self.outputs = {'Out': multi_dot([self.A, self.B, self.C])} - def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') - -class TestMultiDotOp4MatFirstAndLast1D(TestMultiDotOp): +class TestMultiDotOp4MatFirstAndLast1D(TestMultiDotOp4Mat): def get_inputs_and_outputs(self): self.A = np.random.random((3, )).astype(self.dtype) self.B = np.random.random((3, 4)).astype(self.dtype) @@ -217,12 +192,6 @@ def get_inputs_and_outputs(self): } self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])} - def test_check_grad(self): - self.check_grad(['x0'], 'Out') - self.check_grad(['x1'], 'Out') - self.check_grad(['x2'], 'Out') - self.check_grad(['x3'], 'Out') - #####python API test####### class TestMultiDotOpError(unittest.TestCase): @@ -258,7 +227,7 @@ def test_errors(self): self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7]) -class API_TestMultiDot(unittest.TestCase): +class APITestMultiDot(unittest.TestCase): def test_out(self): with fluid.program_guard(fluid.Program()): x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64") diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 01be39d9011606..4d26cd0256c72d 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1014,18 +1014,19 @@ def matrix_power(x, n, name=None): def multi_dot(x, name=None): """ - Compute the dot product of tow or more matrix in a single function call, while automatically selecting the fastest evaluation order. + Multi_dot is an operator that calculates multiple matrix multiplications. Supports inputs of float, double and float16 dtypes. This function does not support batched inputs. - Every tensor in x must be 2D, except for the first and last which may be 1D. if the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1). - If the first and last tensors are matrices, the output will be a matrix. However, if either is a 1D vector, then the output will be a 1D vector. + The input tensor in [x] must be 2D except for the first and last can be 1D. If the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1). - The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (10, 100), (100, 5), (5, 50) respectively, we can calculate the cost of different multiplication orders as follows: - - Cost((AB)C) = 10x100x5 + 10x5x50 = 7500 - - Cost(A(BC)) = 10x100x50 + 100x5x50 = 75000 + If the first and last tensor are 2D matrix, then the output is also 2D matrix, otherwise the output is a 1D vector. - In this case, multiplying A and B first followed by C is 10 times faster. + Multi_dot will select the lowest cost multiplication order for calculation. The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (20, 5), (5, 100), (100, 10) respectively, we can calculate the cost of different multiplication orders as follows: + - Cost((AB)C) = 20x5x100 + 20x100x10 = 30000 + - Cost(A(BC)) = 5x100x10 + 20x5x10 = 6000 + + In this case, multiplying B and C first, then multiply A, which is 5 times faster than sequential calculation. Args: x ([Tensor]): The input tensors which is a list Tensor. @@ -1061,6 +1062,7 @@ def multi_dot(x, name=None): C = paddle.to_tensor(C_data) out = paddle.multi_dot([A, B, C]) print(out.numpy().shape) + # [10, 7] """ if in_dygraph_mode(): From 5d2bbc889b8d3774fa01003f325fe012da2a11c8 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Thu, 2 Sep 2021 09:22:14 +0000 Subject: [PATCH 13/19] optimize the code according to the review comments --- .../operators/fused/fused_dropout_common.h | 73 ++++++--- .../fused/fused_residual_dropout_bias.h | 142 +++++++++--------- .../fused/fused_residual_dropout_bias_test.cu | 129 ++++++---------- 3 files changed, 167 insertions(+), 177 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index 755153bb07eee9..53ce76826a6793 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -22,48 +22,75 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/gpu_launch_config.h" namespace paddle { namespace operators { -/** - * get 1D threads and blocks - */ -template -inline std::pair Get1DThreadsAndBlocks( - const platform::CUDADeviceContext &ctx, const uint64_t n) { - const uint64_t tmp_n = n / VecSize; - int threads = std::max( - (uint64_t)32, std::min(tmp_n, (uint64_t)ctx.GetMaxThreadsPerBlock())); - int blocks = std::max((uint64_t)1, (tmp_n + threads - 1) / threads); - return std::pair{threads, blocks}; -} +#define MAX_CACHE_BYTES 16 /** * get the threads for fused_residual_dropout_bias: * 1D blocks: blockDim.x = cols * 2D grids: gridDim.y = rows */ -template -inline std::pair Get1DBlocksAnd2DGrids( +inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids( const platform::CUDADeviceContext &ctx, const uint32_t rows, - const uint32_t cols) { + const uint32_t cols, const int VecSize) { const uint32_t tmp_cols = cols / VecSize; int threads = std::max( (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock())); int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads); int blocks_y = std::max((uint32_t)1, rows); - dim3 block_dim(threads, 1, 1); - dim3 grid_dim(blocks_x, blocks_y, 1); - return std::pair{block_dim, grid_dim}; + platform::GpuLaunchConfig config; + config.block_per_grid.x = blocks_x; + config.block_per_grid.y = blocks_y; + config.thread_per_block.x = threads; + return config; } -// aligned vector generates vectorized load/store on CUDA -template -struct alignas(sizeof(T) * VecSize) AlignedVector { - T val[VecSize]; -}; +__forceinline__ __device__ void Rand1(curandStatePhilox4_32_10_t *state, + float *data) { + data[0] = curand_uniform(state); +} + +__forceinline__ __device__ void Rand2(curandStatePhilox4_32_10_t *state, + float *data) { + data[0] = curand_uniform(state); + data[1] = curand_uniform(state); +} + +__forceinline__ __device__ void Rand4(curandStatePhilox4_32_10_t *state, + float *data) { + float4 rand4 = curand_uniform4(state); + data[0] = rand4.x; + data[1] = rand4.y; + data[2] = rand4.w; + data[3] = rand4.z; +} + +__forceinline__ __device__ void Rand8(curandStatePhilox4_32_10_t *state, + float *data) { + Rand4(state, data); + Rand4(state, data + 4); +} + +__forceinline__ __device__ void RandVec(curandStatePhilox4_32_10_t *state, + float *data, const int VecSize) { + if (VecSize == 1) { + Rand1(state, data); + } else if (VecSize == 2) { + Rand2(state, data); + } else if (VecSize == 4) { + Rand4(state, data); + } else if (VecSize == 8) { + Rand8(state, data); + } else { + return; + } +} } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index eda633380e07a2..bafc8c60040c1e 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -20,28 +20,19 @@ limitations under the License. */ namespace paddle { namespace operators { -namespace platform = paddle::platform; -namespace cg = cooperative_groups; -namespace memory = paddle::memory; - -/** - * @brief fused the add_bias, dropout, add residual into one operators - * - */ - -/********Forward**************/ /** - * @brief the fused function called by every thread + * @brief The fused function called by every thread + * VecSize can be 1, 2, 4 or 8 */ template -__forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread( +__forceinline__ __device__ void FusedResidualDropoutBiasOneThread( const int row_id, const int col_id, const int cols, curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor, const T *src, const T *residual, const T *bias, T *dst, MaskType *mask, typename details::MPTypeTrait::Type *mean_val, typename details::MPTypeTrait::Type *var_val) { - using LoadT = AlignedVector; - using MaskLoadT = AlignedVector; + using LoadT = platform::CudaAlignedVector; + using MaskLoadT = platform::CudaAlignedVector; using U = typename details::MPTypeTrait::Type; T src_vec[VecSize]; @@ -60,16 +51,19 @@ __forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread( LoadT *bias_value = bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; - if (bias != nullptr) + if (bias) { *bias_value = *reinterpret_cast(&bias[col_id]); + } + + float rand[VecSize]; + RandVec(state, rand, VecSize); - float4 rand = curand_uniform4(state); T dest_vec[VecSize]; MaskType mask_vec[VecSize]; #pragma unroll for (int ii = 0; ii < VecSize; ii++) { - mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob); + mask_vec[ii] = static_cast(rand[ii] >= dropout_prob); } #pragma unroll @@ -97,13 +91,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread( * the bias shape is (1, cols) */ template -__global__ void FusedResidualDropoutBiasVec(const size_t rows, - const size_t cols, uint64_t seed, - const float dropout_prob, - const bool is_upscale_in_train, - const T *src, const T *residual, - const T *bias, MaskType *mask, - T *dst, uint64_t increment) { +__global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols, + uint64_t seed, + const float dropout_prob, + const bool is_upscale_in_train, + const T *src, const T *residual, + const T *bias, MaskType *mask, T *dst, + uint64_t increment) { int col_id = blockDim.x * blockIdx.x + threadIdx.x; int row_id = blockIdx.y; int idx = row_id * cols + col_id; @@ -117,9 +111,9 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { for (int i = col_id * VecSize; i < cols; i += blockDim.x * gridDim.x * VecSize) { - FusedResidualDropoutBiasVecOneThread( + FusedResidualDropoutBiasOneThread( r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst, - mask, NULL, NULL); + mask, nullptr, nullptr); } } } @@ -127,12 +121,14 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows, /** * @brief the fused function called by every thread */ -template -__forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferVecOneThread( +template +__forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferOneThread( const int row_id, const int col_id, const int cols, const float dropout_prob, const T factor, const T *src, const T *residual, - const T *bias, T *dst, U *mean_val, U *var_val) { - using LoadT = AlignedVector; + const T *bias, T *dst, typename details::MPTypeTrait::Type *mean_val, + typename details::MPTypeTrait::Type *var_val) { + using LoadT = platform::CudaAlignedVector; + using U = typename details::MPTypeTrait::Type; T src_vec[VecSize]; T residual_vec[VecSize]; T bias_vec[VecSize]; @@ -149,15 +145,16 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferVecOneThread( LoadT *bias_value = bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; - if (bias != nullptr) + if (bias) { *bias_value = *reinterpret_cast(&bias[col_id]); + } T dest_vec[VecSize]; #pragma unroll for (int ii = 0; ii < VecSize; ii++) { dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii]; - if (layer_norm) { + if (ComputeLayerNorm) { U tmp = static_cast(dest_vec[ii]); *mean_val += tmp; *var_val += (tmp * tmp); @@ -175,7 +172,7 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferVecOneThread( * the bias shape is (1, cols) */ template -__global__ void FusedResidualDropoutBiasOnlyInferVec( +__global__ void FusedResidualDropoutBiasOnlyInfer( const size_t rows, const size_t cols, const float dropout_prob, const bool is_upscale_in_train, const T *src, const T *residual, const T *bias, T *dst) { @@ -191,7 +188,7 @@ __global__ void FusedResidualDropoutBiasOnlyInferVec( for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { for (int i = col_id * VecSize; i < cols; i += blockDim.x * gridDim.x * VecSize) { - FusedResidualDropoutBiasOnlyInferVecOneThread( + FusedResidualDropoutBiasOnlyInferOneThread( r, i, cols, dropout_prob, factor, src, residual, bias, dst, nullptr, nullptr); } @@ -220,48 +217,46 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, return; } - const int VecSize = 4; - auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols); + const int VecSize = MAX_CACHE_BYTES / sizeof(T); + auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, VecSize); if (cols % VecSize != 0) { if (!is_test) { - FusedResidualDropoutBiasVec< - T, uint8_t, 1><<>>( + FusedResidualDropoutBias<<< + config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, bias, mask_data, dst, increment); } else { - FusedResidualDropoutBiasOnlyInferVec< - T, 1><<>>( + FusedResidualDropoutBiasOnlyInfer<<< + config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, dst); } } else { if (!is_test) { - FusedResidualDropoutBiasVec< - T, uint8_t, - VecSize><<>>( + FusedResidualDropoutBias<<< + config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, bias, mask_data, dst, increment); } else { - FusedResidualDropoutBiasOnlyInferVec< - T, VecSize><<>>( + FusedResidualDropoutBiasOnlyInfer<<< + config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, dst); } } } -/********Backward**************/ /* * @brief calculate the grad of no bias */ template -__global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask, - const T factor, const int64_t size, - T *dx) { +__global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask, + const T factor, const int64_t size, + T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = AlignedVector; - using MaskLoadT = AlignedVector; + using LoadT = platform::CudaAlignedVector; + using MaskLoadT = platform::CudaAlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { T dout_vec[VecSize]; MaskType mask_vec[VecSize]; @@ -286,15 +281,17 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask, * 2. save 128*8 temporary sum in 8*128 shared memory * 3. reduce the sum of 128 rows data by 8*VecSize warps */ -template -__global__ void FusedResidualDropoutBiasGradVec( - const T *dout, const MaskType *mask, const T factor, const int64_t rows, - const int64_t cols, T *dx, T *dbias) { +__global__ void FusedResidualDropoutBiasGrad(const T *dout, + const MaskType *mask, + const T factor, const int64_t rows, + const int64_t cols, T *dx, + T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = AlignedVector; - using MaskLoadT = AlignedVector; + using LoadT = platform::CudaAlignedVector; + using MaskLoadT = platform::CudaAlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum @@ -321,7 +318,7 @@ __global__ void FusedResidualDropoutBiasGradVec( } // save temporary sum to cache and do transpose - __shared__ T cache[BLOCK_SIZE_X * VecSize][BLOCK_SIZE_Y]; + __shared__ T cache[BlockSizeX * VecSize][BlockSizeY]; for (int i = 0; i < VecSize; i++) { cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i]; } @@ -333,11 +330,11 @@ __global__ void FusedResidualDropoutBiasGradVec( int x = tid >> 5; // warp id int y = tid & 31; // thread id on warp 0~31 - // need BLOCK_SIZE_X * VecSize warps - if (x < BLOCK_SIZE_X * VecSize) { + // need BlockSizeX * VecSize warps + if (x < BlockSizeX * VecSize) { // reduce 128 to 32 #pragma unroll - for (int i = 0; i < (BLOCK_SIZE_Y >> 5); i++) { + for (int i = 0; i < (BlockSizeY >> 5); i++) { sum += cache[x][y + i * 32]; } } @@ -347,7 +344,7 @@ __global__ void FusedResidualDropoutBiasGradVec( // save sum to dbias int bias_id = blockIdx.x * blockDim.x * VecSize + x; - if (y == 0 && x < VecSize * BLOCK_SIZE_X && bias_id < cols) { + if (y == 0 && x < VecSize * BlockSizeX && bias_id < cols) { dbias[bias_id] = sum; } } @@ -370,7 +367,7 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, factor = static_cast(1.0f); } - const int VecSize = 4; + const int VecSize = MAX_CACHE_BYTES / sizeof(T); if (dbias != nullptr) { int real_vec_size = VecSize; if (cols % VecSize != 0) { @@ -384,26 +381,27 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, dim3 grid_dim(blocks, 1, 1); if (cols % VecSize == 0) { - FusedResidualDropoutBiasGradVec< + FusedResidualDropoutBiasGrad< T, MaskType, 8, 128, VecSize><<>>( dout, mask, factor, rows, cols, dx, dbias); } else { - FusedResidualDropoutBiasGradVec< - T, MaskType, 8, 128, 1><<>>( + FusedResidualDropoutBiasGrad<<>>( dout, mask, factor, rows, cols, dx, dbias); } } else { const uint64_t n = rows * cols; - auto threads = Get1DThreadsAndBlocks(ctx, n); if (n % VecSize == 0) { - FusedResidualDropoutGradVec< - T, MaskType, - VecSize><<>>( + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(ctx, n / VecSize); + FusedResidualDropoutGrad<<< + config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( dout, mask, factor, n, dx); } else { - FusedResidualDropoutGradVec< - T, MaskType, 1><<>>( + platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx, n); + FusedResidualDropoutGrad<<< + config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( dout, mask, factor, n, dx); } } diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 14267974ff2657..b246d9bac9761b 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -39,7 +39,7 @@ struct TestFusedResidualDropoutBias { float dropout_prob; bool is_upscale_in_train; bool is_test; // default false, Set to true for inference only - bool hasbias = true; + bool has_bias = true; framework::Tensor src, residual, bias, out, mask; framework::Tensor dsrc, dbias; @@ -57,10 +57,10 @@ struct TestFusedResidualDropoutBias { dropout_prob = 0.0; is_upscale_in_train = false; is_test = false; - hasbias = true; + has_bias = true; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + auto device_ctx = pool.Get(place); + ctx = reinterpret_cast(device_ctx); } TestFusedResidualDropoutBias(int rows_, int cols_, uint64_t seed_ = 0, @@ -73,10 +73,10 @@ struct TestFusedResidualDropoutBias { dropout_prob = dropout_prob_; is_upscale_in_train = is_upscale_in_train_; is_test = is_test_; - hasbias = true; + has_bias = true; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + auto device_ctx = pool.Get(place); + ctx = reinterpret_cast(device_ctx); } ~TestFusedResidualDropoutBias() {} @@ -108,7 +108,7 @@ struct TestFusedResidualDropoutBias { src.Resize({rows, cols}); framework::TensorFromVector(residual_vec, *ctx, &residual); residual.Resize({rows, cols}); - if (hasbias) { + if (has_bias) { framework::TensorFromVector(bias_vec, *ctx, &bias); bias.Resize({cols}); } @@ -121,7 +121,7 @@ struct TestFusedResidualDropoutBias { dsrc.Resize({rows, cols}); dsrc.mutable_data(place); - if (hasbias) { + if (has_bias) { dbias.Resize({cols}); dbias.mutable_data(place); } @@ -130,7 +130,7 @@ struct TestFusedResidualDropoutBias { void BaseForward() { std::vector out1(rows * cols), out2(rows * cols); - if (hasbias) { + if (has_bias) { // add bias for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { @@ -167,15 +167,16 @@ struct TestFusedResidualDropoutBias { } void FusedForward() { - auto threads = paddle::operators::Get1DBlocksAnd2DGrids( - *ctx, (uint64_t)rows, (uint64_t)cols); - const int VecSize = 4; - const int increment = - ((cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) * - VecSize; + const int VecSize = MAX_CACHE_BYTES / sizeof(T); + auto config = paddle::operators::Get1DBlocksAnd2DGrids( + *ctx, (uint64_t)rows, (uint64_t)cols, VecSize); + const int increment = ((cols - 1) / (config.thread_per_block.x * + config.block_per_grid.x * VecSize) + + 1) * + VecSize; T *bias_ptr = nullptr; - if (hasbias) { + if (has_bias) { bias_ptr = bias.data(); } paddle::operators::LaunchResidualDropoutBias( @@ -191,7 +192,7 @@ struct TestFusedResidualDropoutBias { } T *bias_ptr = nullptr; - if (hasbias) { + if (has_bias) { bias_ptr = dbias.data(); } paddle::operators::LaunchResidualDropoutBiasGrad( @@ -237,7 +238,7 @@ struct TestFusedResidualDropoutBias { EXPECT_LT(std::abs(_dsrc[i] - correct_dsrc[i]), diff); } - if (hasbias) { + if (has_bias) { std::vector _dbias(cols); framework::TensorToVector(dbias, *ctx, &_dbias); ctx->Wait(); @@ -248,66 +249,39 @@ struct TestFusedResidualDropoutBias { } }; -TEST(FusedDropout, GPUFusedResidualDropoutBias) { +// test the shape and bias +template +static void BaseTest(const bool is_fp16 = false) { const int rows = 16; - const int cols = 16; - TestFusedResidualDropoutBias test(rows, cols); - test.Run(); - test.CheckOut(static_cast(1e-5)); - test.CheckGrad(static_cast(1e-5)); + std::vector cols_list = {16, 17}; + bool has_bias[2] = {true, false}; + T default_diff = static_cast(1e-5); + if (is_fp16) { + default_diff = static_cast(1e-2); + } + for (int i = 0; i < cols_list.size(); i++) { + for (int j = 0; j < 2; j++) { + TestFusedResidualDropoutBias test(rows, cols_list[i]); + test.has_bias = has_bias[j]; + test.Run(); + test.CheckOut(default_diff); + if (!is_fp16) { + test.CheckGrad(default_diff); + } + } + } } -TEST(FusedDropout, GPUFusedResidualDropoutBiasDouble) { - const int rows = 16; - const int cols = 16; - TestFusedResidualDropoutBias test(rows, cols); - test.Run(); - test.CheckOut(static_cast(1e-5)); - test.CheckGrad(static_cast(1e-5)); -} +TEST(FusedDropout, GPUFusedResidualDropoutBias) { BaseTest(); } + +TEST(FusedDropout, GPUFusedResidualDropoutBiasDouble) { BaseTest(); } // test fp16, For inference, check_grad is not required. ref: testdropout_op.py TEST(FusedDropout, GPUFusedResidualDropoutBiasFp16) { - const int rows = 16; - const int cols = 16; - TestFusedResidualDropoutBias test(rows, cols); - test.Run(); - test.CheckOut(static_cast(1e-2)); -} - -// test no bias and cols % 4 == 0 -TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias) { - const int rows = 16; - const int cols = 16; - TestFusedResidualDropoutBias test(rows, cols); - test.hasbias = false; - test.Run(); - test.CheckOut(static_cast(1e-5)); - test.CheckGrad(static_cast(1e-5)); -} - -// test no bias and cols % 4 != 0 -TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias2) { - const int rows = 16; - const int cols = 17; - TestFusedResidualDropoutBias test(rows, cols); - test.hasbias = false; - test.Run(); - test.CheckOut(static_cast(1e-5)); - test.CheckGrad(static_cast(1e-5)); + BaseTest(true); } -// test add bias and cols % 4 != 0 TEST(FusedDropout, GPUFusedResidualDropoutBias2) { - const int rows = 16; - const int cols = 17; - TestFusedResidualDropoutBias test(rows, cols); - test.Run(); - test.CheckOut(static_cast(1e-5)); - test.CheckGrad(static_cast(1e-5)); -} - -TEST(FusedDropout, GPUFusedResidualDropoutBias3) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols, 0, 1.0, false, false); @@ -316,16 +290,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBias3) { test.CheckGrad(static_cast(1e-5)); } -TEST(FusedDropout, GPUFusedResidualDropoutBias4) { - const int rows = 16; - const int cols = 16; - TestFusedResidualDropoutBias test(rows, cols, 0, 1.0, false, false); - test.Run(); - test.CheckOut(static_cast(1e-5)); - test.CheckGrad(static_cast(1e-5)); -} - -TEST(FusedDropout, GPUFusedResidualDropoutBias5) { +TEST(FusedDropout, GPUFusedResidualDropoutBias3) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols, 0, 1.0, true, false); @@ -334,7 +299,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBias5) { test.CheckGrad(static_cast(1e-5)); } -TEST(FusedDropout, GPUFusedResidualDropoutBias6) { +TEST(FusedDropout, GPUFusedResidualDropoutBias4) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols, 0, 0.35, true, true); @@ -343,7 +308,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBias6) { test.CheckGrad(static_cast(1e-5)); } -TEST(FusedDropout, GPUFusedResidualDropoutBias7) { +TEST(FusedDropout, GPUFusedResidualDropoutBias5) { const int rows = 16; const int cols = 16; TestFusedResidualDropoutBias test(rows, cols, 125, 0.0, false, false); From 934fcac6826d7559c71109df54ecdc7d3b89e81d Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Thu, 2 Sep 2021 10:26:33 +0000 Subject: [PATCH 14/19] use static_cast --- paddle/fluid/operators/fused/fused_dropout_common.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index 53ce76826a6793..f159d16d855a7e 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -41,9 +41,11 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids( const uint32_t cols, const int VecSize) { const uint32_t tmp_cols = cols / VecSize; int threads = std::max( - (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock())); - int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads); - int blocks_y = std::max((uint32_t)1, rows); + static_cast(32), + std::min(tmp_cols, static_cast(ctx.GetMaxThreadsPerBlock()))); + const auto blocks_x = + std::max(static_cast(1), (tmp_cols + threads - 1) / threads); + const auto blocks_y = std::max(static_cast(1), rows); platform::GpuLaunchConfig config; config.block_per_grid.x = blocks_x; config.block_per_grid.y = blocks_y; From 44610ea2e651be31fbde3bddecd95ced540c6665 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 8 Sep 2021 02:00:15 +0000 Subject: [PATCH 15/19] fix the blocks for large shape --- .../operators/fused/fused_residual_dropout_bias.h | 5 ++--- .../fused/fused_residual_dropout_bias_test.cu | 10 ++++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index bafc8c60040c1e..952042d45f47c4 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -374,9 +374,8 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, real_vec_size = 1; } auto threads = std::min(cols / real_vec_size, static_cast(8)); - auto blocks = std::max( - (uint32_t)1, std::min((cols / real_vec_size + threads - 1) / threads, - (uint32_t)ctx.GetSMCount())); + auto blocks = + std::max((uint32_t)1, cols / real_vec_size + threads - 1 / threads); dim3 block_dim(threads, 128, 1); dim3 grid_dim(blocks, 1, 1); diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index b246d9bac9761b..e687823bc8158b 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -316,3 +316,13 @@ TEST(FusedDropout, GPUFusedResidualDropoutBias5) { test.CheckOut(static_cast(1e-5)); test.CheckGrad(static_cast(1e-5)); } + +// test large shape +TEST(FusedDropout, GPUFusedResidualDropoutBias6) { + const int rows = 256; + const int cols = 4096; + TestFusedResidualDropoutBias test(rows, cols); + test.Run(); + test.CheckOut(static_cast(1e-5)); + test.CheckGrad(static_cast(1e-3)); +} From 6c743f1004b46fb7b56334cd6e3d2af13fa9e477 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 8 Sep 2021 02:20:39 +0000 Subject: [PATCH 16/19] fix the merge error --- python/paddle/linalg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index 26bf4d6d3035b3..d5206e0417933b 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -23,7 +23,7 @@ 'cholesky', #noqa 'norm', 'inv', - 'multi_dot' + 'multi_dot', 'svd', 'matrix_power' ] From 1a83adb08e7cc1914b07b4e8ea3be6d521173a10 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 8 Sep 2021 03:43:53 +0000 Subject: [PATCH 17/19] merge upstream, and used new AlignedVector --- .../operators/fused/fused_dropout_common.h | 5 +- .../fused/fused_residual_dropout_bias.h | 252 ++++++------------ 2 files changed, 85 insertions(+), 172 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index f159d16d855a7e..24f6f53c63630e 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -20,16 +20,17 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/gpu_launch_config.h" namespace paddle { namespace operators { -#define MAX_CACHE_BYTES 16 +#define CACHE_LINE 128 +#define MAX_CACHE_BYTES (CACHE_LINE / CHAR_BIT) /** * get the threads for fused_residual_dropout_bias: diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 952042d45f47c4..cd9dfd1c79ca8f 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -29,43 +29,45 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( const int row_id, const int col_id, const int cols, curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor, const T *src, const T *residual, const T *bias, T *dst, MaskType *mask, - typename details::MPTypeTrait::Type *mean_val, + const bool is_test, typename details::MPTypeTrait::Type *mean_val, typename details::MPTypeTrait::Type *var_val) { - using LoadT = platform::CudaAlignedVector; - using MaskLoadT = platform::CudaAlignedVector; + using LoadT = platform::AlignedVector; + using StoreT = platform::AlignedVector; + using MaskStoreT = platform::AlignedVector; using U = typename details::MPTypeTrait::Type; - T src_vec[VecSize]; - T residual_vec[VecSize]; - T bias_vec[VecSize]; + LoadT src_vec; + LoadT residual_vec; + LoadT bias_vec; #pragma unroll for (int ii = 0; ii < VecSize; ii++) { bias_vec[ii] = static_cast(0); } // vectorize load data from global - LoadT *value = reinterpret_cast(&src_vec); - LoadT *residual_value = reinterpret_cast(&residual_vec); - *value = *reinterpret_cast(&src[row_id * cols + col_id]); - *residual_value = - *reinterpret_cast(&residual[row_id * cols + col_id]); - - LoadT *bias_value = - bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; + platform::Load(&src[row_id * cols + col_id], &src_vec); + platform::Load(&residual[row_id * cols + col_id], &residual_vec); + if (bias) { - *bias_value = *reinterpret_cast(&bias[col_id]); + platform::Load(&bias[col_id], &bias_vec); } - float rand[VecSize]; - RandVec(state, rand, VecSize); - - T dest_vec[VecSize]; - MaskType mask_vec[VecSize]; - + MaskStoreT mask_vec; + if (!is_test) { + float rand[VecSize]; + RandVec(state, rand, VecSize); #pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - mask_vec[ii] = static_cast(rand[ii] >= dropout_prob); + for (int ii = 0; ii < VecSize; ii++) { + mask_vec[ii] = static_cast(rand[ii] >= dropout_prob); + } + } else { +#pragma unroll + for (int ii = 0; ii < VecSize; ii++) { + mask_vec[ii] = static_cast(1); + } } + StoreT dest_vec; + #pragma unroll for (int ii = 0; ii < VecSize; ii++) { dest_vec[ii] = @@ -79,25 +81,25 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( } // store result to global - *(reinterpret_cast(&dst[row_id * cols + col_id])) = - *reinterpret_cast(&dest_vec[0]); - *(reinterpret_cast(&mask[row_id * cols + col_id])) = - *reinterpret_cast(&mask_vec[0]); + platform::Store(dest_vec, &dst[row_id * cols + col_id]); + if (!is_test) { + platform::Store(mask_vec, &mask[row_id * cols + col_id]); + } } /** * @brief dst = residual + dropout(src + bias); * the src, residual, mask and dst shape is (rows, cols) * the bias shape is (1, cols) + * is_test: only used in inference + * mask: can be null if is_test=true */ template -__global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols, - uint64_t seed, - const float dropout_prob, - const bool is_upscale_in_train, - const T *src, const T *residual, - const T *bias, MaskType *mask, T *dst, - uint64_t increment) { +__global__ void FusedResidualDropoutBias( + const size_t rows, const size_t cols, uint64_t seed, + const float dropout_prob, const bool is_upscale_in_train, const T *src, + const T *residual, const T *bias, MaskType *mask, T *dst, + uint64_t increment, const bool is_test) { int col_id = blockDim.x * blockIdx.x + threadIdx.x; int row_id = blockIdx.y; int idx = row_id * cols + col_id; @@ -108,89 +110,18 @@ __global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols, if (!is_upscale_in_train) { factor = static_cast(1.0f); } - for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { - for (int i = col_id * VecSize; i < cols; - i += blockDim.x * gridDim.x * VecSize) { - FusedResidualDropoutBiasOneThread( - r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst, - mask, nullptr, nullptr); - } - } -} - -/** - * @brief the fused function called by every thread - */ -template -__forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferOneThread( - const int row_id, const int col_id, const int cols, - const float dropout_prob, const T factor, const T *src, const T *residual, - const T *bias, T *dst, typename details::MPTypeTrait::Type *mean_val, - typename details::MPTypeTrait::Type *var_val) { - using LoadT = platform::CudaAlignedVector; - using U = typename details::MPTypeTrait::Type; - T src_vec[VecSize]; - T residual_vec[VecSize]; - T bias_vec[VecSize]; -#pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - bias_vec[ii] = static_cast(0); - } - // vectorize load data from global - LoadT *value = reinterpret_cast(&src_vec); - LoadT *residual_value = reinterpret_cast(&residual_vec); - *value = *reinterpret_cast(&src[row_id * cols + col_id]); - *residual_value = - *reinterpret_cast(&residual[row_id * cols + col_id]); - - LoadT *bias_value = - bias != nullptr ? reinterpret_cast(&bias_vec) : nullptr; - if (bias) { - *bias_value = *reinterpret_cast(&bias[col_id]); - } - - T dest_vec[VecSize]; - -#pragma unroll - for (int ii = 0; ii < VecSize; ii++) { - dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii]; - if (ComputeLayerNorm) { - U tmp = static_cast(dest_vec[ii]); - *mean_val += tmp; - *var_val += (tmp * tmp); + if (is_test) { + factor = static_cast(1.0f - dropout_prob); + if (is_upscale_in_train) { + factor = static_cast(1.0f); } } - - // store result to global - *(reinterpret_cast(&dst[row_id * cols + col_id])) = - *reinterpret_cast(&dest_vec[0]); -} - -/** - * @brief for dropout's param is_test = true, only used in inference - * the src, residual and dst shape is (rows, cols) - * the bias shape is (1, cols) - */ -template -__global__ void FusedResidualDropoutBiasOnlyInfer( - const size_t rows, const size_t cols, const float dropout_prob, - const bool is_upscale_in_train, const T *src, const T *residual, - const T *bias, T *dst) { - int col_id = blockDim.x * blockIdx.x + threadIdx.x; - int row_id = blockIdx.y; - int idx = row_id * cols + col_id; - - T factor = static_cast(1.0f - dropout_prob); - if (is_upscale_in_train) { - factor = static_cast(1.0f); - } - for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) { for (int i = col_id * VecSize; i < cols; i += blockDim.x * gridDim.x * VecSize) { - FusedResidualDropoutBiasOnlyInferOneThread( - r, i, cols, dropout_prob, factor, src, residual, bias, dst, nullptr, - nullptr); + FusedResidualDropoutBiasOneThread( + r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst, + mask, is_test, nullptr, nullptr); } } } @@ -212,37 +143,27 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols, auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T), ctx.stream()); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync( - mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream())); + if (!is_test) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync( + mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream())); + } return; } const int VecSize = MAX_CACHE_BYTES / sizeof(T); - auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, VecSize); - if (cols % VecSize != 0) { - if (!is_test) { - FusedResidualDropoutBias<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, - bias, mask_data, dst, increment); - } else { - FusedResidualDropoutBiasOnlyInfer<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, - dst); - } + const int real_vec_size = cols % VecSize == 0 ? VecSize : 1; + auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size); + if (cols % VecSize == 0) { + FusedResidualDropoutBias<<< + config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, + bias, mask_data, dst, increment, is_test); } else { - if (!is_test) { - FusedResidualDropoutBias<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, - bias, mask_data, dst, increment); - } else { - FusedResidualDropoutBiasOnlyInfer<<< - config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( - rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, - dst); - } + FusedResidualDropoutBias< + T, uint8_t, + 1><<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual, + bias, mask_data, dst, increment, is_test); } } @@ -255,23 +176,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask, T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = platform::CudaAlignedVector; - using MaskLoadT = platform::CudaAlignedVector; + using LoadT = platform::AlignedVector; + using StoreT = platform::AlignedVector; + using MaskLoadT = platform::AlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { - T dout_vec[VecSize]; - MaskType mask_vec[VecSize]; - LoadT *dout_value = reinterpret_cast(&dout_vec); - MaskLoadT *mask_value = reinterpret_cast(&mask_vec); - *dout_value = *reinterpret_cast(&dout[i]); - *mask_value = *reinterpret_cast(&mask[i]); - - T dx_vec[VecSize]; + LoadT dout_vec; + MaskLoadT mask_vec; + platform::Load(&dout[i], &dout_vec); + platform::Load(&mask[i], &mask_vec); + + StoreT dx_vec; #pragma unroll for (int ii = 0; ii < VecSize; ii++) { dx_vec[ii] = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; } - *(reinterpret_cast(&dx[i])) = - *reinterpret_cast(&dx_vec[0]); + platform::Store(dx_vec, &dx[i]); } } @@ -290,21 +209,20 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = platform::CudaAlignedVector; - using MaskLoadT = platform::CudaAlignedVector; + using LoadT = platform::AlignedVector; + using StoreT = platform::AlignedVector; + using MaskLoadT = platform::AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum if (col_id * VecSize < cols) { for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) { int index = row_id * cols + col_id * VecSize; - T out_vec[VecSize]; - MaskType mask_vec[VecSize]; - T dx_vec[VecSize]; - LoadT *out_value = reinterpret_cast(&out_vec); - MaskLoadT *mask_value = reinterpret_cast(&mask_vec); - *out_value = *reinterpret_cast(&dout[index]); - *mask_value = *reinterpret_cast(&mask[index]); + LoadT out_vec; + MaskLoadT mask_vec; + StoreT dx_vec; + platform::Load(&dout[index], &out_vec); + platform::Load(&mask[index], &mask_vec); #pragma unroll for (int i = 0; i < VecSize; i++) { @@ -312,8 +230,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, tmp_sum[i] += out_vec[i]; } - *(reinterpret_cast(&dx[index])) = - *reinterpret_cast(&dx_vec[0]); + platform::Store(dx_vec, &dx[index]); } } @@ -368,17 +285,13 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, } const int VecSize = MAX_CACHE_BYTES / sizeof(T); + int real_vec_size = cols % VecSize == 0 ? VecSize : 1; if (dbias != nullptr) { - int real_vec_size = VecSize; - if (cols % VecSize != 0) { - real_vec_size = 1; - } auto threads = std::min(cols / real_vec_size, static_cast(8)); auto blocks = - std::max((uint32_t)1, cols / real_vec_size + threads - 1 / threads); + std::max((uint32_t)1, (cols / real_vec_size + threads - 1) / threads); dim3 block_dim(threads, 128, 1); dim3 grid_dim(blocks, 1, 1); - if (cols % VecSize == 0) { FusedResidualDropoutBiasGrad< T, MaskType, 8, 128, @@ -391,14 +304,13 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask, } } else { const uint64_t n = rows * cols; + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(ctx, n / real_vec_size); if (n % VecSize == 0) { - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx, n / VecSize); FusedResidualDropoutGrad<<< config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( dout, mask, factor, n, dx); } else { - platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx, n); FusedResidualDropoutGrad<<< config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( dout, mask, factor, n, dx); From eda910f1551c466099da3f731f183964d8fcc8a8 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Thu, 9 Sep 2021 10:30:10 +0000 Subject: [PATCH 18/19] replace fluid with paddle --- .../tests/unittests/test_multi_dot_op.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py index e128939d9d5ecc..97047b1ae0e5e0 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py @@ -18,8 +18,6 @@ from numpy.linalg import multi_dot from op_test import OpTest import paddle -from paddle.fluid import Program, program_guard -import paddle.fluid as fluid paddle.enable_static() @@ -196,44 +194,46 @@ def get_inputs_and_outputs(self): #####python API test####### class TestMultiDotOpError(unittest.TestCase): def test_errors(self): - with program_guard(Program(), Program()): + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): # The inputs type of multi_dot must be list matrix. input1 = 12 self.assertRaises(TypeError, paddle.multi_dot, [input1, input1]) # The inputs dtype of multi_dot must be float64, float64 or float16. - input2 = fluid.layers.data( + input2 = paddle.static.data( name='input2', shape=[10, 10], dtype="int32") self.assertRaises(TypeError, paddle.multi_dot, [input2, input2]) # the number of tensor must be larger than 1 - x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64") + x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64") self.assertRaises(ValueError, paddle.multi_dot, [x0]) #the first tensor must be 1D or 2D - x1 = fluid.data(name='x1', shape=[3, 2, 3], dtype="float64") - x2 = fluid.data(name='x2', shape=[3, 2], dtype="float64") + x1 = paddle.static.data(name='x1', shape=[3, 2, 3], dtype="float64") + x2 = paddle.static.data(name='x2', shape=[3, 2], dtype="float64") self.assertRaises(ValueError, paddle.multi_dot, [x1, x2]) #the last tensor must be 1D or 2D - x3 = fluid.data(name='x3', shape=[3, 2], dtype="float64") - x4 = fluid.data(name='x4', shape=[3, 2, 2], dtype="float64") + x3 = paddle.static.data(name='x3', shape=[3, 2], dtype="float64") + x4 = paddle.static.data(name='x4', shape=[3, 2, 2], dtype="float64") self.assertRaises(ValueError, paddle.multi_dot, [x3, x4]) #the tensor must be 2D, except first and last tensor - x5 = fluid.data(name='x5', shape=[3, 2], dtype="float64") - x6 = fluid.data(name='x6', shape=[2], dtype="float64") - x7 = fluid.data(name='x7', shape=[2, 2], dtype="float64") + x5 = paddle.static.data(name='x5', shape=[3, 2], dtype="float64") + x6 = paddle.static.data(name='x6', shape=[2], dtype="float64") + x7 = paddle.static.data(name='x7', shape=[2, 2], dtype="float64") self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7]) class APITestMultiDot(unittest.TestCase): def test_out(self): - with fluid.program_guard(fluid.Program()): - x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64") - x1 = fluid.data(name='x1', shape=[2, 3], dtype='float64') + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64") + x1 = paddle.static.data(name='x1', shape=[2, 3], dtype='float64') result = paddle.multi_dot([x0, x1]) - exe = fluid.Executor(fluid.CPUPlace()) + exe = paddle.static.Executor(paddle.CPUPlace()) data1 = np.random.rand(3, 2).astype("float64") data2 = np.random.rand(2, 3).astype("float64") np_res = exe.run(feed={'x0': data1, @@ -248,14 +248,14 @@ def test_out(self): {}\n{}, check diff!".format(np_res, expected_result)) def test_dygraph_without_out(self): - device = fluid.CPUPlace() - with fluid.dygraph.guard(device): - input_array1 = np.random.rand(3, 4).astype("float64") - input_array2 = np.random.rand(4, 3).astype("float64") - data1 = fluid.dygraph.to_variable(input_array1) - data2 = fluid.dygraph.to_variable(input_array2) - out = paddle.multi_dot([data1, data2]) - expected_result = np.linalg.multi_dot([input_array1, input_array2]) + paddle.disable_static() + device = paddle.CPUPlace() + input_array1 = np.random.rand(3, 4).astype("float64") + input_array2 = np.random.rand(4, 3).astype("float64") + data1 = paddle.to_tensor(input_array1) + data2 = paddle.to_tensor(input_array2) + out = paddle.multi_dot([data1, data2]) + expected_result = np.linalg.multi_dot([input_array1, input_array2]) self.assertTrue(np.allclose(expected_result, out.numpy())) From b647c1f357ade554cc801d08c4b3c32415aeca5c Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 13 Sep 2021 11:09:18 +0000 Subject: [PATCH 19/19] modify code according to the review --- paddle/fluid/operators/multi_dot_op.cc | 81 ++++---------------------- 1 file changed, 10 insertions(+), 71 deletions(-) diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index 3929aebc4382e7..2d06170d34a91e 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -22,10 +22,6 @@ limitations under the License. */ #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/utils.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif - namespace paddle { namespace operators { using Tensor = framework::Tensor; @@ -41,11 +37,11 @@ inline framework::DDim ComputeAndCheckShape( bool is_vector = false; framework::DDim out_dim; - if (first_dim.size() > 2) { - PADDLE_THROW(platform::errors::InvalidArgument( - "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", - static_cast(first_dim.size()))); - } + PADDLE_ENFORCE_LT( + first_dim.size(), static_cast(3), + platform::errors::InvalidArgument( + "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", + static_cast(first_dim.size()))); // If the first tensor is 1D of size n view it as a row vector (1, n) if (first_dim.size() == 1) { @@ -54,11 +50,11 @@ inline framework::DDim ComputeAndCheckShape( } auto last_dim = inputs_dims[n - 1]; - if (last_dim.size() > 2) { - PADDLE_THROW(platform::errors::InvalidArgument( - "the last input tensor of multi_dot op must be 1D or 2D but got[%d]!", - static_cast(last_dim.size()))); - } + PADDLE_ENFORCE_LT( + last_dim.size(), static_cast(3), + platform::errors::InvalidArgument( + "the last input tensor of multi_dot must be 1D or 2D but got[%d]!", + static_cast(first_dim.size()))); // If the last tensor is 1D of size n view it as a column vector (n, 1) if (last_dim.size() == 1) { @@ -226,10 +222,6 @@ class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "The input tensors of multi_dot operator.").AsDuplicable(); AddOutput("Out", "The output tensor of multi_dot operator"); - AddAttr( - "use_mkldnn", - "(bool, default false) Indicates if MKL-DNN kernel will be used") - .SetDefault(false); AddComment(R"DOC( Compute the dot product of two or more arrays in a single function call, while automatically selecting the fastest evaluation order. @@ -259,44 +251,6 @@ class MultiDotOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", out_dims); ctx->ShareLoD("X", "Out"); } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto inputs = ctx.MultiInput("X"); - auto input_data_type = framework::proto::VarType::Type(0); - for (auto* input : inputs) { - if (!input->IsInitialized()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The inputs of multi_dot OP are Empty!")); - break; - } - } - input_data_type = inputs[0]->type(); - -#ifdef PADDLE_WITH_MKLDNN - using mkldnn::memory; - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(input_data_type, ctx.GetPlace()); - } - - framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const framework::Tensor& tensor, - const framework::OpKernelType& expected_kernel_type) const { - if (framework::IsComplexType(expected_kernel_type.data_type_)) { - // only promote inputs’s types when contains complex input - return framework::OpKernelType(tensor.type(), tensor.place(), - tensor.layout()); - } else { - return framework::OpKernelType(expected_kernel_type.data_type_, - tensor.place(), tensor.layout()); - } - } }; /** @@ -379,21 +333,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel { ctx->SetOutputsDim(out_x_g_n, ins_dims); ctx->ShareAllLoD(in_x, out_x_g_n); } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); - } - - framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, - const framework::OpKernelType& expected_kernel_type) const override { - return framework::OpKernelType(expected_kernel_type.data_type_, - tensor.place(), tensor.layout()); - } }; template