From bf318b8b5481cd994e0d1c0dc29482fc8e12da8b Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Mon, 23 Aug 2021 08:16:17 +0000
Subject: [PATCH 01/19] add a fusion op: fused_residual_dropout_bias

---
 paddle/fluid/operators/fused/CMakeLists.txt   |   5 +
 .../fused/fused_residual_dropout_bias.h       | 558 ++++++++++++++++++
 .../fused/test_fused_residual_dropout_bias.cu | 441 ++++++++++++++
 3 files changed, 1004 insertions(+)
 create mode 100644 paddle/fluid/operators/fused/fused_residual_dropout_bias.h
 create mode 100644 paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 541e5afdf9b71e..525f6504f9fa61 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -71,4 +71,9 @@ if (WITH_GPU OR WITH_ROCM)
         op_library(fused_bn_add_activation_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n")
     endif()
+    # fused_dropout
+    # only support CUDA
+    if(NOT WITH_ROCM)
+        nv_test(test_fused_residual_dropout_bias SRCS test_fused_residual_dropout_bias.cu DEPS tensor op_registry elementwise_add_op dropout_op device_context generator)
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
new file mode 100644
index 00000000000000..e0b51be9e909e3
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -0,0 +1,558 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cooperative_groups.h>
+#include <cuda.h>
+#include <curand_kernel.h>
+
+#include <iostream>
+#include <memory>
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
+
+const int VecSize = 4;
+
+namespace paddle {
+namespace operators {
+
+namespace platform = paddle::platform;
+
+inline std::pair<uint32_t, uint32_t> GetResidualDropoutThreads(
+    const platform::CUDADeviceContext &ctx, const uint64_t n) {
+  const uint64_t tmp_n = n / VecSize;
+  int threads = std::max(
+      (uint64_t)32, std::min(tmp_n, (uint64_t)ctx.GetMaxThreadsPerBlock()));
+  int blocks = std::max((uint64_t)1, (tmp_n + threads - 1) / threads);
+  return std::pair<uint32_t, uint32_t>{threads, blocks};
+}
+
+inline std::pair<dim3, dim3> GetResidualDropoutBiasThreads(
+    const platform::CUDADeviceContext &ctx, const uint32_t rows,
+    const uint32_t cols) {
+  const uint32_t tmp_cols = cols / VecSize;
+  int threads = std::max(
+      (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock()));
+  int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads);
+  int blocks_y = std::max((uint32_t)1, rows);
+  dim3 block_dim(threads, 1, 1);
+  dim3 grid_dim(blocks_x, blocks_y, 1);
+  return std::pair<dim3, dim3>{block_dim, grid_dim};
+}
+
+/********Forward**************/
+// aligned vector generates vectorized load/store on CUDA
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) AlignedVector {
+  T val[Size];
+};
+
+template <typename T>
+inline int VectorizedSize(const T *pointer) {
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value;  // NOLINT
+  if (address % vec4 == 0) {
+    return 4;
+  }
+  return 1;
+}
+
+/**
+ * dst = residual + dropout(src + bias);
+ */
+template <typename T, typename MaskType, int VecSize>
+__global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols,
+                                         uint64_t seed,
+                                         const float dropout_prob,
+                                         const bool is_upscale_in_train,
+                                         const T *src, const T *residual,
+                                         const T *bias, MaskType *mask_data,
+                                         T *dst, uint64_t increment) {
+  int col_id = blockDim.x * blockIdx.x + threadIdx.x;
+  int row_id = blockIdx.y;
+  int idx = row_id * cols + col_id;
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, increment, &state);
+
+  T factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+  const int tmp_cols = cols / VecSize * VecSize;
+  for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
+    for (int i = col_id * VecSize; i < tmp_cols;
+         i += blockDim.x * gridDim.x * VecSize) {
+      float4 rand = curand_uniform4(&state);
+      float *rand_data = &(rand.x);
+      MaskType mask[VecSize];
+      T bias_vec[VecSize];
+#pragma unroll
+      for (int j = 0; j < VecSize; j++) {
+        mask[j] = (MaskType)(rand_data[j] > dropout_prob);
+        bias_vec[j] = bias != nullptr ? bias[i + j] : static_cast<T>(0);
+      }
+#pragma unroll
+      for (int j = 0; j < VecSize; j++) {
+        mask_data[r * cols + i + j] = mask[j];
+      }
+
+      if (is_upscale_in_train) {
+#pragma unroll
+        for (int j = 0; j < VecSize; j++) {
+          dst[r * cols + i + j] = (src[r * cols + i + j] + bias_vec[j]) *
+                                      static_cast<T>(mask[j]) * factor +
+                                  residual[r * cols + i + j];
+        }
+      } else {
+#pragma unroll
+        for (int j = 0; j < VecSize; j++) {
+          dst[r * cols + i + j] =
+              (src[r * cols + i + j] + bias_vec[j]) * static_cast<T>(mask[j]) +
+              residual[r * cols + i + j];
+        }
+      }
+    }
+
+    int high_index = tmp_cols + col_id;
+    if (high_index < cols) {
+      float4 rand = curand_uniform4(&state);
+      float *rand_data = &(rand.x);
+      int k = 0;
+      if (is_upscale_in_train) {
+        for (int i = high_index; i < cols; i++) {
+          MaskType m = (MaskType)(rand_data[k++] > dropout_prob);
+          mask_data[r * cols + i] = m;
+          dst[r * cols + i] =
+              (src[r * cols + i] +
+               (bias != nullptr ? bias[i] : static_cast<T>(0.0))) *
+                  static_cast<T>(m) * factor +
+              residual[r * cols + i];
+        }
+      } else {
+        for (int i = high_index; i < cols; i++) {
+          MaskType m = (MaskType)(rand_data[k++] > dropout_prob);
+          mask_data[r * cols + i] = m;
+          dst[r * cols + i] =
+              (src[r * cols + i] +
+               (bias != nullptr ? bias[i] : static_cast<T>(0.0))) *
+                  static_cast<T>(m) +
+              residual[r * cols + i];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename MaskType, int VecSize>
+__global__ void FusedResidualDropoutBiasVec(const size_t rows,
+                                            const size_t cols, uint64_t seed,
+                                            const float dropout_prob,
+                                            const bool is_upscale_in_train,
+                                            const T *src, const T *residual,
+                                            const T *bias, MaskType *mask_data,
+                                            T *dst, uint64_t increment) {
+  int col_id = blockDim.x * blockIdx.x + threadIdx.x;
+  int row_id = blockIdx.y;
+  int idx = row_id * cols + col_id;
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, increment, &state);
+
+  T dest;
+  MaskType mask;
+  T factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+  using LoadT = AlignedVector<T, VecSize>;
+  using MaskLoadT = AlignedVector<MaskType, VecSize>;
+  for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
+    for (int i = col_id * VecSize; i < cols;
+         i += blockDim.x * gridDim.x * VecSize) {
+      T src_vec[VecSize];
+      T residual_vec[VecSize];
+      T bias_vec[VecSize];
+#pragma unroll
+      for (int ii = 0; ii < VecSize; ii++) {
+        bias_vec[ii] = static_cast<T>(0);
+      }
+      LoadT *value = reinterpret_cast<LoadT *>(&src_vec);
+      LoadT *residual_value = reinterpret_cast<LoadT *>(&residual_vec);
+      *value = *reinterpret_cast<const LoadT *>(&src[r * cols + i]);
+      *residual_value =
+          *reinterpret_cast<const LoadT *>(&residual[r * cols + i]);
+
+      LoadT *bias_value =
+          bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
+      if (bias != nullptr)
+        *bias_value = *reinterpret_cast<const LoadT *>(&bias[i]);
+
+      float4 rand = curand_uniform4(&state);
+      T dest_vec[VecSize];
+      MaskType mask_vec[VecSize];
+
+#pragma unroll
+      for (int ii = 0; ii < VecSize; ii++) {
+        mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob);
+      }
+
+      if (is_upscale_in_train) {
+#pragma unroll
+        for (int ii = 0; ii < VecSize; ii++) {
+          dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) *
+                             static_cast<T>(mask_vec[ii]) * factor +
+                         residual_vec[ii];
+        }
+      } else {
+#pragma unroll
+        for (int ii = 0; ii < VecSize; ii++) {
+          dest_vec[ii] =
+              (src_vec[ii] + bias_vec[ii]) * static_cast<T>(mask_vec[ii]) +
+              residual_vec[ii];
+        }
+      }
+      *(reinterpret_cast<LoadT *>(&dst[r * cols + i])) =
+          *reinterpret_cast<LoadT *>(&dest_vec[0]);
+      *(reinterpret_cast<MaskLoadT *>(&mask_data[r * cols + i])) =
+          *reinterpret_cast<MaskLoadT *>(&mask_vec[0]);
+    }
+  }
+}
+
+template <typename T, int VecSize>
+__global__ void FusedResidualDropoutBiasTest(const size_t rows,
+                                             const size_t cols,
+                                             const float dropout_prob,
+                                             const bool is_upscale_in_train,
+                                             const T *src, const T *residual,
+                                             const T *bias, T *dst) {
+  int col_id = blockDim.x * blockIdx.x + threadIdx.x;
+  int row_id = blockIdx.y;
+  int idx = row_id * cols + col_id;
+
+  T factor = static_cast<T>(1.0f - dropout_prob);
+  const int tmp_cols = cols / VecSize * VecSize;
+  for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
+    for (int i = col_id * VecSize; i < tmp_cols;
+         i += blockDim.x * gridDim.x * VecSize) {
+      if (is_upscale_in_train) {
+#pragma unroll
+        for (int j = 0; j < VecSize; j++) {
+          dst[r * cols + i + j] =
+              (src[r * cols + i + j] +
+               (bias != nullptr ? bias[i + j] : static_cast<T>(0.0))) +
+              residual[r * cols + i + j];
+        }
+      } else {
+#pragma unroll
+        for (int j = 0; j < VecSize; j++) {
+          dst[r * cols + i + j] =
+              (src[r * cols + i + j] +
+               (bias != nullptr ? bias[i + j] : static_cast<T>(0.0))) *
+                  factor +
+              residual[r * cols + i + j];
+        }
+      }
+    }
+
+    int high_index = tmp_cols + col_id;
+    if (high_index < cols) {
+      if (is_upscale_in_train) {
+        for (int i = high_index; i < cols; i++) {
+          dst[r * cols + i] =
+              (src[r * cols + i] +
+               (bias != nullptr ? bias[i] : static_cast<T>(0.0))) +
+              residual[r * cols + i];
+        }
+      } else {
+        for (int i = high_index; i < cols; i++) {
+          dst[r * cols + i] =
+              (src[r * cols + i] +
+               (bias != nullptr ? bias[i] : static_cast<T>(0.0))) *
+                  factor +
+              residual[r * cols + i];
+        }
+      }
+    }
+  }
+}
+
+/**
+ * dst = residual + dropout(src + bias);
+ */
+template <typename T, typename MaskType>
+void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
+                               const int increment, uint64_t seed,
+                               const float dropout_prob,
+                               bool is_upscale_in_train, const T *src,
+                               const T *residual, const T *bias,
+                               MaskType *mask_data, T *dst,
+                               const platform::CUDADeviceContext &ctx) {
+  if (std::abs(dropout_prob - 1.0) < 1e-5) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T),
+                        cudaMemcpyDeviceToDevice, ctx.stream()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
+        mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
+    return;
+  }
+
+  auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols);
+  if (cols % VecSize != 0)
+    FusedResidualDropoutBias<
+        T, uint8_t,
+        VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+        bias, mask_data, dst, increment);
+  else
+    FusedResidualDropoutBiasVec<
+        T, uint8_t,
+        VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+        bias, mask_data, dst, increment);
+}
+
+template <typename T>
+void LaunchResidualDropoutBiasTest(const uint32_t rows, const uint32_t cols,
+                                   const float dropout_prob,
+                                   bool is_upscale_in_train, const T *src,
+                                   const T *residual, const T *bias, T *dst,
+                                   const platform::CUDADeviceContext &ctx) {
+  if (std::abs(dropout_prob - 1.0) < 1e-5) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T),
+                        cudaMemcpyDeviceToDevice, ctx.stream()));
+    return;
+  }
+  auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols);
+  FusedResidualDropoutBiasTest<
+      T, VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+      rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, dst);
+}
+
+/********Backward**************/
+template <typename T, typename MaskType>
+__global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
+                                         const T factor, const int64_t size,
+                                         T *dx, bool is_upscale_in_train) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  int tmp_size = size / VecSize * VecSize;
+  for (int i = idx * VecSize; i < tmp_size;
+       i += blockDim.x * gridDim.x * VecSize) {
+#pragma unroll
+    for (int ii = 0; ii < VecSize; ii++) {
+      dx[i + ii] = dout[i + ii] * static_cast<T>(mask[i + ii]) * factor;
+    }
+  }
+
+  int high_index = tmp_size + idx;
+  if (size > high_index) {
+    for (int i = high_index; i < size; i++) {
+      if (is_upscale_in_train)
+        dx[i] = dout[i] * static_cast<T>(mask[i]) * factor;
+      else
+        dx[i] = dout[i] * static_cast<T>(mask[i]);
+    }
+  }
+}
+
+template <typename T, typename MaskType, int VecSize>
+__global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask,
+                                            const T factor, const int64_t size,
+                                            T *dx, bool is_upscale_in_train) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  using LoadT = AlignedVector<T, VecSize>;
+  using MaskLoadT = AlignedVector<MaskType, VecSize>;
+  for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
+    T dout_vec[VecSize];
+    MaskType mask_vec[VecSize];
+    LoadT *dout_value = reinterpret_cast<LoadT *>(&dout_vec);
+    MaskLoadT *mask_value = reinterpret_cast<MaskLoadT *>(&mask_vec);
+    *dout_value = *reinterpret_cast<const LoadT *>(&dout[i]);
+    *mask_value = *reinterpret_cast<const MaskLoadT *>(&mask[i]);
+
+    T dx_vec[VecSize];
+#pragma unroll
+    for (int ii = 0; ii < VecSize; ii++) {
+      dx_vec[ii] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
+    }
+    *(reinterpret_cast<LoadT *>(&dx[i])) =
+        *reinterpret_cast<LoadT *>(&dx_vec[0]);
+  }
+}
+
+template <typename T, typename MaskType, int BSX, int BSY>
+__global__ void FusedResidualDropoutBiasGrad(
+    const T *dout, const MaskType *mask, const T factor, const int64_t rows,
+    const int64_t cols, T *dx, T *dbias, bool is_upscale_in_train) {
+  int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  __shared__ T cache[BSX][BSY];
+  T tmp_sum = static_cast<T>(0);
+  if (col_id < cols) {
+    for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) {
+      int index = row_id * cols + col_id;
+      T out_value = dout[index];
+      if (is_upscale_in_train)
+        dx[index] = out_value * static_cast<T>(mask[index]) * factor;
+      else
+        dx[index] = out_value * static_cast<T>(mask[index]);
+      tmp_sum += out_value;
+    }
+  }
+  cache[threadIdx.x][threadIdx.y] = tmp_sum;
+  __syncthreads();
+
+  // reduce sum
+  // TODO(zhangkaihuo) : Replace with ModuleAPI
+  T sum = static_cast<T>(0);
+  int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  int x = tid / BSY;
+  int y = tid & (BSY - 1);
+
+  int s = BSY / 2;
+  while (s > 0) {
+    if (y < s) {
+      cache[x][y] += cache[x][y + s];
+    }
+    s /= 2;
+    __syncthreads();
+  }
+
+  if (threadIdx.y == 0 && col_id < cols) {
+    dbias[col_id] = cache[threadIdx.x][0];
+  }
+}
+
+template <typename T, typename MaskType, int BSX, int BSY, int VecSize>
+__global__ void FusedResidualDropoutBiasGradVec(
+    const T *dout, const MaskType *mask, const T factor, const int64_t rows,
+    const int64_t cols, T *dx, T *dbias, bool is_upscale_in_train) {
+  int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  using LoadT = AlignedVector<T, VecSize>;
+  using MaskLoadT = AlignedVector<MaskType, VecSize>;
+
+  T tmp_sum[VecSize] = {static_cast<T>(0)};
+  if (col_id * 4 < cols) {
+    for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) {
+      int index = row_id * cols + col_id * 4;
+      T out_vec[VecSize];
+      MaskType mask_vec[VecSize];
+      T dx_vec[VecSize];
+      LoadT *out_value = reinterpret_cast<LoadT *>(&out_vec);
+      MaskLoadT *mask_value = reinterpret_cast<MaskLoadT *>(&mask_vec);
+      LoadT *dx_value = reinterpret_cast<LoadT *>(&dx_vec);
+      *out_value = *reinterpret_cast<const LoadT *>(&dout[index]);
+      *mask_value = *reinterpret_cast<const MaskLoadT *>(&mask[index]);
+
+      if (is_upscale_in_train) {
+#pragma unroll
+        for (int i = 0; i < VecSize; i++) {
+          dx_vec[i] = out_vec[i] * static_cast<T>(mask_vec[i]) * factor;
+          tmp_sum[i] += out_vec[i];
+        }
+      } else {
+#pragma unroll
+        for (int i = 0; i < VecSize; i++) {
+          dx_vec[i] = out_vec[i] * static_cast<T>(mask_vec[i]);
+          tmp_sum[i] += out_vec[i];
+        }
+      }
+
+      *(reinterpret_cast<LoadT *>(&dx[index])) =
+          *reinterpret_cast<LoadT *>(&dx_vec[0]);
+    }
+  }
+
+  __shared__ T cache[BSX * VecSize][BSY];
+  for (int i = 0; i < VecSize; i++)
+    cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i];
+  __syncthreads();
+
+  // reduce sum
+  // TODO(zhangkaihuo) : Replace with ModuleAPI
+  T sum = static_cast<T>(0);
+  int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  int x = tid / BSY;
+  int y = tid & (BSY - 1);
+
+  int s = BSY / 2;
+  while (s > 0) {
+    if (y < s) {
+      for (int i = 0; i < VecSize; i++) {
+        cache[x * VecSize + i][y] += cache[x * VecSize + i][y + s];
+      }
+    }
+    s /= 2;
+    __syncthreads();
+  }
+
+  if (threadIdx.y == 0 && col_id * VecSize < cols) {
+    for (int i = 0; i < VecSize; i++)
+      dbias[col_id * VecSize + i] = cache[threadIdx.x * VecSize + i][0];
+  }
+}
+
+template <typename T, typename MaskType>
+void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
+                                   const float dropout_prob,
+                                   const bool is_upscale_in_train,
+                                   const uint32_t rows, const uint32_t cols,
+                                   T *dx, T *dbias,
+                                   const platform::CUDADeviceContext &ctx) {
+  const T zero = static_cast<T>(0.0);
+  auto factor = dropout_prob == static_cast<float>(1.0)
+                    ? zero
+                    : static_cast<T>(1.0 / (1.0 - dropout_prob));
+
+  if (dbias != nullptr) {
+    if (cols % 4 == 0) {
+      auto threads = std::min(cols / VecSize, static_cast<uint32_t>(8));
+      auto blocks = std::max((uint32_t)1,
+                             std::min((cols / VecSize + threads - 1) / threads,
+                                      (uint32_t)ctx.GetSMCount()));
+      dim3 block_dim(threads, 128, 1);
+      dim3 grid_dim(blocks, 1, 1);
+      FusedResidualDropoutBiasGradVec<
+          T, MaskType, 8, 128,
+          VecSize><<<grid_dim, block_dim, 0, ctx.stream()>>>(
+          dout, mask, factor, rows, cols, dx, dbias, is_upscale_in_train);
+
+    } else {
+      auto threads = std::min(cols, static_cast<uint32_t>(8));
+      auto blocks = std::max(
+          (uint32_t)1,
+          std::min((cols + threads - 1) / threads, (uint32_t)ctx.GetSMCount()));
+      dim3 block_dim(threads, 128, 1);
+      dim3 grid_dim(blocks, 1, 1);
+      FusedResidualDropoutBiasGrad<
+          T, MaskType, 8, 128><<<grid_dim, block_dim, 0, ctx.stream()>>>(
+          dout, mask, factor, rows, cols, dx, dbias, is_upscale_in_train);
+    }
+  } else {
+    const uint64_t n = rows * cols;
+    auto threads = GetResidualDropoutThreads(ctx, n);
+    if (n % 4 == 0) {
+      FusedResidualDropoutGradVec<
+          T, MaskType,
+          VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+          dout, mask, factor, n, dx, is_upscale_in_train);
+    } else {
+      FusedResidualDropoutGrad<
+          T, MaskType><<<threads.second, threads.first, 0, ctx.stream()>>>(
+          dout, mask, factor, n, dx, is_upscale_in_train);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu b/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu
new file mode 100644
index 00000000000000..c8a1485ab7e0fa
--- /dev/null
+++ b/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu
@@ -0,0 +1,441 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+
+#include <random>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+
+USE_OP(elementwise_add);
+USE_OP(dropout);
+
+template <typename T>
+void Dropout(const std::vector<T> &x, const framework::DDim &x_dim,
+             std::vector<T> *out, std::vector<uint8_t> *mask,
+             const platform::CUDADeviceContext &ctx, uint64_t seed,
+             float dropout_prob, bool is_upscale_in_train, bool is_test) {
+  framework::Scope scope;
+  auto var_x = scope.Var("X");
+  auto tensor_x = var_x->GetMutable<framework::LoDTensor>();
+  tensor_x->Resize(x_dim);
+  tensor_x->mutable_data<T>(ctx.GetPlace());
+  cudaMemcpy(tensor_x->data<T>(), x.data(), x_dim[0] * x_dim[1] * sizeof(T),
+             cudaMemcpyHostToDevice);
+
+  auto var_out = scope.Var("Out");
+  auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
+
+  auto var_mask = scope.Var("Mask");
+  auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
+
+  framework::AttributeMap attrs;
+  attrs.insert({"fix_seed", 1});
+  attrs.insert({"seed", static_cast<int>(seed)});
+  attrs.insert({"dropout_prob", dropout_prob});
+  if (is_upscale_in_train) {
+    attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
+  }
+  if (is_test) {
+    attrs.insert({"is_test", 1});
+  }
+
+  auto op = framework::OpRegistry::CreateOp(
+      "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
+  op->Run(scope, ctx.GetPlace());
+  cudaMemcpy((*out).data(), tensor_out->data<T>(),
+             x_dim[0] * x_dim[1] * sizeof(T), cudaMemcpyDeviceToHost);
+  if (!is_test) {
+    cudaMemcpy((*mask).data(), tensor_mask->data<uint8_t>(),
+               x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost);
+  }
+  ctx.Wait();
+}
+
+template <typename T>
+void DropoutGrad(std::vector<T> *dx, const framework::DDim &x_dim,
+                 const std::vector<T> &dout, const std::vector<uint8_t> &mask,
+                 const platform::CUDADeviceContext &ctx, float dropout_prob,
+                 bool is_upscale_in_train) {
+  framework::Scope scope;
+  const size_t n = x_dim[0] * x_dim[1];
+  auto var_out = scope.Var("DOut");
+  auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
+  tensor_out->Resize(x_dim);
+  tensor_out->mutable_data<T>(ctx.GetPlace());
+  cudaMemcpy(tensor_out->data<T>(), dout.data(), n * sizeof(T),
+             cudaMemcpyHostToDevice);
+
+  auto var_mask = scope.Var("Mask");
+  auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
+  tensor_mask->Resize(x_dim);
+  tensor_mask->mutable_data<uint8_t>(ctx.GetPlace());
+  cudaMemcpy(tensor_mask->data<uint8_t>(), mask.data(), n * sizeof(uint8_t),
+             cudaMemcpyHostToDevice);
+
+  auto var_dx = scope.Var("DX");
+  auto tensor_dx = var_dx->GetMutable<framework::LoDTensor>();
+
+  framework::AttributeMap attrs;
+  attrs.insert({"dropout_prob", dropout_prob});
+  attrs.insert({"is_test", 0});
+  if (is_upscale_in_train) {
+    attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
+  } else {
+    attrs.insert({"dropout_implementation", std::string("downgrade_in_infer")});
+  }
+
+  auto op = framework::OpRegistry::CreateOp(
+      "dropout_grad", {{"Out@GRAD", {"DOut"}}, {"Mask", {"Mask"}}},
+      {{"X@GRAD", {"DX"}}}, attrs);
+  op->Run(scope, ctx.GetPlace());
+
+  cudaMemcpy((*dx).data(), tensor_dx->data<T>(),
+             x_dim[0] * x_dim[1] * sizeof(T), cudaMemcpyDeviceToHost);
+  ctx.Wait();
+}
+
+template <typename T>
+struct TestFusedResidualDropoutBias {
+  uint32_t _rows;
+  uint32_t _cols;
+  uint64_t _seed;
+  float _dropout_prob;
+  bool _is_upscale_in_train;
+  bool _is_test;  // default false,  Set to true for inference only
+  bool _has_bias = true;
+  framework::Tensor _src, _residual, _bias, _out, _mask;
+  framework::Tensor _dsrc, _dbias;
+
+  std::vector<T> _src_vec, _residual_vec, _bias_vec, _out_vec, _mask_vec;
+  std::vector<T> _correct_out, _correct_dsrc, _correct_dbias;
+  std::vector<uint8_t> _correct_mask;
+
+  platform::CUDAPlace _place;
+  platform::CUDADeviceContext *_ctx;
+
+  TestFusedResidualDropoutBias() {
+    _rows = 32;
+    _cols = 32;
+    _seed = 0;
+    _dropout_prob = 0.0;
+    _is_upscale_in_train = false;
+    _is_test = false;
+    _has_bias = true;
+    _ctx = new platform::CUDADeviceContext(_place);
+  }
+
+  TestFusedResidualDropoutBias(int rows, int cols, uint64_t seed = 0,
+                               float dropout_prob = 0.0,
+                               bool is_upscale_in_train = false,
+                               bool is_test = false) {
+    _rows = rows;
+    _cols = cols;
+    _seed = seed;
+    _dropout_prob = dropout_prob;
+    _is_upscale_in_train = is_upscale_in_train;
+    _is_test = is_test;
+    _has_bias = true;
+    _ctx = new platform::CUDADeviceContext(_place);
+  }
+
+  ~TestFusedResidualDropoutBias() { delete _ctx; }
+
+  void SetUp() {
+    const int n = _rows * _cols;
+    _correct_out.resize(n);
+    _correct_mask.resize(n);
+    _correct_dsrc.resize(n);
+    _correct_dbias.resize(_cols);
+
+    _src_vec.resize(n);
+    _residual_vec.resize(n);
+    _bias_vec.resize(_cols);
+    std::default_random_engine random(time(NULL));
+    std::uniform_real_distribution<float> dis(0.0, 1.0);
+
+    for (int i = 0; i < _rows; i++) {
+      for (int j = 0; j < _cols; j++) {
+        _src_vec[i * _cols + j] = static_cast<T>(dis(random));
+        _residual_vec[i * _cols + j] = static_cast<T>(dis(random));
+        if (i == 0) _bias_vec[j] = dis(random);
+      }
+    }
+
+    framework::TensorFromVector<T>(_src_vec, *_ctx, &_src);
+    _src.Resize({_rows, _cols});
+    framework::TensorFromVector<T>(_residual_vec, *_ctx, &_residual);
+    _residual.Resize({_rows, _cols});
+    if (_has_bias) {
+      framework::TensorFromVector<T>(_bias_vec, *_ctx, &_bias);
+      _bias.Resize({_cols});
+    }
+
+    {
+      _out.Resize({_rows, _cols});
+      _out.mutable_data<T>(_place);
+      _mask.Resize({_rows, _cols});
+      _mask.mutable_data<uint8_t>(_place);
+      _dsrc.Resize({_rows, _cols});
+      _dsrc.mutable_data<T>(_place);
+
+      if (_has_bias) {
+        _dbias.Resize({_cols});
+        _dbias.mutable_data<T>(_place);
+      }
+    }
+  }
+
+  void BaseForward() {
+    std::vector<T> out1(_rows * _cols), out2(_rows * _cols);
+    if (_has_bias) {
+      for (int i = 0; i < _rows; i++) {
+        for (int j = 0; j < _cols; j++) {
+          out1[i * _cols + j] = _src_vec[i * _cols + j] + _bias_vec[j];
+        }
+      }
+      Dropout<T>(out1, _src.dims(), &out2, &_correct_mask, *_ctx, _seed,
+                 _dropout_prob, _is_upscale_in_train, _is_test);
+    } else {
+      Dropout<T>(_src_vec, _src.dims(), &out2, &_correct_mask, *_ctx, _seed,
+                 _dropout_prob, _is_upscale_in_train, _is_test);
+    }
+    for (int i = 0; i < _rows; i++) {
+      for (int j = 0; j < _cols; j++) {
+        _correct_out[i * _cols + j] =
+            _residual_vec[i * _cols + j] + out2[i * _cols + j];
+      }
+    }
+    _ctx->Wait();
+  }
+
+  void BaseBackward() {
+    if (!_is_upscale_in_train) {
+      for (int i = 0; i < _rows * _cols; i++) {
+        _correct_dsrc[i] = _correct_out[i] * static_cast<T>(_correct_mask[i]);
+      }
+    } else {
+      DropoutGrad<T>(&_correct_dsrc, _src.dims(), _correct_out, _correct_mask,
+                     *_ctx, _dropout_prob, _is_upscale_in_train);
+    }
+    memset(&_correct_dbias[0], 0, _cols * sizeof(T));
+    for (int i = 0; i < _rows; i++) {
+      for (int j = 0; j < _cols; j++) {
+        _correct_dbias[j] += _correct_out[i * _cols + j];
+      }
+    }
+  }
+
+  void FusedForward() {
+    auto threads = paddle::operators::GetResidualDropoutBiasThreads(
+        *_ctx, (uint64_t)_rows, (uint64_t)_cols);
+    const int increment =
+        ((_cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) *
+        VecSize;
+
+    T *bias_ptr = nullptr;
+    if (_has_bias) {
+      bias_ptr = _bias.data<T>();
+    }
+    if (_is_test) {
+      paddle::operators::LaunchResidualDropoutBiasTest<T>(
+          _rows, _cols, _dropout_prob, _is_upscale_in_train, _src.data<T>(),
+          _residual.data<T>(), bias_ptr, _out.data<T>(), *_ctx);
+    } else {
+      paddle::operators::LaunchResidualDropoutBias<T, uint8_t>(
+          _rows, _cols, increment, _seed, _dropout_prob, _is_upscale_in_train,
+          _src.data<T>(), _residual.data<T>(), bias_ptr, _mask.data<uint8_t>(),
+          _out.data<T>(), *_ctx);
+    }
+    _ctx->Wait();
+  }
+
+  void FusedBackward() {
+    if (_is_test) return;
+
+    T *bias_ptr = nullptr;
+    if (_has_bias) {
+      bias_ptr = _dbias.data<T>();
+    }
+    paddle::operators::LaunchResidualDropoutBiasGrad<T, uint8_t>(
+        _out.data<T>(), _mask.data<uint8_t>(), _dropout_prob,
+        _is_upscale_in_train, _rows, _cols, _dsrc.data<T>(), bias_ptr, *_ctx);
+  }
+
+  void Run() {
+    SetUp();
+    BaseForward();
+    FusedForward();
+    BaseBackward();
+    FusedBackward();
+  }
+
+  void CheckOut(const T diff) {
+    const int n = _rows * _cols;
+    std::vector<T> out(n);
+    std::vector<uint8_t> mask(n);
+    cudaMemcpy(out.data(), _out.data<T>(), _rows * _cols * sizeof(T),
+               cudaMemcpyDeviceToHost);
+    if (!_is_test) {
+      cudaMemcpy(mask.data(), _mask.data<uint8_t>(),
+                 _rows * _cols * sizeof(uint8_t), cudaMemcpyDeviceToHost);
+    }
+    _ctx->Wait();
+
+    for (int i = 0; i < n; i++) {
+      EXPECT_LT(std::abs(out[i] - _correct_out[i]), diff);
+      if (!_is_test) EXPECT_EQ(mask[i], _correct_mask[i]);
+    }
+  }
+
+  void CheckGrad(const T diff) {
+    if (_is_test) return;
+
+    const int n = _rows * _cols;
+
+    std::vector<T> dsrc(n);
+    cudaMemcpy(dsrc.data(), _dsrc.data<T>(), _rows * _cols * sizeof(T),
+               cudaMemcpyDeviceToHost);
+
+    for (int i = 0; i < n; i++) {
+      EXPECT_LT(std::abs(dsrc[i] - _correct_dsrc[i]), diff);
+    }
+
+    if (_has_bias) {
+      std::vector<T> dbias(_cols);
+      cudaMemcpy(dbias.data(), _dbias.data<T>(), _cols * sizeof(T),
+                 cudaMemcpyDeviceToHost);
+      _ctx->Wait();
+      for (int i = 0; i < _cols; i++) {
+        EXPECT_LT(std::abs(dbias[i] - _correct_dbias[i]), diff);
+      }
+    }
+  }
+};
+
+TEST(FusedDropout, GPUFusedRedisualDorpoutBias) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+
+TEST(FusedDropout, GPUFusedRedisualDorpoutBiasDouble) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<double> test(rows, cols);
+  test.Run();
+  test.CheckOut(static_cast<double>(1e-5));
+  test.CheckGrad(static_cast<double>(1e-5));
+}
+
+TEST(FusedDropout, GPUFusedRedisualDorpoutBiasFp16) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<platform::float16> test(rows, cols);
+  test.Run();
+  test.CheckOut(static_cast<platform::float16>(1e-2));
+  // For inference, check_grad is not required. ref: test_dropout_op.py
+  // test.CheckGrad((platform::float16)1e-2);
+}
+
+// test no bias and cols % 4 == 0
+TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols);
+  test._has_bias = false;
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+
+// test no bias and cols % 4 != 0
+TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias2) {
+  const int rows = 16;
+  const int cols = 17;
+  TestFusedResidualDropoutBias<float> test(rows, cols);
+  test._has_bias = false;
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+
+// test add bias and cols % 4 != 0
+TEST(FusedDropout, GPUFusedRedisualDorpoutBias2) {
+  const int rows = 16;
+  const int cols = 17;
+  TestFusedResidualDropoutBias<float> test(rows, cols);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+
+TEST(FusedDropout, GPUFusedRedisualDorpoutBias3) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, false, false);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+
+TEST(FusedDropout, GPUFusedRedisualDorpoutBias4) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, false, false);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+
+TEST(FusedDropout, GPUFusedRedisualDorpoutBias5) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, true, false);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+
+TEST(FusedDropout, GPUFusedRedisualDorpoutBias6) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols, 0, 0.35, true, true);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+
+TEST(FusedDropout, GPUFusedRedisualDorpoutBias7) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols, 125, 0.0, false, false);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}

From 507117a86e68dd4dc572ff167f286b39d7c84416 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Mon, 23 Aug 2021 13:22:26 +0000
Subject: [PATCH 02/19] simplify the code, andd opt reduce sum

---
 .../fused/fused_residual_dropout_bias.h       | 320 +++++-------------
 .../fused/test_fused_residual_dropout_bias.cu |  52 ++-
 2 files changed, 101 insertions(+), 271 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index e0b51be9e909e3..2d0de22952c88a 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <iostream>
 #include <memory>
 
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -30,6 +31,7 @@ namespace paddle {
 namespace operators {
 
 namespace platform = paddle::platform;
+namespace cg = cooperative_groups;
 
 inline std::pair<uint32_t, uint32_t> GetResidualDropoutThreads(
     const platform::CUDADeviceContext &ctx, const uint64_t n) {
@@ -73,86 +75,6 @@ inline int VectorizedSize(const T *pointer) {
 /**
  * dst = residual + dropout(src + bias);
  */
-template <typename T, typename MaskType, int VecSize>
-__global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols,
-                                         uint64_t seed,
-                                         const float dropout_prob,
-                                         const bool is_upscale_in_train,
-                                         const T *src, const T *residual,
-                                         const T *bias, MaskType *mask_data,
-                                         T *dst, uint64_t increment) {
-  int col_id = blockDim.x * blockIdx.x + threadIdx.x;
-  int row_id = blockIdx.y;
-  int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
-
-  T factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
-  const int tmp_cols = cols / VecSize * VecSize;
-  for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
-    for (int i = col_id * VecSize; i < tmp_cols;
-         i += blockDim.x * gridDim.x * VecSize) {
-      float4 rand = curand_uniform4(&state);
-      float *rand_data = &(rand.x);
-      MaskType mask[VecSize];
-      T bias_vec[VecSize];
-#pragma unroll
-      for (int j = 0; j < VecSize; j++) {
-        mask[j] = (MaskType)(rand_data[j] > dropout_prob);
-        bias_vec[j] = bias != nullptr ? bias[i + j] : static_cast<T>(0);
-      }
-#pragma unroll
-      for (int j = 0; j < VecSize; j++) {
-        mask_data[r * cols + i + j] = mask[j];
-      }
-
-      if (is_upscale_in_train) {
-#pragma unroll
-        for (int j = 0; j < VecSize; j++) {
-          dst[r * cols + i + j] = (src[r * cols + i + j] + bias_vec[j]) *
-                                      static_cast<T>(mask[j]) * factor +
-                                  residual[r * cols + i + j];
-        }
-      } else {
-#pragma unroll
-        for (int j = 0; j < VecSize; j++) {
-          dst[r * cols + i + j] =
-              (src[r * cols + i + j] + bias_vec[j]) * static_cast<T>(mask[j]) +
-              residual[r * cols + i + j];
-        }
-      }
-    }
-
-    int high_index = tmp_cols + col_id;
-    if (high_index < cols) {
-      float4 rand = curand_uniform4(&state);
-      float *rand_data = &(rand.x);
-      int k = 0;
-      if (is_upscale_in_train) {
-        for (int i = high_index; i < cols; i++) {
-          MaskType m = (MaskType)(rand_data[k++] > dropout_prob);
-          mask_data[r * cols + i] = m;
-          dst[r * cols + i] =
-              (src[r * cols + i] +
-               (bias != nullptr ? bias[i] : static_cast<T>(0.0))) *
-                  static_cast<T>(m) * factor +
-              residual[r * cols + i];
-        }
-      } else {
-        for (int i = high_index; i < cols; i++) {
-          MaskType m = (MaskType)(rand_data[k++] > dropout_prob);
-          mask_data[r * cols + i] = m;
-          dst[r * cols + i] =
-              (src[r * cols + i] +
-               (bias != nullptr ? bias[i] : static_cast<T>(0.0))) *
-                  static_cast<T>(m) +
-              residual[r * cols + i];
-        }
-      }
-    }
-  }
-}
-
 template <typename T, typename MaskType, int VecSize>
 __global__ void FusedResidualDropoutBiasVec(const size_t rows,
                                             const size_t cols, uint64_t seed,
@@ -170,6 +92,9 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
   T dest;
   MaskType mask;
   T factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+  if (!is_upscale_in_train) {
+    factor = static_cast<T>(1.0);
+  }
   using LoadT = AlignedVector<T, VecSize>;
   using MaskLoadT = AlignedVector<MaskType, VecSize>;
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
@@ -202,20 +127,11 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
         mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob);
       }
 
-      if (is_upscale_in_train) {
 #pragma unroll
-        for (int ii = 0; ii < VecSize; ii++) {
-          dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) *
-                             static_cast<T>(mask_vec[ii]) * factor +
-                         residual_vec[ii];
-        }
-      } else {
-#pragma unroll
-        for (int ii = 0; ii < VecSize; ii++) {
-          dest_vec[ii] =
-              (src_vec[ii] + bias_vec[ii]) * static_cast<T>(mask_vec[ii]) +
-              residual_vec[ii];
-        }
+      for (int ii = 0; ii < VecSize; ii++) {
+        dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) *
+                           static_cast<T>(mask_vec[ii]) * factor +
+                       residual_vec[ii];
       }
       *(reinterpret_cast<LoadT *>(&dst[r * cols + i])) =
           *reinterpret_cast<LoadT *>(&dest_vec[0]);
@@ -237,47 +153,31 @@ __global__ void FusedResidualDropoutBiasTest(const size_t rows,
   int idx = row_id * cols + col_id;
 
   T factor = static_cast<T>(1.0f - dropout_prob);
+  if (is_upscale_in_train) {
+    factor = static_cast<T>(1.0);
+  }
   const int tmp_cols = cols / VecSize * VecSize;
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < tmp_cols;
          i += blockDim.x * gridDim.x * VecSize) {
-      if (is_upscale_in_train) {
 #pragma unroll
-        for (int j = 0; j < VecSize; j++) {
-          dst[r * cols + i + j] =
-              (src[r * cols + i + j] +
-               (bias != nullptr ? bias[i + j] : static_cast<T>(0.0))) +
-              residual[r * cols + i + j];
-        }
-      } else {
-#pragma unroll
-        for (int j = 0; j < VecSize; j++) {
-          dst[r * cols + i + j] =
-              (src[r * cols + i + j] +
-               (bias != nullptr ? bias[i + j] : static_cast<T>(0.0))) *
-                  factor +
-              residual[r * cols + i + j];
-        }
+      for (int j = 0; j < VecSize; j++) {
+        dst[r * cols + i + j] =
+            (src[r * cols + i + j] +
+             (bias != nullptr ? bias[i + j] : static_cast<T>(0.0))) *
+                factor +
+            residual[r * cols + i + j];
       }
     }
 
     int high_index = tmp_cols + col_id;
     if (high_index < cols) {
-      if (is_upscale_in_train) {
-        for (int i = high_index; i < cols; i++) {
-          dst[r * cols + i] =
-              (src[r * cols + i] +
-               (bias != nullptr ? bias[i] : static_cast<T>(0.0))) +
-              residual[r * cols + i];
-        }
-      } else {
-        for (int i = high_index; i < cols; i++) {
-          dst[r * cols + i] =
-              (src[r * cols + i] +
-               (bias != nullptr ? bias[i] : static_cast<T>(0.0))) *
-                  factor +
-              residual[r * cols + i];
-        }
+      for (int i = high_index; i < cols; i++) {
+        dst[r * cols + i] =
+            (src[r * cols + i] +
+             (bias != nullptr ? bias[i] : static_cast<T>(0.0))) *
+                factor +
+            residual[r * cols + i];
       }
     }
   }
@@ -305,9 +205,8 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
 
   auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols);
   if (cols % VecSize != 0)
-    FusedResidualDropoutBias<
-        T, uint8_t,
-        VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+    FusedResidualDropoutBiasVec<
+        T, uint8_t, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
         rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
         bias, mask_data, dst, increment);
   else
@@ -337,36 +236,10 @@ void LaunchResidualDropoutBiasTest(const uint32_t rows, const uint32_t cols,
 }
 
 /********Backward**************/
-template <typename T, typename MaskType>
-__global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
-                                         const T factor, const int64_t size,
-                                         T *dx, bool is_upscale_in_train) {
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-  int tmp_size = size / VecSize * VecSize;
-  for (int i = idx * VecSize; i < tmp_size;
-       i += blockDim.x * gridDim.x * VecSize) {
-#pragma unroll
-    for (int ii = 0; ii < VecSize; ii++) {
-      dx[i + ii] = dout[i + ii] * static_cast<T>(mask[i + ii]) * factor;
-    }
-  }
-
-  int high_index = tmp_size + idx;
-  if (size > high_index) {
-    for (int i = high_index; i < size; i++) {
-      if (is_upscale_in_train)
-        dx[i] = dout[i] * static_cast<T>(mask[i]) * factor;
-      else
-        dx[i] = dout[i] * static_cast<T>(mask[i]);
-    }
-  }
-}
-
 template <typename T, typename MaskType, int VecSize>
 __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask,
                                             const T factor, const int64_t size,
-                                            T *dx, bool is_upscale_in_train) {
+                                            T *dx) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
   using LoadT = AlignedVector<T, VecSize>;
@@ -389,62 +262,33 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask,
   }
 }
 
-template <typename T, typename MaskType, int BSX, int BSY>
-__global__ void FusedResidualDropoutBiasGrad(
-    const T *dout, const MaskType *mask, const T factor, const int64_t rows,
-    const int64_t cols, T *dx, T *dbias, bool is_upscale_in_train) {
-  int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
+template <typename T, int BSX, int BSY, int VecSize>
+__device__ void reduce_sum(T cache[BSX * VecSize][BSY]) {}
 
-  __shared__ T cache[BSX][BSY];
-  T tmp_sum = static_cast<T>(0);
-  if (col_id < cols) {
-    for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) {
-      int index = row_id * cols + col_id;
-      T out_value = dout[index];
-      if (is_upscale_in_train)
-        dx[index] = out_value * static_cast<T>(mask[index]) * factor;
-      else
-        dx[index] = out_value * static_cast<T>(mask[index]);
-      tmp_sum += out_value;
-    }
-  }
-  cache[threadIdx.x][threadIdx.y] = tmp_sum;
-  __syncthreads();
-
-  // reduce sum
-  // TODO(zhangkaihuo) : Replace with ModuleAPI
-  T sum = static_cast<T>(0);
-  int tid = threadIdx.y * blockDim.x + threadIdx.x;
-  int x = tid / BSY;
-  int y = tid & (BSY - 1);
-
-  int s = BSY / 2;
-  while (s > 0) {
-    if (y < s) {
-      cache[x][y] += cache[x][y + s];
-    }
-    s /= 2;
-    __syncthreads();
-  }
-
-  if (threadIdx.y == 0 && col_id < cols) {
-    dbias[col_id] = cache[threadIdx.x][0];
+template <typename U>
+static __forceinline__ __device__ U WarpReduceSum(U val) {
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+  const int warpSize = 32;
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
   }
+  return val;
 }
 
 template <typename T, typename MaskType, int BSX, int BSY, int VecSize>
 __global__ void FusedResidualDropoutBiasGradVec(
     const T *dout, const MaskType *mask, const T factor, const int64_t rows,
-    const int64_t cols, T *dx, T *dbias, bool is_upscale_in_train) {
+    const int64_t cols, T *dx, T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
   using LoadT = AlignedVector<T, VecSize>;
   using MaskLoadT = AlignedVector<MaskType, VecSize>;
 
   T tmp_sum[VecSize] = {static_cast<T>(0)};
-  if (col_id * 4 < cols) {
+  if (col_id * VecSize < cols) {
     for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) {
-      int index = row_id * cols + col_id * 4;
+      int index = row_id * cols + col_id * VecSize;
       T out_vec[VecSize];
       MaskType mask_vec[VecSize];
       T dx_vec[VecSize];
@@ -454,18 +298,10 @@ __global__ void FusedResidualDropoutBiasGradVec(
       *out_value = *reinterpret_cast<const LoadT *>(&dout[index]);
       *mask_value = *reinterpret_cast<const MaskLoadT *>(&mask[index]);
 
-      if (is_upscale_in_train) {
-#pragma unroll
-        for (int i = 0; i < VecSize; i++) {
-          dx_vec[i] = out_vec[i] * static_cast<T>(mask_vec[i]) * factor;
-          tmp_sum[i] += out_vec[i];
-        }
-      } else {
 #pragma unroll
-        for (int i = 0; i < VecSize; i++) {
-          dx_vec[i] = out_vec[i] * static_cast<T>(mask_vec[i]);
-          tmp_sum[i] += out_vec[i];
-        }
+      for (int i = 0; i < VecSize; i++) {
+        dx_vec[i] = out_vec[i] * static_cast<T>(mask_vec[i]) * factor;
+        tmp_sum[i] += out_vec[i];
       }
 
       *(reinterpret_cast<LoadT *>(&dx[index])) =
@@ -479,26 +315,23 @@ __global__ void FusedResidualDropoutBiasGradVec(
   __syncthreads();
 
   // reduce sum
-  // TODO(zhangkaihuo) : Replace with ModuleAPI
   T sum = static_cast<T>(0);
   int tid = threadIdx.y * blockDim.x + threadIdx.x;
-  int x = tid / BSY;
-  int y = tid & (BSY - 1);
+  int x = tid >> 5;
+  int y = tid & 31;
 
-  int s = BSY / 2;
-  while (s > 0) {
-    if (y < s) {
-      for (int i = 0; i < VecSize; i++) {
-        cache[x * VecSize + i][y] += cache[x * VecSize + i][y + s];
-      }
+  if (x < BSX * VecSize) {
+#pragma unroll
+    for (int i = 0; i < (BSY >> 5); i++) {
+      sum += cache[x][y + i * 32];
     }
-    s /= 2;
-    __syncthreads();
   }
 
-  if (threadIdx.y == 0 && col_id * VecSize < cols) {
-    for (int i = 0; i < VecSize; i++)
-      dbias[col_id * VecSize + i] = cache[threadIdx.x * VecSize + i][0];
+  sum = WarpReduceSum(sum);
+
+  int bias_id = blockIdx.x * blockDim.x * VecSize + x;
+  if (y == 0 && x < VecSize * BSX && bias_id < cols) {
+    dbias[bias_id] = sum;
   }
 }
 
@@ -513,43 +346,42 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
   auto factor = dropout_prob == static_cast<float>(1.0)
                     ? zero
                     : static_cast<T>(1.0 / (1.0 - dropout_prob));
+  if (!is_upscale_in_train) {
+    factor = static_cast<T>(1.0);
+  }
 
   if (dbias != nullptr) {
-    if (cols % 4 == 0) {
-      auto threads = std::min(cols / VecSize, static_cast<uint32_t>(8));
-      auto blocks = std::max((uint32_t)1,
-                             std::min((cols / VecSize + threads - 1) / threads,
-                                      (uint32_t)ctx.GetSMCount()));
-      dim3 block_dim(threads, 128, 1);
-      dim3 grid_dim(blocks, 1, 1);
+    int real_vec_size = VecSize;
+    if (cols % VecSize != 0) real_vec_size = 1;
+    auto threads = std::min(cols / real_vec_size, static_cast<uint32_t>(8));
+    auto blocks = std::max(
+        (uint32_t)1, std::min((cols / real_vec_size + threads - 1) / threads,
+                              (uint32_t)ctx.GetSMCount()));
+    dim3 block_dim(threads, 128, 1);
+    dim3 grid_dim(blocks, 1, 1);
+
+    if (cols % VecSize == 0) {
       FusedResidualDropoutBiasGradVec<
           T, MaskType, 8, 128,
           VecSize><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          dout, mask, factor, rows, cols, dx, dbias, is_upscale_in_train);
-
+          dout, mask, factor, rows, cols, dx, dbias);
     } else {
-      auto threads = std::min(cols, static_cast<uint32_t>(8));
-      auto blocks = std::max(
-          (uint32_t)1,
-          std::min((cols + threads - 1) / threads, (uint32_t)ctx.GetSMCount()));
-      dim3 block_dim(threads, 128, 1);
-      dim3 grid_dim(blocks, 1, 1);
-      FusedResidualDropoutBiasGrad<
-          T, MaskType, 8, 128><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          dout, mask, factor, rows, cols, dx, dbias, is_upscale_in_train);
+      FusedResidualDropoutBiasGradVec<
+          T, MaskType, 8, 128, 1><<<grid_dim, block_dim, 0, ctx.stream()>>>(
+          dout, mask, factor, rows, cols, dx, dbias);
     }
   } else {
     const uint64_t n = rows * cols;
     auto threads = GetResidualDropoutThreads(ctx, n);
-    if (n % 4 == 0) {
+    if (n % VecSize == 0) {
       FusedResidualDropoutGradVec<
           T, MaskType,
           VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
-          dout, mask, factor, n, dx, is_upscale_in_train);
+          dout, mask, factor, n, dx);
     } else {
-      FusedResidualDropoutGrad<
-          T, MaskType><<<threads.second, threads.first, 0, ctx.stream()>>>(
-          dout, mask, factor, n, dx, is_upscale_in_train);
+      FusedResidualDropoutGradVec<
+          T, MaskType, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
+          dout, mask, factor, n, dx);
     }
   }
 }
diff --git a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu b/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu
index c8a1485ab7e0fa..12c2fd6be68360 100644
--- a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu
+++ b/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu
@@ -33,16 +33,16 @@ USE_OP(elementwise_add);
 USE_OP(dropout);
 
 template <typename T>
-void Dropout(const std::vector<T> &x, const framework::DDim &x_dim,
-             std::vector<T> *out, std::vector<uint8_t> *mask,
-             const platform::CUDADeviceContext &ctx, uint64_t seed,
-             float dropout_prob, bool is_upscale_in_train, bool is_test) {
+void Dropout(const T *x, const framework::DDim &x_dim, T *out,
+             std::vector<uint8_t> *mask, const platform::CUDADeviceContext &ctx,
+             uint64_t seed, float dropout_prob, bool is_upscale_in_train,
+             bool is_test) {
   framework::Scope scope;
   auto var_x = scope.Var("X");
   auto tensor_x = var_x->GetMutable<framework::LoDTensor>();
   tensor_x->Resize(x_dim);
   tensor_x->mutable_data<T>(ctx.GetPlace());
-  cudaMemcpy(tensor_x->data<T>(), x.data(), x_dim[0] * x_dim[1] * sizeof(T),
+  cudaMemcpy(tensor_x->data<T>(), x, x_dim[0] * x_dim[1] * sizeof(T),
              cudaMemcpyHostToDevice);
 
   auto var_out = scope.Var("Out");
@@ -65,8 +65,8 @@ void Dropout(const std::vector<T> &x, const framework::DDim &x_dim,
   auto op = framework::OpRegistry::CreateOp(
       "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
   op->Run(scope, ctx.GetPlace());
-  cudaMemcpy((*out).data(), tensor_out->data<T>(),
-             x_dim[0] * x_dim[1] * sizeof(T), cudaMemcpyDeviceToHost);
+  cudaMemcpy(out, tensor_out->data<T>(), x_dim[0] * x_dim[1] * sizeof(T),
+             cudaMemcpyDeviceToHost);
   if (!is_test) {
     cudaMemcpy((*mask).data(), tensor_mask->data<uint8_t>(),
                x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost);
@@ -75,24 +75,23 @@ void Dropout(const std::vector<T> &x, const framework::DDim &x_dim,
 }
 
 template <typename T>
-void DropoutGrad(std::vector<T> *dx, const framework::DDim &x_dim,
-                 const std::vector<T> &dout, const std::vector<uint8_t> &mask,
-                 const platform::CUDADeviceContext &ctx, float dropout_prob,
-                 bool is_upscale_in_train) {
+void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout,
+                 const uint8_t *mask, const platform::CUDADeviceContext &ctx,
+                 float dropout_prob, bool is_upscale_in_train) {
   framework::Scope scope;
   const size_t n = x_dim[0] * x_dim[1];
   auto var_out = scope.Var("DOut");
   auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
   tensor_out->Resize(x_dim);
   tensor_out->mutable_data<T>(ctx.GetPlace());
-  cudaMemcpy(tensor_out->data<T>(), dout.data(), n * sizeof(T),
+  cudaMemcpy(tensor_out->data<T>(), dout, n * sizeof(T),
              cudaMemcpyHostToDevice);
 
   auto var_mask = scope.Var("Mask");
   auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
   tensor_mask->Resize(x_dim);
   tensor_mask->mutable_data<uint8_t>(ctx.GetPlace());
-  cudaMemcpy(tensor_mask->data<uint8_t>(), mask.data(), n * sizeof(uint8_t),
+  cudaMemcpy(tensor_mask->data<uint8_t>(), mask, n * sizeof(uint8_t),
              cudaMemcpyHostToDevice);
 
   auto var_dx = scope.Var("DX");
@@ -112,8 +111,8 @@ void DropoutGrad(std::vector<T> *dx, const framework::DDim &x_dim,
       {{"X@GRAD", {"DX"}}}, attrs);
   op->Run(scope, ctx.GetPlace());
 
-  cudaMemcpy((*dx).data(), tensor_dx->data<T>(),
-             x_dim[0] * x_dim[1] * sizeof(T), cudaMemcpyDeviceToHost);
+  cudaMemcpy(dx, tensor_dx->data<T>(), x_dim[0] * x_dim[1] * sizeof(T),
+             cudaMemcpyDeviceToHost);
   ctx.Wait();
 }
 
@@ -211,17 +210,20 @@ struct TestFusedResidualDropoutBias {
   void BaseForward() {
     std::vector<T> out1(_rows * _cols), out2(_rows * _cols);
     if (_has_bias) {
+      // add bias
       for (int i = 0; i < _rows; i++) {
         for (int j = 0; j < _cols; j++) {
           out1[i * _cols + j] = _src_vec[i * _cols + j] + _bias_vec[j];
         }
       }
-      Dropout<T>(out1, _src.dims(), &out2, &_correct_mask, *_ctx, _seed,
-                 _dropout_prob, _is_upscale_in_train, _is_test);
+      // call dropout
+      Dropout<T>(out1.data(), _src.dims(), out2.data(), &_correct_mask, *_ctx,
+                 _seed, _dropout_prob, _is_upscale_in_train, _is_test);
     } else {
-      Dropout<T>(_src_vec, _src.dims(), &out2, &_correct_mask, *_ctx, _seed,
-                 _dropout_prob, _is_upscale_in_train, _is_test);
+      Dropout<T>(_src_vec.data(), _src.dims(), out2.data(), &_correct_mask,
+                 *_ctx, _seed, _dropout_prob, _is_upscale_in_train, _is_test);
     }
+    // add residual
     for (int i = 0; i < _rows; i++) {
       for (int j = 0; j < _cols; j++) {
         _correct_out[i * _cols + j] =
@@ -232,14 +234,10 @@ struct TestFusedResidualDropoutBias {
   }
 
   void BaseBackward() {
-    if (!_is_upscale_in_train) {
-      for (int i = 0; i < _rows * _cols; i++) {
-        _correct_dsrc[i] = _correct_out[i] * static_cast<T>(_correct_mask[i]);
-      }
-    } else {
-      DropoutGrad<T>(&_correct_dsrc, _src.dims(), _correct_out, _correct_mask,
-                     *_ctx, _dropout_prob, _is_upscale_in_train);
-    }
+    DropoutGrad<T>(_correct_dsrc.data(), _src.dims(), _correct_out.data(),
+                   _correct_mask.data(), *_ctx, _dropout_prob,
+                   _is_upscale_in_train);
+    // calc dbias
     memset(&_correct_dbias[0], 0, _cols * sizeof(T));
     for (int i = 0; i < _rows; i++) {
       for (int j = 0; j < _cols; j++) {

From 462caa1f3226012289c5245b9c478294f9951a91 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Tue, 24 Aug 2021 04:46:40 +0000
Subject: [PATCH 03/19] resolve review comments and add comments to the code

---
 paddle/fluid/operators/fused/CMakeLists.txt   |   2 +-
 paddle/fluid/operators/fused/fused_dropout.h  |  70 ++++++
 .../fused/fused_residual_dropout_bias.h       | 204 +++++++++---------
 ...cu => fused_residual_dropout_bias_test.cu} |  25 ++-
 4 files changed, 192 insertions(+), 109 deletions(-)
 create mode 100644 paddle/fluid/operators/fused/fused_dropout.h
 rename paddle/fluid/operators/fused/{test_fused_residual_dropout_bias.cu => fused_residual_dropout_bias_test.cu} (95%)

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 525f6504f9fa61..78ff136c4d1038 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -74,6 +74,6 @@ if (WITH_GPU OR WITH_ROCM)
     # fused_dropout
     # only support CUDA
     if(NOT WITH_ROCM)
-        nv_test(test_fused_residual_dropout_bias SRCS test_fused_residual_dropout_bias.cu DEPS tensor op_registry elementwise_add_op dropout_op device_context generator)
+        nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry elementwise_add_op dropout_op device_context generator)
     endif()
 endif()
diff --git a/paddle/fluid/operators/fused/fused_dropout.h b/paddle/fluid/operators/fused/fused_dropout.h
new file mode 100644
index 00000000000000..bd6a4122f5830d
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_dropout.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cooperative_groups.h>
+#include <cuda.h>
+#include <curand_kernel.h>
+
+#include <iostream>
+#include <memory>
+
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+/**
+ * get 1D threads and blocks
+ */
+template <int VecSize = 4>
+inline std::pair<uint32_t, uint32_t> Get1DThreadsAndBlocks(
+    const platform::CUDADeviceContext &ctx, const uint64_t n) {
+  const uint64_t tmp_n = n / VecSize;
+  int threads = std::max(
+      (uint64_t)32, std::min(tmp_n, (uint64_t)ctx.GetMaxThreadsPerBlock()));
+  int blocks = std::max((uint64_t)1, (tmp_n + threads - 1) / threads);
+  return std::pair<uint32_t, uint32_t>{threads, blocks};
+}
+
+/**
+ * get the threads for fused_residual_dropout_bias:
+ * 1D blocks: blockDim.x = cols
+ * 2D grids: gridDim.y = rows
+ */
+template <int VecSize = 4>
+inline std::pair<dim3, dim3> Get1DBlocksAnd2DGrids(
+    const platform::CUDADeviceContext &ctx, const uint32_t rows,
+    const uint32_t cols) {
+  const uint32_t tmp_cols = cols / VecSize;
+  int threads = std::max(
+      (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock()));
+  int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads);
+  int blocks_y = std::max((uint32_t)1, rows);
+  dim3 block_dim(threads, 1, 1);
+  dim3 grid_dim(blocks_x, blocks_y, 1);
+  return std::pair<dim3, dim3>{block_dim, grid_dim};
+}
+
+// aligned vector generates vectorized load/store on CUDA
+template <typename T, int VecSize>
+struct alignas(sizeof(T) * VecSize) AlignedVector {
+  T val[VecSize];
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 2d0de22952c88a..16747d7739be1e 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,18 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <cooperative_groups.h>
-#include <cuda.h>
-#include <curand_kernel.h>
-
-#include <iostream>
-#include <memory>
-
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
-
-const int VecSize = 4;
+#include "paddle/fluid/operators/fused/fused_dropout.h"
 
 namespace paddle {
 namespace operators {
@@ -33,47 +22,11 @@ namespace operators {
 namespace platform = paddle::platform;
 namespace cg = cooperative_groups;
 
-inline std::pair<uint32_t, uint32_t> GetResidualDropoutThreads(
-    const platform::CUDADeviceContext &ctx, const uint64_t n) {
-  const uint64_t tmp_n = n / VecSize;
-  int threads = std::max(
-      (uint64_t)32, std::min(tmp_n, (uint64_t)ctx.GetMaxThreadsPerBlock()));
-  int blocks = std::max((uint64_t)1, (tmp_n + threads - 1) / threads);
-  return std::pair<uint32_t, uint32_t>{threads, blocks};
-}
-
-inline std::pair<dim3, dim3> GetResidualDropoutBiasThreads(
-    const platform::CUDADeviceContext &ctx, const uint32_t rows,
-    const uint32_t cols) {
-  const uint32_t tmp_cols = cols / VecSize;
-  int threads = std::max(
-      (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock()));
-  int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads);
-  int blocks_y = std::max((uint32_t)1, rows);
-  dim3 block_dim(threads, 1, 1);
-  dim3 grid_dim(blocks_x, blocks_y, 1);
-  return std::pair<dim3, dim3>{block_dim, grid_dim};
-}
-
 /********Forward**************/
-// aligned vector generates vectorized load/store on CUDA
-template <typename T, int Size>
-struct alignas(sizeof(T) * Size) AlignedVector {
-  T val[Size];
-};
-
-template <typename T>
-inline int VectorizedSize(const T *pointer) {
-  uint64_t address = reinterpret_cast<uint64_t>(pointer);
-  constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value;  // NOLINT
-  if (address % vec4 == 0) {
-    return 4;
-  }
-  return 1;
-}
-
 /**
- * dst = residual + dropout(src + bias);
+ * @brief dst = residual + dropout(src + bias);
+ * the src, residual, mask and dst shape is (rows, cols)
+ * the bias shape is (1, cols)
  */
 template <typename T, typename MaskType, int VecSize>
 __global__ void FusedResidualDropoutBiasVec(const size_t rows,
@@ -81,7 +34,7 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
                                             const float dropout_prob,
                                             const bool is_upscale_in_train,
                                             const T *src, const T *residual,
-                                            const T *bias, MaskType *mask_data,
+                                            const T *bias, MaskType *mask,
                                             T *dst, uint64_t increment) {
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
@@ -89,11 +42,9 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx, increment, &state);
 
-  T dest;
-  MaskType mask;
   T factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
   if (!is_upscale_in_train) {
-    factor = static_cast<T>(1.0);
+    factor = static_cast<T>(1.0f);
   }
   using LoadT = AlignedVector<T, VecSize>;
   using MaskLoadT = AlignedVector<MaskType, VecSize>;
@@ -107,6 +58,7 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
       for (int ii = 0; ii < VecSize; ii++) {
         bias_vec[ii] = static_cast<T>(0);
       }
+      // vectorize load data from global
       LoadT *value = reinterpret_cast<LoadT *>(&src_vec);
       LoadT *residual_value = reinterpret_cast<LoadT *>(&residual_vec);
       *value = *reinterpret_cast<const LoadT *>(&src[r * cols + i]);
@@ -133,58 +85,77 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
                            static_cast<T>(mask_vec[ii]) * factor +
                        residual_vec[ii];
       }
+
+      // store result to global
       *(reinterpret_cast<LoadT *>(&dst[r * cols + i])) =
           *reinterpret_cast<LoadT *>(&dest_vec[0]);
-      *(reinterpret_cast<MaskLoadT *>(&mask_data[r * cols + i])) =
+      *(reinterpret_cast<MaskLoadT *>(&mask[r * cols + i])) =
           *reinterpret_cast<MaskLoadT *>(&mask_vec[0]);
     }
   }
 }
 
+/**
+ * @brief for dropout's param is_test = true
+ * the src, residual and dst shape is (rows, cols)
+ * the bias shape is (1, cols)
+ */
 template <typename T, int VecSize>
-__global__ void FusedResidualDropoutBiasTest(const size_t rows,
-                                             const size_t cols,
-                                             const float dropout_prob,
-                                             const bool is_upscale_in_train,
-                                             const T *src, const T *residual,
-                                             const T *bias, T *dst) {
+__global__ void FusedResidualDropoutBiasIsTest(const size_t rows,
+                                               const size_t cols,
+                                               const float dropout_prob,
+                                               const bool is_upscale_in_train,
+                                               const T *src, const T *residual,
+                                               const T *bias, T *dst) {
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
 
   T factor = static_cast<T>(1.0f - dropout_prob);
   if (is_upscale_in_train) {
-    factor = static_cast<T>(1.0);
+    factor = static_cast<T>(1.0f);
   }
+
+  using LoadT = AlignedVector<T, VecSize>;
+
   const int tmp_cols = cols / VecSize * VecSize;
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < tmp_cols;
          i += blockDim.x * gridDim.x * VecSize) {
+      T src_vec[VecSize];
+      T residual_vec[VecSize];
+      T bias_vec[VecSize];
 #pragma unroll
-      for (int j = 0; j < VecSize; j++) {
-        dst[r * cols + i + j] =
-            (src[r * cols + i + j] +
-             (bias != nullptr ? bias[i + j] : static_cast<T>(0.0))) *
-                factor +
-            residual[r * cols + i + j];
+      for (int ii = 0; ii < VecSize; ii++) {
+        bias_vec[ii] = static_cast<T>(0);
       }
-    }
+      // vectorize load data from global
+      LoadT *value = reinterpret_cast<LoadT *>(&src_vec);
+      LoadT *residual_value = reinterpret_cast<LoadT *>(&residual_vec);
+      *value = *reinterpret_cast<const LoadT *>(&src[r * cols + i]);
+      *residual_value =
+          *reinterpret_cast<const LoadT *>(&residual[r * cols + i]);
 
-    int high_index = tmp_cols + col_id;
-    if (high_index < cols) {
-      for (int i = high_index; i < cols; i++) {
-        dst[r * cols + i] =
-            (src[r * cols + i] +
-             (bias != nullptr ? bias[i] : static_cast<T>(0.0))) *
-                factor +
-            residual[r * cols + i];
+      LoadT *bias_value =
+          bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
+      if (bias != nullptr)
+        *bias_value = *reinterpret_cast<const LoadT *>(&bias[i]);
+
+      T dest_vec[VecSize];
+#pragma unroll
+      for (int ii = 0; ii < VecSize; ii++) {
+        dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii];
       }
+
+      // store result to global
+      *(reinterpret_cast<LoadT *>(&dst[r * cols + i])) =
+          *reinterpret_cast<LoadT *>(&dest_vec[0]);
     }
   }
 }
 
 /**
- * dst = residual + dropout(src + bias);
+ * @brief dst = residual + dropout(src + bias);
  */
 template <typename T, typename MaskType>
 void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
@@ -194,7 +165,8 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
                                const T *residual, const T *bias,
                                MaskType *mask_data, T *dst,
                                const platform::CUDADeviceContext &ctx) {
-  if (std::abs(dropout_prob - 1.0) < 1e-5) {
+  // dropout_prob == 1.0f
+  if (std::abs(dropout_prob - 1.0f) < 1e-5) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T),
                         cudaMemcpyDeviceToDevice, ctx.stream()));
@@ -203,7 +175,8 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
     return;
   }
 
-  auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols);
+  const int VecSize = 4;
+  auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols);
   if (cols % VecSize != 0)
     FusedResidualDropoutBiasVec<
         T, uint8_t, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
@@ -217,25 +190,39 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
         bias, mask_data, dst, increment);
 }
 
+/**
+ *@brief to launch kernel FusedResidualDropoutBiasIsTest
+ */
 template <typename T>
-void LaunchResidualDropoutBiasTest(const uint32_t rows, const uint32_t cols,
-                                   const float dropout_prob,
-                                   bool is_upscale_in_train, const T *src,
-                                   const T *residual, const T *bias, T *dst,
-                                   const platform::CUDADeviceContext &ctx) {
-  if (std::abs(dropout_prob - 1.0) < 1e-5) {
+void LaunchResidualDropoutBiasIsTest(const uint32_t rows, const uint32_t cols,
+                                     const float dropout_prob,
+                                     bool is_upscale_in_train, const T *src,
+                                     const T *residual, const T *bias, T *dst,
+                                     const platform::CUDADeviceContext &ctx) {
+  if (std::abs(dropout_prob - 1.0f) < 1e-5) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T),
                         cudaMemcpyDeviceToDevice, ctx.stream()));
     return;
   }
-  auto threads = GetResidualDropoutBiasThreads(ctx, rows, cols);
-  FusedResidualDropoutBiasTest<
-      T, VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
-      rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias, dst);
+  const int VecSize = 4;
+  auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols);
+  if (cols % VecSize != 0)
+    FusedResidualDropoutBiasIsTest<
+        T, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
+        rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
+        dst);
+  else
+    FusedResidualDropoutBiasIsTest<
+        T, VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+        rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
+        dst);
 }
 
 /********Backward**************/
+/*
+ * @brief calculate the grad of no bias
+ */
 template <typename T, typename MaskType, int VecSize>
 __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask,
                                             const T factor, const int64_t size,
@@ -262,9 +249,6 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask,
   }
 }
 
-template <typename T, int BSX, int BSY, int VecSize>
-__device__ void reduce_sum(T cache[BSX * VecSize][BSY]) {}
-
 template <typename U>
 static __forceinline__ __device__ U WarpReduceSum(U val) {
   unsigned mask = 0u;
@@ -276,6 +260,12 @@ static __forceinline__ __device__ U WarpReduceSum(U val) {
   return val;
 }
 
+/**
+ * blocks(128 * 8)
+ * 1. calculate the dx and reduce total rows to 128 rows
+ * 2. save 128*8 temporary sum in 8*128 shared memory
+ * 3. reduce the sum of 128 rows data by 8*VecSize warps
+ */
 template <typename T, typename MaskType, int BSX, int BSY, int VecSize>
 __global__ void FusedResidualDropoutBiasGradVec(
     const T *dout, const MaskType *mask, const T factor, const int64_t rows,
@@ -286,6 +276,7 @@ __global__ void FusedResidualDropoutBiasGradVec(
   using MaskLoadT = AlignedVector<MaskType, VecSize>;
 
   T tmp_sum[VecSize] = {static_cast<T>(0)};
+  // calculate the dx and temporary sum
   if (col_id * VecSize < cols) {
     for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) {
       int index = row_id * cols + col_id * VecSize;
@@ -309,6 +300,7 @@ __global__ void FusedResidualDropoutBiasGradVec(
     }
   }
 
+  // save temporary sum to cache and do transpose
   __shared__ T cache[BSX * VecSize][BSY];
   for (int i = 0; i < VecSize; i++)
     cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i];
@@ -317,24 +309,31 @@ __global__ void FusedResidualDropoutBiasGradVec(
   // reduce sum
   T sum = static_cast<T>(0);
   int tid = threadIdx.y * blockDim.x + threadIdx.x;
-  int x = tid >> 5;
-  int y = tid & 31;
+  int x = tid >> 5;  // warp id
+  int y = tid & 31;  // thread id on warp 0~31
 
+  // need BSX * VecSize warps
   if (x < BSX * VecSize) {
+// reduce 128 to 32
 #pragma unroll
     for (int i = 0; i < (BSY >> 5); i++) {
       sum += cache[x][y + i * 32];
     }
   }
 
+  // reduce 32 to 1
   sum = WarpReduceSum(sum);
 
+  // save sum to dbias
   int bias_id = blockIdx.x * blockDim.x * VecSize + x;
   if (y == 0 && x < VecSize * BSX && bias_id < cols) {
     dbias[bias_id] = sum;
   }
 }
 
+/**
+ * @brief to launch kernel FusedResidualDropoutBiasGradVec
+ */
 template <typename T, typename MaskType>
 void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
                                    const float dropout_prob,
@@ -342,14 +341,15 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
                                    const uint32_t rows, const uint32_t cols,
                                    T *dx, T *dbias,
                                    const platform::CUDADeviceContext &ctx) {
-  const T zero = static_cast<T>(0.0);
-  auto factor = dropout_prob == static_cast<float>(1.0)
+  const T zero = static_cast<T>(0.0f);
+  auto factor = dropout_prob == static_cast<float>(1.0f)
                     ? zero
-                    : static_cast<T>(1.0 / (1.0 - dropout_prob));
+                    : static_cast<T>(1.0f / (1.0f - dropout_prob));
   if (!is_upscale_in_train) {
-    factor = static_cast<T>(1.0);
+    factor = static_cast<T>(1.0f);
   }
 
+  const int VecSize = 4;
   if (dbias != nullptr) {
     int real_vec_size = VecSize;
     if (cols % VecSize != 0) real_vec_size = 1;
@@ -372,7 +372,7 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
     }
   } else {
     const uint64_t n = rows * cols;
-    auto threads = GetResidualDropoutThreads(ctx, n);
+    auto threads = Get1DThreadsAndBlocks(ctx, n);
     if (n % VecSize == 0) {
       FusedResidualDropoutGradVec<
           T, MaskType,
diff --git a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
similarity index 95%
rename from paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu
rename to paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 12c2fd6be68360..5cd20dce57855b 100644
--- a/paddle/fluid/operators/fused/test_fused_residual_dropout_bias.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -29,9 +29,19 @@ limitations under the License. */
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 
-USE_OP(elementwise_add);
 USE_OP(dropout);
 
+/**
+ * @brief the unittest of fused_residual_dropout_bias
+ * 1. random input data
+ * 2. add bias, call paddle dropout op, add residual, and get the base result
+ * 3. call FusedResidualDropoutBias function get fused result
+ * 4. compare ther base result and fused result
+ */
+
+/**
+ * @brief call paddle dropout op
+ */
 template <typename T>
 void Dropout(const T *x, const framework::DDim &x_dim, T *out,
              std::vector<uint8_t> *mask, const platform::CUDADeviceContext &ctx,
@@ -74,6 +84,9 @@ void Dropout(const T *x, const framework::DDim &x_dim, T *out,
   ctx.Wait();
 }
 
+/**
+ * @brief call paddle dropout_grad op
+ */
 template <typename T>
 void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout,
                  const uint8_t *mask, const platform::CUDADeviceContext &ctx,
@@ -247,8 +260,9 @@ struct TestFusedResidualDropoutBias {
   }
 
   void FusedForward() {
-    auto threads = paddle::operators::GetResidualDropoutBiasThreads(
+    auto threads = paddle::operators::Get1DBlocksAnd2DGrids(
         *_ctx, (uint64_t)_rows, (uint64_t)_cols);
+    const int VecSize = 4;
     const int increment =
         ((_cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) *
         VecSize;
@@ -258,7 +272,7 @@ struct TestFusedResidualDropoutBias {
       bias_ptr = _bias.data<T>();
     }
     if (_is_test) {
-      paddle::operators::LaunchResidualDropoutBiasTest<T>(
+      paddle::operators::LaunchResidualDropoutBiasIsTest<T>(
           _rows, _cols, _dropout_prob, _is_upscale_in_train, _src.data<T>(),
           _residual.data<T>(), bias_ptr, _out.data<T>(), *_ctx);
     } else {
@@ -351,14 +365,13 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasDouble) {
   test.CheckGrad(static_cast<double>(1e-5));
 }
 
+// test fp16, For inference, check_grad is not required. ref: test_dropout_op.py
 TEST(FusedDropout, GPUFusedRedisualDorpoutBiasFp16) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<platform::float16> test(rows, cols);
   test.Run();
   test.CheckOut(static_cast<platform::float16>(1e-2));
-  // For inference, check_grad is not required. ref: test_dropout_op.py
-  // test.CheckGrad((platform::float16)1e-2);
 }
 
 // test no bias and cols % 4 == 0

From 93e063864f56f15e2e88cae77e6a5b5b633ab3e5 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Tue, 24 Aug 2021 12:35:20 +0000
Subject: [PATCH 04/19] fused_dropout: optimize code structure to facilitate
 reuse

---
 paddle/fluid/operators/fused/CMakeLists.txt   |   2 +-
 paddle/fluid/operators/fused/fused_dropout.h  |  12 ++
 .../operators/fused/fused_dropout_test.h      | 121 ++++++++++++++++++
 .../fused/fused_residual_dropout_bias.h       |  15 +--
 .../fused/fused_residual_dropout_bias_test.cu | 100 +--------------
 5 files changed, 136 insertions(+), 114 deletions(-)
 create mode 100644 paddle/fluid/operators/fused/fused_dropout_test.h

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 78ff136c4d1038..f3035cddcba020 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -74,6 +74,6 @@ if (WITH_GPU OR WITH_ROCM)
     # fused_dropout
     # only support CUDA
     if(NOT WITH_ROCM)
-        nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry elementwise_add_op dropout_op device_context generator)
+        nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op device_context generator)
     endif()
 endif()
diff --git a/paddle/fluid/operators/fused/fused_dropout.h b/paddle/fluid/operators/fused/fused_dropout.h
index bd6a4122f5830d..4188d935b9e458 100644
--- a/paddle/fluid/operators/fused/fused_dropout.h
+++ b/paddle/fluid/operators/fused/fused_dropout.h
@@ -66,5 +66,17 @@ struct alignas(sizeof(T) * VecSize) AlignedVector {
   T val[VecSize];
 };
 
+// reduce sum by a warp
+template <typename U>
+static __forceinline__ __device__ U WarpReduceSum(U val) {
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+  const int warpSize = 32;
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
new file mode 100644
index 00000000000000..6cb8cd19b608d1
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <random>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+
+USE_OP(dropout);
+
+/**
+ * @brief call paddle dropout op
+ */
+template <typename T>
+void Dropout(const T *x, const framework::DDim &x_dim, T *out,
+             std::vector<uint8_t> *mask, const platform::CUDADeviceContext &ctx,
+             uint64_t seed, float dropout_prob, bool is_upscale_in_train,
+             bool is_test) {
+  framework::Scope scope;
+  auto var_x = scope.Var("X");
+  auto tensor_x = var_x->GetMutable<framework::LoDTensor>();
+  tensor_x->Resize(x_dim);
+  tensor_x->mutable_data<T>(ctx.GetPlace());
+  cudaMemcpy(tensor_x->data<T>(), x, x_dim[0] * x_dim[1] * sizeof(T),
+             cudaMemcpyHostToDevice);
+
+  auto var_out = scope.Var("Out");
+  auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
+
+  auto var_mask = scope.Var("Mask");
+  auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
+
+  framework::AttributeMap attrs;
+  attrs.insert({"fix_seed", 1});
+  attrs.insert({"seed", static_cast<int>(seed)});
+  attrs.insert({"dropout_prob", dropout_prob});
+  if (is_upscale_in_train) {
+    attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
+  }
+  if (is_test) {
+    attrs.insert({"is_test", 1});
+  }
+
+  auto op = framework::OpRegistry::CreateOp(
+      "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
+  op->Run(scope, ctx.GetPlace());
+  cudaMemcpy(out, tensor_out->data<T>(), x_dim[0] * x_dim[1] * sizeof(T),
+             cudaMemcpyDeviceToHost);
+  if (!is_test) {
+    cudaMemcpy((*mask).data(), tensor_mask->data<uint8_t>(),
+               x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost);
+  }
+  ctx.Wait();
+}
+
+/**
+ * @brief call paddle dropout_grad op
+ */
+template <typename T>
+void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout,
+                 const uint8_t *mask, const platform::CUDADeviceContext &ctx,
+                 float dropout_prob, bool is_upscale_in_train) {
+  framework::Scope scope;
+  const size_t n = x_dim[0] * x_dim[1];
+  auto var_out = scope.Var("DOut");
+  auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
+  tensor_out->Resize(x_dim);
+  tensor_out->mutable_data<T>(ctx.GetPlace());
+  cudaMemcpy(tensor_out->data<T>(), dout, n * sizeof(T),
+             cudaMemcpyHostToDevice);
+
+  auto var_mask = scope.Var("Mask");
+  auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
+  tensor_mask->Resize(x_dim);
+  tensor_mask->mutable_data<uint8_t>(ctx.GetPlace());
+  cudaMemcpy(tensor_mask->data<uint8_t>(), mask, n * sizeof(uint8_t),
+             cudaMemcpyHostToDevice);
+
+  auto var_dx = scope.Var("DX");
+  auto tensor_dx = var_dx->GetMutable<framework::LoDTensor>();
+
+  framework::AttributeMap attrs;
+  attrs.insert({"dropout_prob", dropout_prob});
+  attrs.insert({"is_test", 0});
+  if (is_upscale_in_train) {
+    attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
+  } else {
+    attrs.insert({"dropout_implementation", std::string("downgrade_in_infer")});
+  }
+
+  auto op = framework::OpRegistry::CreateOp(
+      "dropout_grad", {{"Out@GRAD", {"DOut"}}, {"Mask", {"Mask"}}},
+      {{"X@GRAD", {"DX"}}}, attrs);
+  op->Run(scope, ctx.GetPlace());
+
+  cudaMemcpy(dx, tensor_dx->data<T>(), x_dim[0] * x_dim[1] * sizeof(T),
+             cudaMemcpyDeviceToHost);
+  ctx.Wait();
+}
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 16747d7739be1e..ce9273dff0a993 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -118,9 +118,8 @@ __global__ void FusedResidualDropoutBiasIsTest(const size_t rows,
 
   using LoadT = AlignedVector<T, VecSize>;
 
-  const int tmp_cols = cols / VecSize * VecSize;
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
-    for (int i = col_id * VecSize; i < tmp_cols;
+    for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
       T src_vec[VecSize];
       T residual_vec[VecSize];
@@ -249,17 +248,6 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask,
   }
 }
 
-template <typename U>
-static __forceinline__ __device__ U WarpReduceSum(U val) {
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-  const int warpSize = 32;
-  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
-  }
-  return val;
-}
-
 /**
  * blocks(128 * 8)
  * 1. calculate the dx and reduce total rows to 128 rows
@@ -285,7 +273,6 @@ __global__ void FusedResidualDropoutBiasGradVec(
       T dx_vec[VecSize];
       LoadT *out_value = reinterpret_cast<LoadT *>(&out_vec);
       MaskLoadT *mask_value = reinterpret_cast<MaskLoadT *>(&mask_vec);
-      LoadT *dx_value = reinterpret_cast<LoadT *>(&dx_vec);
       *out_value = *reinterpret_cast<const LoadT *>(&dout[index]);
       *mask_value = *reinterpret_cast<const MaskLoadT *>(&mask[index]);
 
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 5cd20dce57855b..fa119f1132e8f6 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -17,20 +17,12 @@ limitations under the License. */
 #include <random>
 #include <vector>
 
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/fused/fused_dropout_test.h"
 #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/string/printf.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 
-USE_OP(dropout);
-
 /**
  * @brief the unittest of fused_residual_dropout_bias
  * 1. random input data
@@ -39,96 +31,6 @@ USE_OP(dropout);
  * 4. compare ther base result and fused result
  */
 
-/**
- * @brief call paddle dropout op
- */
-template <typename T>
-void Dropout(const T *x, const framework::DDim &x_dim, T *out,
-             std::vector<uint8_t> *mask, const platform::CUDADeviceContext &ctx,
-             uint64_t seed, float dropout_prob, bool is_upscale_in_train,
-             bool is_test) {
-  framework::Scope scope;
-  auto var_x = scope.Var("X");
-  auto tensor_x = var_x->GetMutable<framework::LoDTensor>();
-  tensor_x->Resize(x_dim);
-  tensor_x->mutable_data<T>(ctx.GetPlace());
-  cudaMemcpy(tensor_x->data<T>(), x, x_dim[0] * x_dim[1] * sizeof(T),
-             cudaMemcpyHostToDevice);
-
-  auto var_out = scope.Var("Out");
-  auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
-
-  auto var_mask = scope.Var("Mask");
-  auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
-
-  framework::AttributeMap attrs;
-  attrs.insert({"fix_seed", 1});
-  attrs.insert({"seed", static_cast<int>(seed)});
-  attrs.insert({"dropout_prob", dropout_prob});
-  if (is_upscale_in_train) {
-    attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
-  }
-  if (is_test) {
-    attrs.insert({"is_test", 1});
-  }
-
-  auto op = framework::OpRegistry::CreateOp(
-      "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
-  op->Run(scope, ctx.GetPlace());
-  cudaMemcpy(out, tensor_out->data<T>(), x_dim[0] * x_dim[1] * sizeof(T),
-             cudaMemcpyDeviceToHost);
-  if (!is_test) {
-    cudaMemcpy((*mask).data(), tensor_mask->data<uint8_t>(),
-               x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost);
-  }
-  ctx.Wait();
-}
-
-/**
- * @brief call paddle dropout_grad op
- */
-template <typename T>
-void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout,
-                 const uint8_t *mask, const platform::CUDADeviceContext &ctx,
-                 float dropout_prob, bool is_upscale_in_train) {
-  framework::Scope scope;
-  const size_t n = x_dim[0] * x_dim[1];
-  auto var_out = scope.Var("DOut");
-  auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
-  tensor_out->Resize(x_dim);
-  tensor_out->mutable_data<T>(ctx.GetPlace());
-  cudaMemcpy(tensor_out->data<T>(), dout, n * sizeof(T),
-             cudaMemcpyHostToDevice);
-
-  auto var_mask = scope.Var("Mask");
-  auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
-  tensor_mask->Resize(x_dim);
-  tensor_mask->mutable_data<uint8_t>(ctx.GetPlace());
-  cudaMemcpy(tensor_mask->data<uint8_t>(), mask, n * sizeof(uint8_t),
-             cudaMemcpyHostToDevice);
-
-  auto var_dx = scope.Var("DX");
-  auto tensor_dx = var_dx->GetMutable<framework::LoDTensor>();
-
-  framework::AttributeMap attrs;
-  attrs.insert({"dropout_prob", dropout_prob});
-  attrs.insert({"is_test", 0});
-  if (is_upscale_in_train) {
-    attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
-  } else {
-    attrs.insert({"dropout_implementation", std::string("downgrade_in_infer")});
-  }
-
-  auto op = framework::OpRegistry::CreateOp(
-      "dropout_grad", {{"Out@GRAD", {"DOut"}}, {"Mask", {"Mask"}}},
-      {{"X@GRAD", {"DX"}}}, attrs);
-  op->Run(scope, ctx.GetPlace());
-
-  cudaMemcpy(dx, tensor_dx->data<T>(), x_dim[0] * x_dim[1] * sizeof(T),
-             cudaMemcpyDeviceToHost);
-  ctx.Wait();
-}
-
 template <typename T>
 struct TestFusedResidualDropoutBias {
   uint32_t _rows;

From 036b4307f58ddfbdeb7b815b138cd59cb09cd600 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Wed, 25 Aug 2021 11:59:17 +0000
Subject: [PATCH 05/19] optimize code structure to facilitate reuse

---
 paddle/fluid/operators/fused/fused_dropout.h  |  12 -
 .../fused/fused_residual_dropout_bias.h       | 258 ++++++++++--------
 .../fused/fused_residual_dropout_bias_test.cu |  38 ++-
 3 files changed, 159 insertions(+), 149 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_dropout.h b/paddle/fluid/operators/fused/fused_dropout.h
index 4188d935b9e458..bd6a4122f5830d 100644
--- a/paddle/fluid/operators/fused/fused_dropout.h
+++ b/paddle/fluid/operators/fused/fused_dropout.h
@@ -66,17 +66,5 @@ struct alignas(sizeof(T) * VecSize) AlignedVector {
   T val[VecSize];
 };
 
-// reduce sum by a warp
-template <typename U>
-static __forceinline__ __device__ U WarpReduceSum(U val) {
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-  const int warpSize = 32;
-  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
-  }
-  return val;
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index ce9273dff0a993..0a263635e46029 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/fused/fused_dropout.h"
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -22,7 +23,71 @@ namespace operators {
 namespace platform = paddle::platform;
 namespace cg = cooperative_groups;
 
+/**
+ * @brief fused the add_bias, dropout, add residual into one operators
+ *
+ */
+
 /********Forward**************/
+/**
+ * @brief the fused function called by every thread
+ */
+template <typename T, typename MaskType, typename U, int VecSize,
+          bool layer_norm>
+__forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread(
+    const int row_id, const int col_id, const int cols,
+    curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor,
+    const T *src, const T *residual, const T *bias, T *dst, MaskType *mask,
+    U *mean_val, U *var_val) {
+  using LoadT = AlignedVector<T, VecSize>;
+  using MaskLoadT = AlignedVector<MaskType, VecSize>;
+  T src_vec[VecSize];
+  T residual_vec[VecSize];
+  T bias_vec[VecSize];
+#pragma unroll
+  for (int ii = 0; ii < VecSize; ii++) {
+    bias_vec[ii] = static_cast<T>(0);
+  }
+  // vectorize load data from global
+  LoadT *value = reinterpret_cast<LoadT *>(&src_vec);
+  LoadT *residual_value = reinterpret_cast<LoadT *>(&residual_vec);
+  *value = *reinterpret_cast<const LoadT *>(&src[row_id * cols + col_id]);
+  *residual_value =
+      *reinterpret_cast<const LoadT *>(&residual[row_id * cols + col_id]);
+
+  LoadT *bias_value =
+      bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
+  if (bias != nullptr)
+    *bias_value = *reinterpret_cast<const LoadT *>(&bias[col_id]);
+
+  float4 rand = curand_uniform4(state);
+  T dest_vec[VecSize];
+  MaskType mask_vec[VecSize];
+
+#pragma unroll
+  for (int ii = 0; ii < VecSize; ii++) {
+    mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob);
+  }
+
+#pragma unroll
+  for (int ii = 0; ii < VecSize; ii++) {
+    dest_vec[ii] =
+        (src_vec[ii] + bias_vec[ii]) * static_cast<T>(mask_vec[ii]) * factor +
+        residual_vec[ii];
+    if (layer_norm) {
+      U tmp = static_cast<U>(dest_vec[ii]);
+      *mean_val += tmp;
+      *var_val += (tmp * tmp);
+    }
+  }
+
+  // store result to global
+  *(reinterpret_cast<LoadT *>(&dst[row_id * cols + col_id])) =
+      *reinterpret_cast<LoadT *>(&dest_vec[0]);
+  *(reinterpret_cast<MaskLoadT *>(&mask[row_id * cols + col_id])) =
+      *reinterpret_cast<MaskLoadT *>(&mask_vec[0]);
+}
+
 /**
  * @brief dst = residual + dropout(src + bias);
  * the src, residual, mask and dst shape is (rows, cols)
@@ -46,67 +111,71 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
   if (!is_upscale_in_train) {
     factor = static_cast<T>(1.0f);
   }
-  using LoadT = AlignedVector<T, VecSize>;
-  using MaskLoadT = AlignedVector<MaskType, VecSize>;
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
-      T src_vec[VecSize];
-      T residual_vec[VecSize];
-      T bias_vec[VecSize];
-#pragma unroll
-      for (int ii = 0; ii < VecSize; ii++) {
-        bias_vec[ii] = static_cast<T>(0);
-      }
-      // vectorize load data from global
-      LoadT *value = reinterpret_cast<LoadT *>(&src_vec);
-      LoadT *residual_value = reinterpret_cast<LoadT *>(&residual_vec);
-      *value = *reinterpret_cast<const LoadT *>(&src[r * cols + i]);
-      *residual_value =
-          *reinterpret_cast<const LoadT *>(&residual[r * cols + i]);
-
-      LoadT *bias_value =
-          bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
-      if (bias != nullptr)
-        *bias_value = *reinterpret_cast<const LoadT *>(&bias[i]);
-
-      float4 rand = curand_uniform4(&state);
-      T dest_vec[VecSize];
-      MaskType mask_vec[VecSize];
+      FusedResidualDropoutBiasVecOneThread<T, MaskType, T, VecSize, false>(
+          r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
+          mask, NULL, NULL);
+    }
+  }
+}
 
+/**
+ * @brief the fused function called by every thread
+ */
+template <typename T, typename U, int VecSize, bool layer_norm>
+__forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferVecOneThread(
+    const int row_id, const int col_id, const int cols,
+    const float dropout_prob, const T factor, const T *src, const T *residual,
+    const T *bias, T *dst, U *mean_val, U *var_val) {
+  using LoadT = AlignedVector<T, VecSize>;
+  T src_vec[VecSize];
+  T residual_vec[VecSize];
+  T bias_vec[VecSize];
 #pragma unroll
-      for (int ii = 0; ii < VecSize; ii++) {
-        mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob);
-      }
+  for (int ii = 0; ii < VecSize; ii++) {
+    bias_vec[ii] = static_cast<T>(0);
+  }
+  // vectorize load data from global
+  LoadT *value = reinterpret_cast<LoadT *>(&src_vec);
+  LoadT *residual_value = reinterpret_cast<LoadT *>(&residual_vec);
+  *value = *reinterpret_cast<const LoadT *>(&src[row_id * cols + col_id]);
+  *residual_value =
+      *reinterpret_cast<const LoadT *>(&residual[row_id * cols + col_id]);
 
-#pragma unroll
-      for (int ii = 0; ii < VecSize; ii++) {
-        dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) *
-                           static_cast<T>(mask_vec[ii]) * factor +
-                       residual_vec[ii];
-      }
+  LoadT *bias_value =
+      bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
+  if (bias != nullptr)
+    *bias_value = *reinterpret_cast<const LoadT *>(&bias[col_id]);
+
+  T dest_vec[VecSize];
 
-      // store result to global
-      *(reinterpret_cast<LoadT *>(&dst[r * cols + i])) =
-          *reinterpret_cast<LoadT *>(&dest_vec[0]);
-      *(reinterpret_cast<MaskLoadT *>(&mask[r * cols + i])) =
-          *reinterpret_cast<MaskLoadT *>(&mask_vec[0]);
+#pragma unroll
+  for (int ii = 0; ii < VecSize; ii++) {
+    dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii];
+    if (layer_norm) {
+      U tmp = static_cast<U>(dest_vec[ii]);
+      *mean_val += tmp;
+      *var_val += (tmp * tmp);
     }
   }
+
+  // store result to global
+  *(reinterpret_cast<LoadT *>(&dst[row_id * cols + col_id])) =
+      *reinterpret_cast<LoadT *>(&dest_vec[0]);
 }
 
 /**
- * @brief for dropout's param is_test = true
+ * @brief for dropout's param is_test = true, only used in inference
  * the src, residual and dst shape is (rows, cols)
  * the bias shape is (1, cols)
  */
 template <typename T, int VecSize>
-__global__ void FusedResidualDropoutBiasIsTest(const size_t rows,
-                                               const size_t cols,
-                                               const float dropout_prob,
-                                               const bool is_upscale_in_train,
-                                               const T *src, const T *residual,
-                                               const T *bias, T *dst) {
+__global__ void FusedResidualDropoutBiasOnlyInferVec(
+    const size_t rows, const size_t cols, const float dropout_prob,
+    const bool is_upscale_in_train, const T *src, const T *residual,
+    const T *bias, T *dst) {
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
@@ -116,39 +185,12 @@ __global__ void FusedResidualDropoutBiasIsTest(const size_t rows,
     factor = static_cast<T>(1.0f);
   }
 
-  using LoadT = AlignedVector<T, VecSize>;
-
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
-      T src_vec[VecSize];
-      T residual_vec[VecSize];
-      T bias_vec[VecSize];
-#pragma unroll
-      for (int ii = 0; ii < VecSize; ii++) {
-        bias_vec[ii] = static_cast<T>(0);
-      }
-      // vectorize load data from global
-      LoadT *value = reinterpret_cast<LoadT *>(&src_vec);
-      LoadT *residual_value = reinterpret_cast<LoadT *>(&residual_vec);
-      *value = *reinterpret_cast<const LoadT *>(&src[r * cols + i]);
-      *residual_value =
-          *reinterpret_cast<const LoadT *>(&residual[r * cols + i]);
-
-      LoadT *bias_value =
-          bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
-      if (bias != nullptr)
-        *bias_value = *reinterpret_cast<const LoadT *>(&bias[i]);
-
-      T dest_vec[VecSize];
-#pragma unroll
-      for (int ii = 0; ii < VecSize; ii++) {
-        dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii];
-      }
-
-      // store result to global
-      *(reinterpret_cast<LoadT *>(&dst[r * cols + i])) =
-          *reinterpret_cast<LoadT *>(&dest_vec[0]);
+      FusedResidualDropoutBiasOnlyInferVecOneThread<T, T, VecSize, false>(
+          r, i, cols, dropout_prob, factor, src, residual, bias, dst, nullptr,
+          nullptr);
     }
   }
 }
@@ -159,7 +201,7 @@ __global__ void FusedResidualDropoutBiasIsTest(const size_t rows,
 template <typename T, typename MaskType>
 void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
                                const int increment, uint64_t seed,
-                               const float dropout_prob,
+                               const float dropout_prob, const bool is_test,
                                bool is_upscale_in_train, const T *src,
                                const T *residual, const T *bias,
                                MaskType *mask_data, T *dst,
@@ -176,46 +218,32 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
 
   const int VecSize = 4;
   auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols);
-  if (cols % VecSize != 0)
-    FusedResidualDropoutBiasVec<
-        T, uint8_t, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
-        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
-        bias, mask_data, dst, increment);
-  else
-    FusedResidualDropoutBiasVec<
-        T, uint8_t,
-        VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
-        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
-        bias, mask_data, dst, increment);
-}
-
-/**
- *@brief to launch kernel FusedResidualDropoutBiasIsTest
- */
-template <typename T>
-void LaunchResidualDropoutBiasIsTest(const uint32_t rows, const uint32_t cols,
-                                     const float dropout_prob,
-                                     bool is_upscale_in_train, const T *src,
-                                     const T *residual, const T *bias, T *dst,
-                                     const platform::CUDADeviceContext &ctx) {
-  if (std::abs(dropout_prob - 1.0f) < 1e-5) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T),
-                        cudaMemcpyDeviceToDevice, ctx.stream()));
-    return;
+  if (cols % VecSize != 0) {
+    if (!is_test) {
+      FusedResidualDropoutBiasVec<
+          T, uint8_t, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
+          rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+          bias, mask_data, dst, increment);
+    } else {
+      FusedResidualDropoutBiasOnlyInferVec<
+          T, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
+          rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
+          dst);
+    }
+  } else {
+    if (!is_test) {
+      FusedResidualDropoutBiasVec<
+          T, uint8_t,
+          VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+          rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+          bias, mask_data, dst, increment);
+    } else {
+      FusedResidualDropoutBiasOnlyInferVec<
+          T, VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+          rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
+          dst);
+    }
   }
-  const int VecSize = 4;
-  auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols);
-  if (cols % VecSize != 0)
-    FusedResidualDropoutBiasIsTest<
-        T, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
-        rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
-        dst);
-  else
-    FusedResidualDropoutBiasIsTest<
-        T, VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
-        rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
-        dst);
 }
 
 /********Backward**************/
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index fa119f1132e8f6..d5377194934ff6 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -43,7 +43,7 @@ struct TestFusedResidualDropoutBias {
   framework::Tensor _src, _residual, _bias, _out, _mask;
   framework::Tensor _dsrc, _dbias;
 
-  std::vector<T> _src_vec, _residual_vec, _bias_vec, _out_vec, _mask_vec;
+  std::vector<T> _src_vec, _residual_vec, _bias_vec;
   std::vector<T> _correct_out, _correct_dsrc, _correct_dbias;
   std::vector<uint8_t> _correct_mask;
 
@@ -173,16 +173,10 @@ struct TestFusedResidualDropoutBias {
     if (_has_bias) {
       bias_ptr = _bias.data<T>();
     }
-    if (_is_test) {
-      paddle::operators::LaunchResidualDropoutBiasIsTest<T>(
-          _rows, _cols, _dropout_prob, _is_upscale_in_train, _src.data<T>(),
-          _residual.data<T>(), bias_ptr, _out.data<T>(), *_ctx);
-    } else {
-      paddle::operators::LaunchResidualDropoutBias<T, uint8_t>(
-          _rows, _cols, increment, _seed, _dropout_prob, _is_upscale_in_train,
-          _src.data<T>(), _residual.data<T>(), bias_ptr, _mask.data<uint8_t>(),
-          _out.data<T>(), *_ctx);
-    }
+    paddle::operators::LaunchResidualDropoutBias<T, uint8_t>(
+        _rows, _cols, increment, _seed, _dropout_prob, _is_test,
+        _is_upscale_in_train, _src.data<T>(), _residual.data<T>(), bias_ptr,
+        _mask.data<uint8_t>(), _out.data<T>(), *_ctx);
     _ctx->Wait();
   }
 
@@ -249,7 +243,7 @@ struct TestFusedResidualDropoutBias {
   }
 };
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBias) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols);
@@ -258,7 +252,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias) {
   test.CheckGrad(static_cast<float>(1e-5));
 }
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBiasDouble) {
+TEST(FusedDropout, GPUFusedResidualDropoutBiasDouble) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<double> test(rows, cols);
@@ -268,7 +262,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasDouble) {
 }
 
 // test fp16, For inference, check_grad is not required. ref: test_dropout_op.py
-TEST(FusedDropout, GPUFusedRedisualDorpoutBiasFp16) {
+TEST(FusedDropout, GPUFusedResidualDropoutBiasFp16) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<platform::float16> test(rows, cols);
@@ -277,7 +271,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasFp16) {
 }
 
 // test no bias and cols % 4 == 0
-TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias) {
+TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols);
@@ -288,7 +282,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias) {
 }
 
 // test no bias and cols % 4 != 0
-TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias2) {
+TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias2) {
   const int rows = 16;
   const int cols = 17;
   TestFusedResidualDropoutBias<float> test(rows, cols);
@@ -299,7 +293,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBiasNoBias2) {
 }
 
 // test add bias and cols % 4 != 0
-TEST(FusedDropout, GPUFusedRedisualDorpoutBias2) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias2) {
   const int rows = 16;
   const int cols = 17;
   TestFusedResidualDropoutBias<float> test(rows, cols);
@@ -308,7 +302,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias2) {
   test.CheckGrad(static_cast<float>(1e-5));
 }
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBias3) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias3) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, false, false);
@@ -317,7 +311,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias3) {
   test.CheckGrad(static_cast<float>(1e-5));
 }
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBias4) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias4) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, false, false);
@@ -326,7 +320,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias4) {
   test.CheckGrad(static_cast<float>(1e-5));
 }
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBias5) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias5) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, true, false);
@@ -335,7 +329,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias5) {
   test.CheckGrad(static_cast<float>(1e-5));
 }
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBias6) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias6) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols, 0, 0.35, true, true);
@@ -344,7 +338,7 @@ TEST(FusedDropout, GPUFusedRedisualDorpoutBias6) {
   test.CheckGrad(static_cast<float>(1e-5));
 }
 
-TEST(FusedDropout, GPUFusedRedisualDorpoutBias7) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias7) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols, 125, 0.0, false, false);

From 6755aeacb50ab656174159344c6a0d838cdba3a1 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Fri, 27 Aug 2021 11:37:43 +0000
Subject: [PATCH 06/19] Add a new op: paddle.linalg.multi_dot

---
 paddle/fluid/operators/multi_dot_op.cc        | 658 ++++++++++++++++++
 python/paddle/__init__.py                     |   1 +
 .../tests/unittests/test_multi_dot_op.py      | 298 ++++++++
 .../white_list/check_shape_white_list.py      |   1 +
 .../white_list/no_grad_set_white_list.py      |   3 +-
 .../white_list/op_accuracy_white_list.py      |   3 +-
 python/paddle/linalg.py                       |   4 +-
 python/paddle/tensor/linalg.py                |  68 ++
 8 files changed, 1033 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/operators/multi_dot_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_multi_dot_op.py

diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
new file mode 100644
index 00000000000000..59c43ea2f95b6e
--- /dev/null
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -0,0 +1,658 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/operators/utils.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+/**
+ * @brief compute the output shape and check the input shape valid or not
+ */
+inline framework::DDim ComputeAndCheckShape(
+    const bool is_runtime, const std::vector<framework::DDim>& inputs_dims) {
+  const size_t n = inputs_dims.size();
+  auto first_dim = inputs_dims[0];
+
+  bool is_vector = false;
+  framework::DDim out_dim;
+
+  if (first_dim.size() > 2) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "multi_dot: the first input tensor must be 1D or 2D but got[%d]!",
+        static_cast<int>(first_dim.size())));
+  }
+
+  // If the first tensor is 1D of size n view it as a row vector (1, n)
+  if (first_dim.size() == 1) {
+    first_dim = framework::make_ddim({1, static_cast<int>(first_dim[0])});
+    is_vector = true;
+  }
+
+  auto last_dim = inputs_dims[n - 1];
+  if (last_dim.size() > 2) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "the last input tensor of multi_dot op must be 1D or 2D but got[%d]!",
+        static_cast<int>(last_dim.size())));
+  }
+
+  // If the last tensor is 1D of size n view it as a column vector (n, 1)
+  if (last_dim.size() == 1) {
+    last_dim = framework::make_ddim({static_cast<int>(last_dim[0]), 1});
+    if (is_vector) {
+      out_dim = framework::make_ddim({1});
+    } else {
+      out_dim = framework::make_ddim({first_dim[0]});
+    }
+  } else {
+    if (is_vector) {
+      out_dim = framework::make_ddim({last_dim[1]});
+    } else {
+      out_dim = framework::make_ddim({first_dim[0], last_dim[1]});
+    }
+  }
+
+  auto width = first_dim[1];
+  for (size_t i = 1; i < n - 1; i++) {
+    PADDLE_ENFORCE_EQ(inputs_dims[i].size(), static_cast<size_t>(2),
+                      platform::errors::InvalidArgument(
+                          "the input tensor of multi_dot op must be 2D."));
+
+    const auto& tmp_dim = inputs_dims[i];
+    PADDLE_ENFORCE_EQ(tmp_dim[0], width,
+                      platform::errors::InvalidArgument(
+                          "the input tensor of multi_dot op must be 2D."));
+    width = tmp_dim[1];
+  }
+  PADDLE_ENFORCE_EQ(last_dim[0], width,
+                    platform::errors::InvalidArgument(
+                        "the input tensor of multi_dot op must be 2D."));
+
+  return out_dim;
+}
+
+/**
+ * @brief the matrix multiplication
+ */
+template <typename DeviceContext, typename T>
+inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
+                                const framework::Tensor& matrix_a,
+                                const framework::Tensor& matrix_b,
+                                const framework::DDim& a_dim,
+                                const framework::DDim& b_dim) {
+  auto place = ctx.GetPlace();
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
+  framework::Tensor matrix_c;
+  framework::DDim c_dim = framework::make_ddim({a_dim[0], b_dim[1]});
+  matrix_c.mutable_data<T>(place, c_dim[0] * c_dim[1] * sizeof(T));
+  matrix_c.Resize(c_dim);
+
+  auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, false);
+  auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, false);
+  const T alpha = static_cast<T>(1.0);
+  blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0));
+  return matrix_c;
+}
+
+/**
+ * @brief multi matrix dot by a chain order
+ * @param
+ * ins: the input tensors
+ * ins_dims: the shape of ins after reshape
+ * order: the optimal order
+ * i: the left of sub chain
+ * j: the righe of sub chain
+ * save_result: set true by backward
+ * results: save the intermediate result during backward
+ */
+template <typename DeviceContext, typename T>
+inline framework::Tensor MatChainMul(
+    const framework::ExecutionContext& ctx,
+    const std::vector<const framework::Tensor*>& ins,
+    const std::vector<framework::DDim>& ins_dims,
+    const std::vector<uint64_t>& order, const uint64_t i, const uint64_t j,
+    const bool save_result, std::vector<framework::Tensor>* results) {
+  if (i == j) {
+    return *ins[i];
+  }
+
+  const auto A = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, i,
+                                               order[i * ins.size() + j],
+                                               save_result, results);
+  framework::DDim a_dim = A.dims();
+  if (i == order[i * ins.size() + j]) {
+    a_dim = ins_dims[i];
+  }
+
+  const auto B = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order,
+                                               order[i * ins.size() + j] + 1, j,
+                                               save_result, results);
+  framework::DDim b_dim = B.dims();
+  if (j == order[i * ins.size() + j] + 1) {
+    b_dim = ins_dims[j];
+  }
+
+  auto result = MatMul<DeviceContext, T>(ctx, A, B, a_dim, b_dim);
+  if (save_result) {
+    (*results)[i * ins.size() + j] = result;
+  }
+  return result;
+}
+
+/**
+ * @brief get the optimal order
+ */
+std::vector<uint64_t> GetOrder(const std::vector<const framework::Tensor*>& ins,
+                               const std::vector<framework::DDim>& ins_dims) {
+  auto n = ins.size();
+  std::vector<uint64_t> p(n + 1);
+  for (uint64_t i = 0; i < n; i++) {
+    p[i] = ins_dims[i][0];
+  }
+  p[n] = ins_dims[n - 1][1];
+
+  std::vector<uint64_t> m(n * n, 0);
+  std::vector<uint64_t> order(n * n);
+
+  for (uint64_t l = 1; l < n; l++) {
+    for (uint64_t i = 0; i < n - l; i++) {
+      auto j = i + l;
+      m[i * n + j] = 0xffffffff;
+      for (uint64_t k = i; k < j; k++) {
+        uint64_t q =
+            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
+        if (q < m[i * n + j]) {
+          m[i * n + j] = q;
+          order[i * n + j] = k;
+        }
+      }
+    }
+  }
+  return order;
+}
+
+template <typename DeviceContext, typename T>
+static inline framework::Tensor MultiDotMatChainOrder(
+    const framework::ExecutionContext& ctx,
+    const std::vector<const framework::Tensor*>& ins,
+    const std::vector<framework::DDim>& ins_dims, const bool save_result,
+    std::vector<framework::Tensor>* results) {
+  auto order = GetOrder(ins, ins_dims);
+  auto n = ins.size();
+  return MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0, n - 1,
+                                       save_result, results);
+}
+
+inline void GetDims(const std::vector<const framework::Tensor*>& ins,
+                    std::vector<framework::DDim>* ins_dims) {
+  const auto n = ins.size();
+  std::vector<framework::Tensor> real_ins;
+  for (size_t i = 0; i < n; i++) {
+    (*ins_dims)[i] = ins[i]->dims();
+    if (i == 0 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = framework::make_ddim({1, (*ins_dims)[i][0]});
+    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = framework::make_ddim({(*ins_dims)[i][0], 1});
+    }
+  }
+}
+
+class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensors of multi_dot operator.").AsDuplicable();
+    AddOutput("Out", "The output tensor of multi_dot operator");
+    AddAttr<bool>(
+        "use_mkldnn",
+        "(bool, default false) Indicates if MKL-DNN kernel will be used")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Compute the dot product of two or more arrays in a single function call, while automatically selecting the fastest evaluation order.
+
+multi_dot chains MatMul and uses optimal parenthesization of the matrices [1] [2]. Depending on the shapes of the matrices, this can speed up the multiplication a lot.
+
+If the first argument is 1-D it is treated as a row vector. If the last argument is 1-D it is treated as a column vector. The other arguments must be 2-D.
+      )DOC");
+  }
+};
+
+class MultiDotOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "multi_dot");
+
+    auto inputs_dims = ctx->GetInputsDim("X");
+
+    const size_t inputs_num = inputs_dims.size();
+    PADDLE_ENFORCE_GT(
+        inputs_num, static_cast<size_t>(1),
+        platform::errors::InvalidArgument(
+            "The number of input tensors in multi_dot op should > 1."));
+    auto out_dims = ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto inputs = ctx.MultiInput<Tensor>("X");
+    auto input_data_type = framework::proto::VarType::Type(0);
+    bool flag = 1;
+    for (auto* input : inputs) {
+      if (!input->IsInitialized() || input->numel() == 0) {
+        flag = 0;
+        break;
+      }
+    }
+    if (flag == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "All Inputs of multi_dot OP are Empty!"));
+    }
+    input_data_type = inputs[0]->type();
+
+#ifdef PADDLE_WITH_MKLDNN
+    using mkldnn::memory;
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const {
+    if (framework::IsComplexType(expected_kernel_type.data_type_)) {
+      // only promote inputs’s types when contains complex input
+      return framework::OpKernelType(tensor.type(), tensor.place(),
+                                     tensor.layout());
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MultiDotKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
+    auto n = ins.size();
+    std::vector<framework::DDim> ins_dims(n);
+    GetDims(ins, &ins_dims);
+
+    const T scale = static_cast<T>(1.0);
+    if (n == 2) {
+      auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false);
+      auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false);
+      blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
+    } else if (n == 3) {
+      const auto Ma = ins_dims[0][0];
+      const auto Ka = ins_dims[0][1];
+      const auto Nb = ins_dims[1][1];
+      const auto Nc = ins_dims[2][1];
+      const uint64_t cost1 =
+          Ma * Nb * (Ka + Nc);  // Ma * Ka * Nb + Ma * Nb * Nc;
+      const uint64_t cost2 =
+          Ka * Nc * (Nb + Ma);  // Ka * Nb * Nc + Ma * Ka * Nc;
+      auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false);
+      auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false);
+      auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false);
+      if (cost1 < cost2) {
+        framework::Tensor tmp_out;
+        tmp_out.mutable_data<T>(place, Ma * Nb * sizeof(T));
+        framework::DDim tmp_dim = ins_dims[0];
+        tmp_dim[1] = Nb;
+        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out,
+                    T(0));
+        auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false);
+        blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
+      } else {
+        framework::Tensor tmp_out;
+        tmp_out.mutable_data<T>(place, Ka * Nc * sizeof(T));
+        framework::DDim tmp_dim = ins_dims[1];
+        tmp_dim[1] = Nc;
+        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out,
+                    T(0));
+        auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false);
+        blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
+      }
+    } else {
+      std::vector<framework::Tensor> results;
+      const auto tmp = MultiDotMatChainOrder<DeviceContext, T>(
+          ctx, ins, ins_dims, false, &results);
+      auto out_dim = out->dims();
+      // TensorCopy(tmp, place, ctx.device_context(), out);
+      *out = tmp;
+      out->Resize(out_dim);
+    }
+  }
+};
+
+class MultiDotOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "multi_dot");
+
+    auto in_x = "X";
+    auto out_x_g_n = framework::GradVarName(in_x);
+    auto ins_dims = ctx->GetInputsDim(in_x);
+    ctx->SetOutputsDim(out_x_g_n, ins_dims);
+    ctx->ShareAllLoD(in_x, out_x_g_n);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MultiDotGradKernel : public framework::OpKernel<T> {
+ public:
+  /**
+   * @brief calculate dA and dB
+   * dA = dout * transpose(B)
+   * dB = transpose(A) * dout
+   */
+  void CalcGrad(const framework::ExecutionContext& ctx,
+                const framework::Tensor& dout, const framework::Tensor& A,
+                const framework::Tensor& B, const framework::DDim& dout_dim,
+                const framework::DDim& a_dim, const framework::DDim& b_dim,
+                framework::Tensor* dA, framework::Tensor* dB) const {
+    auto mat_dim_dout = math::CreateMatrixDescriptor(dout_dim, 0, false);
+    auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, true);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, true);
+    T alpha = static_cast<T>(1.0);
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
+    blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
+    blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
+  }
+
+  /**
+   * @brief calculate multi matrix multiplication grad by a chain order
+   * @param
+   * dout: the grad of multi matrix multiplication out
+   * dx: the out grad of inputs
+   * ins: the input tensors
+   * ins_dims: the shape of ins after reshape
+   * order: the optimal order
+   * i: the left of sub chain
+   * j: the righe of sub chain
+   * results: the intermediate result of farward
+   */
+  void MatChainMulGrad(const framework::ExecutionContext& ctx,
+                       const framework::Tensor& dout,
+                       std::vector<framework::Tensor*>* dx,
+                       const std::vector<const framework::Tensor*>& ins,
+                       const framework::DDim& dout_dim,
+                       const std::vector<framework::DDim>& ins_dims,
+                       const std::vector<uint64_t>& order, const uint64_t i,
+                       const uint64_t j,
+                       const std::vector<framework::Tensor>& results) const {
+    if (i == j) {
+      *((*dx)[i]) = dout;
+      return;
+    }
+
+    const auto n = ins.size();
+    const auto right = order[i * n + j];
+    const auto left = order[i * n + j] + 1;
+    // get the multi result of left sub chain
+    const auto* A = &results[i * n + right];
+    framework::DDim a_dim = A->dims();
+    if (i == right) {
+      A = ins[i];
+      a_dim = ins_dims[i];
+    }
+    // get the multi result of right sub chain
+    const auto* B = &results[left * n + j];
+    framework::DDim b_dim = B->dims();
+    if (left == j) {
+      B = ins[j];
+      b_dim = ins_dims[j];
+    }
+    framework::Tensor dA, dB;
+    dA.Resize({dout_dim[0], b_dim[0]});
+    dB.Resize({a_dim[1], dout_dim[1]});
+    dA.mutable_data<T>(ctx.GetPlace());
+    dB.mutable_data<T>(ctx.GetPlace());
+
+    CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
+
+    MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right,
+                    results);
+    MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j,
+                    results);
+  }
+
+  void MultiDotGradMatChainOrder(
+      const framework::ExecutionContext& ctx, const framework::Tensor& dout,
+      const std::vector<const framework::Tensor*>& ins,
+      const framework::DDim& dout_dim,
+      const std::vector<framework::DDim>& ins_dims,
+      std::vector<framework::Tensor*>* dx) const {
+    auto order = GetOrder(ins, ins_dims);
+    auto n = ins.size();
+    std::vector<framework::Tensor> results(n * n);
+    // call the forward, get the itermediate result
+    MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0, n - 1, true,
+                                  &results);
+    MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1,
+                    results);
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto dx = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto place = ctx.GetPlace();
+
+    const auto n = ins.size();
+    for (size_t i = 0; i < n; i++) {
+      dx[i]->mutable_data<T>(place);
+    }
+
+    std::vector<framework::DDim> ins_dims(n);
+    GetDims(ins, &ins_dims);
+
+    framework::DDim dout_dim = dout.dims();
+    if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
+      dout_dim = framework::make_ddim({1, 1});
+    } else if (ins[0]->dims().size() == 1) {
+      if (dout_dim.size() == 1) {
+        dout_dim = framework::make_ddim({1, dout_dim[0]});
+      }
+    } else if (ins[n - 1]->dims().size() == 1) {
+      if (dout_dim.size() == 1) {
+        dout_dim = framework::make_ddim({dout_dim[0], 1});
+      }
+    }
+
+    T alpha = static_cast<T>(1);
+    auto mat_dim_dout = math::CreateMatrixDescriptor(dout_dim, 0, false);
+    if (n == 2) {
+      CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1],
+               dx[0], dx[1]);
+    } else if (n == 3) {
+      const auto Ma = ins_dims[0][0];
+      const auto Ka = ins_dims[0][1];
+      const auto Nb = ins_dims[1][1];
+      const auto Nc = ins_dims[2][1];
+      const uint64_t cost1 =
+          Ma * Nb * (Ka + Nc);  // Ma * Ka * Nb + Ma * Nb * Nc;
+      const uint64_t cost2 =
+          Ka * Nc * (Nb + Ma);  // Ka * Nb * Nc + Ma * Ka * Nc;
+      auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false);
+      auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false);
+      auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false);
+      if (cost1 < cost2) {
+        framework::Tensor tmp_out, tmp_dout;
+        tmp_out.Resize({Ma, Nb});
+        tmp_out.mutable_data<T>(place);
+        tmp_dout.Resize({mat_dim_dout.height_, Nb});
+        tmp_dout.mutable_data<T>(place);
+        // tmp_out = A * B
+        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out,
+                    T(0));
+
+        /*
+         * dC = dout * transpose(tmp_out)
+         * tmp_dout = dout * transpose(C)
+         */
+        CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(),
+                 ins_dims[2], &tmp_dout, dx[2]);
+
+        /*
+         * dA = tmp_dout * transpose(B)
+         * dB = tmp_dout * transpose(A)
+         */
+        CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0],
+                 ins_dims[1], dx[0], dx[1]);
+      } else {
+        framework::Tensor tmp_out, tmp_dout;
+        tmp_out.Resize({Ka, Nc});
+        tmp_out.mutable_data<T>(place);
+        tmp_dout.Resize({Ka, mat_dim_dout.width_});
+        tmp_dout.mutable_data<T>(place);
+        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out,
+                    T(0));
+
+        /*
+         * dA = dout * transpose(tmp_out)
+         * tmp_out = dout * transpose(A)
+         */
+        CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0],
+                 tmp_dout.dims(), dx[0], &tmp_dout);
+
+        /*
+         * dB = tmp_dout * transpose(C)
+         * dC = tmp_dout * transpose(B)
+         */
+        CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1],
+                 ins_dims[2], dx[1], dx[2]);
+      }
+    } else {
+      MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx);
+      if (ins[n - 1]->dims().size() == 1) {
+        dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
+      }
+    }
+  }
+};
+
+template <typename T>
+class MultiDotOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("multi_dot_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X", false));
+  }
+};
+template <typename T>
+class MultiDotOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("multi_dot");
+    grad_op->SetInput("X", this->Input(("X")));
+    grad_op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    grad_op->SetOutput("DDx", this->OutputGrad(framework::GradVarName("X")));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker,
+                  ops::MultiDotOpGradMaker<paddle::framework::OpDesc>,
+                  ops::MultiDotOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad,
+                  ops::MultiDotOpDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::MultiDotOpDoubleGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    multi_dot, ops::MultiDotKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiDotKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    multi_dot_grad,
+    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, float>);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    multi_dot, ops::MultiDotKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiDotKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiDotKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    multi_dot_grad,
+    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext,
+                            paddle::platform::float16>);
+#endif
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 907a667cb6ba78..48a7a5c143300a 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -99,6 +99,7 @@
 from .tensor.linalg import bmm  # noqa: F401
 from .tensor.linalg import histogram  # noqa: F401
 from .tensor.linalg import mv  # noqa: F401
+from .tensor.linalg import multi_dot  # noqa: F401
 from .tensor.logic import equal  # noqa: F401
 from .tensor.logic import greater_equal  # noqa: F401
 from .tensor.logic import greater_than  # noqa: F401
diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
new file mode 100644
index 00000000000000..71aa99a0f0f03e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+from numpy.linalg import multi_dot
+from op_test import OpTest
+import paddle
+from paddle.fluid import Program, program_guard
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+class TestMultiDotOp(OpTest):
+    def setUp(self):
+        self.op_type = "multi_dot"
+        self.dtype = self.get_dtype()
+        self.get_inputs_and_outputs()
+
+    def get_dtype(self):
+        return "float32"
+
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((2, 8)).astype(self.dtype)
+        self.B = np.random.random((8, 4)).astype(self.dtype)
+        self.inputs = {'X': [('x0', self.A), ('x1', self.B)]}
+        self.outputs = {'Out': multi_dot([self.A, self.B])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+
+
+class TestMultiDotOpDouble(TestMultiDotOp):
+    def get_dtype(self):
+        return "float64"
+
+
+#(A*B)*C
+class TestMultiDotOp3Mat(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((2, 10)).astype(self.dtype)
+        self.B = np.random.random((10, 4)).astype(self.dtype)
+        self.C = np.random.random((4, 3)).astype(self.dtype)
+        self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]}
+        self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+
+
+#A*(B*C)
+class TestMultiDotOp3Mat2(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((3, 4)).astype(self.dtype)
+        self.B = np.random.random((4, 8)).astype(self.dtype)
+        self.C = np.random.random((8, 2)).astype(self.dtype)
+        self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]}
+        self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+
+
+class TestMultiDotOp4Mat(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((8, 6)).astype(self.dtype)
+        self.B = np.random.random((6, 3)).astype(self.dtype)
+        self.C = np.random.random((3, 4)).astype(self.dtype)
+        self.D = np.random.random((4, 5)).astype(self.dtype)
+        self.inputs = {
+            'X':
+            [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)]
+        }
+        self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x3'], 'Out', max_relative_error=1e-3)
+
+
+class TestMultiDotOpFirst1D(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((4)).astype(self.dtype)
+        self.B = np.random.random((4, 3)).astype(self.dtype)
+        self.inputs = {'X': [('x0', self.A), ('x1', self.B)]}
+        self.outputs = {'Out': multi_dot([self.A, self.B])}
+
+
+class TestMultiDotOp3MatFirst1D(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((4)).astype(self.dtype)
+        self.B = np.random.random((4, 3)).astype(self.dtype)
+        self.C = np.random.random((3, 3)).astype(self.dtype)
+        self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]}
+        self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+
+
+class TestMultiDotOp4MatFirst1D(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((4)).astype(self.dtype)
+        self.B = np.random.random((4, 3)).astype(self.dtype)
+        self.C = np.random.random((3, 4)).astype(self.dtype)
+        self.D = np.random.random((4, 5)).astype(self.dtype)
+        self.inputs = {
+            'X':
+            [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)]
+        }
+        self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x3'], 'Out', max_relative_error=1e-3)
+
+
+class TestMultiDotOpLast1D(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((3, 6)).astype(self.dtype)
+        self.B = np.random.random((6)).astype(self.dtype)
+        self.inputs = {'X': [('x0', self.A), ('x1', self.B)]}
+        self.outputs = {'Out': multi_dot([self.A, self.B])}
+
+
+class TestMultiDotOp3MatLast1D(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((2, 4)).astype(self.dtype)
+        self.B = np.random.random((4, 3)).astype(self.dtype)
+        self.C = np.random.random((3)).astype(self.dtype)
+        self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]}
+        self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+
+
+class TestMultiDotOp4MatLast1D(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((2, 3)).astype(self.dtype)
+        self.B = np.random.random((3, 2)).astype(self.dtype)
+        self.C = np.random.random((2, 3)).astype(self.dtype)
+        self.D = np.random.random((3)).astype(self.dtype)
+        self.inputs = {
+            'X':
+            [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)]
+        }
+        self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x3'], 'Out', max_relative_error=1e-3)
+
+
+class TestMultiDotOpFirstAndLast1D(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((4, )).astype(self.dtype)
+        self.B = np.random.random((4)).astype(self.dtype)
+        self.inputs = {'X': [('x0', self.A), ('x1', self.B)]}
+        self.outputs = {'Out': multi_dot([self.A, self.B])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+
+
+class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((6, )).astype(self.dtype)
+        self.B = np.random.random((6, 4)).astype(self.dtype)
+        self.C = np.random.random((4)).astype(self.dtype)
+        self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]}
+        self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+
+
+class TestMultiDotOp4MatFirstAndLast1D(TestMultiDotOp):
+    def get_inputs_and_outputs(self):
+        self.A = np.random.random((3, )).astype(self.dtype)
+        self.B = np.random.random((3, 4)).astype(self.dtype)
+        self.C = np.random.random((4, 2)).astype(self.dtype)
+        self.D = np.random.random((2)).astype(self.dtype)
+        self.inputs = {
+            'X':
+            [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)]
+        }
+        self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x3'], 'Out', max_relative_error=1e-3)
+
+
+#####python API test#######
+class TestMultiDotOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The inputs type of multi_dot must be list matrix.
+            input1 = 12
+            self.assertRaises(TypeError, paddle.multi_dot, [input1, input1])
+
+            # The inputs dtype of multi_dot must be float32, float64 or float16.
+            input2 = fluid.layers.data(
+                name='input2', shape=[10, 10], dtype="int32")
+            self.assertRaises(TypeError, paddle.multi_dot, [input2, input2])
+
+            # the number of tensor must be larger than 1
+            x0 = fluid.data(name='x0', shape=[3, 2], dtype="float32")
+            self.assertRaises(ValueError, paddle.multi_dot, [x0])
+
+            #the first tensor must be 1D or 2D
+            x1 = fluid.data(name='x1', shape=[3, 2, 3], dtype="float32")
+            x2 = fluid.data(name='x2', shape=[3, 2], dtype="float32")
+            self.assertRaises(ValueError, paddle.multi_dot, [x1, x2])
+
+            #the last tensor must be 1D or 2D
+            x3 = fluid.data(name='x3', shape=[3, 2], dtype="float32")
+            x4 = fluid.data(name='x4', shape=[3, 2, 2], dtype="float32")
+            self.assertRaises(ValueError, paddle.multi_dot, [x3, x4])
+
+            #the tensor must be 2D, except first and last tensor
+            x5 = fluid.data(name='x5', shape=[3, 2], dtype="float32")
+            x6 = fluid.data(name='x6', shape=[2], dtype="float32")
+            x7 = fluid.data(name='x7', shape=[2, 2], dtype="float32")
+            self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7])
+
+
+class API_TestMultiDot(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program()):
+            x0 = fluid.data(name='x0', shape=[3, 2], dtype="float32")
+            x1 = fluid.data(name='x1', shape=[2, 3], dtype='float32')
+            result = paddle.multi_dot([x0, x1])
+            exe = fluid.Executor(fluid.CPUPlace())
+            data1 = np.random.rand(3, 2).astype("float32")
+            data2 = np.random.rand(2, 3).astype("float32")
+            np_res = exe.run(feed={'x0': data1,
+                                   'x1': data2},
+                             fetch_list=[result])
+            expected_result = np.linalg.multi_dot([data1, data2])
+
+        self.assertTrue(
+            np.allclose(
+                np_res, expected_result, atol=1e-5),
+            "two value is\
+            {}\n{}, check diff!".format(np_res, expected_result))
+
+    def test_dygraph_without_out(self):
+        device = fluid.CPUPlace()
+        with fluid.dygraph.guard(device):
+            input_array1 = np.random.rand(3, 4).astype("float64")
+            input_array2 = np.random.rand(4, 3).astype("float64")
+            data1 = fluid.dygraph.to_variable(input_array1)
+            data2 = fluid.dygraph.to_variable(input_array2)
+            out = paddle.multi_dot([data1, data2])
+            expected_result = np.linalg.multi_dot([input_array1, input_array2])
+        self.assertTrue(np.allclose(expected_result, out.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
index 15f28d94c70d12..626ea6c2ae0a30 100644
--- a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
@@ -28,4 +28,5 @@
     'cvm',
     'cudnn_lstm',
     'rnn',
+    'multi_dot',
 ]
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
index 15ba331e9de5a3..dd7e800053042a 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
@@ -65,6 +65,7 @@
     'rank_loss',
     'sequence_conv',
     'smooth_l1_loss',
-    'spectral_norm'
+    'spectral_norm',
+    'multi_dot',
 ]
 # yapf: enable
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
index 581656f6cd421b..9db2752ab23217 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
@@ -76,7 +76,8 @@
     'trilinear_interp_v2', \
     'var_conv_2d', \
     'warpctc', \
-    'bilateral_slice'
+    'bilateral_slice',
+    'multi_dot'
 ]
 
 NO_FP16_CHECK_GRAD_OP_LIST = [
diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py
index 5cef01d18aca48..fb0dd92acc5fd5 100644
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -15,9 +15,11 @@
 from .tensor.linalg import cholesky  # noqa: F401
 from .tensor.linalg import norm  # noqa: F401
 from .tensor import inverse as inv  # noqa: F401
+from .tensor.linalg import multi_dot  # noqa: F401
 
 __all__ = [
     'cholesky',  #noqa
     'norm',
-    'inv'
+    'inv',
+    'multi_dot'
 ]
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index a1610581b67c03..7e36a4841c9559 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -941,3 +941,71 @@ def __check_input(x, vec):
         type='mv', inputs={'X': x,
                            'Vec': vec}, outputs={'Out': out})
     return out
+
+
+def multi_dot(x, name=None):
+    """
+    Compute the dot product of tow or more matrix in a single function call, while automatically selecting the fastest evaluation order.
+
+    Supports inputs of float, double and float16 dtypes. This function does not support batched inputs.
+
+    Every tensor in x must be 2D, except for the first and last which may be 1D. if the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1).
+    If the first and last tensors are matrices, the output will be a matrix. However, if either is a 1D vector, then the output will be a 1D vector.
+
+    Notes:
+        The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (10, 100), (100, 5), (5, 50) respectively, we can calculate the cost of different multiplication orders as follows:
+        Cost((AB)C) = 10x100x5 + 10x5x50 = 7500
+        Cost(A(BC)) = 10x100x50 + 100x5x50 = 75000
+
+       In this case, multiplying A and B first followed by C is 10 times faster.
+
+    Args:
+        x ([Tensor]): The input tensors which is a list Tensor.
+
+    Returns:
+        Tensor: The output Tensor.
+
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        # A * B
+        A_data = np.random.random([3, 4]).astype(np.float32)
+        B_data = np.random.random([4, 5]).astype(np.float32)
+        A = paddle.to_tensor(A_data)
+        B = paddle.to_tensor(B_data)
+        out = paddle.multi_dot([A, B])
+        print(out.numpy().shape)
+        # [3, 5]
+
+        # A * B * C
+        A_data = np.random.random([10, 5]).astype(np.float32)
+        B_data = np.random.random([5, 8]).astype(np.float32)
+        C_data = np.random.random([8, 7]).astype(np.float32)
+        A = paddle.to_tensor(A_data)
+        B = paddle.to_tensor(B_data)
+        C = paddle.to_tensor(C_data)
+        out = paddle.multi_dot([A, B, C])
+        print(out.numpy().shape)
+        # [10, 7]
+    """
+    if in_dygraph_mode():
+        return _C_ops.multi_dot(x)
+
+    check_type(x, 'x', (list, tuple), 'multi_dot')
+    for id, item in enumerate(x):
+        check_variable_and_dtype(item, 'x[' + str(id) + ']',
+                                 ['float16', 'float32', 'float64'], 'multi_dot')
+        if item.dtype != x[0].dtype:
+            raise TypeError(
+                "All the Tensors in the input must have the same data type.")
+
+    helper = LayerHelper('multi_dot', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type='multi_dot', inputs={"X": x}, outputs={"Out": out})
+    return out

From 1cc8aad1545ce5e23d6bece3c4116a728bbf26d5 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Mon, 30 Aug 2021 06:29:50 +0000
Subject: [PATCH 07/19] fix the ci problem

---
 paddle/fluid/operators/multi_dot_op.cc        |  70 +++------
 .../tests/unittests/test_multi_dot_op.py      |  98 ++++++------
 .../white_list/no_grad_set_white_list.py      |   3 +-
 .../white_list/op_accuracy_white_list.py      |   3 +-
 python/paddle/linalg.py                       |   2 +-
 python/paddle/tensor/__init__.py              |   1 +
 python/paddle/tensor/linalg.py                | 143 +++++++++---------
 7 files changed, 143 insertions(+), 177 deletions(-)

diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index 59c43ea2f95b6e..6682248f8eff1c 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -63,17 +63,11 @@ inline framework::DDim ComputeAndCheckShape(
   // If the last tensor is 1D of size n view it as a column vector (n, 1)
   if (last_dim.size() == 1) {
     last_dim = framework::make_ddim({static_cast<int>(last_dim[0]), 1});
-    if (is_vector) {
-      out_dim = framework::make_ddim({1});
-    } else {
-      out_dim = framework::make_ddim({first_dim[0]});
-    }
+    out_dim = is_vector ? framework::make_ddim({1})
+                        : framework::make_ddim({first_dim[0]});
   } else {
-    if (is_vector) {
-      out_dim = framework::make_ddim({last_dim[1]});
-    } else {
-      out_dim = framework::make_ddim({first_dim[0], last_dim[1]});
-    }
+    out_dim = is_vector ? framework::make_ddim({last_dim[1]})
+                        : framework::make_ddim({first_dim[0], last_dim[1]});
   }
 
   auto width = first_dim[1];
@@ -83,21 +77,21 @@ inline framework::DDim ComputeAndCheckShape(
                           "the input tensor of multi_dot op must be 2D."));
 
     const auto& tmp_dim = inputs_dims[i];
-    PADDLE_ENFORCE_EQ(tmp_dim[0], width,
-                      platform::errors::InvalidArgument(
-                          "the input tensor of multi_dot op must be 2D."));
+    PADDLE_ENFORCE_EQ(
+        tmp_dim[0], width,
+        platform::errors::InvalidArgument(
+            "the input matrix does not meet the multiplication requirements."));
     width = tmp_dim[1];
   }
-  PADDLE_ENFORCE_EQ(last_dim[0], width,
-                    platform::errors::InvalidArgument(
-                        "the input tensor of multi_dot op must be 2D."));
+
+  PADDLE_ENFORCE_EQ(
+      last_dim[0], width,
+      platform::errors::InvalidArgument(
+          "the input matrix does not meet the multiplication requirements."));
 
   return out_dim;
 }
 
-/**
- * @brief the matrix multiplication
- */
 template <typename DeviceContext, typename T>
 inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
                                 const framework::Tensor& matrix_a,
@@ -109,8 +103,8 @@ inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
 
   framework::Tensor matrix_c;
   framework::DDim c_dim = framework::make_ddim({a_dim[0], b_dim[1]});
-  matrix_c.mutable_data<T>(place, c_dim[0] * c_dim[1] * sizeof(T));
   matrix_c.Resize(c_dim);
+  matrix_c.mutable_data<T>(place);
 
   auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, false);
   auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, false);
@@ -330,18 +324,15 @@ class MultiDotKernel : public framework::OpKernel<T> {
       const auto Ka = ins_dims[0][1];
       const auto Nb = ins_dims[1][1];
       const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 =
-          Ma * Nb * (Ka + Nc);  // Ma * Ka * Nb + Ma * Nb * Nc;
-      const uint64_t cost2 =
-          Ka * Nc * (Nb + Ma);  // Ka * Nb * Nc + Ma * Ka * Nc;
+      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
       auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false);
       auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false);
       auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false);
       if (cost1 < cost2) {
         framework::Tensor tmp_out;
         tmp_out.mutable_data<T>(place, Ma * Nb * sizeof(T));
-        framework::DDim tmp_dim = ins_dims[0];
-        tmp_dim[1] = Nb;
+        framework::DDim tmp_dim = framework::make_ddim({Ma, Nb});
         blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out,
                     T(0));
         auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false);
@@ -349,8 +340,7 @@ class MultiDotKernel : public framework::OpKernel<T> {
       } else {
         framework::Tensor tmp_out;
         tmp_out.mutable_data<T>(place, Ka * Nc * sizeof(T));
-        framework::DDim tmp_dim = ins_dims[1];
-        tmp_dim[1] = Nc;
+        framework::DDim tmp_dim = framework::make_ddim({Ka, Nc});
         blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out,
                     T(0));
         auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false);
@@ -361,7 +351,6 @@ class MultiDotKernel : public framework::OpKernel<T> {
       const auto tmp = MultiDotMatChainOrder<DeviceContext, T>(
           ctx, ins, ins_dims, false, &results);
       auto out_dim = out->dims();
-      // TensorCopy(tmp, place, ctx.device_context(), out);
       *out = tmp;
       out->Resize(out_dim);
     }
@@ -473,7 +462,6 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
     dB.mutable_data<T>(ctx.GetPlace());
 
     CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
-
     MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right,
                     results);
     MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j,
@@ -489,7 +477,6 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
     auto order = GetOrder(ins, ins_dims);
     auto n = ins.size();
     std::vector<framework::Tensor> results(n * n);
-    // call the forward, get the itermediate result
     MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0, n - 1, true,
                                   &results);
     MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1,
@@ -548,21 +535,10 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
         tmp_out.mutable_data<T>(place);
         tmp_dout.Resize({mat_dim_dout.height_, Nb});
         tmp_dout.mutable_data<T>(place);
-        // tmp_out = A * B
         blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out,
                     T(0));
-
-        /*
-         * dC = dout * transpose(tmp_out)
-         * tmp_dout = dout * transpose(C)
-         */
         CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(),
                  ins_dims[2], &tmp_dout, dx[2]);
-
-        /*
-         * dA = tmp_dout * transpose(B)
-         * dB = tmp_dout * transpose(A)
-         */
         CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0],
                  ins_dims[1], dx[0], dx[1]);
       } else {
@@ -573,18 +549,8 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
         tmp_dout.mutable_data<T>(place);
         blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out,
                     T(0));
-
-        /*
-         * dA = dout * transpose(tmp_out)
-         * tmp_out = dout * transpose(A)
-         */
         CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0],
                  tmp_dout.dims(), dx[0], &tmp_dout);
-
-        /*
-         * dB = tmp_dout * transpose(C)
-         * dC = tmp_dout * transpose(B)
-         */
         CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1],
                  ins_dims[2], dx[1], dx[2]);
       }
diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
index 71aa99a0f0f03e..35482144c272dd 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -31,7 +31,7 @@ def setUp(self):
         self.get_inputs_and_outputs()
 
     def get_dtype(self):
-        return "float32"
+        return "float64"
 
     def get_inputs_and_outputs(self):
         self.A = np.random.random((2, 8)).astype(self.dtype)
@@ -43,8 +43,8 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
 
 
 class TestMultiDotOpDouble(TestMultiDotOp):
@@ -62,9 +62,9 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
 
 
 #A*(B*C)
@@ -77,9 +77,9 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
 
 
 class TestMultiDotOp4Mat(TestMultiDotOp):
@@ -95,10 +95,10 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x3'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
+        self.check_grad(['x3'], 'Out')
 
 
 class TestMultiDotOpFirst1D(TestMultiDotOp):
@@ -118,9 +118,9 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
 
 
 class TestMultiDotOp4MatFirst1D(TestMultiDotOp):
@@ -136,10 +136,10 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x3'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
+        self.check_grad(['x3'], 'Out')
 
 
 class TestMultiDotOpLast1D(TestMultiDotOp):
@@ -159,9 +159,9 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
 
 
 class TestMultiDotOp4MatLast1D(TestMultiDotOp):
@@ -177,10 +177,10 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x3'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
+        self.check_grad(['x3'], 'Out')
 
 
 class TestMultiDotOpFirstAndLast1D(TestMultiDotOp):
@@ -191,8 +191,8 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
 
 
 class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp):
@@ -204,9 +204,9 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
 
 
 class TestMultiDotOp4MatFirstAndLast1D(TestMultiDotOp):
@@ -222,10 +222,10 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x1'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x2'], 'Out', max_relative_error=1e-3)
-        self.check_grad(['x3'], 'Out', max_relative_error=1e-3)
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
+        self.check_grad(['x3'], 'Out')
 
 
 #####python API test#######
@@ -236,41 +236,41 @@ def test_errors(self):
             input1 = 12
             self.assertRaises(TypeError, paddle.multi_dot, [input1, input1])
 
-            # The inputs dtype of multi_dot must be float32, float64 or float16.
+            # The inputs dtype of multi_dot must be float64, float64 or float16.
             input2 = fluid.layers.data(
                 name='input2', shape=[10, 10], dtype="int32")
             self.assertRaises(TypeError, paddle.multi_dot, [input2, input2])
 
             # the number of tensor must be larger than 1
-            x0 = fluid.data(name='x0', shape=[3, 2], dtype="float32")
+            x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64")
             self.assertRaises(ValueError, paddle.multi_dot, [x0])
 
             #the first tensor must be 1D or 2D
-            x1 = fluid.data(name='x1', shape=[3, 2, 3], dtype="float32")
-            x2 = fluid.data(name='x2', shape=[3, 2], dtype="float32")
+            x1 = fluid.data(name='x1', shape=[3, 2, 3], dtype="float64")
+            x2 = fluid.data(name='x2', shape=[3, 2], dtype="float64")
             self.assertRaises(ValueError, paddle.multi_dot, [x1, x2])
 
             #the last tensor must be 1D or 2D
-            x3 = fluid.data(name='x3', shape=[3, 2], dtype="float32")
-            x4 = fluid.data(name='x4', shape=[3, 2, 2], dtype="float32")
+            x3 = fluid.data(name='x3', shape=[3, 2], dtype="float64")
+            x4 = fluid.data(name='x4', shape=[3, 2, 2], dtype="float64")
             self.assertRaises(ValueError, paddle.multi_dot, [x3, x4])
 
             #the tensor must be 2D, except first and last tensor
-            x5 = fluid.data(name='x5', shape=[3, 2], dtype="float32")
-            x6 = fluid.data(name='x6', shape=[2], dtype="float32")
-            x7 = fluid.data(name='x7', shape=[2, 2], dtype="float32")
+            x5 = fluid.data(name='x5', shape=[3, 2], dtype="float64")
+            x6 = fluid.data(name='x6', shape=[2], dtype="float64")
+            x7 = fluid.data(name='x7', shape=[2, 2], dtype="float64")
             self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7])
 
 
 class API_TestMultiDot(unittest.TestCase):
     def test_out(self):
         with fluid.program_guard(fluid.Program()):
-            x0 = fluid.data(name='x0', shape=[3, 2], dtype="float32")
-            x1 = fluid.data(name='x1', shape=[2, 3], dtype='float32')
+            x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64")
+            x1 = fluid.data(name='x1', shape=[2, 3], dtype='float64')
             result = paddle.multi_dot([x0, x1])
             exe = fluid.Executor(fluid.CPUPlace())
-            data1 = np.random.rand(3, 2).astype("float32")
-            data2 = np.random.rand(2, 3).astype("float32")
+            data1 = np.random.rand(3, 2).astype("float64")
+            data2 = np.random.rand(2, 3).astype("float64")
             np_res = exe.run(feed={'x0': data1,
                                    'x1': data2},
                              fetch_list=[result])
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
index dd7e800053042a..15ba331e9de5a3 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
@@ -65,7 +65,6 @@
     'rank_loss',
     'sequence_conv',
     'smooth_l1_loss',
-    'spectral_norm',
-    'multi_dot',
+    'spectral_norm'
 ]
 # yapf: enable
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
index 9db2752ab23217..581656f6cd421b 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
@@ -76,8 +76,7 @@
     'trilinear_interp_v2', \
     'var_conv_2d', \
     'warpctc', \
-    'bilateral_slice',
-    'multi_dot'
+    'bilateral_slice'
 ]
 
 NO_FP16_CHECK_GRAD_OP_LIST = [
diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py
index fd06af3c3fbdfd..ba2c9b1d3ea8ee 100644
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -22,6 +22,6 @@
     'cholesky',  #noqa
     'norm',
     'inv',
+    'matrix_power',
     'multi_dot'
-    'matrix_power'
 ]
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index cc20e98006fec4..5d92289b60e251 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -45,6 +45,7 @@
 from .linalg import histogram  # noqa: F401
 from .linalg import mv  # noqa: F401
 from .linalg import matrix_power  # noqa: F401
+from .linalg import multi_dot  # noqa: F401
 from .logic import equal  # noqa: F401
 from .logic import greater_equal  # noqa: F401
 from .logic import greater_than  # noqa: F401
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index fdb301cdb5a6c8..01be39d9011606 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -2,7 +2,6 @@
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
@@ -943,73 +942,6 @@ def __check_input(x, vec):
     return out
 
 
-def multi_dot(x, name=None):
-    """
-    Compute the dot product of tow or more matrix in a single function call, while automatically selecting the fastest evaluation order.
-
-    Supports inputs of float, double and float16 dtypes. This function does not support batched inputs.
-
-    Every tensor in x must be 2D, except for the first and last which may be 1D. if the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1).
-    If the first and last tensors are matrices, the output will be a matrix. However, if either is a 1D vector, then the output will be a 1D vector.
-
-    Notes:
-        The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (10, 100), (100, 5), (5, 50) respectively, we can calculate the cost of different multiplication orders as follows:
-        Cost((AB)C) = 10x100x5 + 10x5x50 = 7500
-        Cost(A(BC)) = 10x100x50 + 100x5x50 = 75000
-
-       In this case, multiplying A and B first followed by C is 10 times faster.
-
-    Args:
-        x ([Tensor]): The input tensors which is a list Tensor.
-
-    Returns:
-        Tensor: The output Tensor.
-
-
-    Examples:
-
-    .. code-block:: python
-
-        import paddle
-        import numpy as np
-
-        # A * B
-        A_data = np.random.random([3, 4]).astype(np.float32)
-        B_data = np.random.random([4, 5]).astype(np.float32)
-        A = paddle.to_tensor(A_data)
-        B = paddle.to_tensor(B_data)
-        out = paddle.multi_dot([A, B])
-        print(out.numpy().shape)
-        # [3, 5]
-
-        # A * B * C
-        A_data = np.random.random([10, 5]).astype(np.float32)
-        B_data = np.random.random([5, 8]).astype(np.float32)
-        C_data = np.random.random([8, 7]).astype(np.float32)
-        A = paddle.to_tensor(A_data)
-        B = paddle.to_tensor(B_data)
-        C = paddle.to_tensor(C_data)
-        out = paddle.multi_dot([A, B, C])
-        print(out.numpy().shape)
-        # [10, 7]
-    """
-    if in_dygraph_mode():
-        return _C_ops.multi_dot(x)
-
-    check_type(x, 'x', (list, tuple), 'multi_dot')
-    for id, item in enumerate(x):
-        check_variable_and_dtype(item, 'x[' + str(id) + ']',
-                                 ['float16', 'float32', 'float64'], 'multi_dot')
-        if item.dtype != x[0].dtype:
-            raise TypeError(
-                "All the Tensors in the input must have the same data type.")
-
-    helper = LayerHelper('multi_dot', **locals())
-    dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='multi_dot', inputs={"X": x}, outputs={"Out": out})
-
-
 def matrix_power(x, n, name=None):
     r"""
     Computes the n-th power of a square matrix or a batch of square matrices.
@@ -1019,12 +951,12 @@ def matrix_power(x, n, name=None):
 
     .. math::
         Out = X ^ {n}
-    
+
     Specifically,
 
     - If `n > 0`, it returns the matrix or a batch of matrices raised to the power
     of `n`.
-    
+
     - If `n = 0`, it returns the identity matrix or a batch of identity matrices.
 
     - If `n < 0`, it returns the inverse of each matrix (if invertible) raised to
@@ -1035,7 +967,7 @@ def matrix_power(x, n, name=None):
             to power `n`. Its shape should be `[*, M, M]`, where `*` is zero or
             more batch dimensions. Its data type should be float32 or float64.
         n (int): The exponent. It can be any positive, negative integer or zero.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1078,3 +1010,72 @@ def matrix_power(x, n, name=None):
         outputs={'Out': out},
         attrs={'n': n})
     return out
+
+
+def multi_dot(x, name=None):
+    """
+    Compute the dot product of tow or more matrix in a single function call, while automatically selecting the fastest evaluation order.
+
+    Supports inputs of float, double and float16 dtypes. This function does not support batched inputs.
+
+    Every tensor in x must be 2D, except for the first and last which may be 1D. if the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1).
+    If the first and last tensors are matrices, the output will be a matrix. However, if either is a 1D vector, then the output will be a 1D vector.
+
+    The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (10, 100), (100, 5), (5, 50) respectively, we can calculate the cost of different multiplication orders as follows:
+    - Cost((AB)C) = 10x100x5 + 10x5x50 = 7500
+    - Cost(A(BC)) = 10x100x50 + 100x5x50 = 75000
+
+    In this case, multiplying A and B first followed by C is 10 times faster.
+
+    Args:
+        x ([Tensor]): The input tensors which is a list Tensor.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Tensor: The output Tensor.
+
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        # A * B
+        A_data = np.random.random([3, 4]).astype(np.float32)
+        B_data = np.random.random([4, 5]).astype(np.float32)
+        A = paddle.to_tensor(A_data)
+        B = paddle.to_tensor(B_data)
+        out = paddle.multi_dot([A, B])
+        print(out.numpy().shape)
+        # [3, 5]
+
+        # A * B * C
+        A_data = np.random.random([10, 5]).astype(np.float32)
+        B_data = np.random.random([5, 8]).astype(np.float32)
+        C_data = np.random.random([8, 7]).astype(np.float32)
+        A = paddle.to_tensor(A_data)
+        B = paddle.to_tensor(B_data)
+        C = paddle.to_tensor(C_data)
+        out = paddle.multi_dot([A, B, C])
+        print(out.numpy().shape)
+
+    """
+    if in_dygraph_mode():
+        return _C_ops.multi_dot(x)
+
+    check_type(x, 'x', (list, tuple), 'multi_dot')
+    for id, item in enumerate(x):
+        check_variable_and_dtype(item, 'x[' + str(id) + ']',
+                                 ['float16', 'float32', 'float64'], 'multi_dot')
+        if item.dtype != x[0].dtype:
+            raise TypeError(
+                "All the Tensors in the input must have the same data type.")
+
+    helper = LayerHelper('multi_dot', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type='multi_dot', inputs={"X": x}, outputs={"Out": out})
+    return out

From 4d33b98f2fc4468f87e2462cfb1e1248e790a93d Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Mon, 30 Aug 2021 10:30:04 +0000
Subject: [PATCH 08/19] modify the code according to the review comments

---
 paddle/fluid/operators/fused/CMakeLists.txt   |   2 +-
 ...fused_dropout.h => fused_dropout_common.h} |   5 +-
 .../operators/fused/fused_dropout_test.h      |  22 +-
 .../fused/fused_residual_dropout_bias.h       |  42 +--
 .../fused/fused_residual_dropout_bias_test.cu | 262 +++++++++---------
 5 files changed, 171 insertions(+), 162 deletions(-)
 rename paddle/fluid/operators/fused/{fused_dropout.h => fused_dropout_common.h} (95%)

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index f3035cddcba020..3df2144aa35944 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -74,6 +74,6 @@ if (WITH_GPU OR WITH_ROCM)
     # fused_dropout
     # only support CUDA
     if(NOT WITH_ROCM)
-        nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op device_context generator)
+        nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op device_context generator memory)
     endif()
 endif()
diff --git a/paddle/fluid/operators/fused/fused_dropout.h b/paddle/fluid/operators/fused/fused_dropout_common.h
similarity index 95%
rename from paddle/fluid/operators/fused/fused_dropout.h
rename to paddle/fluid/operators/fused/fused_dropout_common.h
index bd6a4122f5830d..755153bb07eee9 100644
--- a/paddle/fluid/operators/fused/fused_dropout.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -18,9 +18,8 @@ limitations under the License. */
 #include <cuda.h>
 #include <curand_kernel.h>
 
-#include <iostream>
-#include <memory>
-
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index 6cb8cd19b608d1..4a5e088d2013b8 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -22,11 +22,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
+namespace memory = paddle::memory;
 
 USE_OP(dropout);
 
@@ -34,17 +36,15 @@ USE_OP(dropout);
  * @brief call paddle dropout op
  */
 template <typename T>
-void Dropout(const T *x, const framework::DDim &x_dim, T *out,
-             std::vector<uint8_t> *mask, const platform::CUDADeviceContext &ctx,
-             uint64_t seed, float dropout_prob, bool is_upscale_in_train,
-             bool is_test) {
+void Dropout(const std::vector<T> &x, const framework::DDim &x_dim,
+             std::vector<T> *out, std::vector<uint8_t> *mask,
+             const platform::CUDADeviceContext &ctx, uint64_t seed,
+             float dropout_prob, bool is_upscale_in_train, bool is_test) {
   framework::Scope scope;
   auto var_x = scope.Var("X");
   auto tensor_x = var_x->GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector(x, ctx, tensor_x);
   tensor_x->Resize(x_dim);
-  tensor_x->mutable_data<T>(ctx.GetPlace());
-  cudaMemcpy(tensor_x->data<T>(), x, x_dim[0] * x_dim[1] * sizeof(T),
-             cudaMemcpyHostToDevice);
 
   auto var_out = scope.Var("Out");
   auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
@@ -59,6 +59,7 @@ void Dropout(const T *x, const framework::DDim &x_dim, T *out,
   if (is_upscale_in_train) {
     attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
   }
+
   if (is_test) {
     attrs.insert({"is_test", 1});
   }
@@ -66,11 +67,10 @@ void Dropout(const T *x, const framework::DDim &x_dim, T *out,
   auto op = framework::OpRegistry::CreateOp(
       "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
   op->Run(scope, ctx.GetPlace());
-  cudaMemcpy(out, tensor_out->data<T>(), x_dim[0] * x_dim[1] * sizeof(T),
-             cudaMemcpyDeviceToHost);
+
+  framework::TensorToVector<T>(*tensor_out, ctx, out);
   if (!is_test) {
-    cudaMemcpy((*mask).data(), tensor_mask->data<uint8_t>(),
-               x_dim[0] * x_dim[1] * sizeof(uint8_t), cudaMemcpyDeviceToHost);
+    framework::TensorToVector<uint8_t>(*tensor_mask, ctx, mask);
   }
   ctx.Wait();
 }
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 0a263635e46029..eda633380e07a2 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/fused/fused_dropout.h"
+#include "paddle/fluid/operators/fused/fused_dropout_common.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 
 namespace paddle {
@@ -22,6 +22,7 @@ namespace operators {
 
 namespace platform = paddle::platform;
 namespace cg = cooperative_groups;
+namespace memory = paddle::memory;
 
 /**
  * @brief fused the add_bias, dropout, add residual into one operators
@@ -32,15 +33,17 @@ namespace cg = cooperative_groups;
 /**
  * @brief the fused function called by every thread
  */
-template <typename T, typename MaskType, typename U, int VecSize,
-          bool layer_norm>
+template <typename T, typename MaskType, int VecSize, bool ComputeLayerNorm>
 __forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread(
     const int row_id, const int col_id, const int cols,
     curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor,
     const T *src, const T *residual, const T *bias, T *dst, MaskType *mask,
-    U *mean_val, U *var_val) {
+    typename details::MPTypeTrait<T>::Type *mean_val,
+    typename details::MPTypeTrait<T>::Type *var_val) {
   using LoadT = AlignedVector<T, VecSize>;
   using MaskLoadT = AlignedVector<MaskType, VecSize>;
+  using U = typename details::MPTypeTrait<T>::Type;
+
   T src_vec[VecSize];
   T residual_vec[VecSize];
   T bias_vec[VecSize];
@@ -74,7 +77,7 @@ __forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread(
     dest_vec[ii] =
         (src_vec[ii] + bias_vec[ii]) * static_cast<T>(mask_vec[ii]) * factor +
         residual_vec[ii];
-    if (layer_norm) {
+    if (ComputeLayerNorm) {
       U tmp = static_cast<U>(dest_vec[ii]);
       *mean_val += tmp;
       *var_val += (tmp * tmp);
@@ -114,7 +117,7 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
-      FusedResidualDropoutBiasVecOneThread<T, MaskType, T, VecSize, false>(
+      FusedResidualDropoutBiasVecOneThread<T, MaskType, VecSize, false>(
           r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
           mask, NULL, NULL);
     }
@@ -208,9 +211,10 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
                                const platform::CUDADeviceContext &ctx) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpyAsync(dst, residual, rows * cols * sizeof(T),
-                        cudaMemcpyDeviceToDevice, ctx.stream()));
+    if (residual == dst) return;
+    auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T),
+                 ctx.stream());
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
         mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
     return;
@@ -282,7 +286,8 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask,
  * 2. save 128*8 temporary sum in 8*128 shared memory
  * 3. reduce the sum of 128 rows data by 8*VecSize warps
  */
-template <typename T, typename MaskType, int BSX, int BSY, int VecSize>
+template <typename T, typename MaskType, int BLOCK_SIZE_X, int BLOCK_SIZE_Y,
+          int VecSize>
 __global__ void FusedResidualDropoutBiasGradVec(
     const T *dout, const MaskType *mask, const T factor, const int64_t rows,
     const int64_t cols, T *dx, T *dbias) {
@@ -316,9 +321,10 @@ __global__ void FusedResidualDropoutBiasGradVec(
   }
 
   // save temporary sum to cache and do transpose
-  __shared__ T cache[BSX * VecSize][BSY];
-  for (int i = 0; i < VecSize; i++)
+  __shared__ T cache[BLOCK_SIZE_X * VecSize][BLOCK_SIZE_Y];
+  for (int i = 0; i < VecSize; i++) {
     cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i];
+  }
   __syncthreads();
 
   // reduce sum
@@ -327,11 +333,11 @@ __global__ void FusedResidualDropoutBiasGradVec(
   int x = tid >> 5;  // warp id
   int y = tid & 31;  // thread id on warp 0~31
 
-  // need BSX * VecSize warps
-  if (x < BSX * VecSize) {
+  // need BLOCK_SIZE_X * VecSize warps
+  if (x < BLOCK_SIZE_X * VecSize) {
 // reduce 128 to 32
 #pragma unroll
-    for (int i = 0; i < (BSY >> 5); i++) {
+    for (int i = 0; i < (BLOCK_SIZE_Y >> 5); i++) {
       sum += cache[x][y + i * 32];
     }
   }
@@ -341,7 +347,7 @@ __global__ void FusedResidualDropoutBiasGradVec(
 
   // save sum to dbias
   int bias_id = blockIdx.x * blockDim.x * VecSize + x;
-  if (y == 0 && x < VecSize * BSX && bias_id < cols) {
+  if (y == 0 && x < VecSize * BLOCK_SIZE_X && bias_id < cols) {
     dbias[bias_id] = sum;
   }
 }
@@ -367,7 +373,9 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
   const int VecSize = 4;
   if (dbias != nullptr) {
     int real_vec_size = VecSize;
-    if (cols % VecSize != 0) real_vec_size = 1;
+    if (cols % VecSize != 0) {
+      real_vec_size = 1;
+    }
     auto threads = std::min(cols / real_vec_size, static_cast<uint32_t>(8));
     auto blocks = std::max(
         (uint32_t)1, std::min((cols / real_vec_size + threads - 1) / threads,
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index d5377194934ff6..88438e6e0c36e5 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -24,7 +24,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 
 /**
- * @brief the unittest of fused_residual_dropout_bias
+ * @brief the unittest of fusedresidualdropoutbias
  * 1. random input data
  * 2. add bias, call paddle dropout op, add residual, and get the base result
  * 3. call FusedResidualDropoutBias function get fused result
@@ -33,163 +33,169 @@ namespace platform = paddle::platform;
 
 template <typename T>
 struct TestFusedResidualDropoutBias {
-  uint32_t _rows;
-  uint32_t _cols;
-  uint64_t _seed;
-  float _dropout_prob;
-  bool _is_upscale_in_train;
-  bool _is_test;  // default false,  Set to true for inference only
-  bool _has_bias = true;
-  framework::Tensor _src, _residual, _bias, _out, _mask;
-  framework::Tensor _dsrc, _dbias;
-
-  std::vector<T> _src_vec, _residual_vec, _bias_vec;
-  std::vector<T> _correct_out, _correct_dsrc, _correct_dbias;
-  std::vector<uint8_t> _correct_mask;
-
-  platform::CUDAPlace _place;
-  platform::CUDADeviceContext *_ctx;
+  uint32_t rows;
+  uint32_t cols;
+  uint64_t seed;
+  float dropout_prob;
+  bool is_upscale_in_train;
+  bool is_test;  // default false,  Set to true for inference only
+  bool hasbias = true;
+  framework::Tensor src, residual, bias, out, mask;
+  framework::Tensor dsrc, dbias;
+
+  std::vector<T> src_vec, residual_vec, bias_vec;
+  std::vector<T> correct_out, correct_dsrc, correct_dbias;
+  std::vector<uint8_t> correct_mask;
+
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext *ctx;
 
   TestFusedResidualDropoutBias() {
-    _rows = 32;
-    _cols = 32;
-    _seed = 0;
-    _dropout_prob = 0.0;
-    _is_upscale_in_train = false;
-    _is_test = false;
-    _has_bias = true;
-    _ctx = new platform::CUDADeviceContext(_place);
+    rows = 32;
+    cols = 32;
+    seed = 0;
+    dropout_prob = 0.0;
+    is_upscale_in_train = false;
+    is_test = false;
+    hasbias = true;
+    // ctx = new platform::CUDADeviceContext(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto devicectx = pool.Get(place);
+    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
   }
 
-  TestFusedResidualDropoutBias(int rows, int cols, uint64_t seed = 0,
-                               float dropout_prob = 0.0,
-                               bool is_upscale_in_train = false,
-                               bool is_test = false) {
-    _rows = rows;
-    _cols = cols;
-    _seed = seed;
-    _dropout_prob = dropout_prob;
-    _is_upscale_in_train = is_upscale_in_train;
-    _is_test = is_test;
-    _has_bias = true;
-    _ctx = new platform::CUDADeviceContext(_place);
+  TestFusedResidualDropoutBias(int rows_, int cols_, uint64_t seed_ = 0,
+                               float dropout_prob_ = 0.0,
+                               bool is_upscale_in_train_ = false,
+                               bool is_test_ = false) {
+    rows = rows_;
+    cols = cols_;
+    seed = seed_;
+    dropout_prob = dropout_prob_;
+    is_upscale_in_train = is_upscale_in_train_;
+    is_test = is_test_;
+    hasbias = true;
+    // ctx = new platform::CUDADeviceContext(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto devicectx = pool.Get(place);
+    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
   }
 
-  ~TestFusedResidualDropoutBias() { delete _ctx; }
+  ~TestFusedResidualDropoutBias() {}
 
   void SetUp() {
-    const int n = _rows * _cols;
-    _correct_out.resize(n);
-    _correct_mask.resize(n);
-    _correct_dsrc.resize(n);
-    _correct_dbias.resize(_cols);
-
-    _src_vec.resize(n);
-    _residual_vec.resize(n);
-    _bias_vec.resize(_cols);
+    const int n = rows * cols;
+    correct_out.resize(n);
+    correct_mask.resize(n);
+    correct_dsrc.resize(n);
+    correct_dbias.resize(cols);
+
+    src_vec.resize(n);
+    residual_vec.resize(n);
+    bias_vec.resize(cols);
     std::default_random_engine random(time(NULL));
     std::uniform_real_distribution<float> dis(0.0, 1.0);
 
-    for (int i = 0; i < _rows; i++) {
-      for (int j = 0; j < _cols; j++) {
-        _src_vec[i * _cols + j] = static_cast<T>(dis(random));
-        _residual_vec[i * _cols + j] = static_cast<T>(dis(random));
-        if (i == 0) _bias_vec[j] = dis(random);
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        src_vec[i * cols + j] = static_cast<T>(dis(random));
+        residual_vec[i * cols + j] = static_cast<T>(dis(random));
+        if (i == 0) bias_vec[j] = dis(random);
       }
     }
 
-    framework::TensorFromVector<T>(_src_vec, *_ctx, &_src);
-    _src.Resize({_rows, _cols});
-    framework::TensorFromVector<T>(_residual_vec, *_ctx, &_residual);
-    _residual.Resize({_rows, _cols});
-    if (_has_bias) {
-      framework::TensorFromVector<T>(_bias_vec, *_ctx, &_bias);
-      _bias.Resize({_cols});
+    framework::TensorFromVector<T>(src_vec, *ctx, &src);
+    src.Resize({rows, cols});
+    framework::TensorFromVector<T>(residual_vec, *ctx, &residual);
+    residual.Resize({rows, cols});
+    if (hasbias) {
+      framework::TensorFromVector<T>(bias_vec, *ctx, &bias);
+      bias.Resize({cols});
     }
 
     {
-      _out.Resize({_rows, _cols});
-      _out.mutable_data<T>(_place);
-      _mask.Resize({_rows, _cols});
-      _mask.mutable_data<uint8_t>(_place);
-      _dsrc.Resize({_rows, _cols});
-      _dsrc.mutable_data<T>(_place);
-
-      if (_has_bias) {
-        _dbias.Resize({_cols});
-        _dbias.mutable_data<T>(_place);
+      out.Resize({rows, cols});
+      out.mutable_data<T>(place);
+      mask.Resize({rows, cols});
+      mask.mutable_data<uint8_t>(place);
+      dsrc.Resize({rows, cols});
+      dsrc.mutable_data<T>(place);
+
+      if (hasbias) {
+        dbias.Resize({cols});
+        dbias.mutable_data<T>(place);
       }
     }
   }
 
   void BaseForward() {
-    std::vector<T> out1(_rows * _cols), out2(_rows * _cols);
-    if (_has_bias) {
+    std::vector<T> out1(rows * cols), out2(rows * cols);
+    if (hasbias) {
       // add bias
-      for (int i = 0; i < _rows; i++) {
-        for (int j = 0; j < _cols; j++) {
-          out1[i * _cols + j] = _src_vec[i * _cols + j] + _bias_vec[j];
+      for (int i = 0; i < rows; i++) {
+        for (int j = 0; j < cols; j++) {
+          out1[i * cols + j] = src_vec[i * cols + j] + bias_vec[j];
         }
       }
       // call dropout
-      Dropout<T>(out1.data(), _src.dims(), out2.data(), &_correct_mask, *_ctx,
-                 _seed, _dropout_prob, _is_upscale_in_train, _is_test);
+      Dropout<T>(out1, src.dims(), &out2, &correct_mask, *ctx, seed,
+                 dropout_prob, is_upscale_in_train, is_test);
     } else {
-      Dropout<T>(_src_vec.data(), _src.dims(), out2.data(), &_correct_mask,
-                 *_ctx, _seed, _dropout_prob, _is_upscale_in_train, _is_test);
+      Dropout<T>(src_vec, src.dims(), &out2, &correct_mask, *ctx, seed,
+                 dropout_prob, is_upscale_in_train, is_test);
     }
+    ctx->Wait();
     // add residual
-    for (int i = 0; i < _rows; i++) {
-      for (int j = 0; j < _cols; j++) {
-        _correct_out[i * _cols + j] =
-            _residual_vec[i * _cols + j] + out2[i * _cols + j];
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        correct_out[i * cols + j] =
+            residual_vec[i * cols + j] + out2[i * cols + j];
       }
     }
-    _ctx->Wait();
   }
 
   void BaseBackward() {
-    DropoutGrad<T>(_correct_dsrc.data(), _src.dims(), _correct_out.data(),
-                   _correct_mask.data(), *_ctx, _dropout_prob,
-                   _is_upscale_in_train);
+    DropoutGrad<T>(correct_dsrc.data(), src.dims(), correct_out.data(),
+                   correct_mask.data(), *ctx, dropout_prob,
+                   is_upscale_in_train);
     // calc dbias
-    memset(&_correct_dbias[0], 0, _cols * sizeof(T));
-    for (int i = 0; i < _rows; i++) {
-      for (int j = 0; j < _cols; j++) {
-        _correct_dbias[j] += _correct_out[i * _cols + j];
+    memset(&correct_dbias[0], 0, cols * sizeof(T));
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        correct_dbias[j] += correct_out[i * cols + j];
       }
     }
   }
 
   void FusedForward() {
     auto threads = paddle::operators::Get1DBlocksAnd2DGrids(
-        *_ctx, (uint64_t)_rows, (uint64_t)_cols);
+        *ctx, (uint64_t)rows, (uint64_t)cols);
     const int VecSize = 4;
     const int increment =
-        ((_cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) *
+        ((cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) *
         VecSize;
 
     T *bias_ptr = nullptr;
-    if (_has_bias) {
-      bias_ptr = _bias.data<T>();
+    if (hasbias) {
+      bias_ptr = bias.data<T>();
     }
     paddle::operators::LaunchResidualDropoutBias<T, uint8_t>(
-        _rows, _cols, increment, _seed, _dropout_prob, _is_test,
-        _is_upscale_in_train, _src.data<T>(), _residual.data<T>(), bias_ptr,
-        _mask.data<uint8_t>(), _out.data<T>(), *_ctx);
-    _ctx->Wait();
+        rows, cols, increment, seed, dropout_prob, is_test, is_upscale_in_train,
+        src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
+        out.data<T>(), *ctx);
+    ctx->Wait();
   }
 
   void FusedBackward() {
-    if (_is_test) return;
+    if (is_test) return;
 
     T *bias_ptr = nullptr;
-    if (_has_bias) {
-      bias_ptr = _dbias.data<T>();
+    if (hasbias) {
+      bias_ptr = dbias.data<T>();
     }
     paddle::operators::LaunchResidualDropoutBiasGrad<T, uint8_t>(
-        _out.data<T>(), _mask.data<uint8_t>(), _dropout_prob,
-        _is_upscale_in_train, _rows, _cols, _dsrc.data<T>(), bias_ptr, *_ctx);
+        out.data<T>(), mask.data<uint8_t>(), dropout_prob, is_upscale_in_train,
+        rows, cols, dsrc.data<T>(), bias_ptr, *ctx);
   }
 
   void Run() {
@@ -201,43 +207,39 @@ struct TestFusedResidualDropoutBias {
   }
 
   void CheckOut(const T diff) {
-    const int n = _rows * _cols;
-    std::vector<T> out(n);
-    std::vector<uint8_t> mask(n);
-    cudaMemcpy(out.data(), _out.data<T>(), _rows * _cols * sizeof(T),
-               cudaMemcpyDeviceToHost);
-    if (!_is_test) {
-      cudaMemcpy(mask.data(), _mask.data<uint8_t>(),
-                 _rows * _cols * sizeof(uint8_t), cudaMemcpyDeviceToHost);
+    const int n = rows * cols;
+    std::vector<T> _out(n);
+    std::vector<uint8_t> _mask(n);
+    framework::TensorToVector(out, *ctx, &_out);
+    if (!is_test) {
+      framework::TensorToVector<uint8_t>(mask, *ctx, &_mask);
     }
-    _ctx->Wait();
+    ctx->Wait();
 
     for (int i = 0; i < n; i++) {
-      EXPECT_LT(std::abs(out[i] - _correct_out[i]), diff);
-      if (!_is_test) EXPECT_EQ(mask[i], _correct_mask[i]);
+      EXPECT_LT(std::abs(_out[i] - correct_out[i]), diff);
+      if (!is_test) EXPECT_EQ(_mask[i], correct_mask[i]);
     }
   }
 
   void CheckGrad(const T diff) {
-    if (_is_test) return;
+    if (is_test) return;
 
-    const int n = _rows * _cols;
+    const int n = rows * cols;
 
-    std::vector<T> dsrc(n);
-    cudaMemcpy(dsrc.data(), _dsrc.data<T>(), _rows * _cols * sizeof(T),
-               cudaMemcpyDeviceToHost);
+    std::vector<T> _dsrc(n);
+    framework::TensorToVector(dsrc, *ctx, &_dsrc);
 
     for (int i = 0; i < n; i++) {
-      EXPECT_LT(std::abs(dsrc[i] - _correct_dsrc[i]), diff);
+      EXPECT_LT(std::abs(_dsrc[i] - correct_dsrc[i]), diff);
     }
 
-    if (_has_bias) {
-      std::vector<T> dbias(_cols);
-      cudaMemcpy(dbias.data(), _dbias.data<T>(), _cols * sizeof(T),
-                 cudaMemcpyDeviceToHost);
-      _ctx->Wait();
-      for (int i = 0; i < _cols; i++) {
-        EXPECT_LT(std::abs(dbias[i] - _correct_dbias[i]), diff);
+    if (hasbias) {
+      std::vector<T> _dbias(cols);
+      framework::TensorToVector(dbias, *ctx, &_dbias);
+      ctx->Wait();
+      for (int i = 0; i < cols; i++) {
+        EXPECT_LT(std::abs(_dbias[i] - correct_dbias[i]), diff);
       }
     }
   }
@@ -261,7 +263,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasDouble) {
   test.CheckGrad(static_cast<double>(1e-5));
 }
 
-// test fp16, For inference, check_grad is not required. ref: test_dropout_op.py
+// test fp16, For inference, check_grad is not required. ref: testdropout_op.py
 TEST(FusedDropout, GPUFusedResidualDropoutBiasFp16) {
   const int rows = 16;
   const int cols = 16;
@@ -275,7 +277,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols);
-  test._has_bias = false;
+  test.hasbias = false;
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
   test.CheckGrad(static_cast<float>(1e-5));
@@ -286,7 +288,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias2) {
   const int rows = 16;
   const int cols = 17;
   TestFusedResidualDropoutBias<float> test(rows, cols);
-  test._has_bias = false;
+  test.hasbias = false;
   test.Run();
   test.CheckOut(static_cast<float>(1e-5));
   test.CheckGrad(static_cast<float>(1e-5));

From bd44d043d24b5f76c2f07da6bb0a7a52788c1ace Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Mon, 30 Aug 2021 11:27:40 +0000
Subject: [PATCH 09/19] replace cudaMemcpy with TensorFromVector and
 TensorToVector in DropoutGrad

---
 .../operators/fused/fused_dropout_test.h      | 18 +++++++-----------
 .../fused/fused_residual_dropout_bias_test.cu | 19 +++++++++++--------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index 4a5e088d2013b8..e9fd0e6c097851 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -79,24 +79,21 @@ void Dropout(const std::vector<T> &x, const framework::DDim &x_dim,
  * @brief call paddle dropout_grad op
  */
 template <typename T>
-void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout,
-                 const uint8_t *mask, const platform::CUDADeviceContext &ctx,
-                 float dropout_prob, bool is_upscale_in_train) {
+void DropoutGrad(std::vector<T> *dx, const framework::DDim &x_dim,
+                 const std::vector<T> &dout, const std::vector<uint8_t> &mask,
+                 const platform::CUDADeviceContext &ctx, float dropout_prob,
+                 bool is_upscale_in_train) {
   framework::Scope scope;
   const size_t n = x_dim[0] * x_dim[1];
   auto var_out = scope.Var("DOut");
   auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector(dout, ctx, tensor_out);
   tensor_out->Resize(x_dim);
-  tensor_out->mutable_data<T>(ctx.GetPlace());
-  cudaMemcpy(tensor_out->data<T>(), dout, n * sizeof(T),
-             cudaMemcpyHostToDevice);
 
   auto var_mask = scope.Var("Mask");
   auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector(mask, ctx, tensor_mask);
   tensor_mask->Resize(x_dim);
-  tensor_mask->mutable_data<uint8_t>(ctx.GetPlace());
-  cudaMemcpy(tensor_mask->data<uint8_t>(), mask, n * sizeof(uint8_t),
-             cudaMemcpyHostToDevice);
 
   auto var_dx = scope.Var("DX");
   auto tensor_dx = var_dx->GetMutable<framework::LoDTensor>();
@@ -115,7 +112,6 @@ void DropoutGrad(T *dx, const framework::DDim &x_dim, const T *dout,
       {{"X@GRAD", {"DX"}}}, attrs);
   op->Run(scope, ctx.GetPlace());
 
-  cudaMemcpy(dx, tensor_dx->data<T>(), x_dim[0] * x_dim[1] * sizeof(T),
-             cudaMemcpyDeviceToHost);
+  framework::TensorToVector(*tensor_dx, ctx, dx);
   ctx.Wait();
 }
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 88438e6e0c36e5..14267974ff2657 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -58,7 +58,6 @@ struct TestFusedResidualDropoutBias {
     is_upscale_in_train = false;
     is_test = false;
     hasbias = true;
-    // ctx = new platform::CUDADeviceContext(place);
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto devicectx = pool.Get(place);
     ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
@@ -75,7 +74,6 @@ struct TestFusedResidualDropoutBias {
     is_upscale_in_train = is_upscale_in_train_;
     is_test = is_test_;
     hasbias = true;
-    // ctx = new platform::CUDADeviceContext(place);
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto devicectx = pool.Get(place);
     ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
@@ -100,7 +98,9 @@ struct TestFusedResidualDropoutBias {
       for (int j = 0; j < cols; j++) {
         src_vec[i * cols + j] = static_cast<T>(dis(random));
         residual_vec[i * cols + j] = static_cast<T>(dis(random));
-        if (i == 0) bias_vec[j] = dis(random);
+        if (i == 0) {
+          bias_vec[j] = dis(random);
+        }
       }
     }
 
@@ -155,9 +155,8 @@ struct TestFusedResidualDropoutBias {
   }
 
   void BaseBackward() {
-    DropoutGrad<T>(correct_dsrc.data(), src.dims(), correct_out.data(),
-                   correct_mask.data(), *ctx, dropout_prob,
-                   is_upscale_in_train);
+    DropoutGrad<T>(&correct_dsrc, src.dims(), correct_out, correct_mask, *ctx,
+                   dropout_prob, is_upscale_in_train);
     // calc dbias
     memset(&correct_dbias[0], 0, cols * sizeof(T));
     for (int i = 0; i < rows; i++) {
@@ -187,7 +186,9 @@ struct TestFusedResidualDropoutBias {
   }
 
   void FusedBackward() {
-    if (is_test) return;
+    if (is_test) {
+      return;
+    }
 
     T *bias_ptr = nullptr;
     if (hasbias) {
@@ -223,7 +224,9 @@ struct TestFusedResidualDropoutBias {
   }
 
   void CheckGrad(const T diff) {
-    if (is_test) return;
+    if (is_test) {
+      return;
+    }
 
     const int n = rows * cols;
 

From d2beab70c620a19a6e26b5e8c16a4f537882fc68 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Tue, 31 Aug 2021 02:19:33 +0000
Subject: [PATCH 10/19] set dropout attr 'is_test':false

---
 paddle/fluid/operators/fused/fused_dropout_test.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index e9fd0e6c097851..288b415aef31f9 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -61,7 +61,7 @@ void Dropout(const std::vector<T> &x, const framework::DDim &x_dim,
   }
 
   if (is_test) {
-    attrs.insert({"is_test", 1});
+    attrs.insert({"is_test", true});
   }
 
   auto op = framework::OpRegistry::CreateOp(
@@ -100,7 +100,7 @@ void DropoutGrad(std::vector<T> *dx, const framework::DDim &x_dim,
 
   framework::AttributeMap attrs;
   attrs.insert({"dropout_prob", dropout_prob});
-  attrs.insert({"is_test", 0});
+  attrs.insert({"is_test", false});
   if (is_upscale_in_train) {
     attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
   } else {

From 40cd7cad7071a07b6dd1fa18b8373a8ad00fa7d9 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Tue, 31 Aug 2021 06:07:58 +0000
Subject: [PATCH 11/19] reduce the code to less than 1000 lines

---
 paddle/fluid/operators/multi_dot_op.cc        | 23 ++++++-------------
 .../tests/unittests/test_multi_dot_op.py      |  4 ----
 2 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index 6682248f8eff1c..c27c01b92b6db3 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -197,15 +197,13 @@ static inline framework::Tensor MultiDotMatChainOrder(
     const std::vector<framework::DDim>& ins_dims, const bool save_result,
     std::vector<framework::Tensor>* results) {
   auto order = GetOrder(ins, ins_dims);
-  auto n = ins.size();
-  return MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0, n - 1,
-                                       save_result, results);
+  return MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0,
+                                       ins.size() - 1, save_result, results);
 }
 
 inline void GetDims(const std::vector<const framework::Tensor*>& ins,
                     std::vector<framework::DDim>* ins_dims) {
   const auto n = ins.size();
-  std::vector<framework::Tensor> real_ins;
   for (size_t i = 0; i < n; i++) {
     (*ins_dims)[i] = ins[i]->dims();
     if (i == 0 && (*ins_dims)[i].size() == 1) {
@@ -260,17 +258,13 @@ class MultiDotOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto inputs = ctx.MultiInput<Tensor>("X");
     auto input_data_type = framework::proto::VarType::Type(0);
-    bool flag = 1;
     for (auto* input : inputs) {
-      if (!input->IsInitialized() || input->numel() == 0) {
-        flag = 0;
+      if (!input->IsInitialized()) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The inputs of multi_dot OP are Empty!"));
         break;
       }
     }
-    if (flag == 0) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "All Inputs of multi_dot OP are Empty!"));
-    }
     input_data_type = inputs[0]->type();
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -407,7 +401,6 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
     auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, true);
     T alpha = static_cast<T>(1.0);
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
-
     blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
     blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
   }
@@ -522,10 +515,8 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
       const auto Ka = ins_dims[0][1];
       const auto Nb = ins_dims[1][1];
       const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 =
-          Ma * Nb * (Ka + Nc);  // Ma * Ka * Nb + Ma * Nb * Nc;
-      const uint64_t cost2 =
-          Ka * Nc * (Nb + Ma);  // Ka * Nb * Nc + Ma * Ka * Nc;
+      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
       auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false);
       auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false);
       auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false);
diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
index 35482144c272dd..c5335d11410897 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -190,10 +190,6 @@ def get_inputs_and_outputs(self):
         self.inputs = {'X': [('x0', self.A), ('x1', self.B)]}
         self.outputs = {'Out': multi_dot([self.A, self.B])}
 
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-
 
 class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp):
     def get_inputs_and_outputs(self):

From f342f00661deff765ecf1fc4825453d56ea9e8d5 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Thu, 2 Sep 2021 02:42:14 +0000
Subject: [PATCH 12/19] add comment and modifying code according to the review
 comments

---
 paddle/fluid/operators/multi_dot_op.cc        | 17 ++++++-
 .../tests/unittests/test_multi_dot_op.py      | 49 ++++---------------
 python/paddle/tensor/linalg.py                | 16 +++---
 3 files changed, 33 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index c27c01b92b6db3..3929aebc4382e7 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -114,7 +114,10 @@ inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
 }
 
 /**
- * @brief multi matrix dot by a chain order
+ * @brief Recursively calculate matrix multiplication according to the optimal
+ * order
+ * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
+ *
  * @param
  * ins: the input tensors
  * ins_dims: the shape of ins after reshape
@@ -164,15 +167,19 @@ inline framework::Tensor MatChainMul(
 std::vector<uint64_t> GetOrder(const std::vector<const framework::Tensor*>& ins,
                                const std::vector<framework::DDim>& ins_dims) {
   auto n = ins.size();
+  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
   std::vector<uint64_t> p(n + 1);
   for (uint64_t i = 0; i < n; i++) {
     p[i] = ins_dims[i][0];
   }
   p[n] = ins_dims[n - 1][1];
 
+  // m[i, j]: save the lowest cost for multiplying ins[i...j]
   std::vector<uint64_t> m(n * n, 0);
+  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
+  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
+  // multiply the resulting matrices is the optimal order for ins[i...j]
   std::vector<uint64_t> order(n * n);
-
   for (uint64_t l = 1; l < n; l++) {
     for (uint64_t i = 0; i < n - l; i++) {
       auto j = i + l;
@@ -292,6 +299,12 @@ class MultiDotOp : public framework::OperatorWithKernel {
   }
 };
 
+/**
+ * 1. there are only 2 matrices: direct matrix multiplication A*B
+ * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C),
+ *  choose the least cost order for calculation
+ * 3. more than 3 matrices: call MultiDotMatChainOrder
+ */
 template <typename DeviceContext, typename T>
 class MultiDotKernel : public framework::OpKernel<T> {
  public:
diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
index c5335d11410897..e128939d9d5ecc 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -24,6 +24,8 @@
 paddle.enable_static()
 
 
+#the unittest of multi_dot
+#compare the result of paddle multi_dot and numpy multi_dot
 class TestMultiDotOp(OpTest):
     def setUp(self):
         self.op_type = "multi_dot"
@@ -47,11 +49,6 @@ def test_check_grad(self):
         self.check_grad(['x1'], 'Out')
 
 
-class TestMultiDotOpDouble(TestMultiDotOp):
-    def get_dtype(self):
-        return "float64"
-
-
 #(A*B)*C
 class TestMultiDotOp3Mat(TestMultiDotOp):
     def get_inputs_and_outputs(self):
@@ -109,7 +106,7 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B])}
 
 
-class TestMultiDotOp3MatFirst1D(TestMultiDotOp):
+class TestMultiDotOp3MatFirst1D(TestMultiDotOp3Mat):
     def get_inputs_and_outputs(self):
         self.A = np.random.random((4)).astype(self.dtype)
         self.B = np.random.random((4, 3)).astype(self.dtype)
@@ -117,13 +114,8 @@ def get_inputs_and_outputs(self):
         self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]}
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
-
 
-class TestMultiDotOp4MatFirst1D(TestMultiDotOp):
+class TestMultiDotOp4MatFirst1D(TestMultiDotOp4Mat):
     def get_inputs_and_outputs(self):
         self.A = np.random.random((4)).astype(self.dtype)
         self.B = np.random.random((4, 3)).astype(self.dtype)
@@ -135,12 +127,6 @@ def get_inputs_and_outputs(self):
         }
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
-        self.check_grad(['x3'], 'Out')
-
 
 class TestMultiDotOpLast1D(TestMultiDotOp):
     def get_inputs_and_outputs(self):
@@ -150,7 +136,7 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B])}
 
 
-class TestMultiDotOp3MatLast1D(TestMultiDotOp):
+class TestMultiDotOp3MatLast1D(TestMultiDotOp3Mat):
     def get_inputs_and_outputs(self):
         self.A = np.random.random((2, 4)).astype(self.dtype)
         self.B = np.random.random((4, 3)).astype(self.dtype)
@@ -164,7 +150,7 @@ def test_check_grad(self):
         self.check_grad(['x2'], 'Out')
 
 
-class TestMultiDotOp4MatLast1D(TestMultiDotOp):
+class TestMultiDotOp4MatLast1D(TestMultiDotOp4Mat):
     def get_inputs_and_outputs(self):
         self.A = np.random.random((2, 3)).astype(self.dtype)
         self.B = np.random.random((3, 2)).astype(self.dtype)
@@ -176,12 +162,6 @@ def get_inputs_and_outputs(self):
         }
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
-        self.check_grad(['x3'], 'Out')
-
 
 class TestMultiDotOpFirstAndLast1D(TestMultiDotOp):
     def get_inputs_and_outputs(self):
@@ -191,7 +171,7 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B])}
 
 
-class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp):
+class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp3Mat):
     def get_inputs_and_outputs(self):
         self.A = np.random.random((6, )).astype(self.dtype)
         self.B = np.random.random((6, 4)).astype(self.dtype)
@@ -199,13 +179,8 @@ def get_inputs_and_outputs(self):
         self.inputs = {'X': [('x0', self.A), ('x1', self.B), ('x2', self.C)]}
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
-
 
-class TestMultiDotOp4MatFirstAndLast1D(TestMultiDotOp):
+class TestMultiDotOp4MatFirstAndLast1D(TestMultiDotOp4Mat):
     def get_inputs_and_outputs(self):
         self.A = np.random.random((3, )).astype(self.dtype)
         self.B = np.random.random((3, 4)).astype(self.dtype)
@@ -217,12 +192,6 @@ def get_inputs_and_outputs(self):
         }
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
-        self.check_grad(['x3'], 'Out')
-
 
 #####python API test#######
 class TestMultiDotOpError(unittest.TestCase):
@@ -258,7 +227,7 @@ def test_errors(self):
             self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7])
 
 
-class API_TestMultiDot(unittest.TestCase):
+class APITestMultiDot(unittest.TestCase):
     def test_out(self):
         with fluid.program_guard(fluid.Program()):
             x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64")
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 01be39d9011606..4d26cd0256c72d 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1014,18 +1014,19 @@ def matrix_power(x, n, name=None):
 
 def multi_dot(x, name=None):
     """
-    Compute the dot product of tow or more matrix in a single function call, while automatically selecting the fastest evaluation order.
+    Multi_dot is an operator that calculates multiple matrix multiplications.
 
     Supports inputs of float, double and float16 dtypes. This function does not support batched inputs.
 
-    Every tensor in x must be 2D, except for the first and last which may be 1D. if the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1).
-    If the first and last tensors are matrices, the output will be a matrix. However, if either is a 1D vector, then the output will be a 1D vector.
+    The input tensor in [x] must be 2D except for the first and last can be 1D. If the first tensor is a 1D vector of shape(n, ) it is treated as row vector of shape(1, n), similarly if the last tensor is a 1D vector of shape(n, ), it is treated as a column vector of shape(n, 1).
 
-    The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (10, 100), (100, 5), (5, 50) respectively, we can calculate the cost of different multiplication orders as follows:
-    - Cost((AB)C) = 10x100x5 + 10x5x50 = 7500
-    - Cost(A(BC)) = 10x100x50 + 100x5x50 = 75000
+    If the first and last tensor are 2D matrix, then the output is also 2D matrix, otherwise the output is a 1D vector.
 
-    In this case, multiplying A and B first followed by C is 10 times faster.
+    Multi_dot will select the lowest cost multiplication order for calculation. The cost of multiplying two matrices with shapes (a, b) and (b, c) is a * b * c. Given matrices A, B, C with shapes (20, 5), (5, 100), (100, 10) respectively, we can calculate the cost of different multiplication orders as follows:
+    - Cost((AB)C) = 20x5x100 + 20x100x10 = 30000
+    - Cost(A(BC)) = 5x100x10 + 20x5x10 = 6000
+
+    In this case, multiplying B and C first, then multiply A, which is 5 times faster than sequential calculation.
 
     Args:
         x ([Tensor]): The input tensors which is a list Tensor.
@@ -1061,6 +1062,7 @@ def multi_dot(x, name=None):
         C = paddle.to_tensor(C_data)
         out = paddle.multi_dot([A, B, C])
         print(out.numpy().shape)
+        # [10, 7]
 
     """
     if in_dygraph_mode():

From 5d2bbc889b8d3774fa01003f325fe012da2a11c8 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Thu, 2 Sep 2021 09:22:14 +0000
Subject: [PATCH 13/19] optimize the code according to the review comments

---
 .../operators/fused/fused_dropout_common.h    |  73 ++++++---
 .../fused/fused_residual_dropout_bias.h       | 142 +++++++++---------
 .../fused/fused_residual_dropout_bias_test.cu | 129 ++++++----------
 3 files changed, 167 insertions(+), 177 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 755153bb07eee9..53ce76826a6793 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -22,48 +22,75 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
 
-/**
- * get 1D threads and blocks
- */
-template <int VecSize = 4>
-inline std::pair<uint32_t, uint32_t> Get1DThreadsAndBlocks(
-    const platform::CUDADeviceContext &ctx, const uint64_t n) {
-  const uint64_t tmp_n = n / VecSize;
-  int threads = std::max(
-      (uint64_t)32, std::min(tmp_n, (uint64_t)ctx.GetMaxThreadsPerBlock()));
-  int blocks = std::max((uint64_t)1, (tmp_n + threads - 1) / threads);
-  return std::pair<uint32_t, uint32_t>{threads, blocks};
-}
+#define MAX_CACHE_BYTES 16
 
 /**
  * get the threads for fused_residual_dropout_bias:
  * 1D blocks: blockDim.x = cols
  * 2D grids: gridDim.y = rows
  */
-template <int VecSize = 4>
-inline std::pair<dim3, dim3> Get1DBlocksAnd2DGrids(
+inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
     const platform::CUDADeviceContext &ctx, const uint32_t rows,
-    const uint32_t cols) {
+    const uint32_t cols, const int VecSize) {
   const uint32_t tmp_cols = cols / VecSize;
   int threads = std::max(
       (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock()));
   int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads);
   int blocks_y = std::max((uint32_t)1, rows);
-  dim3 block_dim(threads, 1, 1);
-  dim3 grid_dim(blocks_x, blocks_y, 1);
-  return std::pair<dim3, dim3>{block_dim, grid_dim};
+  platform::GpuLaunchConfig config;
+  config.block_per_grid.x = blocks_x;
+  config.block_per_grid.y = blocks_y;
+  config.thread_per_block.x = threads;
+  return config;
 }
 
-// aligned vector generates vectorized load/store on CUDA
-template <typename T, int VecSize>
-struct alignas(sizeof(T) * VecSize) AlignedVector {
-  T val[VecSize];
-};
+__forceinline__ __device__ void Rand1(curandStatePhilox4_32_10_t *state,
+                                      float *data) {
+  data[0] = curand_uniform(state);
+}
+
+__forceinline__ __device__ void Rand2(curandStatePhilox4_32_10_t *state,
+                                      float *data) {
+  data[0] = curand_uniform(state);
+  data[1] = curand_uniform(state);
+}
+
+__forceinline__ __device__ void Rand4(curandStatePhilox4_32_10_t *state,
+                                      float *data) {
+  float4 rand4 = curand_uniform4(state);
+  data[0] = rand4.x;
+  data[1] = rand4.y;
+  data[2] = rand4.w;
+  data[3] = rand4.z;
+}
+
+__forceinline__ __device__ void Rand8(curandStatePhilox4_32_10_t *state,
+                                      float *data) {
+  Rand4(state, data);
+  Rand4(state, data + 4);
+}
+
+__forceinline__ __device__ void RandVec(curandStatePhilox4_32_10_t *state,
+                                        float *data, const int VecSize) {
+  if (VecSize == 1) {
+    Rand1(state, data);
+  } else if (VecSize == 2) {
+    Rand2(state, data);
+  } else if (VecSize == 4) {
+    Rand4(state, data);
+  } else if (VecSize == 8) {
+    Rand8(state, data);
+  } else {
+    return;
+  }
+}
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index eda633380e07a2..bafc8c60040c1e 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -20,28 +20,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-namespace platform = paddle::platform;
-namespace cg = cooperative_groups;
-namespace memory = paddle::memory;
-
-/**
- * @brief fused the add_bias, dropout, add residual into one operators
- *
- */
-
-/********Forward**************/
 /**
- * @brief the fused function called by every thread
+ * @brief The fused function called by every thread
+ * VecSize can be 1, 2, 4 or 8
  */
 template <typename T, typename MaskType, int VecSize, bool ComputeLayerNorm>
-__forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread(
+__forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const int row_id, const int col_id, const int cols,
     curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor,
     const T *src, const T *residual, const T *bias, T *dst, MaskType *mask,
     typename details::MPTypeTrait<T>::Type *mean_val,
     typename details::MPTypeTrait<T>::Type *var_val) {
-  using LoadT = AlignedVector<T, VecSize>;
-  using MaskLoadT = AlignedVector<MaskType, VecSize>;
+  using LoadT = platform::CudaAlignedVector<T, VecSize>;
+  using MaskLoadT = platform::CudaAlignedVector<MaskType, VecSize>;
   using U = typename details::MPTypeTrait<T>::Type;
 
   T src_vec[VecSize];
@@ -60,16 +51,19 @@ __forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread(
 
   LoadT *bias_value =
       bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
-  if (bias != nullptr)
+  if (bias) {
     *bias_value = *reinterpret_cast<const LoadT *>(&bias[col_id]);
+  }
+
+  float rand[VecSize];
+  RandVec(state, rand, VecSize);
 
-  float4 rand = curand_uniform4(state);
   T dest_vec[VecSize];
   MaskType mask_vec[VecSize];
 
 #pragma unroll
   for (int ii = 0; ii < VecSize; ii++) {
-    mask_vec[ii] = (MaskType)((&rand.x)[ii] >= dropout_prob);
+    mask_vec[ii] = static_cast<MaskType>(rand[ii] >= dropout_prob);
   }
 
 #pragma unroll
@@ -97,13 +91,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasVecOneThread(
  * the bias shape is (1, cols)
  */
 template <typename T, typename MaskType, int VecSize>
-__global__ void FusedResidualDropoutBiasVec(const size_t rows,
-                                            const size_t cols, uint64_t seed,
-                                            const float dropout_prob,
-                                            const bool is_upscale_in_train,
-                                            const T *src, const T *residual,
-                                            const T *bias, MaskType *mask,
-                                            T *dst, uint64_t increment) {
+__global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols,
+                                         uint64_t seed,
+                                         const float dropout_prob,
+                                         const bool is_upscale_in_train,
+                                         const T *src, const T *residual,
+                                         const T *bias, MaskType *mask, T *dst,
+                                         uint64_t increment) {
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
@@ -117,9 +111,9 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
-      FusedResidualDropoutBiasVecOneThread<T, MaskType, VecSize, false>(
+      FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, false>(
           r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
-          mask, NULL, NULL);
+          mask, nullptr, nullptr);
     }
   }
 }
@@ -127,12 +121,14 @@ __global__ void FusedResidualDropoutBiasVec(const size_t rows,
 /**
  * @brief the fused function called by every thread
  */
-template <typename T, typename U, int VecSize, bool layer_norm>
-__forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferVecOneThread(
+template <typename T, int VecSize, bool ComputeLayerNorm>
+__forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferOneThread(
     const int row_id, const int col_id, const int cols,
     const float dropout_prob, const T factor, const T *src, const T *residual,
-    const T *bias, T *dst, U *mean_val, U *var_val) {
-  using LoadT = AlignedVector<T, VecSize>;
+    const T *bias, T *dst, typename details::MPTypeTrait<T>::Type *mean_val,
+    typename details::MPTypeTrait<T>::Type *var_val) {
+  using LoadT = platform::CudaAlignedVector<T, VecSize>;
+  using U = typename details::MPTypeTrait<T>::Type;
   T src_vec[VecSize];
   T residual_vec[VecSize];
   T bias_vec[VecSize];
@@ -149,15 +145,16 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferVecOneThread(
 
   LoadT *bias_value =
       bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
-  if (bias != nullptr)
+  if (bias) {
     *bias_value = *reinterpret_cast<const LoadT *>(&bias[col_id]);
+  }
 
   T dest_vec[VecSize];
 
 #pragma unroll
   for (int ii = 0; ii < VecSize; ii++) {
     dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii];
-    if (layer_norm) {
+    if (ComputeLayerNorm) {
       U tmp = static_cast<U>(dest_vec[ii]);
       *mean_val += tmp;
       *var_val += (tmp * tmp);
@@ -175,7 +172,7 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferVecOneThread(
  * the bias shape is (1, cols)
  */
 template <typename T, int VecSize>
-__global__ void FusedResidualDropoutBiasOnlyInferVec(
+__global__ void FusedResidualDropoutBiasOnlyInfer(
     const size_t rows, const size_t cols, const float dropout_prob,
     const bool is_upscale_in_train, const T *src, const T *residual,
     const T *bias, T *dst) {
@@ -191,7 +188,7 @@ __global__ void FusedResidualDropoutBiasOnlyInferVec(
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
-      FusedResidualDropoutBiasOnlyInferVecOneThread<T, T, VecSize, false>(
+      FusedResidualDropoutBiasOnlyInferOneThread<T, VecSize, false>(
           r, i, cols, dropout_prob, factor, src, residual, bias, dst, nullptr,
           nullptr);
     }
@@ -220,48 +217,46 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
     return;
   }
 
-  const int VecSize = 4;
-  auto threads = Get1DBlocksAnd2DGrids(ctx, rows, cols);
+  const int VecSize = MAX_CACHE_BYTES / sizeof(T);
+  auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, VecSize);
   if (cols % VecSize != 0) {
     if (!is_test) {
-      FusedResidualDropoutBiasVec<
-          T, uint8_t, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
+      FusedResidualDropoutBias<T, uint8_t, 1><<<
+          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
           rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
           bias, mask_data, dst, increment);
     } else {
-      FusedResidualDropoutBiasOnlyInferVec<
-          T, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
+      FusedResidualDropoutBiasOnlyInfer<T, 1><<<
+          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
           rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
           dst);
     }
   } else {
     if (!is_test) {
-      FusedResidualDropoutBiasVec<
-          T, uint8_t,
-          VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+      FusedResidualDropoutBias<T, uint8_t, VecSize><<<
+          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
           rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
           bias, mask_data, dst, increment);
     } else {
-      FusedResidualDropoutBiasOnlyInferVec<
-          T, VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+      FusedResidualDropoutBiasOnlyInfer<T, VecSize><<<
+          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
           rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
           dst);
     }
   }
 }
 
-/********Backward**************/
 /*
  * @brief calculate the grad of no bias
  */
 template <typename T, typename MaskType, int VecSize>
-__global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask,
-                                            const T factor, const int64_t size,
-                                            T *dx) {
+__global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
+                                         const T factor, const int64_t size,
+                                         T *dx) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  using LoadT = AlignedVector<T, VecSize>;
-  using MaskLoadT = AlignedVector<MaskType, VecSize>;
+  using LoadT = platform::CudaAlignedVector<T, VecSize>;
+  using MaskLoadT = platform::CudaAlignedVector<MaskType, VecSize>;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
     T dout_vec[VecSize];
     MaskType mask_vec[VecSize];
@@ -286,15 +281,17 @@ __global__ void FusedResidualDropoutGradVec(const T *dout, const MaskType *mask,
  * 2. save 128*8 temporary sum in 8*128 shared memory
  * 3. reduce the sum of 128 rows data by 8*VecSize warps
  */
-template <typename T, typename MaskType, int BLOCK_SIZE_X, int BLOCK_SIZE_Y,
+template <typename T, typename MaskType, int BlockSizeX, int BlockSizeY,
           int VecSize>
-__global__ void FusedResidualDropoutBiasGradVec(
-    const T *dout, const MaskType *mask, const T factor, const int64_t rows,
-    const int64_t cols, T *dx, T *dbias) {
+__global__ void FusedResidualDropoutBiasGrad(const T *dout,
+                                             const MaskType *mask,
+                                             const T factor, const int64_t rows,
+                                             const int64_t cols, T *dx,
+                                             T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  using LoadT = AlignedVector<T, VecSize>;
-  using MaskLoadT = AlignedVector<MaskType, VecSize>;
+  using LoadT = platform::CudaAlignedVector<T, VecSize>;
+  using MaskLoadT = platform::CudaAlignedVector<MaskType, VecSize>;
 
   T tmp_sum[VecSize] = {static_cast<T>(0)};
   // calculate the dx and temporary sum
@@ -321,7 +318,7 @@ __global__ void FusedResidualDropoutBiasGradVec(
   }
 
   // save temporary sum to cache and do transpose
-  __shared__ T cache[BLOCK_SIZE_X * VecSize][BLOCK_SIZE_Y];
+  __shared__ T cache[BlockSizeX * VecSize][BlockSizeY];
   for (int i = 0; i < VecSize; i++) {
     cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i];
   }
@@ -333,11 +330,11 @@ __global__ void FusedResidualDropoutBiasGradVec(
   int x = tid >> 5;  // warp id
   int y = tid & 31;  // thread id on warp 0~31
 
-  // need BLOCK_SIZE_X * VecSize warps
-  if (x < BLOCK_SIZE_X * VecSize) {
+  // need BlockSizeX * VecSize warps
+  if (x < BlockSizeX * VecSize) {
 // reduce 128 to 32
 #pragma unroll
-    for (int i = 0; i < (BLOCK_SIZE_Y >> 5); i++) {
+    for (int i = 0; i < (BlockSizeY >> 5); i++) {
       sum += cache[x][y + i * 32];
     }
   }
@@ -347,7 +344,7 @@ __global__ void FusedResidualDropoutBiasGradVec(
 
   // save sum to dbias
   int bias_id = blockIdx.x * blockDim.x * VecSize + x;
-  if (y == 0 && x < VecSize * BLOCK_SIZE_X && bias_id < cols) {
+  if (y == 0 && x < VecSize * BlockSizeX && bias_id < cols) {
     dbias[bias_id] = sum;
   }
 }
@@ -370,7 +367,7 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
     factor = static_cast<T>(1.0f);
   }
 
-  const int VecSize = 4;
+  const int VecSize = MAX_CACHE_BYTES / sizeof(T);
   if (dbias != nullptr) {
     int real_vec_size = VecSize;
     if (cols % VecSize != 0) {
@@ -384,26 +381,27 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
     dim3 grid_dim(blocks, 1, 1);
 
     if (cols % VecSize == 0) {
-      FusedResidualDropoutBiasGradVec<
+      FusedResidualDropoutBiasGrad<
           T, MaskType, 8, 128,
           VecSize><<<grid_dim, block_dim, 0, ctx.stream()>>>(
           dout, mask, factor, rows, cols, dx, dbias);
     } else {
-      FusedResidualDropoutBiasGradVec<
-          T, MaskType, 8, 128, 1><<<grid_dim, block_dim, 0, ctx.stream()>>>(
+      FusedResidualDropoutBiasGrad<T, MaskType, 8, 128,
+                                   1><<<grid_dim, block_dim, 0, ctx.stream()>>>(
           dout, mask, factor, rows, cols, dx, dbias);
     }
   } else {
     const uint64_t n = rows * cols;
-    auto threads = Get1DThreadsAndBlocks(ctx, n);
     if (n % VecSize == 0) {
-      FusedResidualDropoutGradVec<
-          T, MaskType,
-          VecSize><<<threads.second, threads.first, 0, ctx.stream()>>>(
+      platform::GpuLaunchConfig config =
+          platform::GetGpuLaunchConfig1D(ctx, n / VecSize);
+      FusedResidualDropoutGrad<T, MaskType, VecSize><<<
+          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
           dout, mask, factor, n, dx);
     } else {
-      FusedResidualDropoutGradVec<
-          T, MaskType, 1><<<threads.second, threads.first, 0, ctx.stream()>>>(
+      platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx, n);
+      FusedResidualDropoutGrad<T, MaskType, 1><<<
+          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
           dout, mask, factor, n, dx);
     }
   }
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 14267974ff2657..b246d9bac9761b 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -39,7 +39,7 @@ struct TestFusedResidualDropoutBias {
   float dropout_prob;
   bool is_upscale_in_train;
   bool is_test;  // default false,  Set to true for inference only
-  bool hasbias = true;
+  bool has_bias = true;
   framework::Tensor src, residual, bias, out, mask;
   framework::Tensor dsrc, dbias;
 
@@ -57,10 +57,10 @@ struct TestFusedResidualDropoutBias {
     dropout_prob = 0.0;
     is_upscale_in_train = false;
     is_test = false;
-    hasbias = true;
+    has_bias = true;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto devicectx = pool.Get(place);
-    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
+    auto device_ctx = pool.Get(place);
+    ctx = reinterpret_cast<platform::CUDADeviceContext *>(device_ctx);
   }
 
   TestFusedResidualDropoutBias(int rows_, int cols_, uint64_t seed_ = 0,
@@ -73,10 +73,10 @@ struct TestFusedResidualDropoutBias {
     dropout_prob = dropout_prob_;
     is_upscale_in_train = is_upscale_in_train_;
     is_test = is_test_;
-    hasbias = true;
+    has_bias = true;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto devicectx = pool.Get(place);
-    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
+    auto device_ctx = pool.Get(place);
+    ctx = reinterpret_cast<platform::CUDADeviceContext *>(device_ctx);
   }
 
   ~TestFusedResidualDropoutBias() {}
@@ -108,7 +108,7 @@ struct TestFusedResidualDropoutBias {
     src.Resize({rows, cols});
     framework::TensorFromVector<T>(residual_vec, *ctx, &residual);
     residual.Resize({rows, cols});
-    if (hasbias) {
+    if (has_bias) {
       framework::TensorFromVector<T>(bias_vec, *ctx, &bias);
       bias.Resize({cols});
     }
@@ -121,7 +121,7 @@ struct TestFusedResidualDropoutBias {
       dsrc.Resize({rows, cols});
       dsrc.mutable_data<T>(place);
 
-      if (hasbias) {
+      if (has_bias) {
         dbias.Resize({cols});
         dbias.mutable_data<T>(place);
       }
@@ -130,7 +130,7 @@ struct TestFusedResidualDropoutBias {
 
   void BaseForward() {
     std::vector<T> out1(rows * cols), out2(rows * cols);
-    if (hasbias) {
+    if (has_bias) {
       // add bias
       for (int i = 0; i < rows; i++) {
         for (int j = 0; j < cols; j++) {
@@ -167,15 +167,16 @@ struct TestFusedResidualDropoutBias {
   }
 
   void FusedForward() {
-    auto threads = paddle::operators::Get1DBlocksAnd2DGrids(
-        *ctx, (uint64_t)rows, (uint64_t)cols);
-    const int VecSize = 4;
-    const int increment =
-        ((cols - 1) / (threads.first.x * threads.second.x * VecSize) + 1) *
-        VecSize;
+    const int VecSize = MAX_CACHE_BYTES / sizeof(T);
+    auto config = paddle::operators::Get1DBlocksAnd2DGrids(
+        *ctx, (uint64_t)rows, (uint64_t)cols, VecSize);
+    const int increment = ((cols - 1) / (config.thread_per_block.x *
+                                         config.block_per_grid.x * VecSize) +
+                           1) *
+                          VecSize;
 
     T *bias_ptr = nullptr;
-    if (hasbias) {
+    if (has_bias) {
       bias_ptr = bias.data<T>();
     }
     paddle::operators::LaunchResidualDropoutBias<T, uint8_t>(
@@ -191,7 +192,7 @@ struct TestFusedResidualDropoutBias {
     }
 
     T *bias_ptr = nullptr;
-    if (hasbias) {
+    if (has_bias) {
       bias_ptr = dbias.data<T>();
     }
     paddle::operators::LaunchResidualDropoutBiasGrad<T, uint8_t>(
@@ -237,7 +238,7 @@ struct TestFusedResidualDropoutBias {
       EXPECT_LT(std::abs(_dsrc[i] - correct_dsrc[i]), diff);
     }
 
-    if (hasbias) {
+    if (has_bias) {
       std::vector<T> _dbias(cols);
       framework::TensorToVector(dbias, *ctx, &_dbias);
       ctx->Wait();
@@ -248,66 +249,39 @@ struct TestFusedResidualDropoutBias {
   }
 };
 
-TEST(FusedDropout, GPUFusedResidualDropoutBias) {
+// test the shape and bias
+template <typename T>
+static void BaseTest(const bool is_fp16 = false) {
   const int rows = 16;
-  const int cols = 16;
-  TestFusedResidualDropoutBias<float> test(rows, cols);
-  test.Run();
-  test.CheckOut(static_cast<float>(1e-5));
-  test.CheckGrad(static_cast<float>(1e-5));
+  std::vector<int> cols_list = {16, 17};
+  bool has_bias[2] = {true, false};
+  T default_diff = static_cast<T>(1e-5);
+  if (is_fp16) {
+    default_diff = static_cast<T>(1e-2);
+  }
+  for (int i = 0; i < cols_list.size(); i++) {
+    for (int j = 0; j < 2; j++) {
+      TestFusedResidualDropoutBias<T> test(rows, cols_list[i]);
+      test.has_bias = has_bias[j];
+      test.Run();
+      test.CheckOut(default_diff);
+      if (!is_fp16) {
+        test.CheckGrad(default_diff);
+      }
+    }
+  }
 }
 
-TEST(FusedDropout, GPUFusedResidualDropoutBiasDouble) {
-  const int rows = 16;
-  const int cols = 16;
-  TestFusedResidualDropoutBias<double> test(rows, cols);
-  test.Run();
-  test.CheckOut(static_cast<double>(1e-5));
-  test.CheckGrad(static_cast<double>(1e-5));
-}
+TEST(FusedDropout, GPUFusedResidualDropoutBias) { BaseTest<float>(); }
+
+TEST(FusedDropout, GPUFusedResidualDropoutBiasDouble) { BaseTest<double>(); }
 
 // test fp16, For inference, check_grad is not required. ref: testdropout_op.py
 TEST(FusedDropout, GPUFusedResidualDropoutBiasFp16) {
-  const int rows = 16;
-  const int cols = 16;
-  TestFusedResidualDropoutBias<platform::float16> test(rows, cols);
-  test.Run();
-  test.CheckOut(static_cast<platform::float16>(1e-2));
-}
-
-// test no bias and cols % 4 == 0
-TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias) {
-  const int rows = 16;
-  const int cols = 16;
-  TestFusedResidualDropoutBias<float> test(rows, cols);
-  test.hasbias = false;
-  test.Run();
-  test.CheckOut(static_cast<float>(1e-5));
-  test.CheckGrad(static_cast<float>(1e-5));
-}
-
-// test no bias and cols % 4 != 0
-TEST(FusedDropout, GPUFusedResidualDropoutBiasNoBias2) {
-  const int rows = 16;
-  const int cols = 17;
-  TestFusedResidualDropoutBias<float> test(rows, cols);
-  test.hasbias = false;
-  test.Run();
-  test.CheckOut(static_cast<float>(1e-5));
-  test.CheckGrad(static_cast<float>(1e-5));
+  BaseTest<platform::float16>(true);
 }
 
-// test add bias and cols % 4 != 0
 TEST(FusedDropout, GPUFusedResidualDropoutBias2) {
-  const int rows = 16;
-  const int cols = 17;
-  TestFusedResidualDropoutBias<float> test(rows, cols);
-  test.Run();
-  test.CheckOut(static_cast<float>(1e-5));
-  test.CheckGrad(static_cast<float>(1e-5));
-}
-
-TEST(FusedDropout, GPUFusedResidualDropoutBias3) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, false, false);
@@ -316,16 +290,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBias3) {
   test.CheckGrad(static_cast<float>(1e-5));
 }
 
-TEST(FusedDropout, GPUFusedResidualDropoutBias4) {
-  const int rows = 16;
-  const int cols = 16;
-  TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, false, false);
-  test.Run();
-  test.CheckOut(static_cast<float>(1e-5));
-  test.CheckGrad(static_cast<float>(1e-5));
-}
-
-TEST(FusedDropout, GPUFusedResidualDropoutBias5) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias3) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, true, false);
@@ -334,7 +299,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBias5) {
   test.CheckGrad(static_cast<float>(1e-5));
 }
 
-TEST(FusedDropout, GPUFusedResidualDropoutBias6) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias4) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols, 0, 0.35, true, true);
@@ -343,7 +308,7 @@ TEST(FusedDropout, GPUFusedResidualDropoutBias6) {
   test.CheckGrad(static_cast<float>(1e-5));
 }
 
-TEST(FusedDropout, GPUFusedResidualDropoutBias7) {
+TEST(FusedDropout, GPUFusedResidualDropoutBias5) {
   const int rows = 16;
   const int cols = 16;
   TestFusedResidualDropoutBias<float> test(rows, cols, 125, 0.0, false, false);

From 934fcac6826d7559c71109df54ecdc7d3b89e81d Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Thu, 2 Sep 2021 10:26:33 +0000
Subject: [PATCH 14/19] use static_cast

---
 paddle/fluid/operators/fused/fused_dropout_common.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 53ce76826a6793..f159d16d855a7e 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -41,9 +41,11 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
     const uint32_t cols, const int VecSize) {
   const uint32_t tmp_cols = cols / VecSize;
   int threads = std::max(
-      (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock()));
-  int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads);
-  int blocks_y = std::max((uint32_t)1, rows);
+      static_cast<uint32_t>(32),
+      std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
+  const auto blocks_x =
+      std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
+  const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
   platform::GpuLaunchConfig config;
   config.block_per_grid.x = blocks_x;
   config.block_per_grid.y = blocks_y;

From 44610ea2e651be31fbde3bddecd95ced540c6665 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Wed, 8 Sep 2021 02:00:15 +0000
Subject: [PATCH 15/19] fix the blocks for large shape

---
 .../operators/fused/fused_residual_dropout_bias.h      |  5 ++---
 .../fused/fused_residual_dropout_bias_test.cu          | 10 ++++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index bafc8c60040c1e..952042d45f47c4 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -374,9 +374,8 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
       real_vec_size = 1;
     }
     auto threads = std::min(cols / real_vec_size, static_cast<uint32_t>(8));
-    auto blocks = std::max(
-        (uint32_t)1, std::min((cols / real_vec_size + threads - 1) / threads,
-                              (uint32_t)ctx.GetSMCount()));
+    auto blocks =
+        std::max((uint32_t)1, cols / real_vec_size + threads - 1 / threads);
     dim3 block_dim(threads, 128, 1);
     dim3 grid_dim(blocks, 1, 1);
 
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index b246d9bac9761b..e687823bc8158b 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -316,3 +316,13 @@ TEST(FusedDropout, GPUFusedResidualDropoutBias5) {
   test.CheckOut(static_cast<float>(1e-5));
   test.CheckGrad(static_cast<float>(1e-5));
 }
+
+// test large shape
+TEST(FusedDropout, GPUFusedResidualDropoutBias6) {
+  const int rows = 256;
+  const int cols = 4096;
+  TestFusedResidualDropoutBias<float> test(rows, cols);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-3));
+}

From 6c743f1004b46fb7b56334cd6e3d2af13fa9e477 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Wed, 8 Sep 2021 02:20:39 +0000
Subject: [PATCH 16/19] fix the merge error

---
 python/paddle/linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py
index 26bf4d6d3035b3..d5206e0417933b 100644
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -23,7 +23,7 @@
     'cholesky',  #noqa
     'norm',
     'inv',
-    'multi_dot'
+    'multi_dot',
     'svd',
     'matrix_power'
 ]

From 1a83adb08e7cc1914b07b4e8ea3be6d521173a10 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Wed, 8 Sep 2021 03:43:53 +0000
Subject: [PATCH 17/19] merge upstream, and used new AlignedVector

---
 .../operators/fused/fused_dropout_common.h    |   5 +-
 .../fused/fused_residual_dropout_bias.h       | 252 ++++++------------
 2 files changed, 85 insertions(+), 172 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index f159d16d855a7e..24f6f53c63630e 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -20,16 +20,17 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
 
-#define MAX_CACHE_BYTES 16
+#define CACHE_LINE 128
+#define MAX_CACHE_BYTES (CACHE_LINE / CHAR_BIT)
 
 /**
  * get the threads for fused_residual_dropout_bias:
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 952042d45f47c4..cd9dfd1c79ca8f 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -29,43 +29,45 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const int row_id, const int col_id, const int cols,
     curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor,
     const T *src, const T *residual, const T *bias, T *dst, MaskType *mask,
-    typename details::MPTypeTrait<T>::Type *mean_val,
+    const bool is_test, typename details::MPTypeTrait<T>::Type *mean_val,
     typename details::MPTypeTrait<T>::Type *var_val) {
-  using LoadT = platform::CudaAlignedVector<T, VecSize>;
-  using MaskLoadT = platform::CudaAlignedVector<MaskType, VecSize>;
+  using LoadT = platform::AlignedVector<T, VecSize>;
+  using StoreT = platform::AlignedVector<T, VecSize>;
+  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
   using U = typename details::MPTypeTrait<T>::Type;
 
-  T src_vec[VecSize];
-  T residual_vec[VecSize];
-  T bias_vec[VecSize];
+  LoadT src_vec;
+  LoadT residual_vec;
+  LoadT bias_vec;
 #pragma unroll
   for (int ii = 0; ii < VecSize; ii++) {
     bias_vec[ii] = static_cast<T>(0);
   }
   // vectorize load data from global
-  LoadT *value = reinterpret_cast<LoadT *>(&src_vec);
-  LoadT *residual_value = reinterpret_cast<LoadT *>(&residual_vec);
-  *value = *reinterpret_cast<const LoadT *>(&src[row_id * cols + col_id]);
-  *residual_value =
-      *reinterpret_cast<const LoadT *>(&residual[row_id * cols + col_id]);
-
-  LoadT *bias_value =
-      bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
+  platform::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  platform::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
+
   if (bias) {
-    *bias_value = *reinterpret_cast<const LoadT *>(&bias[col_id]);
+    platform::Load<T, VecSize>(&bias[col_id], &bias_vec);
   }
 
-  float rand[VecSize];
-  RandVec(state, rand, VecSize);
-
-  T dest_vec[VecSize];
-  MaskType mask_vec[VecSize];
-
+  MaskStoreT mask_vec;
+  if (!is_test) {
+    float rand[VecSize];
+    RandVec(state, rand, VecSize);
 #pragma unroll
-  for (int ii = 0; ii < VecSize; ii++) {
-    mask_vec[ii] = static_cast<MaskType>(rand[ii] >= dropout_prob);
+    for (int ii = 0; ii < VecSize; ii++) {
+      mask_vec[ii] = static_cast<MaskType>(rand[ii] >= dropout_prob);
+    }
+  } else {
+#pragma unroll
+    for (int ii = 0; ii < VecSize; ii++) {
+      mask_vec[ii] = static_cast<MaskType>(1);
+    }
   }
 
+  StoreT dest_vec;
+
 #pragma unroll
   for (int ii = 0; ii < VecSize; ii++) {
     dest_vec[ii] =
@@ -79,25 +81,25 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
   }
 
   // store result to global
-  *(reinterpret_cast<LoadT *>(&dst[row_id * cols + col_id])) =
-      *reinterpret_cast<LoadT *>(&dest_vec[0]);
-  *(reinterpret_cast<MaskLoadT *>(&mask[row_id * cols + col_id])) =
-      *reinterpret_cast<MaskLoadT *>(&mask_vec[0]);
+  platform::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
+  if (!is_test) {
+    platform::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
+  }
 }
 
 /**
  * @brief dst = residual + dropout(src + bias);
  * the src, residual, mask and dst shape is (rows, cols)
  * the bias shape is (1, cols)
+ * is_test: only used in inference
+ * mask: can be null if is_test=true
  */
 template <typename T, typename MaskType, int VecSize>
-__global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols,
-                                         uint64_t seed,
-                                         const float dropout_prob,
-                                         const bool is_upscale_in_train,
-                                         const T *src, const T *residual,
-                                         const T *bias, MaskType *mask, T *dst,
-                                         uint64_t increment) {
+__global__ void FusedResidualDropoutBias(
+    const size_t rows, const size_t cols, uint64_t seed,
+    const float dropout_prob, const bool is_upscale_in_train, const T *src,
+    const T *residual, const T *bias, MaskType *mask, T *dst,
+    uint64_t increment, const bool is_test) {
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
@@ -108,89 +110,18 @@ __global__ void FusedResidualDropoutBias(const size_t rows, const size_t cols,
   if (!is_upscale_in_train) {
     factor = static_cast<T>(1.0f);
   }
-  for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
-    for (int i = col_id * VecSize; i < cols;
-         i += blockDim.x * gridDim.x * VecSize) {
-      FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, false>(
-          r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
-          mask, nullptr, nullptr);
-    }
-  }
-}
-
-/**
- * @brief the fused function called by every thread
- */
-template <typename T, int VecSize, bool ComputeLayerNorm>
-__forceinline__ __device__ void FusedResidualDropoutBiasOnlyInferOneThread(
-    const int row_id, const int col_id, const int cols,
-    const float dropout_prob, const T factor, const T *src, const T *residual,
-    const T *bias, T *dst, typename details::MPTypeTrait<T>::Type *mean_val,
-    typename details::MPTypeTrait<T>::Type *var_val) {
-  using LoadT = platform::CudaAlignedVector<T, VecSize>;
-  using U = typename details::MPTypeTrait<T>::Type;
-  T src_vec[VecSize];
-  T residual_vec[VecSize];
-  T bias_vec[VecSize];
-#pragma unroll
-  for (int ii = 0; ii < VecSize; ii++) {
-    bias_vec[ii] = static_cast<T>(0);
-  }
-  // vectorize load data from global
-  LoadT *value = reinterpret_cast<LoadT *>(&src_vec);
-  LoadT *residual_value = reinterpret_cast<LoadT *>(&residual_vec);
-  *value = *reinterpret_cast<const LoadT *>(&src[row_id * cols + col_id]);
-  *residual_value =
-      *reinterpret_cast<const LoadT *>(&residual[row_id * cols + col_id]);
-
-  LoadT *bias_value =
-      bias != nullptr ? reinterpret_cast<LoadT *>(&bias_vec) : nullptr;
-  if (bias) {
-    *bias_value = *reinterpret_cast<const LoadT *>(&bias[col_id]);
-  }
-
-  T dest_vec[VecSize];
-
-#pragma unroll
-  for (int ii = 0; ii < VecSize; ii++) {
-    dest_vec[ii] = (src_vec[ii] + bias_vec[ii]) * factor + residual_vec[ii];
-    if (ComputeLayerNorm) {
-      U tmp = static_cast<U>(dest_vec[ii]);
-      *mean_val += tmp;
-      *var_val += (tmp * tmp);
+  if (is_test) {
+    factor = static_cast<T>(1.0f - dropout_prob);
+    if (is_upscale_in_train) {
+      factor = static_cast<T>(1.0f);
     }
   }
-
-  // store result to global
-  *(reinterpret_cast<LoadT *>(&dst[row_id * cols + col_id])) =
-      *reinterpret_cast<LoadT *>(&dest_vec[0]);
-}
-
-/**
- * @brief for dropout's param is_test = true, only used in inference
- * the src, residual and dst shape is (rows, cols)
- * the bias shape is (1, cols)
- */
-template <typename T, int VecSize>
-__global__ void FusedResidualDropoutBiasOnlyInfer(
-    const size_t rows, const size_t cols, const float dropout_prob,
-    const bool is_upscale_in_train, const T *src, const T *residual,
-    const T *bias, T *dst) {
-  int col_id = blockDim.x * blockIdx.x + threadIdx.x;
-  int row_id = blockIdx.y;
-  int idx = row_id * cols + col_id;
-
-  T factor = static_cast<T>(1.0f - dropout_prob);
-  if (is_upscale_in_train) {
-    factor = static_cast<T>(1.0f);
-  }
-
   for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
     for (int i = col_id * VecSize; i < cols;
          i += blockDim.x * gridDim.x * VecSize) {
-      FusedResidualDropoutBiasOnlyInferOneThread<T, VecSize, false>(
-          r, i, cols, dropout_prob, factor, src, residual, bias, dst, nullptr,
-          nullptr);
+      FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, false>(
+          r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
+          mask, is_test, nullptr, nullptr);
     }
   }
 }
@@ -212,37 +143,27 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
     auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
     memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T),
                  ctx.stream());
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
-        mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
+    if (!is_test) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
+          mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
+    }
     return;
   }
 
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
-  auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, VecSize);
-  if (cols % VecSize != 0) {
-    if (!is_test) {
-      FusedResidualDropoutBias<T, uint8_t, 1><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
-          bias, mask_data, dst, increment);
-    } else {
-      FusedResidualDropoutBiasOnlyInfer<T, 1><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
-          dst);
-    }
+  const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
+  auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
+  if (cols % VecSize == 0) {
+    FusedResidualDropoutBias<T, uint8_t, VecSize><<<
+        config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+        bias, mask_data, dst, increment, is_test);
   } else {
-    if (!is_test) {
-      FusedResidualDropoutBias<T, uint8_t, VecSize><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
-          bias, mask_data, dst, increment);
-    } else {
-      FusedResidualDropoutBiasOnlyInfer<T, VecSize><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          rows, cols, dropout_prob, is_upscale_in_train, src, residual, bias,
-          dst);
-    }
+    FusedResidualDropoutBias<
+        T, uint8_t,
+        1><<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+        bias, mask_data, dst, increment, is_test);
   }
 }
 
@@ -255,23 +176,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
                                          T *dx) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  using LoadT = platform::CudaAlignedVector<T, VecSize>;
-  using MaskLoadT = platform::CudaAlignedVector<MaskType, VecSize>;
+  using LoadT = platform::AlignedVector<T, VecSize>;
+  using StoreT = platform::AlignedVector<T, VecSize>;
+  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
   for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
-    T dout_vec[VecSize];
-    MaskType mask_vec[VecSize];
-    LoadT *dout_value = reinterpret_cast<LoadT *>(&dout_vec);
-    MaskLoadT *mask_value = reinterpret_cast<MaskLoadT *>(&mask_vec);
-    *dout_value = *reinterpret_cast<const LoadT *>(&dout[i]);
-    *mask_value = *reinterpret_cast<const MaskLoadT *>(&mask[i]);
-
-    T dx_vec[VecSize];
+    LoadT dout_vec;
+    MaskLoadT mask_vec;
+    platform::Load<T, VecSize>(&dout[i], &dout_vec);
+    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+
+    StoreT dx_vec;
 #pragma unroll
     for (int ii = 0; ii < VecSize; ii++) {
       dx_vec[ii] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
     }
-    *(reinterpret_cast<LoadT *>(&dx[i])) =
-        *reinterpret_cast<LoadT *>(&dx_vec[0]);
+    platform::Store<T, VecSize>(dx_vec, &dx[i]);
   }
 }
 
@@ -290,21 +209,20 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
                                              T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  using LoadT = platform::CudaAlignedVector<T, VecSize>;
-  using MaskLoadT = platform::CudaAlignedVector<MaskType, VecSize>;
+  using LoadT = platform::AlignedVector<T, VecSize>;
+  using StoreT = platform::AlignedVector<T, VecSize>;
+  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
 
   T tmp_sum[VecSize] = {static_cast<T>(0)};
   // calculate the dx and temporary sum
   if (col_id * VecSize < cols) {
     for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) {
       int index = row_id * cols + col_id * VecSize;
-      T out_vec[VecSize];
-      MaskType mask_vec[VecSize];
-      T dx_vec[VecSize];
-      LoadT *out_value = reinterpret_cast<LoadT *>(&out_vec);
-      MaskLoadT *mask_value = reinterpret_cast<MaskLoadT *>(&mask_vec);
-      *out_value = *reinterpret_cast<const LoadT *>(&dout[index]);
-      *mask_value = *reinterpret_cast<const MaskLoadT *>(&mask[index]);
+      LoadT out_vec;
+      MaskLoadT mask_vec;
+      StoreT dx_vec;
+      platform::Load<T, VecSize>(&dout[index], &out_vec);
+      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
 
 #pragma unroll
       for (int i = 0; i < VecSize; i++) {
@@ -312,8 +230,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
         tmp_sum[i] += out_vec[i];
       }
 
-      *(reinterpret_cast<LoadT *>(&dx[index])) =
-          *reinterpret_cast<LoadT *>(&dx_vec[0]);
+      platform::Store<T, VecSize>(dx_vec, &dx[index]);
     }
   }
 
@@ -368,17 +285,13 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
   }
 
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
+  int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
   if (dbias != nullptr) {
-    int real_vec_size = VecSize;
-    if (cols % VecSize != 0) {
-      real_vec_size = 1;
-    }
     auto threads = std::min(cols / real_vec_size, static_cast<uint32_t>(8));
     auto blocks =
-        std::max((uint32_t)1, cols / real_vec_size + threads - 1 / threads);
+        std::max((uint32_t)1, (cols / real_vec_size + threads - 1) / threads);
     dim3 block_dim(threads, 128, 1);
     dim3 grid_dim(blocks, 1, 1);
-
     if (cols % VecSize == 0) {
       FusedResidualDropoutBiasGrad<
           T, MaskType, 8, 128,
@@ -391,14 +304,13 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
     }
   } else {
     const uint64_t n = rows * cols;
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx, n / real_vec_size);
     if (n % VecSize == 0) {
-      platform::GpuLaunchConfig config =
-          platform::GetGpuLaunchConfig1D(ctx, n / VecSize);
       FusedResidualDropoutGrad<T, MaskType, VecSize><<<
           config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
           dout, mask, factor, n, dx);
     } else {
-      platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx, n);
       FusedResidualDropoutGrad<T, MaskType, 1><<<
           config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
           dout, mask, factor, n, dx);

From eda910f1551c466099da3f731f183964d8fcc8a8 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Thu, 9 Sep 2021 10:30:10 +0000
Subject: [PATCH 18/19] replace fluid with paddle

---
 .../tests/unittests/test_multi_dot_op.py      | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
index e128939d9d5ecc..97047b1ae0e5e0 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -18,8 +18,6 @@
 from numpy.linalg import multi_dot
 from op_test import OpTest
 import paddle
-from paddle.fluid import Program, program_guard
-import paddle.fluid as fluid
 
 paddle.enable_static()
 
@@ -196,44 +194,46 @@ def get_inputs_and_outputs(self):
 #####python API test#######
 class TestMultiDotOpError(unittest.TestCase):
     def test_errors(self):
-        with program_guard(Program(), Program()):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
             # The inputs type of multi_dot must be list matrix.
             input1 = 12
             self.assertRaises(TypeError, paddle.multi_dot, [input1, input1])
 
             # The inputs dtype of multi_dot must be float64, float64 or float16.
-            input2 = fluid.layers.data(
+            input2 = paddle.static.data(
                 name='input2', shape=[10, 10], dtype="int32")
             self.assertRaises(TypeError, paddle.multi_dot, [input2, input2])
 
             # the number of tensor must be larger than 1
-            x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64")
+            x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64")
             self.assertRaises(ValueError, paddle.multi_dot, [x0])
 
             #the first tensor must be 1D or 2D
-            x1 = fluid.data(name='x1', shape=[3, 2, 3], dtype="float64")
-            x2 = fluid.data(name='x2', shape=[3, 2], dtype="float64")
+            x1 = paddle.static.data(name='x1', shape=[3, 2, 3], dtype="float64")
+            x2 = paddle.static.data(name='x2', shape=[3, 2], dtype="float64")
             self.assertRaises(ValueError, paddle.multi_dot, [x1, x2])
 
             #the last tensor must be 1D or 2D
-            x3 = fluid.data(name='x3', shape=[3, 2], dtype="float64")
-            x4 = fluid.data(name='x4', shape=[3, 2, 2], dtype="float64")
+            x3 = paddle.static.data(name='x3', shape=[3, 2], dtype="float64")
+            x4 = paddle.static.data(name='x4', shape=[3, 2, 2], dtype="float64")
             self.assertRaises(ValueError, paddle.multi_dot, [x3, x4])
 
             #the tensor must be 2D, except first and last tensor
-            x5 = fluid.data(name='x5', shape=[3, 2], dtype="float64")
-            x6 = fluid.data(name='x6', shape=[2], dtype="float64")
-            x7 = fluid.data(name='x7', shape=[2, 2], dtype="float64")
+            x5 = paddle.static.data(name='x5', shape=[3, 2], dtype="float64")
+            x6 = paddle.static.data(name='x6', shape=[2], dtype="float64")
+            x7 = paddle.static.data(name='x7', shape=[2, 2], dtype="float64")
             self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7])
 
 
 class APITestMultiDot(unittest.TestCase):
     def test_out(self):
-        with fluid.program_guard(fluid.Program()):
-            x0 = fluid.data(name='x0', shape=[3, 2], dtype="float64")
-            x1 = fluid.data(name='x1', shape=[2, 3], dtype='float64')
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64")
+            x1 = paddle.static.data(name='x1', shape=[2, 3], dtype='float64')
             result = paddle.multi_dot([x0, x1])
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = paddle.static.Executor(paddle.CPUPlace())
             data1 = np.random.rand(3, 2).astype("float64")
             data2 = np.random.rand(2, 3).astype("float64")
             np_res = exe.run(feed={'x0': data1,
@@ -248,14 +248,14 @@ def test_out(self):
             {}\n{}, check diff!".format(np_res, expected_result))
 
     def test_dygraph_without_out(self):
-        device = fluid.CPUPlace()
-        with fluid.dygraph.guard(device):
-            input_array1 = np.random.rand(3, 4).astype("float64")
-            input_array2 = np.random.rand(4, 3).astype("float64")
-            data1 = fluid.dygraph.to_variable(input_array1)
-            data2 = fluid.dygraph.to_variable(input_array2)
-            out = paddle.multi_dot([data1, data2])
-            expected_result = np.linalg.multi_dot([input_array1, input_array2])
+        paddle.disable_static()
+        device = paddle.CPUPlace()
+        input_array1 = np.random.rand(3, 4).astype("float64")
+        input_array2 = np.random.rand(4, 3).astype("float64")
+        data1 = paddle.to_tensor(input_array1)
+        data2 = paddle.to_tensor(input_array2)
+        out = paddle.multi_dot([data1, data2])
+        expected_result = np.linalg.multi_dot([input_array1, input_array2])
         self.assertTrue(np.allclose(expected_result, out.numpy()))
 
 

From b647c1f357ade554cc801d08c4b3c32415aeca5c Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@gmail.com>
Date: Mon, 13 Sep 2021 11:09:18 +0000
Subject: [PATCH 19/19] modify code according to the review

---
 paddle/fluid/operators/multi_dot_op.cc | 81 ++++----------------------
 1 file changed, 10 insertions(+), 71 deletions(-)

diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index 3929aebc4382e7..2d06170d34a91e 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -22,10 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
 
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
@@ -41,11 +37,11 @@ inline framework::DDim ComputeAndCheckShape(
   bool is_vector = false;
   framework::DDim out_dim;
 
-  if (first_dim.size() > 2) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "multi_dot: the first input tensor must be 1D or 2D but got[%d]!",
-        static_cast<int>(first_dim.size())));
-  }
+  PADDLE_ENFORCE_LT(
+      first_dim.size(), static_cast<size_t>(3),
+      platform::errors::InvalidArgument(
+          "multi_dot: the first input tensor must be 1D or 2D but got[%d]!",
+          static_cast<int>(first_dim.size())));
 
   // If the first tensor is 1D of size n view it as a row vector (1, n)
   if (first_dim.size() == 1) {
@@ -54,11 +50,11 @@ inline framework::DDim ComputeAndCheckShape(
   }
 
   auto last_dim = inputs_dims[n - 1];
-  if (last_dim.size() > 2) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "the last input tensor of multi_dot op must be 1D or 2D but got[%d]!",
-        static_cast<int>(last_dim.size())));
-  }
+  PADDLE_ENFORCE_LT(
+      last_dim.size(), static_cast<size_t>(3),
+      platform::errors::InvalidArgument(
+          "the last input tensor of multi_dot must be 1D or 2D but got[%d]!",
+          static_cast<int>(first_dim.size())));
 
   // If the last tensor is 1D of size n view it as a column vector (n, 1)
   if (last_dim.size() == 1) {
@@ -226,10 +222,6 @@ class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The input tensors of multi_dot operator.").AsDuplicable();
     AddOutput("Out", "The output tensor of multi_dot operator");
-    AddAttr<bool>(
-        "use_mkldnn",
-        "(bool, default false) Indicates if MKL-DNN kernel will be used")
-        .SetDefault(false);
     AddComment(R"DOC(
 Compute the dot product of two or more arrays in a single function call, while automatically selecting the fastest evaluation order.
 
@@ -259,44 +251,6 @@ class MultiDotOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", out_dims);
     ctx->ShareLoD("X", "Out");
   }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto inputs = ctx.MultiInput<Tensor>("X");
-    auto input_data_type = framework::proto::VarType::Type(0);
-    for (auto* input : inputs) {
-      if (!input->IsInitialized()) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The inputs of multi_dot OP are Empty!"));
-        break;
-      }
-    }
-    input_data_type = inputs[0]->type();
-
-#ifdef PADDLE_WITH_MKLDNN
-    using mkldnn::memory;
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const {
-    if (framework::IsComplexType(expected_kernel_type.data_type_)) {
-      // only promote inputs’s types when contains complex input
-      return framework::OpKernelType(tensor.type(), tensor.place(),
-                                     tensor.layout());
-    } else {
-      return framework::OpKernelType(expected_kernel_type.data_type_,
-                                     tensor.place(), tensor.layout());
-    }
-  }
 };
 
 /**
@@ -379,21 +333,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel {
     ctx->SetOutputsDim(out_x_g_n, ins_dims);
     ctx->ShareAllLoD(in_x, out_x_g_n);
   }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.GetPlace());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
 };
 
 template <typename DeviceContext, typename T>