add some optimization for general_permute transpose

JamesLim-sy · JamesLim-sy · commit 4c2d3cba70aa · 2022-10-20T11:24:09.000+08:00
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
@@ -720,15 +720,6 @@ class IdxAndOffsetHelper {
     index_helper = IdxHelper<N, T>(dims);
   }
 
-  template <typename U>
-  explicit IdxAndOffsetHelper(const U* dims) {
-    T temp_dims[N];
-    for (int i = 0; i < N; ++i) {
-      temp_dims[i] = static_cast<T>(dims[i]);
-    }
-    index_helper = IdxHelper<N, T>(temp_dims);
-  }
-
   __device__ inline T IndexToOffset(const T* index) const {
     T offset = 0;
 #pragma unroll
@@ -756,13 +747,15 @@ struct PermuteParams {
 
   explicit PermuteParams(const std::vector<int>& dims,
                          const std::vector<int>& perm_) {
-    size_t dst_dims[Rank];
-    for (size_t i = 0; i < Rank; ++i) {
+    IndexT dst_dims[Rank];
+    IndexT src_dims[Rank];
+    for (auto i = 0; i < Rank; ++i) {
+      src_dims[i] = dims[i];
       dst_dims[i] = dims[perm_[i]];
       perm[i] = perm_[i];
     }
     dst_index_helper = IdxAndOffsetHelper<IndexT, Rank>(dst_dims);
-    src_index_helper = IdxAndOffsetHelper<IndexT, Rank>(dims.data());
+    src_index_helper = IdxAndOffsetHelper<IndexT, Rank>(src_dims);
   }
 };
 
@@ -915,10 +908,9 @@ template <typename T,
           typename IndexT,
           int ReadSize,
           bool IsVecWrite,
-          int WritSize = IsVecWrite ? (sizeof(T) < sizeof(float)
-                                           ? sizeof(float) / sizeof(T)
-                                           : 1)
-                                    : 1>
+          int WritSize = (IsVecWrite && (sizeof(T) < sizeof(float)))
+                             ? sizeof(float) / sizeof(T)
+                             : 1>
 __global__ void BatchTransposeKernel(const T* __restrict__ src_data,
                                      T* dst_data,
                                      IndexT rows,
@@ -1000,22 +992,20 @@ inline void LaunchTransposeKernel(const phi::GPUContext& ctx,
   const int rank = dims.size();
   IndexT num_batch = (rank == 2) ? 1 : dims[0];
   IndexT rows = dims[rank - 2];
+  IndexT cols = dims[rank - 1] / VecSize;
+  IndexT num_tile_cols = GETTILESIZE(cols, kTileSize);
 
   int write_size = 1;
   bool is_write_size = sizeof(T) < sizeof(float)
                            ? (rows % (sizeof(float) / sizeof(T)) ? false : true)
                            : false;
   if (is_write_size) {
-    is_write_size = (num_batch * ((rows + kTileSize - 1) & ~(kTileSize - 1)) /
-                     kTileSize) >= ctx.GetSMCount();
+    is_write_size = (num_batch * num_tile_cols * GETTILESIZE(rows, kTileSize)) >
+                    ctx.GetSMCount();
     write_size = is_write_size ? sizeof(float) / sizeof(T) : 1;
   }
 
-  IndexT cols = dims[rank - 1] / VecSize;
-  IndexT num_tile_cols = (cols + kTileSize - 1) / kTileSize;
-  IndexT num_tile_rows =
-      (rows + kTileSize * write_size - 1) / (kTileSize * write_size);
-
+  IndexT num_tile_rows = GETTILESIZE(rows, (kTileSize * write_size));
   dim3 blocks(num_tile_cols, num_tile_rows, num_batch);
   dim3 threads(kTileSize, kBlockRows, 1);
 
@@ -1174,14 +1164,15 @@ void TransposeGPUKernelDriver(const phi::GPUContext& ctx,
                                       phi::vectorize<int>(in.dims()),
                                       in.data<T>(),
                                       out->data<T>());
-  auto* tuner = phi::autotune::MakeTransposeTuner<T>(TransposeWithSimple<T>);
+  auto* tuner = phi::autotune::MakeTransposeTuner<T>(PermuteAndTranspose<T>);
   tuner->AddCallBack(PermuteWithEigen<T>);
-  tuner->AddCallBack(PermuteAndTranspose<T>);
+  tuner->AddCallBack(TransposeWithSimple<T>);
 
   size_t key = phi::autotune::TransposeKey(
-      phi::vectorize(in.dims()),
-      perm,
+      simplifier.GetSrcDims(),
+      simplifier.GetPerm(),
       paddle::experimental::CppTypeToDataType<T>::Type());
+
   tuner->Run(ctx,
              phi::autotune::AlgorithmType::kTranspose,
              key,
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
@@ -73,6 +73,8 @@ enum PermuteType {
 constexpr int kBlockRows = 16;
 constexpr int kTileSize = 32;
 
+#define GETTILESIZE(LEN, ALIGN) ((LEN + (ALIGN - 1)) & ~(ALIGN - 1)) / ALIGN
+
 // Simplify the input dims and permute dims if possible.
 template <typename T>
 class DimsSimplifier {
@@ -182,38 +184,40 @@ class DimsSimplifier {
   }
 
   int GetPermVecSize(const int sm_count, const T* src, T* dst) {
-    // For gerneal_permute kernel, there is good chance for
-    // vectorized write.
+    // For gerneal_permute kernel, there is chance for vectorized write.
     type_ = PermuteType::kNormalPermute;
     int vec_size = phi::GetVectorizedSize<T>(dst);
 
-    // While the last dim is fixed, there is good chance for
-    // both vectorized read and write.
+    // While the last dim is fixed, there is chance for vectorized IO.
     if (perm_[rank_ - 1] == rank_ - 1) {
       int tmp_size = std::min(vec_size, phi::GetVectorizedSize<T>(src));
-      tmp_size = GetDimVesSize(tmp_size, src_dims[rank_ - 1]);
+      tmp_size = GetDimVecSize(tmp_size, src_dims[rank_ - 1]);
       if (tmp_size > 1) {
         type_ = kVecPermute;
         vec_size = tmp_size;
       }
     }
 
-    // Once only transpose at the last 2 dims, there is good
-    // chance for vectorized read.
+    // Once only transpose at the last 2 dims.
     if ((rank_ == 2 && perm_[1] == 0 && perm_[0] == 1) ||
         (rank_ == 3 && perm_[2] == 1 && perm_[1] == 2)) {
       type_ = PermuteType::kTranspose;
-      // With bytes limitation of shared_memory, the VecSize shall be
-      // restricted for the type whose byte-size is less than 8 (double).
+      // With bytes limitation of shared_memory, the VecSize
+      // shall be restricted to sizeof(float).
       int tmp_vec = std::min(vec_size, phi::GetVectorizedSize<T>(src));
-      vec_size =
-          sizeof(T) > 4 ? 1 : GetDimVesSize(tmp_vec, src_dims[rank_ - 1]);
+      vec_size = sizeof(T) > sizeof(float)
+                     ? 1
+                     : GetDimVecSize(tmp_vec, src_dims[rank_ - 1]);
+      const int tile_size = (rank_ == 2 ? 1 : src_dims[0]) *
+                            GETTILESIZE(src_dims[rank_ - 1], kTileSize) *
+                            GETTILESIZE(src_dims[rank_ - 2], kTileSize);
+      vec_size = tile_size < sm_count ? 1 : vec_size;
     }
     return vec_size;
   }
 
   // To find if highest common divisor and make it as vec_size.
-  int GetDimVesSize(const int vec_size, const size_t target_dim) {
+  int GetDimVecSize(const int vec_size, const size_t target_dim) {
     int dim_vec_size = 1;
     for (auto size = vec_size; size > 0; size /= 2) {
       if (target_dim % size == 0) {
diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc
@@ -21,7 +21,7 @@
 namespace phi {
 namespace autotune {
 
-size_t TransposeKey(const std::vector<int64_t>& x_dims,
+size_t TransposeKey(const std::vector<int32_t>& x_dims,
                     const std::vector<int32_t>& perm,
                     phi::DataType dtype) {
   const auto rank = perm.size();
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
@@ -196,7 +196,7 @@ class CudnnAlgorithmsCacheMap {
   int64_t cache_misses_{0};
 };
 
-size_t TransposeKey(const std::vector<int64_t>& x_dims,
+size_t TransposeKey(const std::vector<int32_t>& x_dims,
                     const std::vector<int32_t>& perm,
                     phi::DataType dtype);