PaddlePaddle · lanxianghit · Aug 12, 2021 · Aug 9, 2021
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -15,7 +15,8 @@ register_operators(EXCLUDES
     fusion_group_op
     fusion_gru_op
     fusion_lstm_op
-    fused_bn_add_activation_op)
+    fused_bn_add_activation_op
+    fused_transformer_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)

diff --git a/paddle/fluid/operators/fused/fused_transformer_op.cc b/paddle/fluid/operators/fused/fused_transformer_op.cc
@@ -0,0 +1,161 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_transformer_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+// constructor and init
+template <typename T>
+FusedTransformerEncoderLayer<T>::FusedTransformerEncoderLayer(
+    int batch_size_, int max_seq_len_, int dim_embed_, int dim_feedforward_,
+    int num_head_, float dropout_, float act_dropout_, float attn_dropout_,
+    std::string act_method_, bool normalize_pre_or_post_) {
+  // configurations
+  batch_size = batch_size_;
+  max_seq_len = max_seq_len_;
+  dim_embed = dim_embed_;
+  dim_feedforward = dim_feedforward_;
+  num_head = num_head_;
+  head_size = dim_embed_ / num_head;
+
+  dropout = dropout_;
+  act_dropout = act_dropout_;
+  attn_dropout = attn_dropout_;
+
+  act_method = act_method_;
+  normalize_pre_or_post = normalize_pre_or_post_;
+
+  // init attn
+  fused_attn =
+      new FusedAttention<T>(batch_size, max_seq_len, dim_embed, num_head,
+                            dropout, attn_dropout, normalize_pre_or_post);
+
+  // init ffn
+  fused_ffn =
+      new FusedFFN<T>(batch_size, max_seq_len, dim_embed, dim_feedforward_,
+                      act_dropout, act_method, normalize_pre_or_post);
+}
+
+// deconstructor
+template <typename T>
+FusedTransformerEncoderLayer<T>::~FusedTransformerEncoderLayer() {
+  delete fused_attn;
+  delete fused_ffn;
+}
+
+// compute forward
+template <typename T>
+void FusedTransformerEncoderLayer<T>::ComputeForward(T* src, T* output) {
+  T* output_attn;  // todo
+
+  fused_attn->ComputeForward(src, output_attn);
+  fused_ffn->ComputeForward(output_attn, output);
+}
+
+// compute backward
+template <typename T>
+void FusedTransformerEncoderLayer<T>::ComputeBackward() {}
+
+// constructor and init
+template <typename T>
+FusedAttention<T>::FusedAttention(int batch_size_, int max_seq_len_,
+                                  int dim_embed_, int num_head_, float dropout_,
+                                  float attn_dropout_,
+                                  bool normalize_pre_or_post_) {
+  // configurations
+  batch_size = batch_size_;
+  max_seq_len = max_seq_len_;
+  dim_embed = dim_embed_;
+  num_head = num_head_;
+  head_size = dim_embed_ / num_head;
+
+  dropout = dropout_;
+  attn_dropout = attn_dropout_;
+
+  normalize_pre_or_post = normalize_pre_or_post_;
+
+  // init fmha
+  fmha = new FusedMHA<T>();
+}
+
+// compute forward
+template <typename T>
+void FusedAttention<T>::ComputeForward(T* src, T* output) {}
+
+template <typename T>
+FusedAttention<T>::~FusedAttention() {
+  delete fmha;
+}
+
+// compute backward
+template <typename T>
+void FusedAttention<T>::ComputeBackward() {}
+
+// constructor and init
+template <typename T>
+FusedFFN<T>::FusedFFN(int batch_size_, int max_seq_len_, int dim_embed_,
+                      int dim_feedforward_, float act_dropout_,
+                      std::string act_method_, bool normalize_pre_or_post_) {
+  batch_size = batch_size_;
+  max_seq_len = max_seq_len_;
+  dim_embed = dim_embed_;
+  dim_feedforward = dim_feedforward_;
+  act_dropout = act_dropout_;
+
+  act_method = act_method_;
+  normalize_pre_or_post = normalize_pre_or_post_;
+}
+
+template <typename T>
+FusedFFN<T>::~FusedFFN() {}
+
+// compute forward
+template <typename T>
+void FusedFFN<T>::ComputeForward(T* src, T* output) {}
+
+// compute backward
+template <typename T>
+void FusedFFN<T>::ComputeBackward() {}
+
+// init
+template <typename T>
+FusedMHA<T>::FusedMHA(int batch_size_, int max_seq_len_, int dim_embed_,
+                      int num_head_, float dropout_, bool is_test_,
+                      uint64_t seed_, uint64_t* seqlen_, uint64_t* cu_seqlen_) {
+  batch_size = batch_size_;
+  max_seq_len = max_seq_len_;
+  dim_embed = dim_embed_;
+  num_head = num_head_;
+  head_size = dim_embed_ / num_head;
+
+  dropout = dropout_;
+  is_test = is_test_;
+  seed = seed_;
+  seqlen = seqlen_;
+  cu_seqlen = cu_seqlen_;
+}
+
+// compute forward
+template <typename T>
+void FusedMHA<T>::ComputeForward(T* output, T* softmax_mask) {}
+
+// compute backward
+template <typename T>
+void FusedMHA<T>::ComputeBackward(const T* grad_output, T* softmax_mask,
+                                  T* grad_x) {}
+}
+}
diff --git a/paddle/fluid/operators/fused/fused_transformer_op.cu b/paddle/fluid/operators/fused/fused_transformer_op.cu
@@ -0,0 +1,13 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
diff --git a/paddle/fluid/operators/fused/fused_transformer_op.h b/paddle/fluid/operators/fused/fused_transformer_op.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class FusedMHA {
+  FusedMHA(int, int, int, int, float, bool, uint64_t, uint64_t*, uint64_t*);
+  ~FusedMHA();
+
+  void ComputeForward(T*, T*);
+  void ComputeBackward(const T*, T*, T*);
+
+ private:
+  int batch_size;
+  int max_seq_len;
+  int dim_embed;
+
+  int num_head;
+  int head_size;
+
+  float dropout;
+
+  bool is_test;
+  uint64_t seed;
+
+  int32_t seqlen;
+  int32_t* cu_seqlen;
+};
+
+template <typename T>
+class FusedAttention {
+ public:
+  FusedAttention(int, int, int, int, float, float, bool);
+  ~FusedAttention();
+
+  void ComputeForward(T*, T*);
+  void ComputeBackward();
+
+ private:
+  FusedMHA<T>* fmha;  // fused multihead attention
+
+  int batch_size;
+  int max_seq_len;
+  int dim_embed;
+
+  int num_head;
+  int head_size;
+
+  float dropout;
+  T attn_dropout;
+
+  bool normalize_pre_or_post;
+
+  // weights and bias used in attention
+  T* fattn_qkv_w;
+  T* fattn_qkv_b;
+  T* fattn_o_w;
+  T* fattn_o_b;
+  T* fattn_n_w;
+  T* fattn_n_b;
+  T* fattn_norm_w;
+  T* fattn_norm_b;
+
+  T* fattn_grad_qkv_w;
+  T* fattn_grad_qkv_b;
+  T* fattn_grad_o_w;
+  T* fattn_grad_o_b;
+  T* fattn_grad_n_w;
+  T* fattn_grad_n_b;
+  T* fattn_grad_norm_w;
+  T* fattn_grad_norm_b;
+};
+
+template <typename T>
+class FusedFFN {
+  FusedFFN(int, int, int, int, float, std::string, bool);
+  ~FusedFFN();
+
+  void ComputeForward(T*, T*);
+  void ComputeBackward();
+
+ private:
+  int batch_size;
+  int max_seq_len;
+  int dim_embed;
+  int dim_feedforward;
+
+  float attn_dropout;
+  float act_dropout;
+
+  bool normalize_pre_or_post;
+
+  std::string act_method;
+
+  // weights and bias used in ffn
+  T* fffn_inter_w;
+  T* fffn_inter_b;
+  T* fffn_output_w;
+  T* fffn_output_b;
+
+  T* fffn_grad_inter_w;
+  T* fffn_grad_inter_b;
+  T* fffn_grad_output_w;
+  T* fffn_grad_output_b;
+};
+
+template <typename T>
+class FusedTransformerEncoderLayer {
+ public:
+  FusedTransformerEncoderLayer(int, int, int, int, int, float, float, float,
+                               std::string, bool);
+  ~FusedTransformerEncoderLayer();
+
+  void ComputeForward(T* src, T* output);
+  void ComputeBackward();
+
+ private:
+  FusedAttention<T>* fused_attn;
+  FusedFFN<T>* fused_ffn;
+
+  int batch_size;
+  int max_seq_len;
+  int dim_embed;
+  int dim_feedforward;
+
+  int num_head;
+  int head_size;
+
+  float dropout;
+  float attn_dropout;
+  float act_dropout;
+
+  bool normalize_pre_or_post;
+
+  std::string act_method;
+};
+}
+}