From f58cb01864151e27ff45d9fc99b61b72cce3295e Mon Sep 17 00:00:00 2001
From: Chengmo <cmchengmo@163.com>
Date: Thu, 25 Mar 2021 17:37:09 +0800
Subject: [PATCH 01/15] =?UTF-8?q?=E3=80=90Paddle.Fleet=E3=80=91fix=20datas?=
 =?UTF-8?q?et=20zip=20py3=20bug=20(#31441)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix zip py3 bug
---
 .../fleet/data_generator/data_generator.py    | 26 +++++++-----
 .../tests/unittests/test_data_generator.py    | 40 +++++++++++++++++++
 2 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/python/paddle/distributed/fleet/data_generator/data_generator.py b/python/paddle/distributed/fleet/data_generator/data_generator.py
index 669d2ea24a0c78..9d743fc38bf398 100644
--- a/python/paddle/distributed/fleet/data_generator/data_generator.py
+++ b/python/paddle/distributed/fleet/data_generator/data_generator.py
@@ -32,11 +32,11 @@ def set_batch(self, batch_size):
         '''
         Set batch size of current DataGenerator
         This is necessary only if a user wants to define generator_batch
-        
+
         Example:
 
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -52,7 +52,7 @@ def local_iter():
                                 yield ("words", s[1].extend([s[1][0]]))
                 mydata = MyData()
                 mydata.set_batch(128)
-                    
+
         '''
         self.batch_size_ = batch_size
 
@@ -63,7 +63,7 @@ def run_from_memory(self):
 
         Example:
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -100,9 +100,9 @@ def run_from_stdin(self):
         generated.
 
         Example:
-        
+
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -161,7 +161,7 @@ def generate_sample(self, line):
               The data format is list or tuple: 
             [(name, [feasign, ...]), ...] 
               or ((name, [feasign, ...]), ...)
-             
+
             For example:
             [("words", [1926, 08, 17]), ("label", [1])]
               or (("words", [1926, 08, 17]), ("label", [1]))
@@ -174,7 +174,7 @@ def generate_sample(self, line):
         Example:
 
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -206,7 +206,7 @@ def generate_batch(self, samples):
         Example:
 
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -259,6 +259,9 @@ def _gen_str(self, line):
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
+        if sys.version > '3' and isinstance(line, zip):
+            line = list(line)
+
         if not isinstance(line, list) and not isinstance(line, tuple):
             raise ValueError(
                 "the output of process() must be in list or tuple type"
@@ -289,7 +292,7 @@ def _gen_str(self, line):
             >>> [ids_num id1 id2 ...] ...
         The proto_info will be in this format:
             >>> [(name, type), ...]
-        
+
         For example, if the input is like this:
             >>> [("words", [1926, 08, 17]), ("label", [1])]
             >>> or (("words", [1926, 08, 17]), ("label", [1]))
@@ -304,6 +307,9 @@ def _gen_str(self, line):
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
+        if sys.version > '3' and isinstance(line, zip):
+            line = list(line)
+
         if not isinstance(line, list) and not isinstance(line, tuple):
             raise ValueError(
                 "the output of process() must be in list or tuple type"
diff --git a/python/paddle/fluid/tests/unittests/test_data_generator.py b/python/paddle/fluid/tests/unittests/test_data_generator.py
index 6381cb36402636..69d8e01fd464af 100644
--- a/python/paddle/fluid/tests/unittests/test_data_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_data_generator.py
@@ -95,6 +95,32 @@ def data_iter():
         return data_iter
 
 
+class MyMultiSlotStringDataGenerator_zip(fleet.MultiSlotStringDataGenerator):
+    def generate_sample(self, line):
+        def data_iter():
+            for i in range(40):
+                if i == 1:
+                    yield None
+                feature_name = ["words", "label"]
+                data = [["1", "2", "3", "4"], ["0"]]
+                yield zip(feature_name, data)
+
+        return data_iter
+
+
+class MyMultiSlotDataGenerator_zip(fleet.MultiSlotDataGenerator):
+    def generate_sample(self, line):
+        def data_iter():
+            for i in range(40):
+                if i == 1:
+                    yield None
+                feature_name = ["words", "label"]
+                data = [[1, 2, 3, 4], [0]]
+                yield zip(feature_name, data)
+
+        return data_iter
+
+
 class TestMultiSlotDataGenerator(unittest.TestCase):
     def test_MultiSlotDataGenerator_basic(self):
         my_ms_dg = MyMultiSlotDataGenerator()
@@ -149,5 +175,19 @@ def test_MultiSlotDataGenerator_error(self):
             my_ms_dg.run_from_memory()
 
 
+class TestMultiSlotStringDataGeneratorZip(unittest.TestCase):
+    def test_MultiSlotStringDataGenerator_zip(self):
+        my_ms_dg = MyMultiSlotStringDataGenerator_zip()
+        my_ms_dg.set_batch(1)
+        my_ms_dg.run_from_memory()
+
+
+class TestMultiSlotDataGeneratorZip(unittest.TestCase):
+    def test_MultiSlotDataGenerator_zip(self):
+        my_ms_dg = MyMultiSlotDataGenerator_zip()
+        my_ms_dg.set_batch(1)
+        my_ms_dg.run_from_memory()
+
+
 if __name__ == '__main__':
     unittest.main()

From e804f08559d96a87b8c7eb50120eef68402e4313 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Fri, 26 Mar 2021 13:43:48 +0800
Subject: [PATCH 02/15] delete include framework.pb.h (#31859)

* delete include framework.pb.h

* fix error
---
 paddle/fluid/framework/custom_operator.cc                        | 1 -
 paddle/fluid/framework/executor_gc_helper.cc                     | 1 -
 paddle/fluid/framework/ir/graph_pattern_detector.h               | 1 -
 paddle/fluid/framework/ir/layer_norm_fuse_pass.cc                | 1 -
 paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc         | 1 -
 paddle/fluid/framework/op_info.h                                 | 1 -
 paddle/fluid/framework/op_proto_maker.h                          | 1 -
 paddle/fluid/framework/op_version_registry.h                     | 1 -
 paddle/fluid/framework/operator.h                                | 1 -
 paddle/fluid/framework/program_desc.h                            | 1 -
 paddle/fluid/framework/reader.h                                  | 1 -
 paddle/fluid/framework/tensor_util.h                             | 1 -
 paddle/fluid/framework/var_type.h                                | 1 -
 paddle/fluid/framework/var_type_traits.h                         | 1 -
 paddle/fluid/framework/variable_helper.h                         | 1 -
 paddle/fluid/imperative/gradient_accumulator.cc                  | 1 -
 paddle/fluid/inference/analysis/analysis_pass.h                  | 1 -
 paddle/fluid/inference/analysis/helper.cc                        | 1 -
 paddle/fluid/inference/analysis/helper.h                         | 1 -
 paddle/fluid/inference/engine.h                                  | 1 -
 paddle/fluid/operators/cast_op.h                                 | 1 -
 paddle/fluid/operators/distributed_ops/recv_save_op.cc           | 1 -
 .../operators/fused/fused_embedding_eltwise_layernorm_op.cu      | 1 -
 paddle/fluid/operators/inplace_abn_op.cc                         | 1 -
 paddle/fluid/operators/one_hot_op.cc                             | 1 -
 paddle/fluid/operators/one_hot_op_xpu.cc                         | 1 -
 paddle/fluid/operators/one_hot_v2_op.cc                          | 1 -
 paddle/fluid/operators/one_hot_v2_op_xpu.cc                      | 1 -
 paddle/fluid/operators/reader/create_py_reader_op.cc             | 1 -
 paddle/fluid/operators/reader/read_op.cc                         | 1 -
 paddle/fluid/operators/save_combine_op.h                         | 1 -
 paddle/fluid/operators/save_op.h                                 | 1 -
 paddle/fluid/pybind/pybind.cc                                    | 1 -
 33 files changed, 33 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 1ebb8998c854eb..97d58df6dc5738 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index c8bc735790400b..c06a3d4a183799 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -18,7 +18,6 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 2e518c1d4df72a..b6c1074d90dd2a 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -28,7 +28,6 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
index 69edc3d87f97d6..18d2e9817ebec8 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
@@ -14,7 +14,6 @@
 
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
index 5fd47b21733b54..5fe71fbc21451f 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
@@ -17,7 +17,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_test_util.h"
 #include "paddle/fluid/framework/naive_executor.h"
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index af657232e91a68..ddd84bfd81abf5 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 912e82f60ef5df..506c3eb1e0ad09 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index b9ec5507612099..5ae8f255d63be5 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_version_proto.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e9ecf9b5a83978..bf27a8e37e0b32 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index cfef80b8d37778..4ceb0c5c824814 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index a4207deb7e8113..e7c23eab1fa5fc 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -20,7 +20,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 8a127e0ed59295..fd0f98784ceb0a 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/dlpack_tensor.h"
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 8affeda67b3d07..2e35f9b845ac73 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index b0d8f43a90f35f..fc754cbaf177c9 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -21,7 +21,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
index 6e65bc2c932877..4cdfba29249ccf 100644
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
@@ -15,7 +15,6 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index deb504a1b657e4..b9df88b1f1eeaa 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -18,7 +18,6 @@
 #include <memory>
 #include <utility>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index d5a972fab3beae..14a1c3eea34174 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <iosfwd>
 #include <string>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/helper.h"
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
index 368ef2e5583fe2..ede0402f816765 100644
--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index ab4949935140c6..cace420d87c9df 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -25,7 +25,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h
index 1a13ba510384c0..e29162cf5b23ba 100644
--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <string>
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 8fa0416049f8fa..cd60c7707cb0aa 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 
diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
index d194fcda36a474..d6da818e1df51d 100644
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 9711cc8d811d5d..14a6608836a8a7 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -14,7 +14,6 @@
 
 #include <paddle/fluid/platform/device_context.h>
 #include <algorithm>
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 652c071be6b33f..8234d63d681ff0 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -16,7 +16,6 @@
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 9c321832f8489a..64323e588c6285 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/operators/one_hot_op.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/one_hot_op_xpu.cc b/paddle/fluid/operators/one_hot_op_xpu.cc
index 14ecd11d114d0b..3e214aa8bf8221 100644
--- a/paddle/fluid/operators/one_hot_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_op_xpu.cc
@@ -16,7 +16,6 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/one_hot_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index 29fe6f10c72f43..c42db1e6f449c0 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/operators/one_hot_v2_op.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/one_hot_v2_op_xpu.cc b/paddle/fluid/operators/one_hot_v2_op_xpu.cc
index 6fec597db17290..e24be3bead6889 100644
--- a/paddle/fluid/operators/one_hot_v2_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_xpu.cc
@@ -16,7 +16,6 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/one_hot_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index c04bdb2f10930e..a7d177f326e511 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/reader/py_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 9086291e17db89..38894495b4ca05 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
index 0246c42d433255..939768693a2431 100644
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
index fbde722a425bc3..e44a5c77bd8410 100644
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c8ca3bf2c8fa27..e1ff69e7485eb8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"

From 70b67f1029a8ddfa68cf2a6f0d5631b95ff591bd Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Fri, 26 Mar 2021 13:45:31 +0800
Subject: [PATCH 03/15] fix go api bug. (#31857)

---
 go/README_cn.md                             | 1 +
 go/demo/mobilenet.go                        | 2 +-
 go/paddle/common.go                         | 2 +-
 go/paddle/config.go                         | 2 +-
 go/paddle/predictor.go                      | 4 ++--
 go/paddle/tensor.go                         | 4 ++--
 paddle/fluid/inference/capi/pd_predictor.cc | 9 ++++++---
 7 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/go/README_cn.md b/go/README_cn.md
index a184ecbb8dea1a..040540e939bc3a 100644
--- a/go/README_cn.md
+++ b/go/README_cn.md
@@ -50,6 +50,7 @@ output_data := value.Interface().([][]float32)
 
 运行
 ```bash
+go mod init github.com/paddlepaddle
 export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH
 go run ./demo/mobilenet.go
 ```
diff --git a/go/demo/mobilenet.go b/go/demo/mobilenet.go
index 1b42fe8049a584..c1ca2e967f72dc 100644
--- a/go/demo/mobilenet.go
+++ b/go/demo/mobilenet.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 package main
 
-import "../paddle"
+import "github.com/paddlepaddle/paddle"
 import "strings"
 import "io/ioutil"
 import "strconv"
diff --git a/go/paddle/common.go b/go/paddle/common.go
index 4bf94765931282..cbbde6a45f59b8 100644
--- a/go/paddle/common.go
+++ b/go/paddle/common.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <paddle_c_api.h>
 import "C"
diff --git a/go/paddle/config.go b/go/paddle/config.go
index 89f7d7e63ff2a8..68a31230997bed 100644
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <paddle_c_api.h>
diff --git a/go/paddle/predictor.go b/go/paddle/predictor.go
index 59bad908e6a508..5f2b2c81a60549 100644
--- a/go/paddle/predictor.go
+++ b/go/paddle/predictor.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include "paddle_c_api.h"
 import "C"
@@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string {
 }
 
 func (predictor *Predictor) GetOutputNames() []string {
-	names := make([]string, predictor.GetInputNum())
+	names := make([]string, predictor.GetOutputNum())
 	for i := 0; i < len(names); i++ {
 		names[i] = predictor.GetOutputName(i)
 	}
diff --git a/go/paddle/tensor.go b/go/paddle/tensor.go
index e6e2c53fef1af5..6fbcf039f88a7c 100644
--- a/go/paddle/tensor.go
+++ b/go/paddle/tensor.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <string.h>
@@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va
 		value := reflect.Indirect(ptr)
 		value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0])))
 		if len(shape) == 1 && value.Len() > 0 {
-			switch value.Index(1).Kind() {
+			switch value.Index(0).Kind() {
 			case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32:
 				binary.Read(r, Endian(), value.Interface())
 				return
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index c1bf4c974fac8c..c4e195b6ec8fab 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -207,13 +207,16 @@ int PD_GetOutputNum(const PD_Predictor* predictor) {
 }
 
 const char* PD_GetInputName(const PD_Predictor* predictor, int n) {
-  static std::vector<std::string> names = predictor->predictor->GetInputNames();
+  static std::vector<std::string> names;
+  names.resize(predictor->predictor->GetInputNames().size());
+  names[n] = predictor->predictor->GetInputNames()[n];
   return names[n].c_str();
 }
 
 const char* PD_GetOutputName(const PD_Predictor* predictor, int n) {
-  static std::vector<std::string> names =
-      predictor->predictor->GetOutputNames();
+  static std::vector<std::string> names;
+  names.resize(predictor->predictor->GetOutputNames().size());
+  names[n] = predictor->predictor->GetOutputNames()[n];
   return names[n].c_str();
 }
 

From 01aa252624a639552116a0c46188ca7f5c43a1ee Mon Sep 17 00:00:00 2001
From: zlsh80826 <rewang@nvidia.com>
Date: Fri, 26 Mar 2021 15:58:50 +0800
Subject: [PATCH 04/15] [Paddle-TRT] multiclass nms (#31742)

* add multiclass_nms

* add multiclass_nms unittest

* add default enable_tensorrt_oss option

* refine multiclas nms unittest and add serialization/dynamic test

* change super to InferencePassTest for python2 compatibility

* refine multiclass nms unittest

* move out dynamic shape test due to ci timelimit
---
 .../fluid/inference/api/analysis_predictor.cc |   2 +-
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +-
 .../tensorrt/convert/multiclass_nms_op.cc     | 133 ++++++++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  34 ++++-
 .../ir/inference/inference_pass_test.py       |   3 +
 .../inference/test_trt_multiclass_nms_op.py   | 144 ++++++++++++++++++
 6 files changed, 315 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 8f2b217a2fde0a..0007582e2c73d2 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1192,7 +1192,7 @@ USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 USE_TRT_CONVERTER(clip);
 USE_TRT_CONVERTER(gather);
-
+USE_TRT_CONVERTER(multiclass_nms);
 USE_TRT_CONVERTER(nearest_interp);
 #endif
 
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index b0d0229ec0531f..be7fa0548d9f34 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -6,7 +6,7 @@ nv_library(tensorrt_converter
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
-
+                multiclass_nms_op.cc
                 nearest_interp_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
new file mode 100644
index 00000000000000..b0d67a5bf90ca9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class MultiClassNMSOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid multiclassNMS op to tensorrt plugin";
+
+    // for now, only work for static shape and regular tensor
+    framework::OpDesc op_desc(op, nullptr);
+
+    std::string bboxes = op_desc.Input("BBoxes").front();
+    std::string scores = op_desc.Input("Scores").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    auto* bboxes_tensor = engine_->GetITensor(bboxes);
+    auto* scores_tensor = engine_->GetITensor(scores);
+
+    int background_label =
+        BOOST_GET_CONST(int, op_desc.GetAttr("background_label"));
+    float score_threshold =
+        BOOST_GET_CONST(float, op_desc.GetAttr("score_threshold"));
+    int nms_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("nms_top_k"));
+    float nms_threshold =
+        BOOST_GET_CONST(float, op_desc.GetAttr("nms_threshold"));
+    int keep_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("keep_top_k"));
+    bool normalized = BOOST_GET_CONST(bool, op_desc.GetAttr("normalized"));
+    int num_classes = scores_tensor->getDimensions().d[0];
+
+    auto bboxes_dims = bboxes_tensor->getDimensions();
+    nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]);
+    auto* bboxes_expand_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor);
+    bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims);
+
+    nvinfer1::Permutation permutation{1, 0};
+    auto* scores_transpose_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor);
+    scores_transpose_layer->setFirstTranspose(permutation);
+
+    std::vector<nvinfer1::ITensor*> batch_nms_inputs;
+    batch_nms_inputs.push_back(bboxes_expand_layer->getOutput(0));
+    batch_nms_inputs.push_back(scores_transpose_layer->getOutput(0));
+
+    constexpr bool shareLocation = true;
+    constexpr bool clip_boxes = false;
+
+    const std::vector<nvinfer1::PluginField> fields{
+        {"shareLocation", &shareLocation, nvinfer1::PluginFieldType::kINT32, 1},
+        {"backgroundLabelId", &background_label,
+         nvinfer1::PluginFieldType::kINT32, 1},
+        {"numClasses", &num_classes, nvinfer1::PluginFieldType::kINT32, 1},
+        {"topK", &nms_top_k, nvinfer1::PluginFieldType::kINT32, 1},
+        {"keepTopK", &keep_top_k, nvinfer1::PluginFieldType::kINT32, 1},
+        {"scoreThreshold", &score_threshold,
+         nvinfer1::PluginFieldType::kFLOAT32, 1},
+        {"iouThreshold", &nms_threshold, nvinfer1::PluginFieldType::kFLOAT32,
+         1},
+        {"isNormalized", &normalized, nvinfer1::PluginFieldType::kINT32, 1},
+        {"clipBoxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1},
+    };
+
+    nvinfer1::PluginFieldCollection* plugin_collections =
+        static_cast<nvinfer1::PluginFieldCollection*>(
+            malloc(sizeof(*plugin_collections) +
+                   fields.size() * sizeof(nvinfer1::PluginField)));
+    plugin_collections->nbFields = static_cast<int>(fields.size());
+    plugin_collections->fields = fields.data();
+
+    auto creator = GetPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1");
+    auto batch_nms_plugin =
+        creator->createPlugin("BatchNMSPlugin", plugin_collections);
+    free(plugin_collections);
+
+    auto batch_nms_layer = engine_->network()->addPluginV2(
+        batch_nms_inputs.data(), batch_nms_inputs.size(), *batch_nms_plugin);
+    auto nmsed_boxes = batch_nms_layer->getOutput(1);
+    auto nmsed_scores = batch_nms_layer->getOutput(2);
+    auto nmsed_classes = batch_nms_layer->getOutput(3);
+
+    auto nmsed_scores_transpose_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_scores);
+    nmsed_scores_transpose_layer->setReshapeDimensions(
+        nvinfer1::Dims2(keep_top_k, 1));
+    auto nmsed_classes_reshape_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_classes);
+    nmsed_classes_reshape_layer->setReshapeDimensions(
+        nvinfer1::Dims2(keep_top_k, 1));
+
+    std::vector<nvinfer1::ITensor*> concat_inputs;
+    concat_inputs.push_back(nmsed_classes_reshape_layer->getOutput(0));
+    concat_inputs.push_back(nmsed_scores_transpose_layer->getOutput(0));
+    concat_inputs.push_back(nmsed_boxes);
+
+    auto nms_concat_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Concatenation, concat_inputs.data(), concat_inputs.size());
+    nms_concat_layer->setAxis(1);
+
+    RreplenishLayerAndOutput(nms_concat_layer, "multiclass_nms", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(multiclass_nms, MultiClassNMSOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 11752d71a45e1b..82f58254fe8e0d 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -111,7 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "flatten2",
       "flatten",
       "gather",
-
+      "multiclass_nms",
       "nearest_interp",
   };
 };
@@ -195,6 +195,38 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       // current not support axis from input, use default 0
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
+
+    if (op_type == "multiclass_nms") {
+      if (with_dynamic_shape) return false;
+      auto* block = desc.Block();
+      for (auto& param_name : desc.Inputs()) {
+        for (auto& var_name : param_name.second) {
+          auto* var_desc = block->FindVar(var_name);
+          const auto shape = var_desc->GetShape();
+          if (shape.size() != 3) {
+            VLOG(1) << "multiclass_nms op dims != 3 not supported in tensorrt, "
+                       "but got dims "
+                    << shape.size() << ", so jump it.";
+            return false;
+          }
+        }
+      }
+      bool has_attrs =
+          (desc.HasAttr("background_label") &&
+           desc.HasAttr("score_threshold") && desc.HasAttr("nms_top_k") &&
+           desc.HasAttr("keep_top_k") && desc.HasAttr("normalized"));
+      if (has_attrs == false) return false;
+
+      auto nms_top_k = BOOST_GET_CONST(int, desc.GetAttr("nms_top_k"));
+      if (nms_top_k < 0) return false;
+
+      auto keep_top_k = BOOST_GET_CONST(int, desc.GetAttr("keep_top_k"));
+      if (keep_top_k < 0) return false;
+
+      auto registry = GetPluginRegistry();
+      if (registry == nullptr) return false;
+    }
+
     if (op_type == "fc" || op_type == "mul") {
       const int x_num_col_dims =
           desc.HasAttr("x_num_col_dims")
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 993493a3ccf2b6..010086bfbbc47f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -46,6 +46,7 @@ def __init__(self, methodName='runTest'):
         self.enable_mkldnn = False
         self.enable_mkldnn_bfloat16 = False
         self.enable_trt = False
+        self.enable_tensorrt_oss = True
         self.trt_parameters = None
         self.dynamic_shape_params = None
         self.enable_lite = False
@@ -133,6 +134,8 @@ def _get_analysis_config(self,
                         self.dynamic_shape_params.max_input_shape,
                         self.dynamic_shape_params.optim_input_shape,
                         self.dynamic_shape_params.disable_trt_plugin_fp16)
+                if self.enable_tensorrt_oss:
+                    config.enable_tensorrt_oss()
 
         elif use_mkldnn:
             config.enable_mkldnn()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
new file mode 100644
index 00000000000000..3ca6985985985e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TensorRTMultiClassNMSTest(InferencePassTest):
+    def setUp(self):
+        self.enable_trt = True
+        self.enable_tensorrt_oss = True
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.bs = 1
+        self.background_label = -1
+        self.score_threshold = .5
+        self.nms_top_k = 8
+        self.nms_threshold = .3
+        self.keep_top_k = 8
+        self.normalized = False
+        self.num_classes = 8
+        self.num_boxes = 8
+        self.trt_parameters = InferencePassTest.TensorRTParam(
+            1 << 30, self.bs, 2, self.precision, self.serialize, False)
+
+    def build(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            boxes = fluid.data(
+                name='bboxes', shape=[-1, self.num_boxes, 4], dtype='float32')
+            scores = fluid.data(
+                name='scores',
+                shape=[-1, self.num_classes, self.num_boxes],
+                dtype='float32')
+            multiclass_nms_out = fluid.layers.multiclass_nms(
+                bboxes=boxes,
+                scores=scores,
+                background_label=self.background_label,
+                score_threshold=self.score_threshold,
+                nms_top_k=self.nms_top_k,
+                nms_threshold=self.nms_threshold,
+                keep_top_k=self.keep_top_k,
+                normalized=self.normalized)
+            mutliclass_nms_out = multiclass_nms_out + 1.
+            multiclass_nms_out = fluid.layers.reshape(
+                multiclass_nms_out, [self.bs, 1, self.keep_top_k, 6],
+                name='reshape')
+            out = fluid.layers.batch_norm(multiclass_nms_out, is_test=True)
+
+        boxes_data = np.arange(self.num_boxes * 4).reshape(
+            [self.bs, self.num_boxes, 4]).astype('float32')
+        scores_data = np.arange(1 * self.num_classes * self.num_boxes).reshape(
+            [self.bs, self.num_classes, self.num_boxes]).astype('float32')
+        self.feeds = {
+            'bboxes': boxes_data,
+            'scores': scores_data,
+        }
+        self.fetch_list = [out]
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def run_test_all(self):
+        precision_opt = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_opt = [False, True]
+        max_shape = {
+            'bboxes': [self.bs, self.num_boxes, 4],
+            'scores': [self.bs, self.num_classes, self.num_boxes],
+        }
+        opt_shape = max_shape
+        dynamic_shape_opt = [
+            None, InferencePassTest.DynamicShapeParam({
+                'bboxes': [1, 1, 4],
+                'scores': [1, 1, 1]
+            }, max_shape, opt_shape, False)
+        ]
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_opt, serialize_opt, dynamic_shape_opt):
+            self.precision = precision
+            self.serialize = serialize
+            self.dynamic_shape_params = dynamic_shape
+            self.build()
+            self.check_output()
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        max_shape = {
+            'bboxes': [self.bs, self.num_boxes, 4],
+            'scores': [self.bs, self.num_classes, self.num_boxes],
+        }
+        opt_shape = max_shape
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
+            'bboxes': [1, 1, 4],
+            'scores': [1, 1, 1]
+        }, max_shape, opt_shape, False)
+        self.run_test()
+
+    def test_background(self):
+        self.background = 7
+        self.run_test()
+
+    def test_disable_oss(self):
+        self.diable_tensorrt_oss = False
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From c3974d0e2a6353f3a134e8925aeb15cac7f0e48b Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Fri, 26 Mar 2021 18:11:51 +0800
Subject: [PATCH 05/15] [3D-parallel] Reformat pipeline parallel (#31786)

* update, test=develop
---
 paddle/fluid/framework/section_worker.cc      |  20 +-
 .../fleet/meta_optimizers/common.py           |  41 +-
 .../meta_optimizers/pipeline_optimizer.py     | 308 +++---
 .../contrib/mixed_precision/fp16_utils.py     |  10 +-
 python/paddle/fluid/device_worker.py          |   2 +-
 python/paddle/fluid/executor.py               |  23 +-
 python/paddle/fluid/optimizer.py              | 954 +++++++++++-------
 .../fluid/tests/unittests/pipeline_mnist.py   |  27 +-
 8 files changed, 816 insertions(+), 569 deletions(-)

diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 90a371e474756e..e740771e5ca9fc 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -39,13 +39,13 @@ void SectionWorker::RunForward(
     int op_role = op->Attr<int>(std::string("op_role"));
     // We run op with op_role = kLRSched only for the first microbatch
     // to avoid increasing the @LR_DECAY_STEP@ multiple times.
-    bool run_first_mbatch = op_role == static_cast<int>(OpRole::kForward) ||
-                            op_role == (static_cast<int>(OpRole::kForward) |
-                                        static_cast<int>(OpRole::kLoss)) ||
-                            op_role == static_cast<int>(OpRole::kLRSched);
-    bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
-                      op_role == (static_cast<int>(OpRole::kForward) |
-                                  static_cast<int>(OpRole::kLoss));
+    bool run_first_mbatch = (op_role == static_cast<int>(OpRole::kForward)) ||
+                            (op_role == (static_cast<int>(OpRole::kForward) |
+                                         static_cast<int>(OpRole::kLoss))) ||
+                            (op_role == static_cast<int>(OpRole::kLRSched));
+    bool run_others = (op_role == static_cast<int>(OpRole::kForward)) ||
+                      (op_role == (static_cast<int>(OpRole::kForward) |
+                                   static_cast<int>(OpRole::kLoss)));
     if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) {
       VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch "
               << micro_id;
@@ -64,9 +64,9 @@ void SectionWorker::RunBackward(
         &unused_vars_) {
   for (auto &op : ops_) {
     int op_role = op->Attr<int>(std::string("op_role"));
-    if (op_role == static_cast<int>(OpRole::kBackward) ||
-        op_role == (static_cast<int>(OpRole::kBackward) |
-                    static_cast<int>(OpRole::kLoss))) {
+    if ((op_role == static_cast<int>(OpRole::kBackward)) ||
+        (op_role == (static_cast<int>(OpRole::kBackward) |
+                     static_cast<int>(OpRole::kLoss)))) {
       VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch "
               << micro_id;
       op->Run(*microbatch_scopes_[micro_id], place_);
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 00d58cbd997fb4..c3d27bcc4ea551 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -47,7 +47,7 @@ def is_optimizer_op(op):
 
 
 class CollectiveHelper(object):
-    def __init__(self, role_maker, nrings=1, wait_port='6174'):
+    def __init__(self, role_maker, nrings=1, wait_port=True):
         self.nrings = nrings
         self.wait_port = wait_port
         self.role_maker = role_maker
@@ -65,14 +65,48 @@ def update_startup_program(self, startup_program=None):
                 self.role_maker._worker_index(), ring_id, self.wait_port)
         self._broadcast_params()
 
-    def _init_communicator(self, program, current_endpoint, endpoints, rank,
-                           ring_id, wait_port):
+    def _init_communicator(self,
+                           program,
+                           current_endpoint,
+                           endpoints,
+                           rank,
+                           ring_id,
+                           wait_port,
+                           global_ring_id=None,
+                           sync=True):
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
+        def _add_sync_by_allreduce(block):
+            sync_var = block.create_var(
+                name=unique_name.generate('sync_var'),
+                dtype=core.VarDesc.VarType.INT32,
+                persistable=False,
+                stop_gradient=True)
+            block.append_op(
+                type='fill_constant',
+                inputs={},
+                outputs={'Out': [sync_var]},
+                attrs={
+                    'shape': [1],
+                    'dtype': sync_var.dtype,
+                    'value': 1,
+                    'force_cpu': False,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_allreduce_sum',
+                inputs={'X': [sync_var]},
+                outputs={'Out': [sync_var]},
+                attrs={
+                    'ring_id': global_ring_id,
+                    'use_calc_stream': True,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
         block = program.global_block()
         if core.is_compiled_with_cuda():
             comm_id_var = block.create_var(
@@ -128,6 +162,7 @@ def _init_communicator(self, program, current_endpoint, endpoints, rank,
             raise ValueError(
                 "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
             )
+        if sync: _add_sync_by_allreduce(block)
 
     def _wait(self, current_endpoint, endpoints):
         assert (self.wait_port)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index 9535c9ef53c2e6..6f435bb86ba5ac 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -19,130 +19,21 @@
 from ..base.private_helper_function import wait_server_ready
 from paddle.fluid.optimizer import PipelineOptimizer as PO
 from .meta_optimizer_base import MetaOptimizerBase
-from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
-
-
-def _get_node_num(endpoints):
-    ss = set()
-    for ep in endpoints:
-        ip = ep.split(":")[0].strip()
-        if ip not in ss:
-            ss.add(ip)
-    return len(ss)
-
-
-class PipelineHelper(object):
-    def __init__(self, role_maker, wait_port='6174'):
-        self.wait_port = wait_port
-        self.role_maker = role_maker
-
-    def update_startup_program(self,
-                               startup_program=None,
-                               inner_parallelism=None):
-        self.startup_program = startup_program
-
-        nranks = self.role_maker._worker_num()
-        rank = self.role_maker._worker_index()
-        endpoints = self.role_maker._get_trainer_endpoints()
-        current_endpoint = endpoints[rank]
-        node_num = _get_node_num(endpoints)
-        assert nranks % node_num == 0
-
-        # Create ring 0 for all gpus in the same pipeline
-        if inner_parallelism > 1:
-            pipeline_rank = rank % inner_parallelism
-            pipeline_id = rank // inner_parallelism
-            start_index = pipeline_id * inner_parallelism
-            pipeline_endpoints = endpoints[start_index:start_index +
-                                           inner_parallelism]
-            self._init_communicator(self.startup_program, current_endpoint,
-                                    pipeline_endpoints, pipeline_rank, 0,
-                                    self.wait_port)
-
-        pipeline_num = len(endpoints) // inner_parallelism
-        if pipeline_num == 1: return
-        # Create rings for gpus with the same pipeline id for data parallel
-        eps = []
-        pipeline_rank = rank % inner_parallelism
-        ring_id = pipeline_rank + 1
-        for i in range(pipeline_num):
-            eps.append(endpoints[i * inner_parallelism + pipeline_rank])
-        # rank in a ring of gpus with the same pipeline id for data parallel
-        dp_rank = rank // inner_parallelism
-        self._init_communicator(self.startup_program, current_endpoint, eps,
-                                dp_rank, ring_id, self.wait_port)
-        self._broadcast_params(ring_id)
-
-    def _init_communicator(self, program, current_endpoint, endpoints, rank,
-                           ring_id, wait_port):
-        nranks = len(endpoints)
-        other_endpoints = endpoints[:]
-        other_endpoints.remove(current_endpoint)
-        if rank == 0 and wait_port:
-            wait_server_ready(other_endpoints)
-
-        block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=unique_name.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-                OP_ROLE_KEY: OpRole.Forward,
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': ring_id,
-                OP_ROLE_KEY: OpRole.Forward,
-            })
-
-    def _broadcast_params(self, ring_id):
-        block = self.startup_program.global_block()
-        for var_name in block.vars:
-            if "nccl_id" in var_name: continue
-            param = block.var(var_name)
-            if not param.persistable:
-                continue
-
-            block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': 0,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
-
-        block.append_op(
-            type='c_sync_comm_stream',
-            inputs={'X': param},
-            outputs={'Out': param},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Forward})
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
 
 
 class PipelineOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(PipelineOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
             "RecomputeOptimizer",
             "AMPOptimizer",
         ]
         self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.global_ring_id = 1
+        self.dp_ring_id = 2
+        self.start_pipeline_ring_id = 20  # Just a magic number
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -165,7 +56,11 @@ def _can_apply(self):
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.pipeline = False
-        dist_strategy.pipeline_configs = {}
+        dist_strategy.pipeline_configs = {
+            "micro_batch_size": 1,
+            "accumulate_steps": 1,
+            "schedule_mode": "1F1B",
+        }
 
     def _enable_strategy(self, dist_strategy, context):
         dist_strategy.pipeline = True
@@ -175,61 +70,134 @@ def _enable_strategy(self, dist_strategy, context):
             "schedule_mode": "1F1B",
         }
 
+    def _broadcast_params(self, ring_id):
+        block = self.startup_program.global_block()
+        param = None
+        for param in block.iter_parameters():
+            if param.is_distributed:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        if not param: return  # no parameter on this device
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': param},
+            outputs={'Out': param},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+
+    def _get_process_group_info(self):
+        # global ring info
+        self.global_endpoints = self.endpoints
+        self.global_rank = self.rank
+        self.global_nranks = self.nranks
+
+        # data parallel ring info
+        if self.pipeline_num > 1:
+            self.dp_rank = self.rank // self.inner_parallelism
+            self.dp_nranks = self.nranks // self.inner_parallelism
+            start_index = self.rank % self.inner_parallelism
+            self.dp_endpoints = [
+                self.endpoints[start_index + i * self.inner_parallelism]
+                for i in range(self.pipeline_num)
+            ]
+
+    def _init_process_group(self, pipeline_pair, pipeline_ring_map):
+        self._get_process_group_info()
+        collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
+        # Create global ring for all gpus (ring_id = 0)
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.global_endpoints,
+            self.global_rank, self.global_ring_id, True, self.global_ring_id,
+            True)
+        # Create pipeline rings
+        if self.inner_parallelism > 1:
+            pipeline_id = self.rank // self.inner_parallelism
+            start_index = pipeline_id * self.inner_parallelism
+            for pair in pipeline_pair:
+                pair_key = pair[0] * 1000 + pair[1]
+                ring_id = pipeline_ring_map[pair_key]
+                assert ring_id >= self.start_pipeline_ring_id
+                first_node = pair[0] + start_index
+                second_node = pair[1] + start_index
+                if self.rank != first_node and self.rank != second_node:
+                    continue
+                pipeline_endpoints = [
+                    self.endpoints[first_node], self.endpoints[second_node]
+                ]
+                pipeline_rank = 0 if self.rank == first_node else 1
+                pipeline_nranks = 2
+                collective_helper._init_communicator(
+                    self.startup_program, self.current_endpoint,
+                    pipeline_endpoints, pipeline_rank, ring_id, False,
+                    self.global_ring_id, True)
+
+        # Create dp rings
+        if self.pipeline_num > 1:
+            collective_helper._init_communicator(
+                self.startup_program, self.current_endpoint, self.dp_endpoints,
+                self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True)
+            self._broadcast_params(self.dp_ring_id)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        endpoints = self.role_maker._get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker._worker_index()]
-        self.wrapped_opt = PO(self.inner_opt,
-                              num_microbatches=self.num_microbatches)
-        node_num = _get_node_num(endpoints)
-        gpus_per_node = len(endpoints) // node_num
-        self.startup_program = startup_program
-        if startup_program is None:
-            self.startup_program = fluid.default_startup_program()
-
+        self.endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
         self.rank = self.role_maker._worker_index()
         self.nranks = self.role_maker._worker_num()
-        assert self.nranks % node_num == 0
 
-        loss.block.program._pipeline_opt = dict()
-        loss.block.program._pipeline_opt['local_rank'] = self.rank
-        loss.block.program._pipeline_opt[
-            'micro_batch_size'] = self.micro_batch_size
-        loss.block.program._pipeline_opt['schedule_mode'] = self.schedule_mode
-        optimize_ops, params_grads, prog_list = self.wrapped_opt.minimize(
+        self.wrapped_opt = PO(self.inner_opt,
+                              num_microbatches=self.num_microbatches)
+        orig_startup_program = startup_program if startup_program else fluid.default_startup_program(
+        )
+        block = loss.block
+        program = block.program
+
+        program._pipeline_opt = dict()
+        program._pipeline_opt['local_rank'] = self.rank
+        program._pipeline_opt['global_ring_id'] = self.global_ring_id
+        program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id
+        program._pipeline_opt['micro_batch_size'] = self.micro_batch_size
+        program._pipeline_opt['schedule_mode'] = self.schedule_mode
+        optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize(
             loss, startup_program, parameter_list, no_grad_set)
-        assert prog_list
-
-        self.main_program_list = prog_list
-        self.main_program = loss.block.program
-        self.inner_parallelism = loss.block.program._pipeline_opt[
-            'inner_parallelism']
+        self.startup_program = orig_startup_program._pipeline_opt[
+            'startup_program']
+        self.inner_parallelism = program._pipeline_opt['inner_parallelism']
         assert self.nranks % self.inner_parallelism == 0
+        assert prog_list
+        self.pipeline_num = len(self.endpoints) // self.inner_parallelism
 
-        pipeline_helper = PipelineHelper(self.role_maker)
-        pipeline_helper.update_startup_program(
-            self.startup_program._pipeline_opt["startup_program"],
-            self.inner_parallelism)
+        self._init_process_group(pp_pair, ring_map)
 
-        pipeline_num = self.nranks // self.inner_parallelism
-        self._transpile_main_program(loss, pipeline_num, self.inner_parallelism)
+        self.main_program_list = prog_list
+        self.main_program = program
+        if self.pipeline_num > 1:
+            self._transpile_main_program(loss)
         return optimize_ops, params_grads
 
-    def _transpile_main_program(self, loss, pipeline_num, inner_parallelism):
-        if pipeline_num <= 1: return
-        self._insert_loss_grad_ops(loss, pipeline_num)
-        for ring_id in range(1, inner_parallelism + 1):
-            self._insert_allreduce_ops(ring_id)
+    def _transpile_main_program(self, loss):
+        self._insert_loss_grad_ops(loss, self.pipeline_num)
+        self._insert_allreduce_ops(self.dp_ring_id)
 
     def _insert_loss_grad_ops(self, loss, pipeline_num):
         """
         In order to keep the learning rate consistent in different numbers of
         training workers, we scale the loss grad by the number of workers
         """
-        block = self.main_program_list[-1]['program'].global_block()
+        block = self.main_program_list[-1].global_block()
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
@@ -244,57 +212,53 @@ def _insert_loss_grad_ops(self, loss, pipeline_num):
                     })
 
     def _insert_allreduce_ops(self, ring_id):
-        block = self.main_program_list[ring_id - 1]['program'].global_block()
+        block = self.main_program._pipeline_opt['section_program'].global_block(
+        )
         origin_block = self.main_program.global_block()
         grad = None
         processed_param_name = set()
+        first_optimize_op_idx = None
+        add_sync_calc_stream = False
         for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and not first_optimize_op_idx:
+                first_optimize_op_idx = idx + 1
+                # no optimize phase
+                if first_optimize_op_idx == len(block.ops): return
             if is_backward_op(op) and \
                     OP_ROLE_VAR_KEY in op.attr_names:
                 op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
                 if len(op_role_var) == 0:
                     continue
                 assert len(op_role_var) % 2 == 0
-                offset = idx
+                offset = 0
                 for i in range(0, len(op_role_var), 2):
                     param_name = op_role_var[i]
                     param = block.vars[op_role_var[i]]
                     if param_name in processed_param_name: continue
                     processed_param_name.add(param_name)
-                    grad = block.vars[op_role_var[i + 1]]
+                    grad_name = op_role_var[i + 1]
+                    if not 'MERGED' in grad_name: grad_name += '@MERGED'
+                    grad = block.vars[grad_name]
                     origin_param = origin_block.vars[op_role_var[i]]
                     if origin_param.is_distributed:
                         continue
-                    if offset == idx:
-                        offset += 1
+                    if not add_sync_calc_stream:
+                        add_sync_calc_stream = True
                         block._insert_op(
-                            offset,
+                            first_optimize_op_idx + offset,
                             type='c_sync_calc_stream',
                             inputs={'X': grad},
                             outputs={'Out': grad},
-                            attrs={OP_ROLE_KEY: OpRole.Backward})
+                            attrs={OP_ROLE_KEY: OpRole.Optimize})
                         offset += 1
 
                     block._insert_op(
-                        offset,
+                        first_optimize_op_idx + offset,
                         type='c_allreduce_sum',
                         inputs={'X': grad},
                         outputs={'Out': grad},
                         attrs={
                             'ring_id': ring_id,
-                            OP_ROLE_KEY: OpRole.Backward
+                            'use_calc_stream': True,
+                            OP_ROLE_KEY: OpRole.Optimize
                         })
-
-        if grad is None:
-            return
-
-        for idx, op in enumerate(block.ops):
-            if is_optimizer_op(op):
-                block._insert_op(
-                    idx,
-                    type='c_sync_comm_stream',
-                    inputs={'X': grad},
-                    outputs={'Out': grad},
-                    attrs={'ring_id': ring_id,
-                           OP_ROLE_KEY: OpRole.Backward})
-            break
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index f9c3a613c4053a..67e83a2ec4617c 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -123,7 +123,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         outputs={"Out": out_var},
                         attrs={
                             "in_dtype": in_var.dtype,
-                            "out_dtype": out_var.dtype
+                            "out_dtype": out_var.dtype,
+                            "op_device": op.attr("op_device")
                         })
                     num_cast_ops += 1
                 _rename_arg(op, in_var.name, out_var.name)
@@ -171,8 +172,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
             type="cast",
             inputs={"X": target_var},
             outputs={"Out": cast_var},
-            attrs={"in_dtype": target_var.dtype,
-                   "out_dtype": cast_var.dtype})
+            attrs={
+                "in_dtype": target_var.dtype,
+                "out_dtype": cast_var.dtype,
+                "op_device": op.attr("op_device")
+            })
         num_cast_ops += 1
         op_var_rename_map[block.idx][target_var.name] = cast_var.name
 
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index b923f36af8d027..0f98af5772313d 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -427,7 +427,7 @@ def _gen_worker_desc(self, trainer_desc):
         section_param.schedule_mode = schedule_mode
         cfg = section_param.section_config
         program = pipeline_opt["section_program"]
-        cfg.program_desc.ParseFromString(program["program"]._get_desc()
+        cfg.program_desc.ParseFromString(program._get_desc()
                                          .serialize_to_string())
         # TODO: why does not work
         # cfg.program_desc.CopyFrom(program.program._get_desc())
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 9b0b04a6ea7164..da326ec074c1d9 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1458,7 +1458,7 @@ def _run_from_dataset(self,
         dataset._prepare_to_run()
         real_fetch_list = []
         if program._pipeline_opt:
-            real_program = program._pipeline_opt["section_program"]['program']
+            real_program = program._pipeline_opt["section_program"]
             for fetch_var in fetch_list:
                 if isinstance(fetch_var, Variable):
                     fetch_var_name = fetch_var.name
@@ -1467,13 +1467,20 @@ def _run_from_dataset(self,
                 if fetch_var_name in real_program.global_block().vars:
                     real_fetch_list.append(fetch_var)
 
-            program._pipeline_opt["section_program"][
-                'program'] = self._add_feed_fetch_ops(
-                    program=program._pipeline_opt["section_program"]['program'],
-                    feed=[],
-                    fetch_list=real_fetch_list,
-                    feed_var_name='feed',
-                    fetch_var_name='fetch')
+            program._pipeline_opt["section_program"] = self._add_feed_fetch_ops(
+                program=program._pipeline_opt["section_program"],
+                feed=[],
+                fetch_list=real_fetch_list,
+                feed_var_name='feed',
+                fetch_var_name='fetch')
+            main_block = program._pipeline_opt["section_program"].block(0)
+            for op in main_block.ops:
+                # set the op_role of fetch op to Optimize to avoid
+                # erase the fetched vars by gc for pipeline
+                if op.type == 'fetch':
+                    op._set_attr(
+                        'op_role',
+                        core.op_proto_and_checker_maker.OpRole.Optimize)
             fetch_list = None
 
         scope, trainer = self._prepare_trainer(
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 9c724cbfdd4a73..2aa918bf806616 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -3784,6 +3784,12 @@ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
                              "Optimizer, but the given type is {}.".format(
                                  type(optimizer)))
         self._optimizer = optimizer
+
+        # Get the original optimizer defined by users, such as SGD
+        self._origin_optimizer = self._optimizer
+        while hasattr(self._origin_optimizer, "inner_opt"):
+            self._origin_optimizer = self._origin_optimizer.inner_opt
+
         assert num_microbatches >= 1, (
             "num_microbatches must be a positive value.")
         self._num_microbatches = num_microbatches
@@ -3797,13 +3803,98 @@ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
         self._op_role_var_key = op_maker.kOpRoleVarAttrName()
         self._op_device_key = op_maker.kOpDeviceAttrName()
         self._param_device_map = None
+        self._pipeline_pair = []
+        self._pp_ring_map = dict()
+        self._global_ring_id = None
+
+    # insert allreduce op to sync global information for global
+    # gradient clip and amp
+    def _insert_allreduce_op(self, op_idx, block):
+        """
+        Insert allreduce op to sync global information for global
+        gradient clip and amp.
+        """
+        op = block.ops[op_idx]
+        out_name = op.desc.output_arg_names()[0]
+        out_var = block.var(out_name)
+        offset = 0
+        if op.type == "reduce_any":
+            # cast the bool var to int32 to use allreduce_max op
+            temp_var_name = unique_name.generate(out_name + "_cast_int32")
+            temp_var = block.create_var(
+                name=temp_var_name, shape=[1], dtype="int32")
+            block._insert_op(
+                op_idx + 1 + offset,
+                type='cast',
+                inputs={'X': out_var},
+                outputs={'Out': temp_var},
+                attrs={
+                    'in_dtype': out_var.dtype,
+                    'out_dtype': temp_var.dtype,
+                    self._op_role_key: self._op_role.Optimize
+                })
+            offset += 1
+        block._insert_op(
+            op_idx + 1 + offset,
+            type='c_allreduce_max'
+            if op.type == "reduce_any" else 'c_allreduce_sum',
+            inputs={'X': temp_var if op.type == "reduce_any" else out_var},
+            outputs={'Out': temp_var if op.type == "reduce_any" else out_var},
+            attrs={
+                'ring_id': self._global_ring_id,
+                self._op_role_key: self._op_role.Optimize,
+                'use_calc_stream': True
+            })
+        offset += 1
+        if op.type == "reduce_any":
+            block._insert_op(
+                op_idx + 1 + offset,
+                type='cast',
+                inputs={'X': temp_var},
+                outputs={'Out': out_var},
+                attrs={
+                    'in_dtype': temp_var.dtype,
+                    'out_dtype': out_var.dtype,
+                    self._op_role_key: self._op_role.Optimize
+                })
+        return offset
 
     def _create_vars(self, block, ori_block):
-        # Create vars for block, copied from main_program's global block
+        # Create vars for block, copied from ori_block
         used_var_set = set()
-        for op_idx in range(block.desc.op_size()):
-            op_desc = block.desc.op(op_idx)
-            vars = op_desc.input_arg_names() + op_desc.output_arg_names()
+        added_op_num = 0
+        op_idx = 0
+        op_size = block.desc.op_size()
+        while op_idx < op_size + added_op_num:
+            # Whether to insert allreduce_sum or allreduce_max op.
+            # For amp and global gradient clip strategies, we should
+            # get the global information, so allreduce op is needed.
+            should_insert = False
+            op = block.ops[op_idx]
+            # For op process vars on all devices, remove its input 
+            # vars not in this block
+            reserved_x = []
+            if op.type == 'reduce_any' and self._is_optimize_op(op):
+                should_insert = True
+            elif op.type == 'concat' and self._is_optimize_op(op):
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+            elif op.type == 'update_loss_scaling':
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+                op.desc.set_output('Out', reserved_x)
+            elif op.type == 'sum' and self._is_gradient_clip_op(op):
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+                should_insert = True
+
+            vars = op.desc.input_arg_names() + op.desc.output_arg_names()
             for var in vars:
                 # a var whose name contains "blocking_queue" 
                 # only exists in startup program 
@@ -3813,27 +3904,39 @@ def _create_vars(self, block, ori_block):
                 if block._find_var_recursive(str(var)): continue
                 source_var = ori_block._var_recursive(str(var))
                 if source_var.type == core.VarDesc.VarType.READER:
-                    block.create_var(
+                    dest_var = block.create_var(
                         name=var,
                         type=core.VarDesc.VarType.READER,
                         persistable=source_var.persistable)
                 else:
-                    block._clone_variable(source_var, False)
+                    dest_var = block._clone_variable(source_var, False)
+                dest_var.stop_gradient = source_var.stop_gradient
+            # When use with sharding, allreduce_sum and allreduce_max
+            # used for global gradient clip and amp will be added by sharding.
+            op_idx += 1
+            if self.use_sharding or not should_insert: continue
+            inserted_ops = self._insert_allreduce_op(op_idx - 1, block)
+            added_op_num += inserted_ops
+            op_idx += inserted_ops
+        block._sync_with_cpp()
 
     def _is_loss_grad_op(self, op):
-        if self._op_role_key not in op.attr_names:
-            return False
-        op_role = int(op.all_attrs()[self._op_role_key])
+        assert self._op_role_key in op.attr_names
+        op_role = int(op.attr(self._op_role_key))
         return op_role & int(self._op_role.Backward) and op_role & int(
             self._op_role.Loss)
 
     def _is_backward_op(self, op):
-        return self._op_role_key in op.attr_names and int(op.all_attrs()[
-            self._op_role_key]) & int(self._op_role.Backward)
+        return self._op_role_key in op.attr_names and (
+            int(op.attr(self._op_role_key)) & int(self._op_role.Backward))
+
+    def _is_loss_op(self, op):
+        assert self._op_role_key in op.attr_names
+        return int(op.attr(self._op_role_key)) == int(self._op_role.Loss)
 
     def _is_optimize_op(self, op):
-        return self._op_role_key in op.attr_names and int(op.all_attrs()[
-            self._op_role_key]) & int(self._op_role.Optimize)
+        return self._op_role_key in op.attr_names and (
+            int(op.attr(self._op_role_key)) & int(self._op_role.Optimize))
 
     def _is_update_op(self, op):
         return 'Param' in op.input_names and 'Grad' in op.input_names and (
@@ -3842,50 +3945,40 @@ def _is_update_op(self, op):
     def _split_program(self, main_program, devices):
         """
         Split a program into sections according to devices that ops run on.
-        The ops of the role LRSched are copied to all sections.
+        The op whose op_device attr is "gpu:all" is copied to all sections.
 
         Args:
             main_program (Program): the main program
             devices: all used devices
         """
-        programs = []
         # Map from device to its corresponding section program info
-        device_program_map = dict()
-        for device in devices:
-            p = {'program': Program()}
-            device_program_map[device] = p
+        device_program_map = defaultdict(Program)
 
         block = main_program.block(0)
         for op in block.ops:
             device = op.attr(self._op_device_key)
-            op_role = op.attr(self._op_role_key)
-            if int(op_role) & int(self._op_role.LRSched):
-                # Copy ops of the role LRSched to all sections.
-                for device in device_program_map.keys():
-                    program = device_program_map[device]
-                    op_desc = op.desc
-                    ap_op = program["program"].block(0).desc.append_op()
-                    ap_op.copy_from(op_desc)
-                    # ap_op._set_attr(self._op_device_key, "")
-            elif op.type == "create_py_reader" or op.type == "read" or op.type == "create_double_buffer_reader":
-                # Copy read related ops to all section to make them exit after each epoch.
-                for device in device_program_map.keys():
+            # Copy ops whose op_device set to "gpu:all" to all sections.
+            if device == "gpu:all":
+                for device in devices:
                     program = device_program_map[device]
                     op_desc = op.desc
-                    ap_op = program["program"].block(0).desc.append_op()
+                    ap_op = program.global_block().desc.append_op()
                     ap_op.copy_from(op_desc)
+                    ap_op._set_attr(self._op_device_key, "")
             else:
                 program = device_program_map[device]
                 op_desc = op.desc
-                ap_op = program["program"].block(0).desc.append_op()
+                ap_op = program.global_block().desc.append_op()
                 ap_op.copy_from(op_desc)
+                ap_op._set_attr(self._op_device_key, "")
 
+        program_list = []
         for key in devices:
             program = device_program_map[key]
-            program['program']._sync_with_cpp()
-            programs.append(program)
+            program._sync_with_cpp()
+            program_list.append(program)
 
-        return programs
+        return program_list
 
     def _get_op_device_for_startup_program(self, var_name):
         """
@@ -3894,21 +3987,22 @@ def _get_op_device_for_startup_program(self, var_name):
         get the real op_device attribute of the fill_constant as the device
         where the corresponding parameters on.
         """
-        assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name
+        assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name, \
+            'For accumulators for Adam, the name must contain beta1_pow_acc ' \
+            'or beta2_pow_acc.'
         param_name = var_name[0:var_name.index('_beta')]
         device = self._param_device_map[param_name]
         return device
 
-    def _split_startup_program(self, startup_program, local_rank):
-        block = startup_program.block(0)
+    def _split_startup_program(self, startup_program, device_id):
+        block = startup_program.global_block()
         new_startup_program = Program()
         for op in block.ops:
             device = op.attr(self._op_device_key)
             if device == "cpu":
                 assert op.type == "fill_constant", (
-                    "For ops in startup "
-                    "program that with the op_device attribute of cpu, "
-                    "they must be fill_constant.")
+                    "For ops in startup program with the op_device attribute "
+                    "of cpu, they must be of type fill_constant.")
                 output_var = op.output_arg_names[0]
                 device = self._get_op_device_for_startup_program(output_var)
 
@@ -3917,14 +4011,13 @@ def _split_startup_program(self, startup_program, local_rank):
             else:
                 # LR related ops
                 device = None
-            if device and device_index != local_rank: continue
+            if device and device_index != device_id: continue
             op_desc = op.desc
-            ap_op = new_startup_program.block(0).desc.append_op()
+            ap_op = new_startup_program.global_block().desc.append_op()
             ap_op.copy_from(op_desc)
             ap_op._set_attr(self._op_device_key, "")
         new_startup_program._sync_with_cpp()
-        self._create_vars(
-            new_startup_program.block(0), startup_program.global_block())
+        self._create_vars(new_startup_program.global_block(), block)
         return new_startup_program
 
     def _find_post_op(self, ops, cur_op, var_name):
@@ -3937,6 +4030,11 @@ def _find_post_op(self, ops, cur_op, var_name):
                                var_name as output.
             var_name (string): Variable name.
         """
+        # To skip the cast op added by amp which has no op_device set
+        if '.cast_fp32' in var_name:
+            var_name = var_name.replace('.cast_fp32', '')
+        elif '.cast_fp16' in var_name:
+            var_name = var_name.replace('.cast_fp16', '')
         post_op = []
         before = True
         for op in ops:
@@ -3965,7 +4063,8 @@ def _find_real_prev_op(self, ops, cur_op, var_name):
         """
         prev_op = []
         for op in ops:
-            if op.type == 'send_v2' or op.type == 'recv_v2':
+            if op.type == 'send_v2' or op.type == 'recv_v2' \
+                or op.type == 'c_broadcast':
                 continue
             if op == cur_op:
                 break
@@ -3980,11 +4079,8 @@ def _find_real_prev_op(self, ops, cur_op, var_name):
         return None
 
     def _rename_arg(self, op, old_name, new_name):
-        op_desc = op.desc
-        if isinstance(op_desc, tuple):
-            op_desc = op_desc[0]
-        op_desc._rename_input(old_name, new_name)
-        op_desc._rename_output(old_name, new_name)
+        op._rename_input(old_name, new_name)
+        op._rename_output(old_name, new_name)
 
     def _create_var(self, block, ref_var, name):
         """
@@ -3998,99 +4094,12 @@ def _create_var(self, block, ref_var, name):
             dtype=ref_var.dtype,
             type=ref_var.type,
             lod_level=ref_var.lod_level,
-            persistable=False,
-            is_data=False,
+            persistable=ref_var.persistable,
+            is_data=ref_var.is_data,
             need_check_feed=ref_var.desc.need_check_feed())
+        new_var.stop_gradient = ref_var.stop_gradient
         return new_var
 
-    def _get_data_var_info(self, block):
-        """
-        Get info of all vars whose is_data attribute are true.
-        """
-        # map of data vars to devices that that data on
-        data_devices_map = dict()
-        for op in block.ops:
-            dev_spec = op.attr(self._op_device_key)
-            for var_name in op.input_arg_names:
-                if "blocking_queue" in var_name: continue
-                var = block.var(var_name)
-                if not var.is_data:
-                    continue
-                if not var_name in data_devices_map:
-                    data_devices_map[var_name] = []
-                if not dev_spec in data_devices_map[var_name]:
-                    data_devices_map[var_name].append(dev_spec)
-        return data_devices_map
-
-    def _insert_sendrecv_for_data_var(self, main_block, programs, startup,
-                                      devices):
-        """
-        Insert send and recv ops for data var that on other devices.
-
-        Args:
-            main_block (Block): Global block for main program
-            programs (dict): Dictionary for section params
-            startup (Program): Startup program
-            devices (list): List of devices in the format (dev:dev_index)
-        """
-        main_program = main_block.program
-        data_devices_map = self._get_data_var_info(main_block)
-
-        first_prog = programs[0]['program']
-        first_block = first_prog.block(0)
-        insert_index = 0
-        for op in first_block.ops:
-            insert_index += 1
-            if op.type == "read":
-                break
-        first_dev_spec = devices[0]
-        first_dev_index = int(first_dev_spec.split(':')[1])
-        for var_name in data_devices_map.keys():
-            for device in data_devices_map[var_name]:
-                if device == first_dev_spec: continue
-                main_var = main_block.var(var_name)
-                assert main_var.is_data
-                if not var_name in first_block.vars:
-                    self._create_var(first_block, main_var, var_name)
-                dev_index = int(device.split(':')[1])
-                first_block._insert_op(
-                    index=insert_index,
-                    type='send_v2',
-                    inputs={'X': first_block.var(var_name)},
-                    attrs={
-                        self._op_device_key: first_dev_spec,
-                        self._op_role_key: self._op_role.Forward,
-                        'use_calc_stream': True,
-                        'peer': dev_index,
-                    })
-                # Get the device that that data on
-                assert device in devices
-                prog_index = devices.index(device)
-                prog = programs[prog_index]['program']
-                block = prog.block(0)
-                index = 0
-                for op in block.ops:
-                    index += 1
-                    if op.type == "read":
-                        break
-                source_var = main_program.block(0).var(var_name)
-                new_var = self._create_var(block, source_var, var_name)
-                new_var_shape = list(new_var.shape)
-                new_var_shape[0] = self.micro_batch_size if new_var_shape[
-                    0] < 0 else new_var_shape[0]
-                block._insert_op(
-                    index=index,
-                    type='recv_v2',
-                    outputs={'Out': [new_var]},
-                    attrs={
-                        'out_shape': new_var_shape,
-                        'dtype': new_var.dtype,
-                        self._op_device_key: device,
-                        self._op_role_key: self._op_role.Forward,
-                        'peer': first_dev_index,
-                        'use_calc_stream': True,
-                    })
-
     def _strip_grad_suffix(self, name):
         """
         Strip the grad suffix from the given variable name
@@ -4104,95 +4113,161 @@ def _append_grad_suffix(self, name):
         """
         return name + core.grad_var_suffix()
 
-    def _add_opdevice_attr_for_regularization_clip(self, block):
+    def _get_op_device_attr(self, op):
         """
-        Add op_device attribute for regulization and clip ops.
+        Get the op_device attribute of a op.
         """
-        for op in block.ops:
-            # role for regularization and clip ops is optimize
-            if int(op.attr(self._op_role_key)) != int(self._op_role.Optimize):
-                continue
-            if op.has_attr(self._op_device_key) and (
-                    op.attr(self._op_device_key) != ""):
-                continue
-            assert self._op_role_var_key in op.attr_names
-            op_role_var = op.all_attrs()[self._op_role_var_key]
-            assert len(op_role_var) == 2
+        device = op.attr(self._op_device_key) \
+            if op.has_attr(self._op_device_key) else None
+        if device:
+            assert device[0:3] == 'gpu', "Now, only gpu devices are " \
+                "supported in pipeline parallemism."
+        return device
+
+    def _add_op_device_attr_for_op(self, op, idx, block):
+        """
+        Add op_device attrribute for ops that have not that attribute set.
+        We use "gpu:all" to represent the op should be put on all
+        sub-programs, such as lr-related ops. Note that: "gpu:all"
+        is only used by pipeline as an indicator.
+        """
+        lrsched_role = int(self._op_role.LRSched)
+        if op.attr(self._op_role_key) == lrsched_role:
+            # For LRSched ops, we should put them on all sub-programs to
+            # make sure each sub-program update the lr correctly
+            op._set_attr(self._op_device_key, "gpu:all")
+        elif (op.type == "cast" or
+              op.type == "scale") and self._is_backward_op(op):
+            prev_op = self._find_real_prev_op(block.ops, op,
+                                              op.desc.input("X")[0])
+            op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key))
+        elif op.type == "memcpy" and not self._is_optimize_op(op):
+            assert len(op.input_arg_names) == 1 and len(
+                op.output_arg_names) == 1
+            input_name = op.input_arg_names[0]
+            output_name = op.output_arg_names[0]
+            if '@Fetch' in output_name:
+                post_op = self._find_post_op(block.ops, op, output_name)
+                op._set_attr(self._op_device_key,
+                             post_op.attr(self._op_device_key))
+            else:
+                prev_op = self._find_real_prev_op(block.ops, op,
+                                                  op.desc.input("X")[0])
+                op._set_attr(self._op_device_key,
+                             prev_op.attr(self._op_device_key))
+        elif self._is_loss_op(op):
+            # For loss * loss_scaling op added by AMP
+            offset = 1
+            while (not block.ops[idx + offset].has_attr(self._op_device_key) or
+                   not block.ops[idx + offset].attr(self._op_device_key)):
+                offset += 1
+            device = block.ops[idx + offset].attr(self._op_device_key)
+            assert device, "Please put you program within device_guard scope."
+            for i in range(offset):
+                block.ops[idx + i]._set_attr(self._op_device_key, device)
+        elif self._is_optimize_op(op) and op.type == "check_finite_and_unscale":
+            op_role_var = op.attr(self._op_role_var_key)
             param_name = op_role_var[0]
             device = self._param_device_map[param_name]
             op._set_attr(self._op_device_key, device)
-
-    def _add_default_opdevice_attr(self, block):
+        elif self._is_optimize_op(op) and op.type == "cast":
+            # For fp16-->fp32 cast added by AMP
+            grad_name = op.output('Out')
+            assert len(grad_name) == 1
+            param_name = grad_name[0].strip(core.grad_var_suffix())
+            device = self._param_device_map[param_name]
+            op._set_attr(self._op_device_key, device)
+        elif self._is_gradient_clip_op(op) or self._is_regularization_op(op):
+            # For gradient clip and regularization ops, we set their op_device
+            # attribute to the device where their corresponding parameters on.
+            assert self._op_role_var_key in op.attr_names, "gradient_clip " \
+                "and regularization ops must have op_role_var attribute."
+            op_role_var = op.attr(self._op_role_var_key)
+            assert len(op_role_var) == 2, "op_role_var for gradient_clip " \
+                "regularization ops must have two elements."
+            param_name = op_role_var[0]
+            device = self._param_device_map[param_name]
+            # For sum op added by global gradient clip, it must be 
+            # put on all devices
+            if (op.type == 'sum' or op.type == 'sqrt' or
+                    op.type == 'fill_constant' or
+                    op.type == 'elementwise_max' or
+                    op.type == 'elementwise_div'):
+                device = "gpu:all"
+            op._set_attr(self._op_device_key, device)
+        else:
+            other_known_ops = [
+                'update_loss_scaling', 'reduce_any', 'concat', 'sum'
+            ]
+            assert op.type in other_known_ops, "For other ops without " \
+                "op_device set, they must be one of {}, but it " \
+                "is {}".format(other_known_ops, op.type)
+            assert self._is_optimize_op(op)
+            op._set_attr(self._op_device_key, "gpu:all")
+
+    def _add_op_device_attr(self, block):
         """
-        1. Add default op_device attribute for lr-related ops.
-           The default value is the one that of the first place.
-        2. Add default op_device attribute for sum ops added during
-           backward. For these ops, we set the op_device attribute
-           as the one of its post op, i.e, which op has the output of the
-           sum op as an input.
+        Add op_device attrribute for ops in block that have 
+        not that attribute set.
         """
-        first_devcie = ""
-
-        # Get the device spec of the first place.
-        # device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device,
-        # e.g. 'gpu:0', 'gpu:1', etc.
-        for op in block.ops:
-            if op.has_attr(self._op_device_key) and (
-                    op.attr(self._op_device_key) != ""):
-                first_device = op.attr(self._op_device_key)
-                break
-        assert first_device
-        first_device_type = first_device.split(":")[0]
-        assert first_device_type == "gpu"
-
-        # set op_device attr for lr-related ops
-        lrsched_role = int(self._op_role.LRSched)
-        for op in block.ops:
-            if not op.has_attr(self._op_device_key) or (
-                    op.attr(self._op_device_key) == ""):
-                if op.type == "sum":
-                    # For sum ops that compute the sum of @RENAMED@ vars
-                    for name in op.desc.input_arg_names():
-                        assert '@RENAME@' in name
-                    assert len(op.desc.output_arg_names()) == 1
-                    out_name = op.desc.output_arg_names()[0]
-                    post_op = self._find_post_op(block.ops, op, out_name)
-                    device = post_op.attr(self._op_device_key)
-                    assert device
-                    op._set_attr(self._op_device_key, device)
-                    continue
-
-                assert op.attr(self._op_role_key) == lrsched_role, (
-                    "Op whose op_device attr has not been set for pipeline"
-                    " must be of the role LRSched.")
-                op._set_attr(self._op_device_key, first_device)
+        for idx, op in enumerate(list(block.ops)):
+            if (op.type == "create_py_reader" or op.type == "read" or
+                    op.type == "create_double_buffer_reader"):
+                # Copy read related ops to all section to make them exit 
+                # after each epoch.
+                # We use "gpu:all" to represent the op should be put on all
+                # sub-programs, such as lr-related ops. Note that: "gpu:all"
+                # is only used by pipeline as an indicator.
+                op._set_attr(self._op_device_key, "gpu:all")
+                continue
+            # op_device attribute has been set
+            if self._get_op_device_attr(op): continue
+            self._add_op_device_attr_for_op(op, idx, block)
 
     def _check_validation(self, block):
         """
-        Check whether ops in a block are all validate (i.e., the 
-        op_device attribute has been set).
-        Then, return all device specifications in order.
+        Check whether ops in a block have both the op_device and the 
+        op_role attributes set.
+        Then, return all devices in order.
         """
-        device_specs = []
+        device_list = []
+        # Section worker only supports the following op_role
+        valid_op_role_value = [
+            int(self._op_role.LRSched),
+            int(self._op_role.Forward),
+            int(self._op_role.Backward),
+            int(self._op_role.Loss),
+            int(self._op_role.Optimize),
+            int(self._op_role.Backward) | int(self._op_role.Loss),
+        ]
         for op in block.ops:
-            type = op.type
-            if not op._has_kernel(type):
+            if not op._has_kernel(op.type):
                 assert op.type == "conditional_block" and (
                     op.attr(self._op_role_key) == int(self._op_role.LRSched)), (
                         "Now, the only supported op without kernel is "
                         "conditional_block, and its op role must be LRSched.")
+            assert op.has_attr(self._op_role_key), (
+                "op ({}) has no {} attribute.".format(op.type,
+                                                      self._op_role_key))
+            assert int(op.attr(self._op_role_key)) in valid_op_role_value, \
+                "op_role {} for op {} must be one of {}".format(
+                    op.attr(self._op_role_key),
+                    op.type,
+                    valid_op_role_value)
             assert op.has_attr(self._op_device_key), (
                 "op ({}) has no {} attribute.".format(op.type,
                                                       self._op_device_key))
-            dev_spec = op.attr(self._op_device_key)
-            assert dev_spec, ("op_device attribute for op "
-                              "{} has not been set.".format(op.type))
-            dev_type = dev_spec.split(':')[0]
+
+            device = op.attr(self._op_device_key)
+            assert device, ("op_device attribute for op "
+                            "{} has not been set.".format(op.type))
+            if device == "gpu:all": continue
+            dev_type = device.split(':')[0]
             assert dev_type == "gpu", ("Now only gpu devices are supported "
                                        "for pipeline parallelism.")
-            if not dev_spec in device_specs:
-                device_specs.append(dev_spec)
-        return device_specs
+            if not device in device_list:
+                device_list.append(device)
+        return device_list
 
     def _insert_sendrecv_ops_for_boundaries(self, block):
         """
@@ -4201,148 +4276,267 @@ def _insert_sendrecv_ops_for_boundaries(self, block):
         """
         extra_index = 0
 
-        # A map from var to device spec where op takes it as input,
+        # A map from var to device where op takes it as input,
         # avoiding multiple send and recv ops.
-        var_devspec = dict()
+        var_dev_map = dict()
 
         for index, op in enumerate(list(block.ops)):
-            # skips lr-related ops and vars, as we will process them later.
-            if int(op.attr(self._op_role_key)) & int(self._op_role.LRSched):
-                continue
-            # skips update ops and vars, as we will process them later.
-            if self._is_update_op(op): continue
-
-            cur_device_spec = op.attr(self._op_device_key)
+            cur_device = op.attr(self._op_device_key)
+            if cur_device == "gpu:all": continue
             for var_name in op.input_arg_names:
                 # i.e., lod_tensor_blocking_queue created by DataLoader,
                 # which only exists in startup program.
-                if not var_name in block.vars: continue
                 var = block.var(var_name)
                 # skip data, because we will process it later
                 if var.is_data: continue
+                prev_device = None
+                if var_name in self._param_device_map:
+                    prev_device = self._param_device_map[var_name]
                 prev_op = self._find_real_prev_op(block.ops, op, var_name)
-                if prev_op is None:
-                    continue
-                prev_device_spec = prev_op.attr(self._op_device_key)
+                if not prev_device:
+                    prev_device = prev_op.attr(self._op_device_key) \
+                        if prev_op else None
+                if not prev_device or prev_device == 'gpu:all': continue
 
-                if prev_device_spec != cur_device_spec:
-                    if var_name not in var_devspec:
-                        var_devspec[var_name] = []
-                    if cur_device_spec in var_devspec[var_name]: continue
-                    var_devspec[var_name].append(cur_device_spec)
+                if prev_device != cur_device:
+                    if var_name not in var_dev_map: var_dev_map[var_name] = []
+                    if cur_device in var_dev_map[var_name]: continue
+                    var_dev_map[var_name].append(cur_device)
 
                     op_role = op.all_attrs()[self._op_role_key]
                     var = block.vars[var_name]
-                    prev_device_index = int(prev_device_spec.split(':')[1])
-                    cur_device_index = int(cur_device_spec.split(':')[1])
-                    block._insert_op(
-                        index=index + extra_index,
-                        type='send_v2',
-                        inputs={'X': var},
-                        attrs={
-                            self._op_device_key: prev_device_spec,
-                            self._op_role_key: op_role,
-                            'use_calc_stream': True,
-                            'peer': cur_device_index,
-                        })
-                    extra_index += 1
-                    var_shape = list(var.shape)
-                    var_shape[0] = self.micro_batch_size if var_shape[
-                        0] < 0 else var_shape[0]
-                    block._insert_op(
-                        index=index + extra_index,
-                        type='recv_v2',
-                        outputs={'Out': [var]},
-                        attrs={
-                            'out_shape': var_shape,
-                            'dtype': var.dtype,
-                            self._op_device_key: cur_device_spec,
-                            self._op_role_key: op_role,
-                            'use_calc_stream': True,
-                            'peer': prev_device_index,
-                        })
-                    extra_index += 1
-
-    def _clear_gradients(self, main_block, dev_spec):
-        """
-        Clear gradients at the begining of each run of a minibatch.
-        """
-        for param_name in self._param_device_map:
-            device = self._param_device_map[param_name]
-            if device != dev_spec: continue
-            grad_name = self._append_grad_suffix(param_name)
-            if not main_block.has_var(grad_name): continue
-            grad_var = main_block.vars[grad_name]
-            grad_var.persistable = True
-            main_block._insert_op(
-                index=0,
-                type='fill_constant',
-                inputs={},
-                outputs={'Out': [grad_var]},
-                attrs={
-                    'shape': grad_var.shape,
-                    'dtype': grad_var.dtype,
-                    'value': float(0),
-                    self._op_device_key: device,
-                    # a trick to run this op once per mini-batch
-                    self._op_role_key: self._op_role.Optimize.LRSched,
-                })
+                    prev_device_index = int(prev_device.split(':')[1])
+                    cur_device_index = int(cur_device.split(':')[1])
+                    pair = (prev_device_index, cur_device_index)
+                    pair_key = prev_device_index * 1000 + cur_device_index
+                    if pair not in self._pipeline_pair:
+                        self._pipeline_pair.append(pair)
+                        self._pp_ring_map[pair_key] = self.ring_id
+                        ring_id = self.ring_id
+                        self.ring_id += 1
+                    else:
+                        ring_id = self._pp_ring_map[pair_key]
+                    if self.schedule_mode == 'F-then-B':  # F-then-B
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='send_v2',
+                            inputs={'X': var},
+                            attrs={
+                                self._op_device_key: prev_device,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': True,
+                                'peer': 1,
+                                'ring_id': ring_id
+                            })
+                        extra_index += 1
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='recv_v2',
+                            outputs={'Out': [var]},
+                            attrs={
+                                'out_shape': var.shape,
+                                'dtype': var.dtype,
+                                self._op_device_key: cur_device,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': True,
+                                'peer': 0,
+                                'ring_id': ring_id
+                            })
+                        extra_index += 1
+                    elif self.schedule_mode == '1F1B':  # 1F1B
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='c_sync_calc_stream',
+                            inputs={'X': [var]},
+                            outputs={'Out': [var]},
+                            attrs={
+                                self._op_device_key: prev_device,
+                                self._op_role_key: op_role,
+                            })
+                        extra_index += 1
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='send_v2',
+                            inputs={'X': var},
+                            attrs={
+                                self._op_device_key: prev_device,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': False,
+                                'ring_id': ring_id,
+                                'peer': 1,
+                            })
+                        extra_index += 1
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='c_sync_comm_stream',
+                            inputs={'X': [var]},
+                            outputs={'Out': [var]},
+                            attrs={
+                                self._op_device_key: prev_device,
+                                self._op_role_key: self._op_role.Backward,
+                                'ring_id': ring_id,
+                            })
+                        extra_index += 1
+                        var_shape = list(var.shape)
+                        var_shape[0] = self.micro_batch_size if var_shape[
+                            0] < 0 else var_shape[0]
+                        block._insert_op(
+                            index=index + extra_index,
+                            type='recv_v2',
+                            outputs={'Out': [var]},
+                            attrs={
+                                'out_shape': var_shape,
+                                'dtype': var.dtype,
+                                self._op_device_key: cur_device,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': True,
+                                'peer': 0,
+                                'ring_id': ring_id
+                            })
+                        extra_index += 1
+                    else:
+                        raise ValueError(
+                            "Now only 'F-then-B' and '1F1B' are supported."
+                            "The given value is {}.".format(self.schedule_mode))
 
-    def _accumulate_gradients(self, block):
+    def _insert_loss_scale(self, block):
         """
-        Accumulate the gradients generated in microbatch to the one in mini-batch.
-        We also scale the loss corresponding to number of micro-batches as well.
+        Scale the loss corresponding to number of micro-batches.
         """
+        if self._num_microbatches == 1: return
         for index, op in reversed(tuple(enumerate(list(block.ops)))):
-            offset = index
-            device = op.attr(self._op_device_key)
-
-            # Backward pass
             if self._is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
-                scale_factor = self._num_microbatches
                 block._insert_op(
                     index=index + 1,
                     type='scale',
                     inputs={'X': loss_grad_var},
                     outputs={'Out': loss_grad_var},
                     attrs={
-                        'scale': 1.0 / scale_factor,
-                        self._op_device_key: device,
+                        'scale': 1.0 / self._num_microbatches,
                         self._op_role_key: self._op_role.Backward
                     })
                 break
-            if self._is_backward_op(op) and (
-                    self._op_role_var_key in op.attr_names):
-                op_role_var = op.all_attrs()[self._op_role_var_key]
 
-                if len(op_role_var) == 0:
+    def _rename_gradient_var_name(self, block):
+        for index, op in enumerate(block.ops):
+            if not self._is_optimize_op(op): continue
+            input_names = op.input_arg_names
+            output_names = op.output_arg_names
+            in_out_names = input_names + output_names
+            if op.type == 'cast': continue
+            # append "MERGED" to the names of parameter gradients,
+            # and mofify the op_role_var attribute (by rename_arg func).
+            for name in in_out_names:
+                if not core.grad_var_suffix() in name: continue
+                param_name = name.strip(core.grad_var_suffix())
+                new_grad_name = name + "@MERGED"
+                self._rename_arg(op, name, new_grad_name)
+
+    def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False):
+        """
+        Create a new merged gradient for each parameter and accumulate the
+        corresponding gradient to it.
+        """
+        merged_gradient_names = []
+        first_opt_op_idx = None
+
+        for index, op in reversed(tuple(enumerate(list(block.ops)))):
+            # remove the cast op of fp16 grad to fp32 grad
+            if self._is_optimize_op(op) and op.type == 'cast':
+                in_name = op.input_arg_names[0]
+                out_name = op.output_arg_names[0]
+                if out_name.strip('@GRAD') in self._param_device_map:
+                    assert in_name.replace('.cast_fp16', '') == out_name
+                    block._remove_op(index)
                     continue
+
+            if self._is_backward_op(op) and not first_opt_op_idx:
+                first_opt_op_idx = index + 1
+                # no optimize phase
+                if first_opt_op_idx == len(block.ops): return
+                if block.ops[first_opt_op_idx].type == "c_sync_comm_stream":
+                    first_opt_op_idx += 1
+
+            if self._is_backward_op(op) and (
+                    self._op_role_var_key in op.attr_names):
+                op_role_var = op.attr(self._op_role_var_key)
+                if len(op_role_var) == 0: continue
                 assert len(op_role_var) % 2 == 0
-                offset = index
                 for i in range(0, len(op_role_var), 2):
-                    grad_name = op_role_var[i + 1]
-                    grad_var = block.vars[grad_name]
-                    new_grad_var_name = unique_name.generate(grad_name)
-                    new_var = self._create_var(block, grad_var,
-                                               new_grad_var_name)
-                    self._rename_arg(op, grad_name, new_grad_var_name)
+                    offset = 0
+                    param_name = op_role_var[i]
+                    if not block.has_var(param_name): continue
+                    if '@BroadCast' in param_name: continue
+                    param_grad_name = param_name + core.grad_var_suffix()
+                    merged_param_grad_name = param_grad_name + '@MERGED'
+                    if not block.has_var(merged_param_grad_name):
+                        self._create_var(block, block.vars[param_name],
+                                         merged_param_grad_name)
+                    assert block.has_var(merged_param_grad_name)
+                    param_grad_var = block.var(param_grad_name)
+                    merged_param_grad_var = block.var(merged_param_grad_name)
+                    merged_param_grad_var.persistable = True
                     block._insert_op(
-                        index=offset + 1,
-                        type='sum',
-                        inputs={'X': [grad_var, new_var]},
-                        outputs={'Out': grad_var},
+                        index=first_opt_op_idx + offset,
+                        type='fill_constant',
+                        inputs={},
+                        outputs={'Out': [merged_param_grad_var]},
                         attrs={
-                            self._op_device_key: device,
-                            self._op_role_key: self._op_role.Backward,
-                            self._op_role_var_key: op_role_var
+                            'shape': merged_param_grad_var.shape,
+                            'dtype': merged_param_grad_var.dtype,
+                            'value': float(0),
+                            # a trick to run this op once per mini-batch
+                            self._op_role_key: self._op_role.Optimize.LRSched,
                         })
                     offset += 1
+                    grad_name = op_role_var[i + 1]
+                    grad_var = block.vars[grad_name]
+                    if not 'cast_fp16' in grad_name:
+                        block._insert_op(
+                            index=first_opt_op_idx + offset,
+                            type='sum',
+                            inputs={'X': [grad_var, merged_param_grad_var]},
+                            outputs={'Out': merged_param_grad_var},
+                            attrs={
+                                self._op_role_key: self._op_role.Backward,
+                            })
+                        offset += 1
+                        merged_gradient_names.append(merged_param_grad_name)
+                    else:
+                        # cast gradient to fp32 to accumulate to merged gradient
+                        cast_grad_var_name = param_grad_name + '@TMP'
+                        cast_grad_var = self._create_var(block, param_grad_var,
+                                                         cast_grad_var_name)
+                        cast_grad_var.persistable = False
+                        block._insert_op(
+                            index=first_opt_op_idx + offset,
+                            type='cast',
+                            inputs={'X': grad_var},
+                            outputs={'Out': cast_grad_var},
+                            attrs={
+                                'in_dtype': grad_var.dtype,
+                                'out_dtype': cast_grad_var.dtype,
+                                self._op_role_key: self._op_role.Backward,
+                            })
+                        offset += 1
+                        block._insert_op(
+                            index=first_opt_op_idx + offset,
+                            type='sum',
+                            inputs={
+                                'X': [merged_param_grad_var, cast_grad_var]
+                            },
+                            outputs={'Out': merged_param_grad_var},
+                            attrs={
+                                self._op_role_key: self._op_role.Backward,
+                            })
+                        offset += 1
+                        merged_gradient_names.append(merged_param_grad_name)
+        return merged_gradient_names
 
     def _add_sub_blocks(self, main_block, program_list):
         main_program = main_block.program
-        for prog_info in program_list:
-            prog = prog_info['program']
+        for prog in program_list:
             for op in prog.block(0).ops:
                 if not op.has_attr('sub_block'):
                     continue
@@ -4372,8 +4566,7 @@ def _process_persistable_vars_in_multi_sections(self, main_program,
         # var_info = {var_name: [program1, program2...]},
         # persistable var only
         var_info = dict()
-        for prog_info in program_list:
-            prog = prog_info['program']
+        for prog in program_list:
             block = prog.block(0)
             for var_name in block.vars:
                 if var_name == "double_buffer_0": continue
@@ -4395,7 +4588,7 @@ def _process_persistable_vars_in_multi_sections(self, main_program,
                 block = prog.block(0)
                 for op in block.ops:
                     if op.type == "recv_v2" or op.type == "create_py_reader" or \
-                        op.type == "read":
+                        op.type == "read" or op.type == "update_loss_scaling":
                         continue
                     # We have processed lr related vars
                     if op.attr(self._op_role_key) == int(
@@ -4423,6 +4616,15 @@ def _process_persistable_vars_in_multi_sections(self, main_program,
                 read_block = prog.block(0)
                 read_device = self._get_device_info(read_block)
                 read_dev_index = int(read_device.split(':')[1])
+                pair = (write_dev_index, read_dev_index)
+                pair_key = write_dev_index * 1000 + read_dev_index
+                if pair not in self._pipeline_pair:
+                    self._pipeline_pair.append(pair)
+                    self._pp_ring_map[pair_key] = self.ring_id
+                    ring_id = self.ring_id
+                    self.ring_id += 1
+                else:
+                    ring_id = self._pp_ring_map[pair_key]
 
                 write_block._insert_op(
                     index=0,
@@ -4430,11 +4632,12 @@ def _process_persistable_vars_in_multi_sections(self, main_program,
                     inputs={'X': write_block.var(var_name), },
                     attrs={
                         self._op_device_key: write_device,
-                        'use_calc_stream': True,
+                        'use_calc_stream': False,
                         # A trick to make the role LRSched to avoid copy every
                         # microbatch
                         self._op_role_key: self._op_role.LRSched,
                         'peer': read_dev_index,
+                        'ring_id': ring_id
                     })
                 read_block._insert_op(
                     index=0,
@@ -4444,12 +4647,33 @@ def _process_persistable_vars_in_multi_sections(self, main_program,
                         'out_shape': read_block.var(var_name).shape,
                         'dtype': read_block.var(var_name).dtype,
                         self._op_device_key: read_device,
-                        'use_calc_stream': True,
+                        'use_calc_stream': False,
                         # A trick to make the role LRSched to avoid copy every
                         # microbatch
                         self._op_role_key: self._op_role.LRSched,
-                        'peer': write_dev_index
+                        'peer': write_dev_index,
+                        'ring_id': ring_id
                     })
+                read_block._insert_op(
+                    index=1,
+                    type='c_sync_comm_stream',
+                    inputs={'X': [read_block.var(var_name)]},
+                    outputs={'Out': [read_block.var(var_name)]},
+                    attrs={
+                        self._op_device_key: read_device,
+                        # A trick to make the role LRSched to avoid copy every
+                        # microbatch
+                        self._op_role_key: self._op_role.LRSched,
+                        'ring_id': ring_id
+                    })
+
+    def _is_gradient_clip_op(self, op):
+        return op.desc.has_attr("op_namescope") \
+            and op.desc.attr("op_namescope").startswith("/gradient_clip")
+
+    def _is_regularization_op(self, op):
+        return op.desc.has_attr("op_namescope") \
+            and op.desc.attr("op_namescope").startswith("/regularization")
 
     def minimize(self,
                  loss,
@@ -4457,23 +4681,34 @@ def minimize(self,
                  parameter_list=None,
                  no_grad_set=None):
         main_block = loss.block
+        self.origin_main_block = main_block
         if startup_program is None:
             startup_program = default_startup_program()
         optimize_ops, params_grads = self._optimizer.minimize(
             loss, startup_program, parameter_list, no_grad_set)
-        self._param_device_map = self._optimizer._param_device_map
+        self._param_device_map = self._origin_optimizer._param_device_map
+        assert main_block.program._pipeline_opt \
+            and 'local_rank' in main_block.program._pipeline_opt, \
+            'Please use pipeline with fleet.'
+        local_rank = main_block.program._pipeline_opt['local_rank']
+        self._global_ring_id = main_block.program._pipeline_opt[
+            'global_ring_id']
+        schedule_mode = 0
+        if 'schedule_mode' in main_block.program._pipeline_opt:
+            schedule_mode = main_block.program._pipeline_opt['schedule_mode']
+        self.schedule_mode = schedule_mode
+        # micro batch size
         self.micro_batch_size = main_block.program._pipeline_opt[
             'micro_batch_size']
 
-        # Step1: add default op_device attribute for regulization and clip ops
-        self._add_opdevice_attr_for_regularization_clip(main_block)
-
-        # Step2: add default op_device attribute for ops whose op_device
-        # attribute have not been set yet. Then check all ops have the
-        # op_device attribute.
-        self._add_default_opdevice_attr(main_block)
+        self.use_sharding = False
+        if 'use_sharding' in main_block.program._pipeline_opt:
+            self.use_sharding = main_block.program._pipeline_opt['use_sharding']
+        self.ring_id = main_block.program._pipeline_opt['ring_id']
 
-        device_specs = self._check_validation(main_block)
+        # Step1: add default op_device attribute for ops.
+        self._add_op_device_attr(main_block)
+        device_list = self._check_validation(main_block)
 
         def device_cmp(device1, device2):
             dev1_id = int(device1.split(':')[1])
@@ -4485,70 +4720,59 @@ def device_cmp(device1, device2):
             else:
                 return 0
 
-        sorted_device_spec = sorted(device_specs, key=cmp_to_key(device_cmp))
-        assert sorted_device_spec == device_specs, (
-            "With pipeline "
-            "parallelism, you must use gpu devices one after another "
-            "in the order of their ids.")
-
-        # Step3: add send and recv ops between section boundaries
+        sorted_device_list = sorted(device_list, key=cmp_to_key(device_cmp))
+        assert sorted_device_list == device_list, (
+            "With pipeline parallelism, you must use gpu devices one after "
+            "another in the order of their ids.")
+        # Step2: add send and recv ops between section boundaries
         self._insert_sendrecv_ops_for_boundaries(main_block)
 
-        # Step4: split program into sections and add pairs of
+        # Step3: split program into sections and add pairs of
         # send and recv ops for data var.
         main_program = main_block.program
-        program_list = self._split_program(main_program, device_specs)
+        program_list = self._split_program(main_program, device_list)
         for p in program_list:
-            self._create_vars(p["program"].block(0),
-                              main_program.global_block())
-        self._insert_sendrecv_for_data_var(main_block, program_list,
-                                           startup_program, device_specs)
+            self._create_vars(p.global_block(), main_block)
 
-        # Step5: Special Case: process persistable vars that exist in
+        # Step4: Special Case: process persistable vars that exist in
         # multiple sections
         self._process_persistable_vars_in_multi_sections(
             main_program, startup_program, program_list)
 
-        # Step6: Add sub blocks for section programs
+        # Step5: Add sub blocks for section programs
         self._add_sub_blocks(main_block, program_list)
 
-        assert (main_program._pipeline_opt and
-                isinstance(main_program._pipeline_opt, dict) and
-                'local_rank' in main_program._pipeline_opt), \
-                "You must use pipeline with fleet"
-        local_rank = main_program._pipeline_opt['local_rank'] % len(
-            device_specs)
-        self.schedule_mode = main_program._pipeline_opt['schedule_mode']
-
+        local_rank = main_program._pipeline_opt['local_rank'] % len(device_list)
         place_list = []
-        for dev_spec in device_specs:
-            dev_index = dev_spec.split(":")[1]
-            place_list.append(core.CUDAPlace(local_rank))
+        for dev in device_list:
+            dev_index = int(dev.split(":")[1])
+            place_list.append(core.CUDAPlace(dev_index % 8))
 
-        # Step7: Split startup program
+        # Step6: Split startup program
         new_startup_program = self._split_startup_program(startup_program,
                                                           local_rank)
 
-        # Step8: clear gradients before each mini-batch and 
-        # accumulate gradients during backward
-        self._clear_gradients(
-            program_list[local_rank]['program'].global_block(),
-            dev_spec=device_specs[local_rank])
-        self._accumulate_gradients(program_list[local_rank]['program']
-                                   .global_block())
-
         startup_program._pipeline_opt = {
             "startup_program": new_startup_program,
         }
+        real_block = program_list[local_rank].global_block()
+        self._insert_loss_scale(real_block)
+        if not self.use_sharding:
+            # Step7: clear gradients before each mini-batch and 
+            # accumulate gradients during backward
+            self._rename_gradient_var_name(real_block)
+            real_block._sync_with_cpp()
+            self._accumulate_gradients(real_block)
+            real_block._sync_with_cpp()
 
         place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
             "pipeline_stage": local_rank,
-            "num_pipeline_stages": len(device_specs),
+            "num_pipeline_stages": len(device_list),
             "schedule_mode": self.schedule_mode,
-            "inner_parallelism": len(device_specs),
+            "inner_parallelism": len(device_list),
             "section_program": program_list[local_rank],
             "place": place_list[local_rank],
             "place_id": place_id,
@@ -4556,7 +4780,7 @@ def device_cmp(device1, device2):
             "num_microbatches": self._num_microbatches,
             "start_cpu_core_id": self._start_cpu_core_id,
         }
-        return optimize_ops, params_grads, program_list
+        return optimize_ops, params_grads, program_list, self._pipeline_pair, self._pp_ring_map
 
 
 class RecomputeOptimizer(Optimizer):
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
index f433af24813d52..8c3a66f933f59d 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
@@ -66,12 +66,21 @@ def cnn_model(data):
     param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
     scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
 
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
+    with fluid.device_guard("gpu:1"):
+        predict = fluid.layers.fc(
+            input=conv_pool_2,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        # To cover @RENAMED@GRADIENT
+        predict2 = fluid.layers.fc(
+            input=conv_pool_1,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        predict += predict2
     return predict
 
 
@@ -108,7 +117,10 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         bd = [steps_per_pass * p for p in passes]
         lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
         lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
-        opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)
+        opt = fluid.optimizer.Momentum(
+            learning_rate=lr_val,
+            momentum=0.9,
+            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
 
         acc_steps = 2  # accumulated steps for pipeline
         if dist_strategy:
@@ -120,6 +132,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
             fleet.init(is_collective=True)
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
+            strategy.amp = True
             strategy.pipeline_configs = {
                 'micro_batch_size': batch_size,
                 'schedule_mode': '1F1B',

From b47478efc2caf9247fd73245a3b87154ec3e81a1 Mon Sep 17 00:00:00 2001
From: cc <52520497+juncaipeng@users.noreply.github.com>
Date: Fri, 26 Mar 2021 19:15:56 +0800
Subject: [PATCH 06/15] [dygraph qat]  Use layer to calculate output scale 
 (#31861)

* Use layer to calculate output scale
* add backward for moving_average_abs_max_scale and save output scales to op's attr
---
 paddle/fluid/operators/fake_quantize_op.cc    |  69 +++--
 paddle/fluid/operators/fake_quantize_op.cu    |   4 +-
 paddle/fluid/operators/fake_quantize_op.h     |  16 +-
 paddle/fluid/pybind/op_function_generator.cc  |   6 +-
 .../slim/quantization/imperative/qat.py       | 268 +++++-------------
 .../slim/quantization/imperative/quant_nn.py  | 111 ++++----
 .../slim/quantization/imperative/utils.py     |  47 +--
 .../slim/tests/test_imperative_out_scale.py   |  25 --
 .../tests/unittests/test_fake_quantize_op.py  |   5 +-
 9 files changed, 222 insertions(+), 329 deletions(-)

diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index abfc88e5155e5a..4544386718813c 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -649,13 +649,18 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel {
                    "MovingAverageAbsMaxScale");
     OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
                    "MovingAverageAbsMaxScale");
+
     if (ctx->HasOutput("OutState")) {
       ctx->SetOutputDim("OutState", {1});
     }
     if (ctx->HasOutput("OutAccum")) {
       ctx->SetOutputDim("OutAccum", {1});
     }
-    ctx->SetOutputDim("OutScale", {1});
+    if (ctx->HasOutput("Out")) {
+      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+      ctx->SetOutputDim("OutScale", {1});
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
   }
 
  protected:
@@ -673,6 +678,9 @@ class MovingAverageAbsMaxScaleOpMaker
     AddInput("X", "(Tensor) Input is float data type.");
     AddInput("InAccum", "Last accum.").AsDispensable();
     AddInput("InState", "Last state.").AsDispensable();
+    AddOutput("Out",
+              "(Tensor) Output tensor is just equivalent to the input tensor.")
+        .AsDispensable();
     AddOutput("OutScale", " Current scale");
     AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
     AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
@@ -693,7 +701,7 @@ And it will not quantize the input tensor.
   }
 };
 
-class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
+class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -701,9 +709,9 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
     auto out_grad_name = framework::GradVarName("Out");
     auto x_grad_name = framework::GradVarName("X");
     OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
-                   "FakeQuantDequantGradOp");
+                   "StrightThroughEstimatorGradOp");
     OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
-                   "FakeQuantDequantGradOp");
+                   "StrightThroughEstimatorGradOp");
 
     ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
   }
@@ -717,13 +725,13 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-class FakeQuantDequantGradMaker : public framework::SingleGradOpMaker<T> {
+class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker<T> {
  public:
   using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 
  protected:
   void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("fake_quantize_dequantize_grad");
+    grad_op->SetType("stright_throuth_estimator_grad");
     grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     grad_op->SetAttrMap(this->Attrs());
@@ -744,11 +752,11 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
                        ops::FakeQuantizeAbsMaxKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize_dequantize_abs_max,
-                  ops::FakeQuantOrWithDequantAbsMaxOp,
-                  ops::FakeQuantOrWithDequantAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    fake_quantize_dequantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp,
+    ops::FakeQuantOrWithDequantAbsMaxOpMaker,
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max,
                        ops::FakeQuantizeDequantizeAbsMaxKernel<CPU, float>);
 
@@ -769,11 +777,12 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
                        ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize_dequantize_moving_average_abs_max,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    fake_quantize_dequantize_moving_average_abs_max,
+    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
+    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     fake_quantize_dequantize_moving_average_abs_max,
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);
@@ -789,20 +798,22 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
 REGISTER_OPERATOR(
     moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleOp,
     ops::MovingAverageAbsMaxScaleOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
                        ops::MovingAverageAbsMaxScaleKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
-REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
-                       ops::FakeQuantDequantGradKernel<CPU, float>);
+REGISTER_OPERATOR(stright_throuth_estimator_grad,
+                  ops::StrightThroughEstimatorGradOp);
+REGISTER_OP_CPU_KERNEL(stright_throuth_estimator_grad,
+                       ops::StrightThroughEstimatorGradKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
-                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
-                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     fake_channel_wise_quantize_dequantize_abs_max,
     ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
@@ -820,4 +831,8 @@ REGISTER_OP_VERSION(moving_average_abs_max_scale)
             "Out",
             "Delete output in order to make the inference model not "
             "save moving_average_abs_max_scale operator. This will "
-            "make the quantitative model be correctly applied in inference."));
+            "make the quantitative model be correctly applied in inference."))
+    .AddCheckpoint(
+        R"ROC(Incompatible upgrade of output [Out])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewOutput(
+            "Out", "In order to support dygraph qat, add output again."));
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 92127f9aebd0d5..78052179f6be72 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -543,8 +543,8 @@ REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale,
 REGISTER_OP_CUDA_KERNEL(
     fake_quantize_dequantize_moving_average_abs_max,
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
-                        ops::FakeQuantDequantGradKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(stright_throuth_estimator_grad,
+                        ops::StrightThroughEstimatorGradKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(
     fake_channel_wise_quantize_dequantize_abs_max,
     ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 94a75f930bebae..11a2d2de8bcf73 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -314,6 +314,12 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
     auto* in = context.Input<framework::Tensor>("X");
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
+    if (context.HasOutput("Out")) {
+      auto* out = context.Output<framework::Tensor>("Out");
+      out->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
+    }
+
     bool is_test = context.Attr<bool>("is_test");
     // testing
     if (is_test) {
@@ -344,17 +350,17 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T>
-class FakeQuantDequantGradKernel : public framework::OpKernel<T> {
+class StrightThroughEstimatorGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* d_out =
         context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
     auto x_grad_name = framework::GradVarName("X");
     auto* d_x = context.Output<framework::LoDTensor>(x_grad_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        d_x, platform::errors::PreconditionNotMet(
-                 "FakeQuantDequantGradOp doesn't have the output named %s.",
-                 x_grad_name));
+    PADDLE_ENFORCE_NOT_NULL(d_x, platform::errors::PreconditionNotMet(
+                                     "StrightThroughEstimatorGradKernel "
+                                     "doesn't have the output named %s.",
+                                     x_grad_name));
 
     // Initialize dx as same as d_out
     d_x->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index b1c42d91df5048..69856fa4fa142e 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -84,7 +84,8 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"matrix_nms", {"Out", "Index", "RoisNum"}},
     {"distribute_fpn_proposals",
      {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
-    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
     {"multiclass_nms3", {"Out", "NmsRoisNum"}},
     {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
     {"momentum", {"ParamOut", "VelocityOut"}},
@@ -137,7 +138,8 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
     {"update_loss_scaling",
      {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
-    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
     {"rnn", {"DropoutState"}},
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index ea2e8e073b5084..f4620ff00013c8 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -21,14 +21,14 @@
 
 import paddle
 from paddle.fluid import dygraph, core, framework, unique_name
-from paddle.fluid.executor import Executor
+from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Constant
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.io import load_inference_model, save_inference_model
 from paddle.fluid.log_helper import get_logger
-from . import quant_nn
 from .. import quantization_pass
+from . import quant_nn
 from . import utils
 
 __all__ = ['ImperativeQuantAware']
@@ -201,7 +201,7 @@ def forward(self, inputs):
 
         self._quantize_inputs = ImperativeQuantizeInputs(**kwargs)
 
-        self._calc_output_scale = ImperativeCalcOutputScale()
+        self._quantize_outputs = ImperativeQuantizeOutputs()
 
     def quantize(self, model):
         """
@@ -219,11 +219,11 @@ def quantize(self, model):
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
         self._quantize_inputs.apply(model)
-        self._calc_output_scale.apply(model)
+        self._quantize_outputs.apply(model)
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
-        self._calc_output_scale.save_quantized_model(layer, path, input_spec,
-                                                     **config)
+        self._quantize_outputs.save_quantized_model(layer, path, input_spec,
+                                                    **config)
 
 
 class ImperativeQuantizeInputs(object):
@@ -323,10 +323,10 @@ def apply(self, model):
                 idx += 1
             target = name[last_idx:idx]
 
-            quant_layer = self._get_quantized_layer(layer)
+            quant_layer = self._get_input_quantized_layer(layer)
             setattr(obj, target, quant_layer)
 
-    def _get_quantized_layer(self, layer):
+    def _get_input_quantized_layer(self, layer):
         quant_layer_name = None
         for key, value in utils.quant_input_layers_map.items():
             if isinstance(layer, value):
@@ -343,24 +343,26 @@ def _get_quantized_layer(self, layer):
         return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs)
 
 
-class ImperativeCalcOutputScale(object):
+class ImperativeQuantizeOutputs(object):
+    """
+    Calculate the output scales for some layers.
+    """
+
     def __init__(self, moving_rate=0.9):
         """
-        Add the logic of calculating and setting output scales of some layers. 
+        The constructor for ImperativeQuantizeOutputs.
 
         Args:
             moving_rate(float): The decay coefficient of moving average.
                                 The default value is 0.9.
         """
-        super(ImperativeCalcOutputScale, self).__init__()
+        super(ImperativeQuantizeOutputs, self).__init__()
         self._moving_rate = moving_rate
-        self._register_hook_handle_list = []
-        self._out_scale_dict = collections.OrderedDict()
 
     def apply(self, model):
         """
-        Insert the `moving_average_abs_max_scale` op to calculate output 
-        scale of specific layers in model.
+        Insert the `moving_average_abs_max_scale` layers to calculate the
+        output scales for specific layers in the dygraph model.
 
         Args:
             model(fluid.dygraph.Layer): The target model which would be
@@ -372,14 +374,25 @@ def apply(self, model):
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
-        # Calculate the target ops's output scale, and don't consider
-        # the skip_quant attr
-        for _, layer in model.named_sublayers():
-            if self._is_target_layer(layer):
-                self._init_scale_params(layer)
-                hook_handle = layer.register_forward_post_hook(
-                    self._calc_output_scale_hook)
-                self._register_hook_handle_list.append(hook_handle)
+        for name, layer in model.named_sublayers():
+            if not self._is_target_layer(layer):
+                continue
+
+            # TODO(jc): optimize this module
+            last_idx = 0
+            idx = 0
+            obj = model
+            while idx < len(name):
+                if (name[idx] == '.'):
+                    if hasattr(obj, name[last_idx:idx]):
+                        obj = getattr(obj, name[last_idx:idx])
+                        last_idx = idx + 1
+                idx += 1
+            target = name[last_idx:idx]
+
+            quant_layer = quant_nn.__dict__["QuantizedOutputLayer"](
+                layer, self._moving_rate)
+            setattr(obj, target, quant_layer)
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         """
@@ -409,33 +422,18 @@ def save_quantized_model(self, layer, path, input_spec=None, **config):
         Returns:
             None
         """
-
         assert isinstance(layer, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
-        self._gather_output_scale(layer)
-
-        with dygraph.guard():
-            layer.eval()
-            for handle in self._register_hook_handle_list:
-                handle.remove()
         paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)
 
-        if len(self._out_scale_dict) == 0:
-            warnings.warn("Warning: No Layer of the model while to be " \
-                          "saved contains the out_threshold attribute, so the " \
-                          "generated inference model would not contain the " \
-                          "out_threshold.")
-            return
-
-        # load static model
         is_dynamic_mode = False
         if paddle.in_dynamic_mode():
             is_dynamic_mode = True
             paddle.enable_static()
 
-        place = core.CUDAPlace(0) if core.is_compiled_with_cuda() \
-            else core.CPUPlace()
+        place = core.CPUPlace()
+        scope = global_scope()
         exe = Executor(place)
 
         dirname = os.path.dirname(path)
@@ -450,20 +448,10 @@ def save_quantized_model(self, layer, path, input_spec=None, **config):
                 model_filename=model_filename,
                 params_filename=params_filename))
 
-        # TODO(jc): analyse whether the dygraph model has
-        # several blocks before applying qat
-        assert infer_program.num_blocks == 1, \
-            "Quantization aware training (QAT) requires the program " \
-            "only has a block for now. When the model has if-else or " \
-            "while, the program will have several blocks."
+        self._save_output_scale(infer_program, scope)
 
-        # set output scales to the static model
-        self._save_output_scale(infer_program)
-
-        # process skip quant
         self._set_skip_quant_attr(infer_program)
 
-        # save the final quantized model that has output scales
         save_inference_model(
             dirname=dirname,
             feeded_var_names=feed_target_names,
@@ -476,144 +464,42 @@ def save_quantized_model(self, layer, path, input_spec=None, **config):
         if is_dynamic_mode:
             paddle.disable_static()
 
-    def _gather_output_scale(self, layer):
-        """
-        Gather all output scales to self._out_scale_dict
-        """
-        with dygraph.guard():
-            layer.eval()
-            for _, sub_layer in layer.named_sublayers():
-                if self._is_target_layer(sub_layer):
-                    layer_name = sub_layer.full_name()
-                    if hasattr(sub_layer, "_quant_out_scale"):
-                        self._out_scale_dict[layer_name] = float(
-                            sub_layer._quant_out_scale)
-
-    def _save_output_scale(self, infer_program):
+    def _is_target_layer(self, layer):
         """
-        Save all output scales to the corresponding ops in static
-        inference program.
-
-        Because the Layer in dygraph may correspond to multiple ops
-        in static program after being saved. To ensure correctness,
-        the outscale collected for output of dygraph Layer can only
-        be set to the last op in the corresponding ops in static program.
+        Whether the layer needs to calculate output scales.
         """
-        assert infer_program.num_blocks == 1, \
-            "The inference program should only have a block."
-
-        global_block = infer_program.global_block()
-        target_ops = global_block.ops
-
-        scale_idx = 0
-        op_idx = 0
-        attr_name = "out_threshold"
-
-        for scale_name, scale_value in self._out_scale_dict.items():
-            while True:
-                if op_idx >= len(target_ops):
-                    break
-
-                op = target_ops[op_idx]
-                if not self._is_scale_op_matched(scale_name, op, global_block):
-                    op_idx += 1
-                else:
-                    if op.type in utils.weight_op_types \
-                        and op_idx + 1 < len(target_ops) \
-                        and target_ops[op_idx+1].type == "elementwise_add":
-                        target_ops[op_idx + 1]._set_attr(attr_name, scale_value)
-                        op_idx += 2
-                    else:
-                        op._set_attr(attr_name, scale_value)
-                        op_idx += 1
-                    scale_idx += 1
-                    break
-
-        if scale_idx != len(self._out_scale_dict):
-            _logger.warning("Warning: the model have %s output scales, "\
-                "but it only saves %s output scales." \
-                % (len(self._out_scale_dict), scale_idx))
-
-    def _is_target_layer(self, layer):
         return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \
-            or ('quantized_' in layer.full_name() and \
+            or ('quantized' in layer.full_name() and \
                 'quantized_noweight' not in layer.full_name())
 
-    def _init_scale_params(self, layer, name=None):
+    def _save_output_scale(self, program, scope):
         """
-        Init the scale params for calculating output scales and save them in the
-        target layer.
-        After the users define the dygraph model, the hooks for calculating output
-        scales will not execute immediately. If the users load parameters form
-        checkpoint and save the quantized inference model immediately, the inference
-        model would not be saved successfully. Beacuse the dygraph_to_static requires
-        that the parameters created in __init__, but the uniqueness of hook make it
-        impossible to create parameters in __init__. To avoid this mistake, we define
-        the scale parameters in the beginning instead of hook.
+        Save all output scales to the corresponding ops in static
+        inference program and delete 'moving_average_abs_max_scale' ops.
         """
+        for block in program.blocks:
+            for op in block.ops:
+                if op.type == "moving_average_abs_max_scale":
+                    in_var_name = op.input('X')[0]
+                    out_var_name = op.output('Out')[0]
+                    out_scale_name = op.output('OutScale')[0]
 
-        def _create_param(in_layer, first_name, last_name, dtype):
-            prefix = '{}.{}'.format(first_name, last_name) \
-                if first_name else 'outscale.{}'.format(last_name)
-            attr = ParamAttr(
-                name=unique_name.generate(prefix),
-                initializer=Constant(1),
-                trainable=False)
-            param = in_layer.create_parameter(shape=[1], attr=attr, dtype=dtype)
-            return param
-
-        dtype = layer._dtype if layer._dtype is not None else "float32"
-        if dtype not in ["float32", "float64"]:
-            return
-
-        layer._quant_out_scale = _create_param(layer, name, "scale", dtype)
-        layer._quant_out_scale.stop_gradient = True
-
-        layer._quant_out_state = _create_param(layer, name, "state", dtype)
-        layer._quant_out_state.stop_gradient = True
+                    out_scale = utils.load_variable_data(scope, out_scale_name)
+                    previous_op = utils.find_previous_op(block, in_var_name)
+                    previous_op._set_attr("out_threshold", float(out_scale))
 
-        layer._quant_out_accum = _create_param(layer, name, "accum", dtype)
-        layer._quant_out_accum.stop_gradient = True
+                    next_ops = utils.find_next_ops(block, out_var_name)
+                    for next_op in next_ops:
+                        next_op._rename_input(out_var_name, in_var_name)
 
-    def _is_scale_op_matched(self, scale_name, op, block):
+    def _set_skip_quant_attr(self, program):
         """
-        Based on the op name and attrs to judge whether the op in
-        program matches the scale_name. We must know the corresponding
-        name between dgraph and static model.
+        Label the skip quantized ops.
         """
-        fp_type = [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]
-        if op.type in quantization_pass._op_real_in_out_name.keys():
-            output_var_names = quantization_pass._get_op_output_var_names(op)
-            for output_var_name in output_var_names:
-                output_var_tensor = block.var(output_var_name)
-                if output_var_tensor.dtype not in fp_type:
-                    return False
-
-        # corresponding_map: [name, op_types, function]
-        # Note that, the items have priority in corresponding_map
-        corresponding_map = [
-            ['conv2d_tranpose', ['conv2d_transpose', \
-                                'depthwise_conv2d_transpose'], None],
-            ['conv2d', ['conv2d', 'depthwise_conv2d'], None],
-            ['linear', ['matmul'], None],
-            ['re_lu6', ['relu6'], None],
-            ['p_re_lu', ['prelu'], None],
-            ['leaky_re_lu', ['leaky_relu'], None],
-            ['re_lu', ['relu'], None],
-        ]
-
-        for item in corresponding_map:
-            if item[0] in scale_name:
-                return (op.type in item[1]) and \
-                    (len(item) == 2 or item[2] is None or item[2](op))
-
-        return op.type in scale_name
-
-    def _set_skip_quant_attr(self, program):
-        block = program.global_block()
-        for op in block.ops:
-            if self._is_skip_quant_op(block, op):
-                op._set_attr("skip_quant", True)
+        for block in program.blocks:
+            for op in block.ops:
+                if self._is_skip_quant_op(block, op):
+                    op._set_attr("skip_quant", True)
 
     def _is_skip_quant_op(self, block, in_op):
         """
@@ -621,33 +507,11 @@ def _is_skip_quant_op(self, block, in_op):
         1. the type of input op should be conv2d, depthwise_conv2d or matmul
         2. the previous ops of the input op are not fake_quantize_dequantize ops
         """
-
-        def _find_previous_op(block, var_name):
-            for op in block.ops:
-                if var_name in op.output_arg_names:
-                    return op
-
         target_op_types = ["conv2d", "depthwise_conv2d", "matmul"]
         if in_op.type not in target_op_types:
             return False
 
-        previous_ops = [_find_previous_op(block, arg_name) \
+        previous_ops = [utils.find_previous_op(block, arg_name) \
             for arg_name in in_op.input_arg_names]
-        return any(op is not None and op.type not in utils.fake_quantize_dequantize_types \
-            for op in previous_ops )
-
-    def _calc_output_scale_hook(self, layer, input, output):
-        """
-        Create the MovingAverageAbsMaxScale layer for the target layer if needed.
-        Execute MovingAverageAbsMaxScale layer to calculate the output scale. 
-        """
-        assert isinstance(output, (core.VarBase, framework.Variable)), \
-            "Multiple outputs are not currently supported in ImperativeOutScale."
-
-        fp_types = [core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64]
-        if output.dtype in fp_types:
-            if not hasattr(layer, "_out_scale"):
-                self._out_scale = quant_nn.MovingAverageAbsMaxScale(
-                    layer, output.name, self._moving_rate, output.dtype)
-            # TODO (jc): consider the ops that have several outputs 
-            self._out_scale(output)
+        return any(op is not None and op.type not in \
+            utils.fake_quantize_dequantize_types for op in previous_ops)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 3c4fb323bc505f..f6fef0689d43af 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -507,59 +507,42 @@ def forward(self, input):
 
 
 class MovingAverageAbsMaxScale(layers.Layer):
-    def __init__(self, layer=None, name=None, moving_rate=0.9, dtype='float32'):
+    def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
         r"""
-        MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer.
-        Its computational formula is described as below:
+        MovingAverageMaxScale layer is used to calculating the output quantization
+        scale of Layer. Its computational formula is described as below:
 
         :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
         :math:`Out = X`
         """
         super(MovingAverageAbsMaxScale, self).__init__()
         self._moving_rate = moving_rate
-        self._dtype = dtype
-        self._layer = layer
 
-        if self._layer is None or not hasattr(self._layer, "_quant_out_scale"):
-            scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
-            scale_name = unique_name.generate(scale_prefix)
-            scale_attr = ParamAttr(
-                name=scale_name, initializer=Constant(1), trainable=False)
-            self._scale = self.create_parameter(
-                shape=[1], attr=scale_attr, dtype=self._dtype)
-            self._scale.stop_gradient = True
-            if self._layer is not None:
-                setattr(self._layer, "_quant_out_scale", self._scale)
-        else:
-            self._scale = self._layer._quant_out_scale
+        scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
+        scale_name = unique_name.generate(scale_prefix)
+        scale_attr = ParamAttr(
+            name=scale_name, initializer=Constant(1), trainable=False)
+        self._scale = self.create_parameter(
+            shape=[1], attr=scale_attr, dtype=dtype)
+        self._scale.stop_gradient = True
 
-        if self._layer is None or not hasattr(self._layer, "_quant_out_state"):
-            state_prefix = "{}.state".format(name) if name else 'outscale.state'
-            state_attr = ParamAttr(
-                name=unique_name.generate(state_prefix),
-                initializer=Constant(1),
-                trainable=False)
-            self._state = self.create_parameter(
-                shape=[1], attr=state_attr, dtype=self._dtype)
-            self._state.stop_gradient = True
-            if self._layer is not None:
-                setattr(self._layer, "_quant_out_state", self._state)
-        else:
-            self._state = self._layer._quant_out_state
+        state_prefix = "{}.state".format(name) if name else 'outscale.state'
+        state_attr = ParamAttr(
+            name=unique_name.generate(state_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        self._state = self.create_parameter(
+            shape=[1], attr=state_attr, dtype=dtype)
+        self._state.stop_gradient = True
 
-        if self._layer is None or not hasattr(self._layer, "_quant_out_accum"):
-            accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
-            accum_attr = ParamAttr(
-                name=unique_name.generate(accum_prefix),
-                initializer=Constant(1),
-                trainable=False)
-            self._accum = self.create_parameter(
-                shape=[1], attr=accum_attr, dtype=self._dtype)
-            self._accum.stop_gradient = True
-            if self._layer is not None:
-                setattr(self._layer, "_quant_out_accum", self._accum)
-        else:
-            self._accum = self._layer._quant_out_accum
+        accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
+        accum_attr = ParamAttr(
+            name=unique_name.generate(accum_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        self._accum = self.create_parameter(
+            shape=[1], attr=accum_attr, dtype=dtype)
+        self._accum.stop_gradient = True
 
     def forward(self, input):
         if in_dygraph_mode():
@@ -567,18 +550,30 @@ def forward(self, input):
                      not self.training)
             state = self._state if self.training else None
             accum = self._accum if self.training else None
+            quant_out = _varbase_creator(
+                type=input.type,
+                name="{}.tmp".format(input.name),
+                shape=input.shape,
+                dtype=input.dtype,
+                persistable=False)
 
-            self._scale, _, _ = core.ops.moving_average_abs_max_scale(
-                input, accum, state, self._scale, state, accum, *attrs)
-            return self._scale
+            out, _, _, _ = core.ops.moving_average_abs_max_scale(
+                input, accum, state, quant_out, self._scale, state, accum,
+                *attrs)
+            return out
 
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'MovingAverageAbsMaxScale')
 
         attrs = {'moving_rate': self._moving_rate, 'is_test': not self.training}
-
         inputs = {"X": [input]}
-        outputs = {"OutScale": [self._scale]}
+        quant_out = self._helper.create_variable(
+            name="{}.tmp".format(input.name),
+            dtype=input.dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=False)
+        outputs = {"Out": [quant_out], "OutScale": [self._scale]}
 
         if self.training:
             inputs['InState'] = [self._state]
@@ -592,4 +587,22 @@ def forward(self, input):
             outputs=outputs,
             attrs=attrs)
 
-        return self._scale
+        return quant_out
+
+
+class QuantizedOutputLayer(layers.Layer):
+    def __init__(self, layer=None, moving_rate=0.9, dtype='float32'):
+        r"""
+        Add MovingAverageMaxScale layer to the behind of the input layer.
+        """
+        super(QuantizedOutputLayer, self).__init__()
+        self._layer = layer
+        self._moving_average_abs_max_scale = \
+            MovingAverageAbsMaxScale(layer.full_name(), moving_rate, dtype)
+
+    def forward(self, input):
+        if isinstance(input, list):
+            assert len(input) == 1, \
+                "The QuantizedOutputLayer should only have one input."
+        out = self._layer(input)
+        return self._moving_average_abs_max_scale(out)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 090f6cda389af2..f45eb8c97f419e 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -13,22 +13,7 @@
 # limitations under the License.
 
 import paddle
-
-op_real_in_out_name = {
-    "conv2d": [["Input", "Filter"], ["Output"]],
-    "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
-    "pool2d": [["X"], ["Out"]],
-    "elementwise_add": [["X", "Y"], ["Out"]],
-    "softmax": [["X"], ["Out"]],
-    "relu": [["X"], ["Out"]],
-    "relu6": [["X"], ["Out"]],
-    "leaky_relu": [["X"], ["Out"]],
-    "prelu": [["X"], ["Out"]],
-    "tanh": [["X"], ["Out"]],
-    "batch_norm": [["X"], ["Y"]],
-    "sigmoid": [["X"], ["Out"]],
-    "swish": [["X"], ["Out"]],
-}
+import numpy as np
 
 quant_input_layers_map = {
     'Conv2D': paddle.nn.Conv2D,
@@ -85,3 +70,33 @@
     "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose",
     "depthwise_conv2d_transpose"
 ]
+
+
+def load_variable_data(scope, var_name):
+    '''
+    Load variable value from scope
+    '''
+    var_node = scope.find_var(var_name)
+    assert var_node is not None, \
+        "Can not find " + var_name + " in the scope."
+    return np.array(var_node.get_tensor())
+
+
+def find_previous_op(block, var_name):
+    """
+    Find the previous op for the input variable.
+    """
+    for op in block.ops:
+        if var_name in op.output_arg_names:
+            return op
+
+
+def find_next_ops(block, var_name):
+    """
+    Find all followed ops for the input variable.
+    """
+    res_ops = []
+    for op in block.ops:
+        if var_name in op.input_arg_names:
+            res_ops.append(op)
+    return res_ops
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index 600174e503feb2..8d6ce76ef0fa5f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -478,30 +478,5 @@ def test_save_quantized_model(self):
         self.assertTrue(op_count == 14)
 
 
-class TestSaveQuantizedModel_Warning(unittest.TestCase):
-    def test_warning(self):
-        path = "./dynamic_outscale_infer_model_with_warnings/lenet"
-        imperative_out_scale = ImperativeQuantAware()
-        with fluid.dygraph.guard():
-            lenet = ImperativeLenet()
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            imperative_out_scale.save_quantized_model(
-                layer=lenet,
-                path=path,
-                input_spec=[
-                    paddle.static.InputSpec(
-                        shape=[None, 1, 28, 28], dtype='float32')
-                ])
-
-        warning_message = "Warning: No Layer of the model while to be " \
-                          "saved contains the out_threshold attribute, so the " \
-                          "generated inference model would not contain the " \
-                          "out_threshold."
-        num = get_vaild_warning_num(warning_message, w)
-        assert num == 1
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 01f0abe0f217c3..1d7bfc9f6963c6 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -166,12 +166,14 @@ def setUp(self):
         accum[0] = 1
         state = np.zeros(1).astype("float32")
         state[0] = 1
+        x = np.random.random((8, 16, 7, 7)).astype("float32")
         self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'X': x,
             'InAccum': accum,
             'InState': state,
         }
 
+        out = x
         out_accum = np.zeros(1).astype("float32")
         out_state = np.zeros(1).astype("float32")
         out_scale = np.zeros(1).astype("float32")
@@ -180,6 +182,7 @@ def setUp(self):
         out_state[0] = self.attrs['moving_rate'] * state[0] + 1
         out_scale = out_accum / out_state
         self.outputs = {
+            'Out': out,
             'OutAccum': out_accum,
             'OutState': out_state,
             'OutScale': out_scale,

From bfb5cf5567a604fded177d90d639f7337015e3fa Mon Sep 17 00:00:00 2001
From: zlsh80826 <rewang@nvidia.com>
Date: Mon, 29 Mar 2021 10:16:54 +0800
Subject: [PATCH 07/15] [Paddle-TRT] trt affine channel converter (#31628)

* trt affine channel converter

* add trt affine channel base test

* add trt affine channel NHWC

* remove asterisk for python2 compatibility

* trt affine channel converter

* add trt affine channel base test

* add trt affine channel NHWC

* remove asterisk for python2 compatibility

* fix rebase

* move LodTensor to Tensor

* add dbg info

* affine channel converter only support NCHW

* scale,bias are parameters, use create_parameters api

* reduce test input size to not exceed the timelimit of ci

* refine affine channel unittest and add serialization/dynamic test

* change super to InferencePassTest for python2 compatibility

* change super to InferencePassTest for python2 compatibility

* fix affine channel fp16 serialize setting
---
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   1 +
 .../tensorrt/convert/affine_channel_op.cc     |  94 ++++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  10 +-
 .../inference/test_trt_affine_channel_op.py   | 141 ++++++++++++++++++
 5 files changed, 246 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 0007582e2c73d2..76bf5948a2b98c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1192,6 +1192,7 @@ USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 USE_TRT_CONVERTER(clip);
 USE_TRT_CONVERTER(gather);
+USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);
 USE_TRT_CONVERTER(nearest_interp);
 #endif
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index be7fa0548d9f34..6af76bd11cd59a 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -6,6 +6,7 @@ nv_library(tensorrt_converter
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
+                affine_channel_op.cc
                 multiclass_nms_op.cc
                 nearest_interp_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
new file mode 100644
index 00000000000000..813342c08483b7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Affine Channel Op
+ */
+class AffineChannelOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid affine_channel op to tensorrt scale nd layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("X").front();
+    std::string scale_name = op_desc.Input("Scale").front();
+    std::string bias_name = op_desc.Input("Bias").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    auto input_tensor = engine_->GetITensor(input_name);
+    auto idim = input_tensor->getDimensions();
+
+    auto* scale_v = scope.FindVar(scale_name);
+    auto* scale_t = scale_v->GetMutable<framework::LoDTensor>();
+    float* scale_ptr = engine_->GetWeightCPUData(scale_name, scale_t, false);
+
+    auto* bias_v = scope.FindVar(bias_name);
+    auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
+    float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false);
+
+    auto data_layout = framework::StringToDataLayout(
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
+
+    PADDLE_ENFORCE_EQ(
+        data_layout, framework::DataLayout::kNCHW,
+        platform::errors::InvalidArgument(
+            "TensorRT affine channel converter can only convert NCHW format. "
+            "Other format should be run in fluid mode. Report a bug on github "
+            "issue if you see this line."));
+
+    // tensorrt scalend layer only support spatial dims >= 2,
+    // so nhwc is not availabe (spatial dims == 0)
+    const int channel_axis = engine_->with_dynamic_shape();
+
+    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT,
+                                         static_cast<void*>(scale_ptr),
+                                         (size_t)idim.d[channel_axis]};
+    TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT,
+                                        static_cast<void*>(bias_ptr),
+                                        (size_t)idim.d[channel_axis]};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    auto layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *input_tensor,
+                                      nvinfer1::ScaleMode::kCHANNEL,
+                                      bias_weights.get(), scale_weights.get(),
+                                      power_weights.get(), channel_axis);
+
+    RreplenishLayerAndOutput(layer, "affine_channel", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(affine_channel, AffineChannelOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 82f58254fe8e0d..eb429405d18aeb 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -111,6 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "flatten2",
       "flatten",
       "gather",
+      "affine_channel",
       "multiclass_nms",
       "nearest_interp",
   };
@@ -196,6 +197,13 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
 
+    if (op_type == "affine_channel") {
+      if (!desc.HasAttr("data_layout")) return false;
+      auto data_layout = framework::StringToDataLayout(
+          BOOST_GET_CONST(std::string, desc.GetAttr("data_layout")));
+      if (data_layout != framework::DataLayout::kNCHW) return false;
+    }
+
     if (op_type == "multiclass_nms") {
       if (with_dynamic_shape) return false;
       auto* block = desc.Block();
@@ -238,6 +246,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
     }
+
     if (op_type == "nearest_interp") {
       std::vector<std::string> attrs{"data_layout",   "interp_method",
                                      "align_corners", "scale",
@@ -254,7 +263,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           BOOST_GET_CONST(std::string, desc.GetAttr("interp_method"));
       if (interp_method != "nearest") return false;
     }
-
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
new file mode 100644
index 00000000000000..8bbba7c8b55fef
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTAffineChannelTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 2
+        self.channel = 8
+        self.height = 16
+        self.width = 16
+        self.data_layout = 'NCHW'
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.enable_trt = True
+
+    def build(self):
+        # set min_graph_size to 2, 
+        # because affine channel doesn't support nhwc format
+        self.trt_parameters = InferencePassTest.TensorRTParam(
+            1 << 30, self.bs, 2, self.precision, self.serialize, False)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            if self.data_layout == 'NCHW':
+                shape = [-1, self.channel, self.height, self.width]
+            else:
+                shape = [-1, self.height, self.width, self.channel]
+
+            data = fluid.data(name='in', shape=shape, dtype='float32')
+            # set scale, bias by constant
+            scale = fluid.layers.create_parameter(
+                shape=[self.channel],
+                dtype='float32',
+                default_initializer=fluid.initializer.Constant(2.))
+            bias = fluid.layers.create_parameter(
+                shape=[self.channel],
+                dtype='float32',
+                default_initializer=fluid.initializer.Constant(.5))
+            affine_channel_out = fluid.layers.affine_channel(
+                data, scale=scale, bias=bias, data_layout=self.data_layout)
+            out = fluid.layers.batch_norm(affine_channel_out, is_test=True)
+
+        shape[0] = self.bs
+        self.feeds = {'in': np.random.random(shape).astype('float32'), }
+        self.fetch_list = [out]
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            atol = 1e-5
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
+                atol = 1e-3
+            self.check_output_with_option(use_gpu, atol, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def run_test_all(self):
+        precision_opt = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_opt = [False, True]
+
+        if self.data_layout == 'NCHW':
+            min_shape = [
+                self.bs, self.channel, self.height // 2, self.width // 2
+            ]
+            max_shape = [self.bs, self.channel, self.height * 2, self.width * 2]
+            opt_shape = [self.bs, self.channel, self.height, self.width]
+
+        if self.data_layout == 'NHWC':
+            min_shape = [
+                self.bs, self.height // 2, self.width // 2, self.channel
+            ]
+            max_shape = [self.bs, self.height * 2, self.width * 2, self.channel]
+            opt_shape = [self.bs, self.height, self.width, self.channel]
+
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
+            'in': min_shape
+        }, {'in': max_shape}, {'in': opt_shape}, False)
+        dynamic_shape_opt = [None, dynamic_shape_profile]
+
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_opt, serialize_opt, dynamic_shape_opt):
+            self.precision = precision
+            self.serialize = serialize
+            self.dynamic_shape_params = dynamic_shape
+            self.run_test()
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
+            'in': [self.bs, self.channel, self.height // 2, self.width // 2]
+        }, {'in': [self.bs, self.channel, self.height * 2, self.width * 2]
+            }, {'in': [self.bs, self.channel, self.height, self.width]}, False)
+        self.run_test()
+
+    def test_nchw_all(self):
+        self.run_test_all()
+
+    def test_nhwc(self):
+        self.data_layout = 'NHWC'
+        self.run_test_all()
+
+
+if __name__ == "__main__":
+    unittest.main()

From e3a38d790a0f275fd9332b5ce0ad152f74257b61 Mon Sep 17 00:00:00 2001
From: zlsh80826 <rewang@nvidia.com>
Date: Mon, 29 Mar 2021 14:16:56 +0800
Subject: [PATCH 08/15] [Paddle-TRT] roi_align_plugin (#31732)

* add roi_align_plugin

* add roi align unit_test

* add roi align serialization

* remove roi align static plugin because of batch dim issue

* refine roi align unittest and add fp16/serialization

* add trt roi align condition to op_teller

* refine error message

* remove unnecessary reshape layer
---
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   1 +
 .../tensorrt/convert/roi_align_op.cc          |  86 ++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  24 ++
 .../inference/tensorrt/plugin/CMakeLists.txt  |   1 +
 .../tensorrt/plugin/roi_align_op_plugin.cu    | 380 ++++++++++++++++++
 .../tensorrt/plugin/roi_align_op_plugin.h     | 112 ++++++
 .../ir/inference/test_trt_roi_align_op.py     | 119 ++++++
 8 files changed, 724 insertions(+)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 76bf5948a2b98c..7bb092d0e3c1c0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1192,6 +1192,7 @@ USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 USE_TRT_CONVERTER(clip);
 USE_TRT_CONVERTER(gather);
+USE_TRT_CONVERTER(roi_align);
 USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);
 USE_TRT_CONVERTER(nearest_interp);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 6af76bd11cd59a..bc7b7355ea1922 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -6,6 +6,7 @@ nv_library(tensorrt_converter
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
+                roi_align_op.cc
                 affine_channel_op.cc
                 multiclass_nms_op.cc
                 nearest_interp_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
new file mode 100644
index 00000000000000..1329608aecd205
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Roi Align Op
+ */
+class RoiAlignOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid roi align op to tensorrt plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("X").front();
+    std::string rois_name = op_desc.Input("ROIs").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    const auto pooled_height =
+        BOOST_GET_CONST(int, op_desc.GetAttr("pooled_height"));
+    const auto pooled_width =
+        BOOST_GET_CONST(int, op_desc.GetAttr("pooled_width"));
+    const auto spatial_scale =
+        BOOST_GET_CONST(float, op_desc.GetAttr("spatial_scale"));
+    const auto sampling_ratio =
+        BOOST_GET_CONST(int, op_desc.GetAttr("sampling_ratio"));
+
+    const auto input_tensor = engine_->GetITensor(input_name);
+    const auto rois_tensor = engine_->GetITensor(rois_name);
+
+    const nvinfer1::DataType data_type_ = engine_->WithFp16()
+                                              ? nvinfer1::DataType::kHALF
+                                              : nvinfer1::DataType::kFLOAT;
+
+    std::vector<nvinfer1::ITensor*> inputs{input_tensor, rois_tensor};
+    nvinfer1::ILayer* layer = nullptr;
+
+    PADDLE_ENFORCE_EQ(
+        engine_->with_dynamic_shape(), true,
+        platform::errors::InvalidArgument(
+            "TRT roi align plugin only accept the dynamic shape, because that "
+            "the roi_align will change the batch size."));
+
+    auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic(
+        data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio);
+    auto roi_align_layer = engine_->network()->addPluginV2(
+        inputs.data(), inputs.size(), *roi_align_plugin);
+    layer = roi_align_layer;
+
+    std::vector<std::string> output_names{output_name};
+    RreplenishLayerAndOutput(layer, "roi_align", output_names, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(roi_align, RoiAlignOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index eb429405d18aeb..7c1b2e8001edbd 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -111,6 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "flatten2",
       "flatten",
       "gather",
+      "roi_align",
       "affine_channel",
       "multiclass_nms",
       "nearest_interp",
@@ -263,6 +264,29 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           BOOST_GET_CONST(std::string, desc.GetAttr("interp_method"));
       if (interp_method != "nearest") return false;
     }
+
+    if (op_type == "roi_align") {
+      if (!with_dynamic_shape) return false;
+
+      std::vector<std::string> attrs{"pooled_height", "pooled_width",
+                                     "spatial_scale", "sampling_ratio"};
+      for (auto const attr : attrs) {
+        if (!desc.HasAttr(attr)) return false;
+      }
+
+      const auto pooled_height =
+          BOOST_GET_CONST(int, desc.GetAttr("pooled_height"));
+      if (pooled_height <= 0) return false;
+
+      const auto pooled_width =
+          BOOST_GET_CONST(int, desc.GetAttr("pooled_width"));
+      if (pooled_width <= 0) return false;
+
+      const auto spatial_scale =
+          BOOST_GET_CONST(float, desc.GetAttr("spatial_scale"));
+      if (spatial_scale <= 0.f) return false;
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 7ee16a598d2d01..4107f9ef674339 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -5,6 +5,7 @@ nv_library(tensorrt_plugin
            instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
            qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
            hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
+           roi_align_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
new file mode 100644
index 00000000000000..42c0df41a1b5ef
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -0,0 +1,380 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+
+#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+template <class T>
+__inline__ __device__ T BilinearInterpolate(const T* input_data,
+                                            const int height, const int width,
+                                            T y, T x) {
+  if (y < -1.f || y > height || x < -1.f || x > width) return 0;
+  y = y <= 0.f ? 0.f : y;
+  x = x <= 0.f ? 0.f : x;
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1.f - ly, hx = 1.f - lx;
+  T v1 = input_data[y_low * width + x_low];
+  T v2 = input_data[y_low * width + x_high];
+  T v3 = input_data[y_high * width + x_low];
+  T v4 = input_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T, typename OutT, bool USE_SMEM>
+__global__ void GPUROIAlignOpt(const int nthreads,
+                               const T* __restrict__ input_data,
+                               const T* __restrict__ input_rois,
+                               const float spatial_scale, const int channels,
+                               const int height, const int width,
+                               const int pooled_height, const int pooled_width,
+                               const int sampling_ratio, const int num_rois,
+                               OutT* __restrict__ output_data) {
+  const int batch = blockIdx.x;
+  const int channel = blockIdx.y;
+  const T* offset_input_data =
+      input_data + (batch * channels + channel) * height * width;
+  extern __shared__ T s_input_data[];
+  if (USE_SMEM) {
+    for (int idx = threadIdx.x; idx < height * width; idx += blockDim.x) {
+      s_input_data[idx] = offset_input_data[idx];
+    }
+    __syncthreads();
+  }
+  for (int idx = threadIdx.x; idx < num_rois * pooled_height * pooled_width;
+       idx += blockDim.x) {
+    const int pw = idx % pooled_width;
+    const int ph = (idx / pooled_width) % pooled_height;
+    const int roi_idx = (idx / pooled_width / pooled_height) % num_rois;
+    const int n = batch * num_rois + roi_idx;
+    const float4 rois_offset = reinterpret_cast<const float4*>(input_rois)[n];
+    const T roi_xmin = rois_offset.x * spatial_scale;
+    const T roi_ymin = rois_offset.y * spatial_scale;
+    const T roi_xmax = rois_offset.z * spatial_scale;
+    const T roi_ymax = rois_offset.w * spatial_scale;
+    const T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.f));
+    const T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.f));
+    const T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    const T bin_size_w = roi_width / static_cast<T>(pooled_width);
+    const int roi_bin_grid_h = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_height / pooled_height);
+    const int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+
+    T output_val = 0.f;
+    for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        if (USE_SMEM) {
+          T val = BilinearInterpolate<T>(s_input_data, height, width, y, x);
+          output_val += val;
+        } else {
+          T val =
+              BilinearInterpolate<T>(offset_input_data, height, width, y, x);
+          output_val += val;
+        }
+      }
+    }
+    output_val /= count;
+    const int out_offset =
+        batch * num_rois * channels * pooled_height * pooled_width +
+        roi_idx * channels * pooled_height * pooled_width +
+        channel * pooled_height * pooled_width + ph * pooled_width + pw;
+    output_data[out_offset] = static_cast<OutT>(output_val);
+  }
+}
+
+#if IS_TRT_VERSION_GE(6000)
+RoiAlignPluginDynamic::RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
+                                             const int pooled_height,
+                                             const int pooled_width,
+                                             float spatial_scale,
+                                             int sampling_ratio)
+    : data_type_(data_type),
+      pooled_height_(pooled_height),
+      pooled_width_(pooled_width),
+      spatial_scale_(spatial_scale),
+      sampling_ratio_(sampling_ratio) {
+  bool data_type_is_valid = data_type_ == nvinfer1::DataType::kFLOAT ||
+                            data_type_ == nvinfer1::DataType::kHALF;
+  PADDLE_ENFORCE_EQ(data_type_is_valid, true,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts kFLOAT(%d) or "
+                        "kHALF(%d) data type, but the received data type = %d",
+                        static_cast<int>(nvinfer1::DataType::kFLOAT),
+                        static_cast<int>(nvinfer1::DataType::kHALF),
+                        static_cast<int>(data_type_)));
+
+  PADDLE_ENFORCE_GT(pooled_height_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts pooled_height "
+                        "greater than %d, but the received pooled_height = %d",
+                        0, pooled_height_));
+
+  PADDLE_ENFORCE_GT(pooled_width_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts pooled_width greater "
+                        "than %d, but the received pooled_width = %d",
+                        0, pooled_height_));
+
+  PADDLE_ENFORCE_GT(spatial_scale_, 0.f,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts spatial_scale "
+                        "greater than %f, but the received spatial_scale = %f",
+                        0, spatial_scale_));
+
+  int smem_per_block = -1;
+  int device = -1;
+  cudaGetDevice(&device);
+
+  PADDLE_ENFORCE_GE(
+      device, 0,
+      platform::errors::InvalidArgument(
+          "The cuda device ID should be greater than %d, but device ID is %d",
+          0, device));
+
+  cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+                         device);
+  smem_per_block_ = smem_per_block;
+}
+
+RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &pooled_height_);
+  DeserializeValue(&data, &length, &pooled_width_);
+  DeserializeValue(&data, &length, &spatial_scale_);
+  DeserializeValue(&data, &length, &sampling_ratio_);
+  int smem_per_block = -1;
+  int device = -1;
+  cudaGetDevice(&device);
+  PADDLE_ENFORCE_GE(
+      device, 0,
+      platform::errors::InvalidArgument(
+          "The cuda device ID should be greater than %d, but device ID is %d",
+          0, device));
+  cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+                         device);
+  smem_per_block_ = smem_per_block;
+}
+
+nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const {
+  auto* plugin =
+      new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_,
+                                spatial_scale_, sampling_ratio_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs RoiAlignPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) {
+  nvinfer1::DimsExprs ret{};
+  ret.nbDims = 4;
+  ret.d[0] = inputs[1].d[0];  // roi
+  ret.d[1] = inputs[0].d[1];  // X
+  ret.d[2] = exprBuilder.constant(pooled_height_);
+  ret.d[3] = exprBuilder.constant(pooled_width_);
+  return ret;
+}
+
+bool RoiAlignPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) {
+  if (inOut[pos].format != nvinfer1::TensorFormat::kLINEAR) {
+    return false;
+  }
+  if (pos < 2) {  // input
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT;
+  }
+  return inOut[pos].type == data_type_;
+}
+
+void RoiAlignPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t RoiAlignPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return 0;
+}
+
+template <typename T, typename OutT>
+int RoiAlignPluginDynamic::enqueue_impl(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  auto in_dims = inputDesc[0].dims;
+  auto rois_dims = inputDesc[1].dims;
+  auto out_dims = outputDesc[0].dims;
+
+  int rois_num = rois_dims.d[0];
+  if (rois_num == 0) return cudaGetLastError() != cudaSuccess;
+
+  int batch = in_dims.d[0];
+  int channels = in_dims.d[1];
+  int height = in_dims.d[2];
+  int width = in_dims.d[3];
+
+  int output_size =
+      out_dims.d[0] * out_dims.d[1] * out_dims.d[2] * out_dims.d[3];
+
+  const dim3 blocks(batch, channels);
+  const int threads = 512;
+
+  if (smem_per_block_ < width * height * sizeof(T)) {
+    GPUROIAlignOpt<T, OutT, false><<<blocks, threads, 0, stream>>>(
+        output_size, static_cast<const T*>(inputs[0]),
+        static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
+        width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
+        static_cast<OutT*>(outputs[0]));
+  } else {
+    GPUROIAlignOpt<
+        T, OutT, true><<<blocks, threads, width * height * sizeof(T), stream>>>(
+        output_size, static_cast<const T*>(inputs[0]),
+        static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
+        width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
+        static_cast<OutT*>(outputs[0]));
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int RoiAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const* inputs,
+                                   void* const* outputs, void* workspace,
+                                   cudaStream_t stream) {
+  PADDLE_ENFORCE_EQ(outputDesc[0].type, data_type_,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlignPluginDynamic expects outputDesc[0].type "
+                        "equal to data_type_"));
+
+  if (data_type_ == nvinfer1::DataType::kHALF) {
+    return enqueue_impl<float, half>(inputDesc, outputDesc, inputs, outputs,
+                                     workspace, stream);
+  }
+  return enqueue_impl<float, float>(inputDesc, outputDesc, inputs, outputs,
+                                    workspace, stream);
+}
+
+nvinfer1::DataType RoiAlignPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+  return data_type_;
+}
+
+const char* RoiAlignPluginDynamic::getPluginType() const {
+  return "roi_align_plugin_dynamic";
+}
+
+int RoiAlignPluginDynamic::getNbOutputs() const { return 1; }
+
+int RoiAlignPluginDynamic::initialize() { return 0; }
+
+void RoiAlignPluginDynamic::terminate() {}
+
+size_t RoiAlignPluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(pooled_height_);
+  serialize_size += SerializedSize(pooled_width_);
+  serialize_size += SerializedSize(spatial_scale_);
+  serialize_size += SerializedSize(sampling_ratio_);
+  return serialize_size;
+}
+
+void RoiAlignPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, pooled_height_);
+  SerializeValue(&buffer, pooled_width_);
+  SerializeValue(&buffer, spatial_scale_);
+  SerializeValue(&buffer, sampling_ratio_);
+}
+
+void RoiAlignPluginDynamic::destroy() {}
+
+RoiAlignPluginDynamicCreator::RoiAlignPluginDynamicCreator() {}
+
+void RoiAlignPluginDynamicCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* RoiAlignPluginDynamicCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* RoiAlignPluginDynamicCreator::getPluginName() const {
+  return "roi_align_plugin_dynamic";
+}
+
+const char* RoiAlignPluginDynamicCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+RoiAlignPluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+}
+
+nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new RoiAlignPluginDynamic(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
new file mode 100644
index 00000000000000..bba7d0d5a99664
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
+                                 const int pooled_height,
+                                 const int pooled_width, float spatial_scale,
+                                 int sampling_ratio);
+  RoiAlignPluginDynamic(void const* data, size_t length);
+  ~RoiAlignPluginDynamic() = default;
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  template <typename T, typename OutT>
+  int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc,
+                   const nvinfer1::PluginTensorDesc* outputDesc,
+                   const void* const* inputs, void* const* outputs,
+                   void* workspace, cudaStream_t stream);
+
+  nvinfer1::DataType data_type_;
+  int pooled_height_;
+  int pooled_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  int smem_per_block_;
+  std::string namespace_;
+};
+
+class RoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  RoiAlignPluginDynamicCreator();
+  ~RoiAlignPluginDynamicCreator() override = default;
+
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+REGISTER_TRT_PLUGIN_V2(RoiAlignPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
new file mode 100644
index 00000000000000..fa276dd342bc6b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTRoiAlignTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 2
+        self.num_rois = 4
+        self.channel = 16
+        self.height = 32
+        self.width = 32
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.enable_trt = True
+
+    def build(self):
+        self.trt_parameters = TRTRoiAlignTest.TensorRTParam(
+            1 << 30, self.bs * self.num_rois, 1, self.precision, self.serialize,
+            False)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_shape = [-1, self.channel, self.height, self.width]
+            data = fluid.data(name='data', shape=data_shape, dtype='float32')
+            rois = fluid.data(
+                name='rois', shape=[-1, 4], dtype='float32', lod_level=1)
+            roi_align_out = fluid.layers.roi_align(data, rois)
+            out = fluid.layers.batch_norm(roi_align_out, is_test=True)
+
+        rois_lod = fluid.create_lod_tensor(
+            np.random.random([self.bs * self.num_rois, 4]).astype('float32'),
+            [[self.num_rois, self.num_rois]], fluid.CPUPlace())
+
+        data_shape[0] = self.bs
+        self.feeds = {
+            'data': np.random.random(data_shape).astype('float32'),
+            'rois': rois_lod,
+        }
+        self.fetch_list = [out]
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            atol = 1e-5
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
+                atol = 1e-3
+            self.check_output_with_option(use_gpu, atol, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def set_dynamic(self):
+        min_shape_spec = dict()
+        max_shape_spec = dict()
+        opt_shape_spec = dict()
+        min_shape_spec['data'] = [
+            self.bs, self.channel, self.height // 2, self.width // 2
+        ]
+        min_shape_spec['rois'] = [1, 4]
+        max_shape_spec[
+            'data'] = [self.bs, self.channel, self.height * 2, self.width * 2]
+        max_shape_spec['rois'] = [self.bs * self.num_rois, 4]
+        opt_shape_spec[
+            'data'] = [self.bs, self.channel, self.height, self.width]
+        opt_shape_spec['rois'] = [self.bs * self.num_rois, 4]
+
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam(
+            min_shape_spec, max_shape_spec, opt_shape_spec, False)
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_fp16(self):
+        self.set_dynamic()
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_dynamic_serialize(self):
+        self.set_dynamic()
+        self.serialize = True
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 51eb29de18adcf8c20272218f105eb1c2135cc09 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Mon, 29 Mar 2021 14:17:54 +0800
Subject: [PATCH 09/15] [CustomOP] Add shape related constructor for Tensor
 (#31681)

* give shape related contructor and reshape warning

* change line num to fit ut

* change ut to fit

* remove useless code

* call resize directly in constructor
---
 paddle/fluid/extension/include/ext_tensor.h   |  3 +++
 paddle/fluid/extension/src/ext_tensor.cc      | 21 ++++++++++++++++++-
 paddle/fluid/framework/custom_tensor_utils.h  |  2 +-
 .../fluid/tests/custom_op/custom_relu_op.cc   |  3 +--
 .../custom_op/test_custom_relu_op_jit.py      |  4 ++--
 5 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h
index be492a6d5535d1..52606b2a7f59e0 100644
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@@ -52,6 +52,9 @@ class PD_DLL_DECL Tensor {
   /// \brief Construct a Tensor on target Place for CustomOp.
   /// Generally it's only used for user to create Tensor.
   explicit Tensor(const PlaceType& place);
+  /// \brief Construct a Tensor on target Place with shape for CustomOp.
+  /// Generally it's only used for user to create Tensor.
+  Tensor(const PlaceType& place, const std::vector<int64_t>& shape);
   /// \brief Reset the shape of the tensor.
   /// Generally it's only used for the input tensor.
   /// Reshape must be called before calling
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index 0cae8f4af7b97d..e9705e2101cc3c 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -102,13 +102,32 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
 
 void Tensor::reshape(const std::vector<int64_t> &shape) {
   GET_CASTED_TENSOR
-  tensor->Resize(framework::make_ddim(shape));
+  auto new_dim = framework::make_ddim(shape);
+  if (tensor->numel() != framework::product(new_dim)) {
+    LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger "
+                    "or smaller"
+                 << "than original shape will not change your tensor's memory "
+                    "Please call"
+                 << "paddle::Tensor::mutable_data<T>() after to reallocate "
+                    "your tensor's size."
+                 << std::endl;
+  }
+  tensor->Resize(new_dim);
 }
 
 Tensor::Tensor(const PlaceType &place)
     : tensor_(std::make_shared<framework::LoDTensor>()),
       place_(place),
       stream_(StreamWrapper()) {}
+
+Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
+    : tensor_(std::make_shared<framework::LoDTensor>()),
+      place_(place),
+      stream_(StreamWrapper()) {
+  GET_CASTED_TENSOR
+  tensor->Resize(framework::make_ddim(shape));
+}
+
 template <typename T>
 T *Tensor::mutable_data(const PlaceType &place) {
   place_ = place;
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
index fad1e3ee3496cd..809a6b965aad9b 100644
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -37,7 +37,7 @@ class CustomTensorUtils {
   /// \brief Share data FROM another tensor.
   /// Use this to pass tensor from op to op
   /// \return void.
-  static void ShareDataFrom(const void* src, const Tensor& dst);
+  static void ShareDataFrom(const void* src, const paddle::Tensor& dst);
 
   static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType(
       const paddle::DataType& dtype) {
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index c0b30a1cb5579c..c075d27f7b1763 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -38,9 +38,8 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
 }
 
 std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
-  out.reshape(x.shape());
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 23733d20841b3a..641630b0f4476a 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -103,11 +103,11 @@ def test_exception(self):
                 in str(e))
             if IS_WINDOWS:
                 self.assertTrue(
-                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:48"
+                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:47"
                     in str(e))
             else:
                 self.assertTrue(
-                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:48"
+                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:47"
                     in str(e))
         self.assertTrue(caught_exception)
 

From 61805d8f0aa304f4226e5793b97da97552a43282 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Mon, 29 Mar 2021 17:11:26 +0800
Subject: [PATCH 10/15] fix cmake model path (#31866)

* fix cmake model path

* update cmake

* fix unittest

* fix unittest
---
 paddle/fluid/inference/tests/api/CMakeLists.txt | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 92f9c20a369d7a..75628adbe8a859 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -530,7 +530,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
             
     set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)
+    if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
@@ -538,7 +538,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
 
     set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz)
+    if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc
@@ -576,8 +576,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
 
-    set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
@@ -585,8 +584,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
-    set(TEST_TRT_ERNIE_UNSER_FP16_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_fp16_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_FP16_MODEL}/ernie_model_4_unserialized.tgz)
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz")
     endif()
 

From 123949eb48378262c888bf2e5aa3f2127e6bf32f Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Mon, 29 Mar 2021 17:41:31 +0800
Subject: [PATCH 11/15] [ROCM] added a cudnn switch of conv2d for rocm platform
 (#31836)

---
 paddle/fluid/platform/flags.cc                | 12 +++++++
 .../pybind/global_value_getter_setter.cc      |  4 ++-
 python/paddle/fluid/__init__.py               |  1 +
 python/paddle/fluid/layers/nn.py              |  4 +++
 .../fluid/tests/unittests/test_conv2d_op.py   | 36 +++++++++++++++++++
 python/paddle/nn/layer/conv.py                |  5 +++
 6 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 1a55562f2b8244..fa77c0be037df3 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -564,3 +564,15 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
  */
 DEFINE_string(tracer_mkldnn_ops_off, "",
               "List of OneDNN operation types to be turned off");
+
+/**
+ * CUDNN related FLAG
+ * Name: conv2d_disable_cudnn
+ * Since Version:
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Disable cudnn in conv2d.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
+#endif
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 6074d191ad2be0..e8ba16398d2b00 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -72,6 +72,7 @@ DECLARE_uint64(conv_workspace_size_limit);
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 DECLARE_bool(cudnn_deterministic);
 DECLARE_bool(cudnn_exhaustive_search);
+DECLARE_bool(conv2d_disable_cudnn);
 // data processing
 DECLARE_bool(enable_cublas_tensor_op_math);
 // device management
@@ -367,7 +368,8 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_fraction_of_cuda_pinned_memory_to_use,
       FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb,
       FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
-      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce);
+      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce,
+      FLAGS_conv2d_disable_cudnn);
 #endif
 #ifdef PADDLE_WITH_XPU
   REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index b24da29d0f5fdd..ae3418687853b9 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -230,6 +230,7 @@ def __bootstrap__():
             'gpu_allocator_retry_time',
             'local_exe_sub_scope_limit',
             'gpu_memory_limit_mb',
+            'conv2d_disable_cudnn',
         ]
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 00d1db19fc2f51..6bc69ffd5cd324 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1603,6 +1603,10 @@ def _get_default_param_initializer():
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
+    if (core.is_compiled_with_cuda() and paddle.fluid.get_flags(
+            "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+        use_cudnn = False
+
     helper.append_op(
         type=l_type,
         inputs={
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 29c35d28d4d2e3..83bba0b0ca1c3a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -1465,5 +1465,41 @@ def run_7():
         self.assertRaises(ValueError, run_7)
 
 
+# --------- test environment variable ------
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()),
+    "core is not compiled with CUDA or ROCM")
+class TestConv2DEnviron(unittest.TestCase):
+    def run_conv2d_api(self):
+        inputs = fluid.layers.data(
+            shape=[2, 3, 5, 5],
+            append_batch_size=False,
+            name="inputs",
+            dtype="float32")
+        fluid.layers.conv2d(
+            input=inputs,
+            num_filters=4,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding=0,
+            dilation=[1, 1],
+            groups=1,
+            data_format="NCHW")
+
+        x_var = paddle.uniform((2, 3, 5, 5), dtype="float32", min=-1., max=1.)
+        conv = paddle.nn.Conv2D(
+            in_channels=3,
+            out_channels=4,
+            kernel_size=(3, 3),
+            data_format="NCHW")
+        y_var = conv(x_var)
+
+    def test_environ(self):
+        fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False})
+        self.run_conv2d_api()
+        fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True})
+        self.run_conv2d_api()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 389920b923876d..d65b874c8badc6 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 
+from ...fluid import get_flags
 from ...fluid import core
 from ...device import get_cudnn_version
 from ...fluid.dygraph import layers
@@ -644,6 +645,10 @@ def __init__(self,
             bias_attr=bias_attr,
             data_format=data_format)
 
+        if (core.is_compiled_with_cuda() and get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+            self._use_cudnn = False
+
     def forward(self, x):
         if self._padding_mode != 'zeros':
             x = F.pad(x,

From 525c32e33c8023472cb8178990bbc9c2ec3f1e3c Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Mon, 29 Mar 2021 19:47:55 +0800
Subject: [PATCH 12/15] =?UTF-8?q?Fix=20bug=20of=20set=5Fvalue=20op?=
 =?UTF-8?q?=EF=BC=9ADecerease=20axes=20to=20do=20right=20broadcast=20(#318?=
 =?UTF-8?q?75)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/fluid/operators/set_value_op.cc        | 11 ++-
 paddle/fluid/operators/set_value_op.h         | 72 ++++++++++++++++---
 python/paddle/fluid/framework.py              | 11 ++-
 .../tests/unittests/test_set_value_op.py      | 14 ++++
 4 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 94d34c648d1748..105d61015fcb9d 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -124,6 +124,9 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int64_t>>(
         "steps", "(list<int64_t>) Stride step from the start to the end.")
         .SetDefault({});
+    AddAttr<std::vector<int64_t>>("decrease_axes",
+                                  "(list<int>) The axes to decrease.")
+        .SetDefault({});
 
     AddAttr<std::vector<int>>("bool_values", "Store the bool values.")
         .SetDefault({});
@@ -185,4 +188,10 @@ Upgrade set_value, add 3 inputs [StartsTensorList, EndsTensorList, StepsTensorLi
                         "Ending indices of corresponding axis in `axes`.",
                         std::vector<int64_t>{})
             .NewAttr("steps", "Stride step from the start to the end.",
-                     std::vector<int64_t>{}));
+                     std::vector<int64_t>{}))
+    .AddCheckpoint(
+        R"ROC(
+Upgrade set_value, add 1 attribute [decrease_axes].
+              )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "decrease_axes", "The axes to decrease.", std::vector<int64_t>{}));
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 325a2b0b865e9d..eca51147f8159e 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -106,10 +106,10 @@ inline void CheckAndUpdateSlice(const framework::DDim in_dims,
 }
 
 inline framework::DDim GetSliceDims(const framework::DDim in_dims,
-                                    const std::vector<int64_t> axes,
-                                    const std::vector<int64_t> starts,
-                                    const std::vector<int64_t> ends,
-                                    const std::vector<int64_t> steps) {
+                                    const std::vector<int64_t>& axes,
+                                    const std::vector<int64_t>& starts,
+                                    const std::vector<int64_t>& ends,
+                                    const std::vector<int64_t>& steps) {
   framework::DDim slice_dims(in_dims);
 
   for (size_t i = 0; i < axes.size(); ++i) {
@@ -127,6 +127,38 @@ inline framework::DDim GetSliceDims(const framework::DDim in_dims,
   return slice_dims;
 }
 
+inline framework::DDim GetDecreasedDims(
+    const framework::DDim slice_dims,
+    const std::vector<int64_t>& decrease_axes) {
+  // Get dims after decreasing axes.
+  framework::DDim decreased_dims(slice_dims);
+  if (decrease_axes.size() > 0) {
+    for (size_t i = 0; i < decrease_axes.size(); ++i) {
+      int64_t axis = decrease_axes[i];
+      PADDLE_ENFORCE_EQ(
+          decreased_dims[axis], 1,
+          platform::errors::InvalidArgument("decrease dim should be 1"));
+      decreased_dims[axis] = 0;
+    }
+
+    std::vector<int64_t> new_shape;
+    for (int i = 0; i < decreased_dims.size(); ++i) {
+      if (decreased_dims[i] != 0) {
+        new_shape.push_back(decreased_dims[i]);
+      }
+    }
+
+    // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
+    // uses [1] instead.
+    if (new_shape.size() == 0) {
+      new_shape.push_back(1);
+    }
+
+    decreased_dims = framework::make_ddim(new_shape);
+  }
+  return decreased_dims;
+}
+
 template <typename DeviceContext, typename T>
 class SetValueKernel : public framework::OpKernel<T> {
  public:
@@ -179,6 +211,7 @@ class SetValueKernel : public framework::OpKernel<T> {
     auto ends = ctx.Attr<std::vector<int64_t>>("ends");
     auto steps = ctx.Attr<std::vector<int64_t>>("steps");
     auto shape = ctx.Attr<std::vector<int64_t>>("shape");
+    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
 
     auto dtype = in->type();
     if (!starts_tensor_list.empty()) {
@@ -194,6 +227,7 @@ class SetValueKernel : public framework::OpKernel<T> {
     auto in_dims = in->dims();
     CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps);
     auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps);
+    auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
 
     auto place = ctx.GetPlace();
     auto& eigen_place =
@@ -212,13 +246,13 @@ class SetValueKernel : public framework::OpKernel<T> {
     // set_value is what we want.
     TensorCopy(*in, place, out);
 
-    Tensor slice_t(dtype), pad_t(dtype);
-    slice_t.mutable_data<T>(slice_dims, place);
-    pad_t.mutable_data<T>(in_dims, place);
+    Tensor slice_tensor(dtype), pad_tensor(dtype);
+    slice_tensor.mutable_data<T>(slice_dims, place);
+    pad_tensor.mutable_data<T>(in_dims, place);
 
-    auto pad_e = framework::EigenTensor<T, D>::From(pad_t, in_dims);
+    auto pad_e = framework::EigenTensor<T, D>::From(pad_tensor, in_dims);
     auto out_e = framework::EigenTensor<T, D>::From(*out);
-    auto slice_e = framework::EigenTensor<T, D>::From(slice_t, slice_dims);
+    auto slice_e = framework::EigenTensor<T, D>::From(slice_tensor, slice_dims);
 
     // Step 1: Set the value of out at `_index` to zero
     slice_e.device(eigen_place) = slice_e.constant(T(0));
@@ -244,11 +278,26 @@ class SetValueKernel : public framework::OpKernel<T> {
 
     // Step 2: Set a tensor with the same shape as out tensor. And its data at
     // '_index' is the same as value_tensor, and data out of '_index' to zero
+
     // - Step 2.1 Set slice tensor with value
+
+    // NOTE(liym27): [ Why resize slice_tensor here? ]
+    // A: When do broadcasting on slice_tensor and value_tensor, the shape of
+    // slice_tensor should be decreased dims.
+    // e.g.
+    //  x[:,0] = value_tensor
+    // x's shape = [3, 4], value_tensor's shape = [3]
+    // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
+    // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
+    // shape is [3, 3], which cross the border;
+    // If do broadcasting on Tensor with shape [3] and [3], the result's shape
+    // is [3], which is right.
+
+    slice_tensor.Resize(decrease_slice_dims);
     if (value_tensor != nullptr) {
       // ElementwiseComputeEx can do broadcasting
       ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_t, value_tensor, -1, SubFunctor<T>(), &slice_t);
+          ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
     } else {
       Tensor value_t(dtype);
       auto value_dims = framework::make_ddim(shape);
@@ -257,8 +306,9 @@ class SetValueKernel : public framework::OpKernel<T> {
       CopyVecotorToTensor<T>(value_name.c_str(), &value_t, ctx);
       value_t.Resize(value_dims);
       ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_t, &value_t, -1, SubFunctor<T>(), &slice_t);
+          ctx, &slice_tensor, &value_t, -1, SubFunctor<T>(), &slice_tensor);
     }
+    slice_tensor.Resize(slice_dims);
 
     // - Step 2.2 Pad slice tensor with 0
     pad_e.device(eigen_place) = pad_e.constant(T(0));
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index db487128bbe755..18162059e998eb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1863,6 +1863,7 @@ def __setitem__(self, item, value):
         if not isinstance(item, tuple):
             item = [item]
 
+        decrease_axes = []
         axes = []
         starts = []
         ends = []
@@ -1933,15 +1934,23 @@ def replace_ellipsis(item):
                 if end is None:
                     end = max_integer if step > 0 else (0 - max_integer)
             else:
+                decrease_axes.append(dim)
                 start = slice_item
                 end = slice_item + 1 if slice_item != -1 else max_integer
                 step = 1
+
             axes.append(dim)
             starts.append(start)
             ends.append(end)
             steps.append(step)
 
-        attrs = {'axes': axes, 'starts': starts, 'ends': ends, 'steps': steps}
+        attrs = {
+            'axes': axes,
+            'starts': starts,
+            'ends': ends,
+            'steps': steps,
+            'decrease_axes': decrease_axes
+        }
 
         from .layers import utils
         if utils._contain_var(starts):
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 23dac41f64abfe..1239a2249cc43e 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -671,6 +671,20 @@ def _get_answer(self):
         self.data[0] = self.value
 
 
+class TestSetValueValueShape5(TestSetValueApi):
+    def set_value(self):
+        self.value = np.array([3, 3, 3]).astype(self.dtype)
+
+    def set_shape(self):
+        self.shape = [3, 4]
+
+    def _call_setitem(self, x):
+        x[:, 0] = paddle.assign(self.value)  # x is Paddle.Tensor
+
+    def _get_answer(self):
+        self.data[:, 0] = self.value
+
+
 # 4. Test error
 class TestError(TestSetValueBase):
     def _value_type_error(self):

From b48841ba2e7335eaa435a54436ed580d4aef001c Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuancoder@163.com>
Date: Mon, 29 Mar 2021 19:53:12 +0800
Subject: [PATCH 13/15] modify API nn.Bilinear's doc (#31889)

* modify API nn.Bilinear's doc, test=develop

* modify API nn.Bilinear's doc, test=develop
---
 python/paddle/nn/layer/common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index d0f97625bcba75..60c846f9f76ec0 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -578,7 +578,7 @@ class Bilinear(layers.Layer):
 
     .. math::
 
-      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1
+      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,outfeatures-1
 
       out = out + b
 
@@ -586,7 +586,7 @@ class Bilinear(layers.Layer):
      - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features].
      - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features].
      - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features].
-     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features].
+     - :math:`out_{i}`: the i-th element of out, shape is [batch_size], and out's shape is [batch_size, out_features].
      - :math:`b`: the learned bias, shape is [1, out_features].
      - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`.
 

From 8829a309fe056dfecd472f19050c390fd049fead Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Mon, 29 Mar 2021 20:11:26 +0800
Subject: [PATCH 14/15] Delete cudnn6 code (#31835)

---
 paddle/fluid/operators/conv_cudnn_op_cache.h |  5 ---
 paddle/fluid/operators/cudnn_lstm_cache.h    | 10 +-----
 paddle/fluid/operators/cudnn_rnn_cache.h     |  7 ----
 paddle/fluid/platform/cudnn_helper.h         | 38 --------------------
 4 files changed, 1 insertion(+), 59 deletions(-)

diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index ddddb7f8641ba1..23a471cfa00674 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -40,11 +40,6 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
-#else
-// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
-static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
-static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
-static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 #endif
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
index 3181e4b1d990b7..b7859237e737a1 100644
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -85,20 +85,12 @@ class ScopedRNNBase {
     dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_,
                              dropout_state, seed_, state_size);
 
-// ------------------- cudnn rnn descriptors ---------------------
-#if CUDNN_VERSION >= 6000
+    // ------------------- cudnn rnn descriptors ---------------------
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_.desc(), hidden_size_, num_layers_,
         dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
-        CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        cudnn_type));
-#endif
 
 #if CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index 13a3e7d09b9f62..a6a23a91c76c02 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -168,18 +168,11 @@ struct CudnnRNNCache {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
 
-#if CUDNN_VERSION >= 6000
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
         CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        cudnn_type));
-#endif
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index af0df2efc5e6d5..6c3c96b68c48a1 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -91,30 +91,6 @@ enum class ActivationMode {
   kBandPass,
 };
 
-#if CUDNN_VERSION < 6000
-#pragma message "CUDNN version under 6.0 is supported at best effort."
-#pragma message "We strongly encourage you to move to 6.0 and above."
-#pragma message "This message is intended to annoy you enough to update."
-#pragma message \
-    "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/"
-
-inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
-  switch (mode) {
-    case PoolingMode::kMaximumDeterministic:
-      return CUDNN_POOLING_MAX;
-    case PoolingMode::kAverageExclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-    case PoolingMode::kAverageInclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-    case PoolingMode::kMaximum:
-      return CUDNN_POOLING_MAX;
-    default:
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unexpected CUDNN pooling mode."));
-  }
-}
-#else
-
 inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
   switch (mode) {
     case PoolingMode::kMaximumDeterministic:
@@ -130,7 +106,6 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
           platform::errors::Unimplemented("Unexpected CUDNN pooling mode."));
   }
 }
-#endif  // CUDNN_VERSION < 6000
 
 inline ActivationMode StringToActivationMode(const std::string& str) {
   if (str == "identity") {
@@ -471,19 +446,6 @@ class ScopedConvolutionDescriptor {
             "of pads is %d, size of dilations is %d.",
             pads.size(), dilations.size()));
 
-#if !CUDNN_VERSION_MIN(6, 0, 0)
-    // cudnn v5 does not support dilation conv, the argument is called upscale
-    // instead of dilations and it is must be one.
-    for (size_t i = 0; i < dilations.size(); ++i) {
-      PADDLE_ENFORCE_EQ(dilations[i], 1,
-                        platform::errors::InvalidArgument(
-                            "Dilations conv is not supported in this cuDNN "
-                            "version(%d.%d.%d).",
-                            CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
-                            CUDNN_VERSION % 100));
-    }
-#endif
-
     cudnnDataType_t compute_type =
         (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(

From a71d72d921fc861051553c6d44b32bc9037706bc Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Mon, 29 Mar 2021 20:30:37 +0800
Subject: [PATCH 15/15] relu forward and backward with vectortype (#31869)

---
 paddle/fluid/operators/activation_op.cu | 286 +++++++++++++++++++++++-
 1 file changed, 285 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 2033081af224a4..c6d2fbccd8e84b 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -10,8 +10,278 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using float16 = paddle::platform::float16;
+
+template <typename T>
+struct CudaVecType {
+  using type = T;
+  static constexpr int vecsize = 1;
+};
+
+template <>
+struct CudaVecType<platform::float16> {
+  using type = __half2;
+  static constexpr int vecsize = 2;
+};
+
+template <>
+struct CudaVecType<float> {
+  using type = float4;
+  static constexpr int vecsize = 4;
+};
+
+template <typename T>
+class BaseGPUFunctor {
+ public:
+  using ELEMENT_TYPE = T;
+};
+
+/* ========================================================================== */
+
+/* ===========================    relu forward   ============================ */
+template <typename T>
+class ReluGPUFunctor : public BaseGPUFunctor<T> {
+ private:
+  T zero_;
+
+ public:
+  ReluGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+
+  // for relu forward when T is double
+  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
+      const typename CudaVecType<T>::type* x);
+
+  // when num % vecsize != 0 this func will be used
+  __device__ __forceinline__ T ComputeRemainder(const T x) {
+    return x > zero_ ? x : zero_;
+  }
+};
+
+template <>
+__device__ __forceinline__ CudaVecType<double>::type
+ReluGPUFunctor<double>::Compute(const CudaVecType<double>::type* x) {
+// relu forward : out = max(x, 0)
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
+  return __ldg(x) > zero_ ? __ldg(x) : zero_;
+#else
+  return (*x) > zero_ ? (*x) : zero_;
+#endif
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float>::type
+ReluGPUFunctor<float>::Compute(const CudaVecType<float>::type* xx) {
+  // relu forward : out = max(xx, 0)
+  return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y),
+                     (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w));
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float16>::type
+ReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* in) {
+// relu forward : out = max(in, 0)
+#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  const half2 kzero = __float2half2_rn(0.0f);
+  return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in));
+#else
+  const float2 xx = __half22float2(*in);
+  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(xx.x),
+                           (xx.y > 0.0f) * static_cast<float>(xx.y));
+#endif
+}
+/* ========================================================================== */
+
+/* ===========================    relu backward   ============================
+ */
+
+template <typename T>
+class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
+ private:
+  T zero_;
+
+ public:
+  ReluGradGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+
+  // for relu backward when T is double
+  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
+      const typename CudaVecType<T>::type* out,
+      const typename CudaVecType<T>::type* dout);
+
+  // when num % vecsize != 0 this func will be used
+  __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) {
+    // relu backward : dx = out > 0 ? dout : 0
+    return out > zero_ ? dout : zero_;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <>
+__device__ __forceinline__ CudaVecType<double>::type
+ReluGradGPUFunctor<double>::Compute(const CudaVecType<double>::type* out,
+                                    const CudaVecType<double>::type* dout) {
+// relu backward : dx = out > 0 ? dout : 0;
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
+  return __ldg(out) > zero_ ? __ldg(dout) : zero_;
+#else
+  return (*out) > zero_ ? (*dout) : zero_;
+#endif
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float>::type
+ReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type* out,
+                                   const CudaVecType<float>::type* dout) {
+  // relu backward : dx = out > 0 ? dout : 0;
+  return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y),
+                     (out->z > zero_) * (dout->z),
+                     (out->w > zero_) * (dout->w));
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float16>::type
+ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* out,
+                                     const CudaVecType<float16>::type* dout) {
+// relu backward : dx = out > 0 ? dout : 0;
+#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  const half2 kzero = __float2half2_rn(0.0f);
+  return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout));
+#else
+  const float2 xx = __half22float2(*out);
+  const float2 yy = __half22float2(*dout);
+  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(yy.x),
+                           (xx.y > 0.0f) * static_cast<float>(yy.y));
+#endif
+}
+
+/* ========================================================================== */
+
+template <typename T, typename Functor>
+__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout,
+                                        T* dx, int num, Functor functor) {
+  using VecType = typename CudaVecType<T>::type;
+  constexpr int vecsize = CudaVecType<T>::vecsize;
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  int loop = num / vecsize;
+  int tail = num % vecsize;
+  const VecType* in_forward = reinterpret_cast<const VecType*>(forward_data);
+  const VecType* in_dout = reinterpret_cast<const VecType*>(dout);
+  VecType* out = reinterpret_cast<VecType*>(dx);
+
+  for (int i = idx; i < loop; i += stride) {
+    out[i] = functor.Compute((in_forward + i), (in_dout + i));
+  }
+
+  while (idx == loop && tail) {
+    dx[num - tail] =
+        functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]);
+    --tail;
+  }
+}
+
+template <typename T, typename Functor>
+__global__ void ActivationkernelVec(const T* src, T* dst, int num,
+                                    Functor functor) {
+  constexpr int vecsize = CudaVecType<T>::vecsize;
+  using VecType = typename CudaVecType<T>::type;
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  int loop = num / vecsize;
+  int tail = num % vecsize;
+  const VecType* in = reinterpret_cast<const VecType*>(src);
+  VecType* out = reinterpret_cast<VecType*>(dst);
+
+  for (int i = idx; i < loop; i += stride) {
+    out[i] = functor.Compute((in + i));
+  }
+
+  while (idx == loop && tail) {
+    dst[num - tail] = functor.ComputeRemainder(src[num - tail]);
+    --tail;
+  }
+}
+
+template <typename DeviceContext, typename Functor>
+class ActivationGPUKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = nullptr;
+    framework::Tensor* out = nullptr;
+    ExtractActivationTensor(context, &in_x, &out);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    int num = in_x->numel();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>(dev_ctx.GetPlace(),
+                                          static_cast<size_t>(num * sizeof(T)));
+
+    int block = 512;
+#ifdef __HIPCC__
+    block = 256;
+#endif
+    Functor functor;
+    constexpr int vecsize = CudaVecType<T>::vecsize;
+    int grid = max((num / vecsize + block - 1) / block, 1);
+    auto stream = context.cuda_device_context().stream();
+    ActivationkernelVec<T, Functor><<<grid, block, 0, stream>>>(
+        input_data, output_data, num, functor);
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class ActivationGradGPUKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor *x, *out, *d_out;
+    framework::Tensor* d_x = nullptr;
+    x = out = d_out = nullptr;
+    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &x, &out, &d_out,
+                                                    &d_x);
+    int numel = d_out->numel();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto* dx_data = d_x->mutable_data<T>(
+        dev_ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
+    auto* dout_data = d_out->data<T>();
+
+    auto* forward_data = dout_data;
+    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
+      // Only need forward output Out
+      forward_data = out->data<T>();
+    } else if (static_cast<int>(Functor::FwdDeps()) ==
+               static_cast<int>(kDepX)) {
+      // Only need forward input X
+      forward_data = x->data<T>();
+    }
+
+    int block = 512;
+#ifdef __HIPCC__
+    block = 256;
+#endif
+    Functor functor;
+    constexpr int vecsize = CudaVecType<T>::vecsize;
+    int grid = max((numel / vecsize + block - 1) / block, 1);
+    auto stream = context.cuda_device_context().stream();
+    ActivationGradKernelVec<T, Functor><<<grid, block, 0, stream>>>(
+        forward_data, dout_data, dx_data, numel, functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
@@ -60,7 +330,21 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor);
+REGISTER_OP_CUDA_KERNEL(
+    relu, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
+                                   ops::ReluGPUFunctor<float>>,
+    ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
+                             ops::ReluGPUFunctor<double>>,
+    ops::ActivationGPUKernel<plat::CUDADeviceContext,
+                             ops::ReluGPUFunctor<plat::float16>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad, ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
+                                            ops::ReluGradGPUFunctor<float>>,
+    ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
+                                 ops::ReluGradGPUFunctor<double>>,
+    ops::ActivationGradGPUKernel<plat::CUDADeviceContext,
+                                 ops::ReluGradGPUFunctor<plat::float16>>);
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,