From 270699e6478d1314b4f723bc603856d54f0bf59a Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Wed, 24 Mar 2021 10:46:12 +0800
Subject: [PATCH 01/12] [ROCM] fix test_matmul_v2_op (#31802)

---
 paddle/fluid/operators/dot_op.h                          | 2 +-
 python/paddle/fluid/tests/unittests/test_matmul_v2_op.py | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 0b0b7f69b9d849..1b607922eda1d8 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -160,7 +160,7 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
                   const Tensor* tensor_dout, Tensor* tensor_dx,
                   Tensor* tensor_dy,
                   const paddle::framework::ExecutionContext& ctx) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
     if (1 == tensor_dout->dims().size()) {
       auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
 
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 761d318d7b8a3d..efcc0e4cfe3232 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -67,7 +67,7 @@ def config(self):
         self.trans_y = False
 
     def init_kernel_type(self):
-        self.dtype = "float64"
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
 
     def setUp(self):
         self.init_kernel_type()
@@ -91,7 +91,10 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        if core.is_compiled_with_rocm():
+            self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
+        else:
+            self.check_grad(['X', 'Y'], 'Out')
 
 
 class TestMatMuklOp2(TestMatMulV2Op):

From 68497e7b39a13939f1a466f56874fc5aa984878a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 24 Mar 2021 14:26:51 +0800
Subject: [PATCH 02/12] change trainable to stop_gradient in optimizer (#31823)

---
 python/paddle/optimizer/adam.py      |  2 +-
 python/paddle/optimizer/adamax.py    |  2 +-
 python/paddle/optimizer/optimizer.py | 19 ++++++++++---------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index b0c05cf8de76c2..0cafbda893dd2f 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -351,7 +351,7 @@ def step(self):
         """
         params_grads = []
         for param in self._parameter_list:
-            if not param.trainable:
+            if param.stop_gradient:
                 continue
             if param._grad_ivar() is not None:
                 grad_var = param._grad_ivar()
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index bd65fc19c32aaf..4a6c2278a46f40 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -184,7 +184,7 @@ def _finish_update(self, block, parameters_and_grads):
         """
         assert isinstance(block, framework.Block)
         for param, grad in parameters_and_grads:
-            if grad is None or param.trainable is False:
+            if grad is None or param.stop_gradient is True:
                 continue
             with param.block.program._optimized_guard(
                 [param, grad]), name_scope('adamax'):
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 212dad7c77cb4f..b37d1726064113 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -542,7 +542,7 @@ def _get_accumulator(self, name, param):
 
     def _update_param_device_map(self, parameters_and_grads, target_block):
         for param_and_grad in parameters_and_grads:
-            if param_and_grad[0].trainable is True:
+            if param_and_grad[0].stop_gradient is False:
                 param_name = param_and_grad[0].name
                 ops = target_block.ops
                 device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
@@ -598,14 +598,14 @@ def _create_optimization_pass(self, parameters_and_grads):
         self._update_param_device_map(parameters_and_grads, target_block)
         self._create_accumulators(
             target_block,
-            [p[0] for p in parameters_and_grads if p[0].trainable])
+            [p[0] for p in parameters_and_grads if not p[0].stop_gradient])
         self._create_global_learning_rate()
 
         if framework.in_dygraph_mode():
             for param_and_grad in parameters_and_grads:
                 if param_and_grad[1] is None:
                     continue
-                if param_and_grad[0].trainable is True:
+                if param_and_grad[0].stop_gradient is False:
                     self._append_optimize_op(target_block, param_and_grad)
         else:
             for param_and_grad in parameters_and_grads:
@@ -613,7 +613,7 @@ def _create_optimization_pass(self, parameters_and_grads):
                     continue
                 with param_and_grad[0].block.program._optimized_guard(
                         param_and_grad), name_scope("optimizer"):
-                    if param_and_grad[0].trainable is True:
+                    if param_and_grad[0].stop_gradient is False:
                         device = self._get_device_for_param(param_and_grad[0]
                                                             .name)
                         with device_guard(device):
@@ -689,7 +689,7 @@ def backward(self,
 
             params_grads = []
             for param in parameter_list:
-                if not param.trainable:
+                if param.stop_gradient:
                     continue
                 if param._grad_ivar() is not None:
                     # create gradient tensor
@@ -789,8 +789,9 @@ def _apply_optimize(self, loss, startup_program, params_grads):
     def _get_no_grad_set(self, loss, no_grad_set=None):
         no_grad_set = _get_no_grad_set_name(no_grad_set)
         parameters = loss.block.program.global_block().all_parameters()
-        param_no_trainable = set(
-            [param.name for param in parameters if param.trainable is False])
+        param_no_trainable = set([
+            param.name for param in parameters if param.stop_gradient is True
+        ])
         # If the parameter is no trainable, it should not have a gradient.
         no_grad_set.update(param_no_trainable)
 
@@ -825,7 +826,7 @@ def clear_grad(self):
 
         """
         for p in self._parameter_list:
-            if p.trainable:
+            if not p.stop_gradient:
                 p.clear_gradient()
 
     @imperative_base.no_grad
@@ -920,7 +921,7 @@ def step(self):
         """
         params_grads = []
         for param in self._parameter_list:
-            if not param.trainable:
+            if param.stop_gradient:
                 continue
             if param._grad_ivar() is not None:
                 grad_var = param._grad_ivar()

From 84a551380efa7feffc496112a1b746ab7d0617d1 Mon Sep 17 00:00:00 2001
From: cc <52520497+juncaipeng@users.noreply.github.com>
Date: Wed, 24 Mar 2021 14:40:14 +0800
Subject: [PATCH 03/12] [dygraph qat] Refine saving output scale to infer
 program (#31784)

* Refine saving output scale to infer program
---
 .../slim/quantization/imperative/qat.py       | 229 ++++++++++--------
 .../slim/quantization/imperative/utils.py     |  34 ++-
 .../slim/tests/test_imperative_out_scale.py   |  23 +-
 3 files changed, 166 insertions(+), 120 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 68b4cfdc661b4a..ea2e8e073b5084 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -251,8 +251,8 @@ def __init__(self,
         super(ImperativeQuantizeInputs, self).__init__()
 
         self._quantizable_layer_type = tuple(
-            utils.supported_quant_layers_map[layer]
-            if layer in utils.supported_quant_layers_map else layer
+            utils.quant_input_layers_map[layer]
+            if layer in utils.quant_input_layers_map else layer
             for layer in quantizable_layer_type)
         for layer in self._quantizable_layer_type:
             assert not isinstance(layer, str), \
@@ -324,12 +324,11 @@ def apply(self, model):
             target = name[last_idx:idx]
 
             quant_layer = self._get_quantized_layer(layer)
-            setattr(quant_layer, "layer_name", layer.full_name())
             setattr(obj, target, quant_layer)
 
     def _get_quantized_layer(self, layer):
         quant_layer_name = None
-        for key, value in utils.supported_quant_layers_map.items():
+        for key, value in utils.quant_input_layers_map.items():
             if isinstance(layer, value):
                 quant_layer_name = 'Quantized' + key
                 break
@@ -372,6 +371,9 @@ def apply(self, model):
         """
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
+
+        # Calculate the target ops's output scale, and don't consider
+        # the skip_quant attr
         for _, layer in model.named_sublayers():
             if self._is_target_layer(layer):
                 self._init_scale_params(layer)
@@ -411,24 +413,21 @@ def save_quantized_model(self, layer, path, input_spec=None, **config):
         assert isinstance(layer, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
-        # remove handles and collect output scales
+        self._gather_output_scale(layer)
+
         with dygraph.guard():
             layer.eval()
             for handle in self._register_hook_handle_list:
                 handle.remove()
-            for _, sub_layer in layer.named_sublayers():
-                if self._is_target_layer(sub_layer):
-                    if hasattr(sub_layer, "layer_name"):
-                        layer_name = sub_layer.layer_name
-                    else:
-                        layer_name = sub_layer.full_name()
-                    if hasattr(sub_layer, "_quant_out_scale"):
-                        self._out_scale_dict[layer_name] = float(
-                            sub_layer._quant_out_scale)
-
-        # save the quantized model that doesn't have output scales
         paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)
 
+        if len(self._out_scale_dict) == 0:
+            warnings.warn("Warning: No Layer of the model while to be " \
+                          "saved contains the out_threshold attribute, so the " \
+                          "generated inference model would not contain the " \
+                          "out_threshold.")
+            return
+
         # load static model
         is_dynamic_mode = False
         if paddle.in_dynamic_mode():
@@ -443,79 +442,26 @@ def save_quantized_model(self, layer, path, input_spec=None, **config):
         basename = os.path.basename(path)
         model_filename = basename + INFER_MODEL_SUFFIX
         params_filename = basename + INFER_PARAMS_SUFFIX
-        [inference_program, feed_target_names, fetch_targets] = (
+
+        [infer_program, feed_target_names, fetch_targets] = (
             load_inference_model(
                 dirname=dirname,
                 executor=exe,
                 model_filename=model_filename,
                 params_filename=params_filename))
 
+        # TODO(jc): analyse whether the dygraph model has
+        # several blocks before applying qat
+        assert infer_program.num_blocks == 1, \
+            "Quantization aware training (QAT) requires the program " \
+            "only has a block for now. When the model has if-else or " \
+            "while, the program will have several blocks."
+
         # set output scales to the static model
-        check_behind_op = False
-        op_count = 0
-        ops_list = [key for key, _ in self._out_scale_dict.items()]
-        if len(ops_list) == 0:
-            warnings.warn(
-                "Warning: No Layer of the model while to be saved contains "
-                "the out_threshold attribute, so the generated inference "
-                "model would not contain the out_threshold.")
-        else:
-            # Because the Layer in dygraph may correspond to multiple ops
-            # in static program after being saved. To ensure correctness,
-            # the outscale collected for output of dygraph Layer can only
-            # be set to the last op in the corresponding ops in static program.
-            #
-            # We can judge the execution order of the ops which corresponding
-            # to dygraph Layer by check_behind_op
-            forward_op = None
-            for block in inference_program.blocks:
-                for op in block.ops:
-                    if op.type in utils.op_real_in_out_name:
-                        if op_count > len(ops_list):
-                            warnings.warn(
-                                "The number of Layer which has "
-                                "out_threshold attribute should be bigger than "
-                                "the op in inference model")
-                            break
-                        if check_behind_op:
-                            check_behind_op = False
-                            if op.type == "elementwise_add":
-                                if self._is_op_matched(ops_list[op_count], op,
-                                                       block):
-                                    op._set_attr("out_threshold",
-                                                 self._out_scale_dict[ops_list[
-                                                     op_count]])
-                                    op_count += 1
-                                    forward_op = None
-                                continue
-                            else:
-                                if forward_op is None:
-                                    raise ValueError(
-                                        "forward_op should not be None")
-                                if self._is_op_matched(ops_list[op_count],
-                                                       forward_op, block):
-                                    forward_op._set_attr(
-                                        "out_threshold", self._out_scale_dict[
-                                            ops_list[op_count]])
-                                    op_count += 1
-                                    forward_op = None
-
-                        if op.type in ["conv2d", "depthwise_conv2d", "matmul"]:
-                            check_behind_op = True
-                            forward_op = op
-                            continue
-                        if op_count >= len(ops_list):
-                            warnings.warn(
-                                "The number of Layer which has out_threshold attribute should be bigger than the op in inference model"
-                            )
-                            break
-                        if self._is_op_matched(ops_list[op_count], op, block):
-                            op._set_attr(
-                                "out_threshold",
-                                self._out_scale_dict[ops_list[op_count]])
-                            op_count += 1
-
-            self._set_skip_quant_attr(inference_program)
+        self._save_output_scale(infer_program)
+
+        # process skip quant
+        self._set_skip_quant_attr(infer_program)
 
         # save the final quantized model that has output scales
         save_inference_model(
@@ -523,16 +469,75 @@ def save_quantized_model(self, layer, path, input_spec=None, **config):
             feeded_var_names=feed_target_names,
             target_vars=fetch_targets,
             executor=exe,
-            main_program=inference_program.clone(),
+            main_program=infer_program.clone(),
             model_filename=model_filename,
             params_filename=params_filename)
 
         if is_dynamic_mode:
             paddle.disable_static()
 
+    def _gather_output_scale(self, layer):
+        """
+        Gather all output scales to self._out_scale_dict
+        """
+        with dygraph.guard():
+            layer.eval()
+            for _, sub_layer in layer.named_sublayers():
+                if self._is_target_layer(sub_layer):
+                    layer_name = sub_layer.full_name()
+                    if hasattr(sub_layer, "_quant_out_scale"):
+                        self._out_scale_dict[layer_name] = float(
+                            sub_layer._quant_out_scale)
+
+    def _save_output_scale(self, infer_program):
+        """
+        Save all output scales to the corresponding ops in static
+        inference program.
+
+        Because the Layer in dygraph may correspond to multiple ops
+        in static program after being saved. To ensure correctness,
+        the outscale collected for output of dygraph Layer can only
+        be set to the last op in the corresponding ops in static program.
+        """
+        assert infer_program.num_blocks == 1, \
+            "The inference program should only have a block."
+
+        global_block = infer_program.global_block()
+        target_ops = global_block.ops
+
+        scale_idx = 0
+        op_idx = 0
+        attr_name = "out_threshold"
+
+        for scale_name, scale_value in self._out_scale_dict.items():
+            while True:
+                if op_idx >= len(target_ops):
+                    break
+
+                op = target_ops[op_idx]
+                if not self._is_scale_op_matched(scale_name, op, global_block):
+                    op_idx += 1
+                else:
+                    if op.type in utils.weight_op_types \
+                        and op_idx + 1 < len(target_ops) \
+                        and target_ops[op_idx+1].type == "elementwise_add":
+                        target_ops[op_idx + 1]._set_attr(attr_name, scale_value)
+                        op_idx += 2
+                    else:
+                        op._set_attr(attr_name, scale_value)
+                        op_idx += 1
+                    scale_idx += 1
+                    break
+
+        if scale_idx != len(self._out_scale_dict):
+            _logger.warning("Warning: the model have %s output scales, "\
+                "but it only saves %s output scales." \
+                % (len(self._out_scale_dict), scale_idx))
+
     def _is_target_layer(self, layer):
-        return isinstance(layer, utils.out_scale_layers_list) \
-            or 'quantized_' in layer.full_name()
+        return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \
+            or ('quantized_' in layer.full_name() and \
+                'quantized_noweight' not in layer.full_name())
 
     def _init_scale_params(self, layer, name=None):
         """
@@ -570,27 +575,39 @@ def _create_param(in_layer, first_name, last_name, dtype):
         layer._quant_out_accum = _create_param(layer, name, "accum", dtype)
         layer._quant_out_accum.stop_gradient = True
 
-    # Judge whether the op in program matches the Layer in dynamic model
-    def _is_op_matched(self, layer_name, op, block):
-        output_var_names = quantization_pass._get_op_output_var_names(op)
-        for output_var_name in output_var_names:
-            output_var_tensor = block.var(output_var_name)
-            if output_var_tensor.dtype not in [
-                    core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32
-            ]:
-                return False
-
-        # Because the naming styles of static and dynamic graph are different,
-        # in order to avoid mistakes, we unify the name here.
-        op_type = output_var_names[0].split(".")[0]
-        op_type = op_type.rsplit("_", 1)[0]
-        if op_type == 'depthwise_conv2d':
-            op_type = 'conv2d'
-        if 'prelu' in op_type:
-            op_type = op_type.replace('prelu', 'p_re_lu')
-        if 'relu' in op_type:
-            op_type = op_type.replace('relu', 're_lu')
-        return op_type in layer_name
+    def _is_scale_op_matched(self, scale_name, op, block):
+        """
+        Based on the op name and attrs to judge whether the op in
+        program matches the scale_name. We must know the corresponding
+        name between dgraph and static model.
+        """
+        fp_type = [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]
+        if op.type in quantization_pass._op_real_in_out_name.keys():
+            output_var_names = quantization_pass._get_op_output_var_names(op)
+            for output_var_name in output_var_names:
+                output_var_tensor = block.var(output_var_name)
+                if output_var_tensor.dtype not in fp_type:
+                    return False
+
+        # corresponding_map: [name, op_types, function]
+        # Note that, the items have priority in corresponding_map
+        corresponding_map = [
+            ['conv2d_tranpose', ['conv2d_transpose', \
+                                'depthwise_conv2d_transpose'], None],
+            ['conv2d', ['conv2d', 'depthwise_conv2d'], None],
+            ['linear', ['matmul'], None],
+            ['re_lu6', ['relu6'], None],
+            ['p_re_lu', ['prelu'], None],
+            ['leaky_re_lu', ['leaky_relu'], None],
+            ['re_lu', ['relu'], None],
+        ]
+
+        for item in corresponding_map:
+            if item[0] in scale_name:
+                return (op.type in item[1]) and \
+                    (len(item) == 2 or item[2] is None or item[2](op))
+
+        return op.type in scale_name
 
     def _set_skip_quant_attr(self, program):
         block = program.global_block()
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 3bf655265c6f22..090f6cda389af2 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -30,7 +30,7 @@
     "swish": [["X"], ["Out"]],
 }
 
-supported_quant_layers_map = {
+quant_input_layers_map = {
     'Conv2D': paddle.nn.Conv2D,
     'Linear': paddle.nn.Linear,
     'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
@@ -58,8 +58,30 @@
     "fake_quantize_dequantize_moving_average_abs_max"
 ]
 
-out_scale_layers_list = (
-    paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.MaxPool2D,
-    paddle.nn.BatchNorm, paddle.nn.BatchNorm2D, paddle.nn.SyncBatchNorm,
-    paddle.nn.LeakyReLU, paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6,
-    paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Tanh, paddle.nn.Swish)
+quant_output_layers_map = {
+    'Conv2D': paddle.nn.Conv2D,
+    'Conv2DTranspose': paddle.nn.Conv2DTranspose,
+    'Linear': paddle.nn.Linear,
+    'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
+    'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D,
+    'AvgPool2D': paddle.nn.AvgPool2D,
+    'MaxPool2D': paddle.nn.MaxPool2D,
+    'BatchNorm': paddle.nn.BatchNorm,
+    'BatchNorm2D': paddle.nn.BatchNorm2D,
+    'SyncBatchNorm': paddle.nn.SyncBatchNorm,
+    'ELU': paddle.nn.ELU,
+    'GELU': paddle.nn.GELU,
+    'LeakyReLU': paddle.nn.LeakyReLU,
+    'PReLU': paddle.nn.PReLU,
+    'ReLU': paddle.nn.ReLU,
+    'ReLU6': paddle.nn.ReLU6,
+    'Sigmoid': paddle.nn.Sigmoid,
+    'Softmax': paddle.nn.Softmax,
+    'Tanh': paddle.nn.Tanh,
+    'Swish': paddle.nn.Swish,
+}
+
+weight_op_types = [
+    "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose",
+    "depthwise_conv2d_transpose"
+]
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index ed29375d22bb9f..600174e503feb2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -33,7 +33,6 @@
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU
 from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
-from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph import nn
 
@@ -131,8 +130,8 @@ def __init__(self, num_classes=10):
                 bias_attr=False),
             BatchNorm2D(6),
             ReLU(),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
+            MaxPool2D(
+                kernel_size=2, stride=2),
             Conv2D(
                 in_channels=6,
                 out_channels=16,
@@ -357,7 +356,6 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
                     "diff({}) at {}, dynamic loss = {}, static loss = {}".
                     format(diff, i, loss_d, loss_s))
                 break
-
         self.assertTrue(
             np.allclose(
                 np.array(dynamic_loss_rec),
@@ -398,10 +396,15 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
             if dynamic_ops[i].has_attr("out_threshold"):
                 op_count += 1
                 self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
+                if dynamic_ops[i].attr("out_threshold") != static_ops[i].attr(
+                        "out_threshold"):
+                    _logger.info(dynamic_ops[i].attr("out_threshold"))
+                    _logger.info(static_ops[i].attr("out_threshold"))
                 self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
                                 static_ops[i].attr("out_threshold"))
 
-        self.assertTrue(op_count == 13)
+        _logger.info("op_cout: {}".format(op_count))
+        self.assertTrue(op_count == 14)
 
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
@@ -470,7 +473,9 @@ def test_save_quantized_model(self):
                 self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
                 self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
                                 static_ops[i].attr("out_threshold"))
-        self.assertTrue(op_count == 13)
+
+        _logger.info("op_cout: {}".format(op_count))
+        self.assertTrue(op_count == 14)
 
 
 class TestSaveQuantizedModel_Warning(unittest.TestCase):
@@ -490,8 +495,10 @@ def test_warning(self):
                         shape=[None, 1, 28, 28], dtype='float32')
                 ])
 
-        warning_message = "Warning: No Layer of the model while to be saved contains the out_threshold attribute, " \
-                "so the generated inference model would not contain the out_threshold."
+        warning_message = "Warning: No Layer of the model while to be " \
+                          "saved contains the out_threshold attribute, so the " \
+                          "generated inference model would not contain the " \
+                          "out_threshold."
         num = get_vaild_warning_num(warning_message, w)
         assert num == 1
 

From f2cfc0f46d8b47f743320b8037d6f309a097d294 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 24 Mar 2021 15:24:46 +0800
Subject: [PATCH 04/12] [CustomOp]Avoid raising warning while import paddle
 (#31804)

---
 python/paddle/utils/cpp_extension/cpp_extension.py   | 6 +++---
 python/paddle/utils/cpp_extension/extension_utils.py | 6 ------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index d84ae67fff8d67..ea4c85e20db764 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -400,14 +400,14 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                 # ncvv compile CUDA source
                 if is_cuda_file(src):
                     if core.is_compiled_with_rocm():
-                        assert ROCM_HOME is not None
+                        assert ROCM_HOME is not None, "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it."
                         hipcc_cmd = os.path.join(ROCM_HOME, 'bin', 'hipcc')
                         self.compiler.set_executable('compiler_so', hipcc_cmd)
                         # {'nvcc': {}, 'cxx: {}}
                         if isinstance(cflags, dict):
                             cflags = cflags['hipcc']
                     else:
-                        assert CUDA_HOME is not None
+                        assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
                         nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
                         self.compiler.set_executable('compiler_so', nvcc_cmd)
                         # {'nvcc': {}, 'cxx: {}}
@@ -470,7 +470,7 @@ def win_custom_spawn(cmd):
                 src = src_list[0]
                 obj = obj_list[0]
                 if is_cuda_file(src):
-                    assert CUDA_HOME is not None
+                    assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
                     nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
                     if isinstance(self.cflags, dict):
                         cflags = self.cflags['nvcc']
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 1ff42a7bcbc0d1..7d6bcc4d564c93 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -461,9 +461,6 @@ def find_cuda_home():
     if cuda_home and not os.path.exists(
             cuda_home) and core.is_compiled_with_cuda():
         cuda_home = None
-        warnings.warn(
-            "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
-        )
 
     return cuda_home
 
@@ -494,9 +491,6 @@ def find_rocm_home():
     if rocm_home and not os.path.exists(
             rocm_home) and core.is_compiled_with_rocm():
         rocm_home = None
-        warnings.warn(
-            "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it."
-        )
 
     return rocm_home
 

From e5f7a834d4200ad9d7e8b748d2d96fc7faeb0e63 Mon Sep 17 00:00:00 2001
From: Wojciech Uss <wojciech.uss@intel.com>
Date: Wed, 24 Mar 2021 08:41:47 +0100
Subject: [PATCH 05/12] fix cache key in concat oneDNN kernel (#31820)

* fix cache key in concat oneDNN kernel

* key simplified
---
 .../operators/mkldnn/concat_mkldnn_op.cc      | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 4beb7ad017851b..df1b5af121da93 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -71,6 +71,15 @@ static const std::vector<const Tensor*> ReduceMultiInput(
   return reduced;
 }
 
+static const std::vector<int> GetDimsForKey(
+    const std::vector<const Tensor*>& inputs) {
+  auto dims_key = paddle::framework::vectorize<int>(inputs[0]->dims());
+  for (auto it = std::next(inputs.begin()); it != inputs.end(); ++it) {
+    dims_key.push_back((*it)->dims()[0]);
+  }
+  return dims_key;
+}
+
 template <typename T>
 class ConcatPrimitiveFactory {
  public:
@@ -134,6 +143,8 @@ template <typename T>
 class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    // If any of the multiple inputs of concat has an input size of 0, the
+    // actual size of the multi_input will change
     auto multi_input = ReduceMultiInput(ctx.MultiInput<Tensor>("X"));
     EnforceLayouts(multi_input);
     Tensor* output = ctx.Output<Tensor>("Out");
@@ -156,12 +167,9 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::ToMKLDNNDataType(multi_input[0]->type());
 
     ConcatPrimitiveFactory<T> prim_creator;
-    // If one of the multiple inputs of concat has an input size of 0, the
-    // actual size of the multi_input will change
-    std::string key = platform::CreateKey(
-        dev_ctx, paddle::framework::vectorize<int>(multi_input[0]->dims()),
-        multi_input.size(), ctx.OutputName("Out"), dt,
-        platform::ThreadIDasStr());
+    std::string key =
+        platform::CreateKey(dev_ctx, GetDimsForKey(multi_input),
+                            multi_input.size(), ctx.OutputName("Out"), dt);
     key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
 
     const std::string key_prim = key + "@concat_p";

From 649868ffb262bdba89741eca93e7c7cb8632b9e2 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Wed, 24 Mar 2021 16:37:16 +0800
Subject: [PATCH 06/12] [Dy2stat] Fix the bug that loop_body_func may return
 single element (#31806)

Our old `loop_body` function may return single element when `loop_vars` just contains only 1 element, which can cause bug. The key point of this PR is forcing `loop_body` functions always return tuple.
---
 .../dygraph_to_static/loop_transformer.py     |  2 +-
 .../fluid/dygraph/dygraph_to_static/utils.py  | 12 ++++++--
 .../dygraph_to_static/test_for_enumerate.py   | 29 +++++++++++++++++--
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index b7ef000938a151..bd89a79c805c98 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -594,7 +594,7 @@ def get_for_stmt_nodes(self, node):
         # append return values for loop body
         body_stmts.append(
             gast.Return(value=generate_name_node(
-                loop_var_names, ctx=gast.Load())))
+                loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True)))
         body_func_node = gast.FunctionDef(
             name=unique_name.generate(FOR_BODY_PREFIX),
             args=gast.arguments(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 1071fc1350bfeb..624ca085ac6c2d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -381,9 +381,15 @@ def get_attribute_full_name(node):
     return astor.to_source(gast.gast_to_ast(node)).strip()
 
 
-def generate_name_node(name_ids, ctx=gast.Load()):
+def generate_name_node(name_ids, ctx=gast.Load(), gen_tuple_if_single=False):
     """
-    Generate list or gast.Tuple of ast.Name for Return statement.
+    If name_ids is list or tuple or set with multiple strings, this function
+    generates gast.Tuple of gast.Name.
+    If the name_ids is single string or contains only 1 string, this function
+    returns gast.Name if gen_tuple_if_single==False else returns gast.Tuple
+    with only one gast.Name
+
+    This function is used at several gast.Return statements.
     """
     if isinstance(name_ids, six.string_types):
         name_ids = [name_ids]
@@ -395,7 +401,7 @@ def generate_name_node(name_ids, ctx=gast.Load()):
             id=name_id, ctx=ctx, annotation=None, type_comment=None)
         for name_id in name_ids
     ]
-    if len(gast_names) == 1:
+    if len(gast_names) == 1 and not gen_tuple_if_single:
         name_node = gast_names[0]
     else:
         name_node = gast.Tuple(elts=gast_names, ctx=ctx)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
index c28997c5c1c673..517cff39a276f4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
@@ -233,6 +233,7 @@ def for_iter_var_idx(x_array):
     return z
 
 
+# 17. for a,b,c in z: (a, b, c) is a tuple
 @paddle.jit.to_static
 def for_tuple_as_iter_var(x_array):
     x = paddle.to_tensor(x_array)
@@ -250,6 +251,7 @@ def for_tuple_as_iter_var(x_array):
     return a_result, b_result, c_result
 
 
+# 18. for t in enumerate(collection): t is tuple of (idx, element)
 @paddle.jit.to_static
 def for_tuple_as_enumerate_iter(x_array):
     x = paddle.to_tensor(x_array)
@@ -263,6 +265,7 @@ def for_tuple_as_enumerate_iter(x_array):
     return a_result
 
 
+# 19. for i, (a, b, c, d, e) in enumerate(collection): (a, b, c, d, e) is a tuple
 @paddle.jit.to_static
 def for_tuple_as_enumerate_value(x_array):
     x = paddle.to_tensor(x_array)
@@ -284,6 +287,23 @@ def for_tuple_as_enumerate_value(x_array):
     return a_result
 
 
+# 20. test for function in a class
+class ForwardContainsForLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(ForwardContainsForLayer, self).__init__()
+        self.high = 5
+        self.low = 3
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        # just for test case, x is useless in this method
+        y = paddle.zeros([10, 2, 3])
+        z = []
+        for i in range(self.high - self.low):
+            z.append(y[i].clone())
+        return z
+
+
 class TestTransformBase(unittest.TestCase):
     def setUp(self):
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
@@ -313,11 +333,11 @@ def get_static_output(self):
 class TestTransform(TestTransformBase):
     def transformed_result_compare(self):
         dy_outs = self.get_dygraph_output()
-        if not isinstance(dy_outs, tuple):
+        if not isinstance(dy_outs, (tuple, list)):
             dy_outs = (dy_outs, )
 
         st_outs = self.get_static_output()
-        if not isinstance(st_outs, tuple):
+        if not isinstance(st_outs, (tuple, list)):
             st_outs = (st_outs, )
 
         for x, y in zip(dy_outs, st_outs):
@@ -446,5 +466,10 @@ def set_test_func(self):
         self.dygraph_func = for_tuple_as_enumerate_value
 
 
+class TestForwardContainsForLayer(TestForIterVarNumpy):
+    def set_test_func(self):
+        self.dygraph_func = ForwardContainsForLayer()
+
+
 if __name__ == '__main__':
     unittest.main()

From 5d89ec36dc36c3b09a3972db326a2d41c4a330a5 Mon Sep 17 00:00:00 2001
From: parap1uie-s <parap1uie-s@users.noreply.github.com>
Date: Wed, 24 Mar 2021 17:25:00 +0800
Subject: [PATCH 07/12] Update pooling.py (#31829)

Fix default argument of nn.MaxPool3D()
---
 python/paddle/nn/layer/pooling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 0f3c4449a3f20d..5830af3a182d4f 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -589,8 +589,8 @@ class MaxPool3D(layers.Layer):
 
     def __init__(self,
                  kernel_size,
-                 stride,
-                 padding,
+                 stride=None,
+                 padding=0,
                  return_mask=False,
                  ceil_mode=False,
                  data_format="NCDHW",

From e7f28d6c0db54eb9c9a810612300b526687e56a6 Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Wed, 24 Mar 2021 18:19:51 +0800
Subject: [PATCH 08/12] fix runtime crash when rnn model inference,
 test=develop (#31833)

---
 .../analysis/passes/memory_optimize_pass.cc   |  1 +
 paddle/fluid/operators/recurrent_op.cc        | 25 +++++++++----------
 python/paddle/nn/functional/norm.py           |  9 ++++---
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 5e6960c4c7e8c0..fdfd2c60af0c16 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -103,6 +103,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
                                         "merge_lod_tensor",
                                         "equal",
                                         "sequence_pool",
+                                        "recurrent",
                                         "lod_reset"};
     for (auto* tmp : node->inputs) {
       CHECK(tmp->IsOp());
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 9766008963be00..92e5e4a0cd120f 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -210,9 +210,10 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
   auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
   auto *program = block->Program();
-  auto ctx = executor.Prepare(
-      *program, block->ID(), Attr<std::vector<std::string>>(
-                                 kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);
+  auto ctx = executor.Prepare(*program, block->ID(),
+                              Attr<std::vector<std::string>>(
+                                  kSkipEagerDeletionVars), /*skip_ref_cnt_vars*/
+                              true);
 
   static std::mutex mutex;
   std::lock_guard<std::mutex> lock(mutex);
@@ -255,16 +256,6 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
     // Link inside::output -> outside::output
     //   outside::output[seq_offset: seq_offset + 1] = inside::output
     executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_);
-    if (i > 0) {
-      LinkTensorWithCallback(scope, Outputs(kOutputs), cur_scope,
-                             Outputs(kOutputs),
-                             [&](const framework::LoDTensor &src_tensor,
-                                 framework::LoDTensor *dst_tensor) {
-                               framework::Tensor src_slice =
-                                   src_tensor.Slice(seq_offset, seq_offset + 1);
-                               dst_tensor->ShareDataWith(src_slice);
-                             });
-    }
 
     // Linked now, execute!
     executor.RunPreparedContext(ctx.get(), &cur_scope,
@@ -284,6 +275,14 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
             // early.
             framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
           });
+    } else {
+      LinkTensorWithCallback(
+          cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
+          [&](const framework::LoDTensor &src_tensor,
+              framework::LoDTensor *dst_tensor) {
+            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
+            framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
+          });
     }
 
     scopes.ForwardNext();
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 03ba78e12f6376..54824233f70762 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -188,10 +188,10 @@ def batch_norm(x,
 
     if in_dygraph_mode():
         # for dygraph need tuple
-        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
-                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
-                 "use_global_stats", use_global_stats, "trainable_statistics",
-                 trainable_statistics)
+        attrs = ("momentum", momentum, "epsilon", epsilon, "is_test",
+                 not training, "data_layout", data_format, "use_mkldnn", False,
+                 "fuse_with_relu", False, "use_global_stats", use_global_stats,
+                 "trainable_statistics", trainable_statistics)
         batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
             x, weight, bias, running_mean, running_var, mean_out, variance_out,
             *attrs)
@@ -205,6 +205,7 @@ def batch_norm(x,
     attrs = {
         "momentum": momentum,
         "epsilon": epsilon,
+        "is_test": not training,
         "data_layout": data_format,
         "use_mkldnn": False,
         "fuse_with_relu": False,

From 6472d62093c49e76cfcc5fc93224a4be4b1f063b Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Thu, 25 Mar 2021 08:57:24 +0800
Subject: [PATCH 09/12] Revert "add relu forward kernel and backward kernel
 (#31613)" (#31853)

---
 paddle/fluid/operators/activation_op.cu | 284 +-----------------------
 1 file changed, 1 insertion(+), 283 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 29498da0f026f5..2033081af224a4 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -10,276 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/math_cuda_utils.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using float16 = paddle::platform::float16;
-
-template <typename T>
-struct CudaVecType {
-  using type = T;
-  static constexpr int vecsize = 1;
-};
-
-template <>
-struct CudaVecType<platform::float16> {
-  using type = __half2;
-  static constexpr int vecsize = 2;
-};
-
-template <>
-struct CudaVecType<float> {
-  using type = float4;
-  static constexpr int vecsize = 4;
-};
-
-template <typename T>
-class BaseGPUFunctor {
- public:
-  using ELEMENT_TYPE = T;
-};
-
-/* ========================================================================== */
-
-/* ===========================    relu forward   ============================ */
-template <typename T>
-class ReluGPUFuctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
-
- public:
-  ReluGPUFuctor() { zero_ = static_cast<T>(0.0f); }
-
-  // for relu forward when T is double
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type* x);
-
-  // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T x) {
-    return x > zero_ ? x : zero_;
-  }
-};
-
-template <>
-__device__ __forceinline__ CudaVecType<double>::type
-ReluGPUFuctor<double>::Compute(const CudaVecType<double>::type* x) {
-// relu forward : out = max(x, 0)
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
-  return __ldg(x) > zero_ ? __ldg(x) : zero_;
-#else
-  return (*x) > zero_ ? (*x) : zero_;
-#endif
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-ReluGPUFuctor<float>::Compute(const CudaVecType<float>::type* xx) {
-  // relu forward : out = max(xx, 0)
-  return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y),
-                     (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w));
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type
-ReluGPUFuctor<float16>::Compute(const CudaVecType<float16>::type* in) {
-// relu forward : out = max(in, 0)
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
-  const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in));
-#else
-  const float2 xx = __half22float2(*in);
-  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(xx.x),
-                           (xx.y > 0.0f) * static_cast<float>(xx.y));
-#endif
-}
-/* ========================================================================== */
-
-/* ===========================    relu backward   ============================
- */
-
-template <typename T>
-class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
-
- public:
-  ReluGradGPUFunctor() { zero_ = static_cast<T>(0.0f); }
-
-  // for relu backward when T is double
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type* out,
-      const typename CudaVecType<T>::type* dout);
-
-  // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) {
-    // relu backward : dx = out > 0 ? dout : 0;
-    return out > zero_ ? dout : zero_;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <>
-__device__ __forceinline__ CudaVecType<double>::type
-ReluGradGPUFunctor<double>::Compute(const CudaVecType<double>::type* out,
-                                    const CudaVecType<double>::type* dout) {
-// relu backward : dx = out > 0 ? dout : 0;
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
-  return __ldg(out) > zero_ ? __ldg(dout) : zero_;
-#else
-  return (*out) > zero_ ? (*dout) : zero_;
-#endif
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-ReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type* out,
-                                   const CudaVecType<float>::type* dout) {
-  // relu backward : dx = out > 0 ? dout : 0;
-  return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y),
-                     (out->z > zero_) * (dout->z),
-                     (out->w > zero_) * (dout->w));
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type
-ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* out,
-                                     const CudaVecType<float16>::type* dout) {
-// relu backward : dx = out > 0 ? dout : 0;
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
-  const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout));
-#else
-  const float2 xx = __half22float2(*out);
-  const float2 yy = __half22float2(*dout);
-  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(yy.x),
-                           (xx.y > 0.0f) * static_cast<float>(yy.y));
-#endif
-}
-
-/* ========================================================================== */
-
-template <typename T, typename Functor>
-__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout,
-                                        T* dx, int num, Functor functor) {
-  using VecType = typename CudaVecType<T>::type;
-  constexpr int vecsize = CudaVecType<T>::vecsize;
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  int loop = num / vecsize;
-  int tail = num % vecsize;
-  const VecType* in_forward = reinterpret_cast<const VecType*>(forward_data);
-  const VecType* in_dout = reinterpret_cast<const VecType*>(dout);
-  VecType* out = reinterpret_cast<VecType*>(dx);
-
-  for (int i = idx; i < loop; i += stride) {
-    out[i] = functor.Compute((in_forward + i), (in_dout + i));
-  }
-
-  while (idx == loop && tail) {
-    dx[num - tail] =
-        functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]);
-    --tail;
-  }
-}
-
-template <typename T, typename Functor>
-__global__ void ActivationkernelVec(const T* src, T* dst, int num,
-                                    Functor functor) {
-  constexpr int vecsize = CudaVecType<T>::vecsize;
-  using VecType = typename CudaVecType<T>::type;
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  int loop = num / vecsize;
-  int tail = num % vecsize;
-  const VecType* in = reinterpret_cast<const VecType*>(src);
-  VecType* out = reinterpret_cast<VecType*>(dst);
-
-  for (int i = idx; i < loop; i += stride) {
-    out[i] = functor.Compute((in + i));
-  }
-
-  while (idx == loop && tail) {
-    dst[num - tail] = functor.ComputeRemainder(src[num - tail]);
-    --tail;
-  }
-}
-
-template <typename DeviceContext, typename Functor>
-class ActivationGPUKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = nullptr;
-    framework::Tensor* out = nullptr;
-    ExtractActivationTensor(context, &in_x, &out);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    int num = in_x->numel();
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->mutable_data<T>(dev_ctx.GetPlace(),
-                                          static_cast<size_t>(num * sizeof(T)));
-
-    int block = 512;
-#ifdef __HIPCC__
-    block = 256;
-#endif
-    Functor functor;
-    constexpr int vecsize = CudaVecType<T>::vecsize;
-    int grid = max((num / vecsize + block - 1) / block, 1);
-    ActivationkernelVec<T, Functor><<<grid, block>>>(input_data, output_data,
-                                                     num, functor);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class ActivationGradGPUKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor *x, *out, *d_out;
-    framework::Tensor* d_x = nullptr;
-    x = out = d_out = nullptr;
-    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &x, &out, &d_out,
-                                                    &d_x);
-    int numel = d_out->numel();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto* dx_data = d_x->mutable_data<T>(
-        dev_ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-    auto* dout_data = d_out->data<T>();
-
-    auto* forward_data = dout_data;
-    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
-      // Only need forward output Out
-      forward_data = out->data<T>();
-    } else if (static_cast<int>(Functor::FwdDeps()) ==
-               static_cast<int>(kDepX)) {
-      // Only need forward input X
-      forward_data = x->data<T>();
-    }
-
-    int block = 512;
-#ifdef __HIPCC__
-    block = 256;
-#endif
-    Functor functor;
-    constexpr int vecsize = CudaVecType<T>::vecsize;
-    int grid = max((numel / vecsize + block - 1) / block, 1);
-    ActivationGradKernelVec<T, Functor><<<grid, block>>>(
-        forward_data, dout_data, dx_data, numel, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
@@ -328,21 +60,7 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    relu, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
-                                   ops::ReluGPUFuctor<float>>,
-    ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
-                             ops::ReluGPUFuctor<double>>,
-    ops::ActivationGPUKernel<plat::CUDADeviceContext,
-                             ops::ReluGPUFuctor<plat::float16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad, ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                            ops::ReluGradGPUFunctor<float>>,
-    ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                 ops::ReluGradGPUFunctor<double>>,
-    ops::ActivationGradGPUKernel<plat::CUDADeviceContext,
-                                 ops::ReluGradGPUFunctor<plat::float16>>);
+REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,

From 511e204e620f3c6e3df2018746c52c5bf2386a59 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Thu, 25 Mar 2021 11:24:01 +0800
Subject: [PATCH 10/12] LRScheduler.get_lr should not update lr in LinearWarmup
 (#31843)

---
 .../fluid/tests/unittests/test_lr_scheduler.py       | 12 ++++++++++++
 python/paddle/optimizer/lr.py                        |  5 ++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 8c6383cd6ef523..04a0d47e47c86b 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -537,6 +537,18 @@ def test_scheduler(self):
                 self._test_dygraph(python_func, paddle_api, kwarg, place)
                 paddle.enable_static()
 
+    def test_linear_warmp(self):
+        natural_lr = paddle.optimizer.lr.NaturalExpDecay(
+            learning_rate=0.5, gamma=0.1)
+        natural_lr_warmup = paddle.optimizer.lr.LinearWarmup(
+            learning_rate=natural_lr, warmup_steps=10, start_lr=0.0, end_lr=0.1)
+        for idx in range(30):
+            if idx >= 10:
+                self.assertEqual(natural_lr_warmup.get_lr(),
+                                 natural_lr.get_lr())
+                natural_lr.step()
+            natural_lr_warmup.step()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 5085911ce927a3..484b4fb7246a76 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -786,9 +786,8 @@ def get_lr(self):
                 self.last_epoch) / float(self.warmup_steps) + self.start_lr
         else:
             if isinstance(self.learning_rate, LRScheduler):
-                lr_value = self.learning_rate()
-                self.learning_rate.step()
-                return lr_value
+                self.learning_rate.step(self.last_epoch - self.warmup_steps)
+                return self.learning_rate()
 
             return self.learning_rate
 

From 27f2d8df8e48847f62e31e627ee25ac2102f27fc Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 25 Mar 2021 11:36:16 +0800
Subject: [PATCH 11/12] Polish two error messages (#31852)

* polish two error messages

* polish details
---
 paddle/fluid/operators/detection/polygon_box_transform_op.cu | 3 ++-
 paddle/fluid/operators/matmul_op.cc                          | 2 +-
 paddle/fluid/operators/nll_loss_op.h                         | 5 ++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index 337a76f9f976f8..5977a434a6023f 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -45,7 +45,8 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
+        platform::errors::InvalidArgument(
+            "The polygon_box_transform operator needs to be executed on GPU."));
     auto* in = ctx.Input<Tensor>("Input");
     auto in_dims = in->dims();
     const T* in_data = in->data<T>();
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 9b64e99c944725..c12aecc9ba5160 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -587,7 +587,7 @@ class MatMulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_,
                       platform::errors::InvalidArgument(
                           "Input X's width should be equal to the Y's height, "
-                          "but received X's shape: [%s],"
+                          "but received X's shape: [%s], "
                           "Y's shape: [%s].",
                           dim_x, dim_y));
 #endif
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
index e93d5792205900..be6f4422d4ac6a 100644
--- a/paddle/fluid/operators/nll_loss_op.h
+++ b/paddle/fluid/operators/nll_loss_op.h
@@ -36,7 +36,10 @@ static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data,
       }
       PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
                         platform::errors::InvalidArgument(
-                            "label should not be out of bounds."));
+                            "Label value is out of range. "
+                            "Expected label value in range of [0, %d), but "
+                            "received value is %d.",
+                            n_classes, cur_label));
 
       const auto cur_weight =
           weight_data ? weight_data[cur_label] : static_cast<T>(1);

From bf09dcb346c9aa4c20fbfaf520ab781d4f640346 Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Thu, 25 Mar 2021 14:08:22 +0800
Subject: [PATCH 12/12] add GPU tensor notice & update
 default_collate_fn/default_convert_fn. test=develop (#31763)

---
 python/paddle/fluid/dataloader/collate.py | 47 +++++++++++++++++------
 python/paddle/fluid/reader.py             |  6 +++
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py
index ddc010d04280c8..8e90b308b393ed 100644
--- a/python/paddle/fluid/dataloader/collate.py
+++ b/python/paddle/fluid/dataloader/collate.py
@@ -27,24 +27,31 @@
 def default_collate_fn(batch):
     """
     Default batch collating function for :code:`paddle.io.DataLoader`,
-    batch should be a list of samples, and each sample should be a list
-    of fields as follows:
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array and paddle.Tensor, this
+    function will parse input data recursively and stack number,
+    numpy array and paddle.Tensor datas as batch datas. e.g. for
+    following input data:
+
+    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
     
-    [[filed1, filed2, ...], [filed1, filed2, ...], ...]
     
-    This default collate function zipped each filed together and stack
-    each filed as the batch field as follows:
+    This default collate function zipped each number and numpy array
+    field together and stack each field as the batch field as follows:
+
+    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
 
-    [batch_filed1, batch_filed2, ...]
 
     Args:  
-        batch(list of list of numpy array|paddle.Tensor): the batch data, each fields
-              should be a numpy array, each sample should be a list of
-              fileds, and batch should be a list of sample.
+        batch(list of sample data): batch should be a list of sample data.
     
     Returns:
-        a list of numpy array|Paddle.Tensor: collated batch of input batch data,
-            fields data type as same as fields in each sample.
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
     """
     sample = batch[0]
     if isinstance(sample, np.ndarray):
@@ -75,6 +82,24 @@ def default_collate_fn(batch):
 
 
 def default_convert_fn(batch):
+    """
+    Default batch converting function for :code:`paddle.io.DataLoader`.
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array and paddle.Tensor.
+
+    .. note::
+        This function is default :attr:`collate_fn` in **Distable
+        automatic batching** mode, for **Distable automatic batching**
+        mode, please ses :attr:`paddle.io.DataLoader`
+
+    Args:  
+        batch(list of sample data): batch should be a list of sample data.
+    
+    Returns:
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
+    """
     if isinstance(batch, (paddle.Tensor, np.ndarray)):
         return batch
     elif isinstance(batch, (str, bytes)):
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index be196b73edd698..9f2b2127aa7043 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -165,6 +165,12 @@ class DataLoader(object):
 
     For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
 
+    .. note::
+        GPU tensor operation is not supported in subprocess currently,
+        please don't use GPU tensor operations in pipeline which will
+        be performed in subprocess, such as dataset transforms, collte_fn,
+        etc. Numpy array and CPU tensor operation is supported.
+
     **Disable automatic batching**
 
     In certain cases such as some NLP tasks, instead of automatic batching,