From f4f31f2396a7e6b168ece6527187da2be8911442 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Mon, 2 Aug 2021 08:35:52 +0000
Subject: [PATCH 01/16] support quantization of conv2d_transpose

---
 .../slim/quantization/imperative/qat.py       | 62 ++++++++-----
 .../slim/quantization/imperative/utils.py     | 18 +++-
 python/paddle/nn/quant/quant_layers.py        | 87 +++++++++++++++++++
 3 files changed, 140 insertions(+), 27 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index b8c0e47e9bbc26..32a3ebfe047030 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -42,17 +42,18 @@ class ImperativeQuantAware(object):
     Applying quantization aware training (QAT) to the dgraph model.
     """
 
-    def __init__(self,
-                 quantizable_layer_type=['Conv2D', 'Linear'],
-                 weight_quantize_type='abs_max',
-                 activation_quantize_type='moving_average_abs_max',
-                 weight_bits=8,
-                 activation_bits=8,
-                 moving_rate=0.9,
-                 weight_preprocess_layer=None,
-                 act_preprocess_layer=None,
-                 weight_quantize_layer=None,
-                 act_quantize_layer=None):
+    def __init__(
+            self,
+            quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'],
+            weight_quantize_type='abs_max',
+            activation_quantize_type='moving_average_abs_max',
+            weight_bits=8,
+            activation_bits=8,
+            moving_rate=0.9,
+            weight_preprocess_layer=None,
+            act_preprocess_layer=None,
+            weight_quantize_layer=None,
+            act_quantize_layer=None):
         """
         The constructor for ImperativeQuantAware.
 
@@ -232,17 +233,18 @@ class ImperativeQuantizeInputs(object):
     logic both for activation inputs and weight inputs.
     """
 
-    def __init__(self,
-                 quantizable_layer_type=['Conv2D', 'Linear'],
-                 weight_quantize_type='abs_max',
-                 activation_quantize_type='moving_average_abs_max',
-                 weight_bits=8,
-                 activation_bits=8,
-                 moving_rate=0.9,
-                 weight_preprocess_layer=None,
-                 act_preprocess_layer=None,
-                 weight_quantize_layer=None,
-                 act_quantize_layer=None):
+    def __init__(
+            self,
+            quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'],
+            weight_quantize_type='abs_max',
+            activation_quantize_type='moving_average_abs_max',
+            weight_bits=8,
+            activation_bits=8,
+            moving_rate=0.9,
+            weight_preprocess_layer=None,
+            act_preprocess_layer=None,
+            weight_quantize_layer=None,
+            act_quantize_layer=None):
         """
         The constructor for ImperativeQuantizeInputs. 
 
@@ -303,6 +305,18 @@ def __init__(self,
         }
 
     def apply(self, model):
+        """
+        Quantize the weights and activations to calculate for specific 
+        layers in the dygraph model.
+
+        Args:
+            model(fluid.dygraph.Layer): The target model which would
+                calculate the input quantization scale.
+
+        Returns:
+            None
+        """
+
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
@@ -544,7 +558,9 @@ def _is_skip_quant_op(self, block, in_op):
         1. the type of input op should be conv2d, depthwise_conv2d or matmul
         2. the previous ops of the input op are not fake_quantize_dequantize ops
         """
-        target_op_types = ["conv2d", "depthwise_conv2d", "matmul"]
+        target_op_types = [
+            "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose"
+        ]
         if in_op.type not in target_op_types:
             return False
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index a9d52c5a87ad36..5a98ac80549f18 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -24,6 +24,7 @@
 from ..quantization_pass import _get_input_name_index
 
 layer_name_map = {
+    'Conv2DTranspose': paddle.nn.Conv2DTranspose,
     'Conv2D': paddle.nn.Conv2D,
     'Linear': paddle.nn.Linear,
     'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
@@ -47,7 +48,9 @@
 
 # Apply fake quant for the inputs of these layers
 # TODO (jc): support paddle.nn.Conv2DTranspose
-fake_quant_input_layers = [paddle.nn.Conv2D, paddle.nn.Linear]
+fake_quant_input_layers = [
+    paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.Conv2DTranspose
+]
 
 # Apply fake quant for the output of these layers
 # TODO(jc): fix the problem of adding duplicate fake_quant ops
@@ -65,7 +68,8 @@
 ]
 
 fake_quant_wrap_layers = [
-    quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear
+    quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear,
+    quant_layers.QuantizedConv2DTranspose
 ]
 
 # The weight format of these layers is Cin * Cout * H * W 
@@ -84,9 +88,9 @@
 
 
 def load_variable_data(scope, var_name):
-    '''
+    """
     Load variable value from scope
-    '''
+    """
     var_node = scope.find_var(var_name)
     assert var_node is not None, \
         "Can not find " + var_name + " in the scope."
@@ -120,6 +124,12 @@ def find_parent_layer_and_sub_name(model, name):
     the sub_name of the layer.
     For example, if name is 'block_1/convbn_1/conv_1', the parent layer is
     'block_1/convbn_1' and the sub_name is `conv_1`.
+    Args:
+        model(fluid.dygraph.Layer): the model to be quantized.
+        name(string): the name of a layer
+
+    Returns:
+        parent_layer, subname
     """
     assert isinstance(model, paddle.nn.Layer), \
             "The model must be the instance of paddle.nn.Layer."
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 5573683ebd0458..cc98f73174b4b3 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -31,6 +31,7 @@
     'FakeQuantMovingAverageAbsMax',
     'FakeQuantChannelWiseAbsMax',
     'QuantizedConv2D',
+    'QuantizedConv2DTranspose',
     'QuantizedLinear',
     'MovingAverageAbsMaxScale',
     'MAOutputScaleLayer',
@@ -481,6 +482,92 @@ def forward(self, input):
             data_format=self._data_format)
 
 
+class QuantizedConv2DTranspose(layers.Layer):
+    """
+    The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
+    The only difference is that its inputs are all fake quantized.
+    """
+
+    def __init__(self,
+                 layer,
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='abs_max',
+                 weight_pre_layer=None,
+                 act_pre_layer=None,
+                 weight_quant_layer=None,
+                 act_quant_layer=None):
+        super(QuantizedConv2DTranspose, self).__init__()
+        # For Conv2DTranspose
+        self._groups = getattr(layer, '_groups')
+        self._stride = getattr(layer, '_stride')
+        self._padding = getattr(layer, '_padding')
+        self._output_padding = getattr(layer, 'output_padding')
+        self._dilation = getattr(layer, '_dilation')
+        self._data_format = getattr(layer, '_data_format')
+        self.weight = getattr(layer, 'weight')
+        self.bias = getattr(layer, 'bias')
+        # For FakeQuant
+        self._conv2d_transpose_quant_axis = 1
+        if weight_quant_layer is not None:
+            self._fake_quant_weight = weight_quant_layer()
+        else:
+            self._fake_quant_weight = _get_fake_quant_type(
+                weight_quantize_type,
+                name=self.weight.name,
+                moving_rate=moving_rate,
+                quant_bits=weight_bits,
+                dtype=self._dtype,
+                quant_on_weight=True,
+                channel_num=self.weight.shape[
+                    self._conv2d_transpose_quant_axis],
+                quant_axis=self._conv2d_transpose_quant_axis)
+        if act_quant_layer is not None:
+            self._fake_quant_input = act_quant_layer()
+        else:
+            self._fake_quant_input = _get_fake_quant_type(
+                activation_quantize_type,
+                name=layer.full_name(),
+                moving_rate=moving_rate,
+                quant_bits=activation_bits,
+                dtype=self._dtype,
+                quant_on_weight=False)
+
+        self._act_preprocess = act_pre_layer(
+        ) if act_pre_layer is not None else None
+        self._weight_preprocess = weight_pre_layer(
+        ) if weight_pre_layer is not None else None
+
+    def forward(self, input, output_size=None):
+        if self._act_preprocess is not None:
+            input = self._act_preprocess(input)
+        quant_input = self._fake_quant_input(input)
+
+        weight = self.weight
+        if self._weight_preprocess is not None:
+            weight = self._weight_preprocess(self.weight)
+        quant_weight = self._fake_quant_weight(weight)
+
+        if output_size is None:
+            output_padding = self._output_padding
+        else:
+            output_padding = 0
+
+        return F.conv2d_transpose(
+            quant_input,
+            quant_weight,
+            bias=self.bias,
+            padding=self._padding,
+            output_padding=output_padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            output_size=output_size,
+            data_format=self._data_format)
+
+
 class QuantizedLinear(layers.Layer):
     """
     The computational logic of QuantizedLinear is the same with Linear.

From ac21a6041534a459c3dce4a0a75f375cb50e6e17 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Thu, 5 Aug 2021 11:17:43 +0000
Subject: [PATCH 02/16] fix quantization bugs

---
 .../contrib/slim/quantization/post_training_quantization.py   | 2 ++
 .../fluid/contrib/slim/quantization/quantization_pass.py      | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 5996e752c8c22d..5272d9f59903d7 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -578,6 +578,8 @@ def _sample_mse(self):
             var_tensor = _load_variable_data(self._scope, var_name)
             var_tensor = var_tensor.flatten()
             abs_max_value = float(np.max(np.abs(var_tensor)))
+            if abs_max_value == 0.0:
+                abs_max_value = 1e-8
             s = 0.3
             if var_name not in self._best_mse_loss:
                 self._best_mse_loss[var_name] = float('inf')
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index b3b12a477e2a0a..857486b3fc46cc 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -1312,6 +1312,8 @@ def _insert_post_dequant_op(self, graph, op_node):
                 assert self._is_float(
                     scale_v), 'The scale of parameter %s is not a float.' % (
                         original_var_name)
+                if scale_v == 0.0:
+                    scale_v = 1e-8
                 max_range *= param_range / scale_v
             else:
                 max_range *= act_range
@@ -1413,6 +1415,8 @@ def _clip(x, scale):
                     x[:, i] = _clip(x[:, i], s)
                     x[:, i] = np.round(x[:, i] / s * bnt)
         else:
+            if scale == 0.0:
+                scale = 1e-8
             x = _clip(x, scale)
             x = np.round(x / scale * bnt)
         return x

From 350048ef509d720d31782e607f571e8e0719e4f6 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Sun, 8 Aug 2021 16:27:40 +0800
Subject: [PATCH 03/16] Update post_training_quantization.py

---
 .../contrib/slim/quantization/post_training_quantization.py     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 5272d9f59903d7..5996e752c8c22d 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -578,8 +578,6 @@ def _sample_mse(self):
             var_tensor = _load_variable_data(self._scope, var_name)
             var_tensor = var_tensor.flatten()
             abs_max_value = float(np.max(np.abs(var_tensor)))
-            if abs_max_value == 0.0:
-                abs_max_value = 1e-8
             s = 0.3
             if var_name not in self._best_mse_loss:
                 self._best_mse_loss[var_name] = float('inf')

From cdfa3fe87f6649a5083ae9c4a8475ae5d87ac98d Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Sun, 8 Aug 2021 16:28:51 +0800
Subject: [PATCH 04/16] Update quantization_pass.py

---
 .../fluid/contrib/slim/quantization/quantization_pass.py      | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 857486b3fc46cc..b3b12a477e2a0a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -1312,8 +1312,6 @@ def _insert_post_dequant_op(self, graph, op_node):
                 assert self._is_float(
                     scale_v), 'The scale of parameter %s is not a float.' % (
                         original_var_name)
-                if scale_v == 0.0:
-                    scale_v = 1e-8
                 max_range *= param_range / scale_v
             else:
                 max_range *= act_range
@@ -1415,8 +1413,6 @@ def _clip(x, scale):
                     x[:, i] = _clip(x[:, i], s)
                     x[:, i] = np.round(x[:, i] / s * bnt)
         else:
-            if scale == 0.0:
-                scale = 1e-8
             x = _clip(x, scale)
             x = np.round(x / scale * bnt)
         return x

From 4b047da84ab21cbd0748aeeff1c8bc0a1dd4d376 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Mon, 9 Aug 2021 08:28:11 +0000
Subject: [PATCH 05/16] update docs

---
 .../paddle/fluid/contrib/slim/quantization/imperative/qat.py | 4 ++--
 .../fluid/contrib/slim/quantization/imperative/utils.py      | 3 +--
 python/paddle/nn/quant/quant_layers.py                       | 5 +++++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 32a3ebfe047030..aa3269b0f2541d 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -307,10 +307,10 @@ def __init__(
     def apply(self, model):
         """
         Quantize the weights and activations to calculate for specific 
-        layers in the dygraph model.
+        layers.
 
         Args:
-            model(fluid.dygraph.Layer): The target model which would
+            model(paddle.nn.Layer): The target model which would
                 calculate the input quantization scale.
 
         Returns:
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 5a98ac80549f18..009ce372b4f29c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -47,7 +47,6 @@
 }
 
 # Apply fake quant for the inputs of these layers
-# TODO (jc): support paddle.nn.Conv2DTranspose
 fake_quant_input_layers = [
     paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.Conv2DTranspose
 ]
@@ -125,7 +124,7 @@ def find_parent_layer_and_sub_name(model, name):
     For example, if name is 'block_1/convbn_1/conv_1', the parent layer is
     'block_1/convbn_1' and the sub_name is `conv_1`.
     Args:
-        model(fluid.dygraph.Layer): the model to be quantized.
+        model(paddle.nn.Layer): the model to be quantized.
         name(string): the name of a layer
 
     Returns:
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index cc98f73174b4b3..9d07febd955481 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -499,6 +499,11 @@ def __init__(self,
                  act_pre_layer=None,
                  weight_quant_layer=None,
                  act_quant_layer=None):
+        r"""
+        Constructor.
+
+        The arguments are the same as ImperativeQuantAware.
+        """
         super(QuantizedConv2DTranspose, self).__init__()
         # For Conv2DTranspose
         self._groups = getattr(layer, '_groups')

From e5ea4eb4c2430653b47ebaa53c7c77f5bb3fe976 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Mon, 9 Aug 2021 09:12:13 +0000
Subject: [PATCH 06/16] add tests for quantized_conv2d_transpose

---
 .../fluid/contrib/slim/tests/test_imperative_qat.py      | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 14fa291ee077c6..2dc118aa7e1a9f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -28,10 +28,10 @@
 from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.container import Sequential
-from paddle.nn import Linear, Conv2D, Softmax
+from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn.quant.quant_layers import QuantizedConv2D
+from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose
 
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
@@ -74,6 +74,11 @@ def test_qat(self):
             quant_conv1 = QuantizedConv2D(conv1)
             data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
             quant_conv1(fluid.dygraph.to_variable(data))
+            
+            conv_transpose = Conv2DTranspose(4, 6, (3, 3))
+            quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose)
+            x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
+            quant_conv_transpose(x_var)
 
             seed = 1
             np.random.seed(seed)

From 3231853193666ca376e56745d4d98a0912b9f739 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Mon, 9 Aug 2021 10:17:21 +0000
Subject: [PATCH 07/16] update codestyle

---
 .../paddle/fluid/contrib/slim/tests/test_imperative_qat.py   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 2dc118aa7e1a9f..bec486185ba171 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -74,10 +74,11 @@ def test_qat(self):
             quant_conv1 = QuantizedConv2D(conv1)
             data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
             quant_conv1(fluid.dygraph.to_variable(data))
-            
+
             conv_transpose = Conv2DTranspose(4, 6, (3, 3))
             quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose)
-            x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
+            x_var = paddle.uniform(
+                (2, 4, 8, 8), dtype='float32', min=-1., max=1.)
             quant_conv_transpose(x_var)
 
             seed = 1

From da48df7ea923cbf86c03561dc57a312a578c9ba8 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Mon, 9 Aug 2021 10:20:50 +0000
Subject: [PATCH 08/16] update docs

---
 .../paddle/fluid/contrib/slim/quantization/imperative/qat.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index aa3269b0f2541d..8ebad9974ace4f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -213,7 +213,7 @@ def quantize(self, model):
         the out_scale value of outputs would be calculated.
 
         Args:
-            model(fluid.dygraph.Layer): the model to be quantized.
+            model(paddle.nn.Layer): the model to be quantized.
         Returns:
             None
         """
@@ -368,7 +368,7 @@ def apply(self, model):
         output scales for specific layers in the dygraph model.
 
         Args:
-            model(fluid.dygraph.Layer): The target model which would be
+            model(paddle.nn.Layer): The target model which would be
                 calculate the output quantization scale.
 
         Returns:

From 43976beaba3c1381e03e955cb1a50f5f442efdb0 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Sat, 14 Aug 2021 12:11:47 +0000
Subject: [PATCH 09/16] update tests and conv2dtranspose layer

---
 .../contrib/slim/tests/test_imperative_qat.py |  2 +-
 .../tests/test_imperative_qat_user_defined.py | 20 +++++++++++++++++++
 python/paddle/nn/quant/quant_layers.py        | 15 ++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index bec486185ba171..677ccb52e242cf 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -78,7 +78,7 @@ def test_qat(self):
             conv_transpose = Conv2DTranspose(4, 6, (3, 3))
             quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose)
             x_var = paddle.uniform(
-                (2, 4, 8, 8), dtype='float32', min=-1., max=1.)
+                (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0)
             quant_conv_transpose(x_var)
 
             seed = 1
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
index 621213beb31cd7..186789ff03bad0 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -28,6 +28,7 @@
 from paddle.fluid.dygraph import Conv2D
 from paddle.fluid.dygraph import Pool2D
 from paddle.fluid.dygraph import Linear
+from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
 from paddle.fluid.log_helper import get_logger
 
 os.environ["CPU_NUM"] = "1"
@@ -100,6 +101,19 @@ def dequantize(x, lower_bound, delta, interval):
         return x
 
 
+class ModelForConv2dT(nn.Layer):
+    def __init__(self, num_classes=10):
+        super(ModelForConv2dT, self).__init__()
+        self.features = nn.Conv2DTranspose(4, 6, (3, 3))
+        self.fc = Linear(input_dim=600, output_dim=num_classes)
+    
+    def forward(self, inputs):
+        x = self.features(inputs)
+        x = paddle.flatten(x, 1)
+        x = self.fc(x)
+        return x
+
+
 class ImperativeLenet(paddle.nn.Layer):
     def __init__(self, num_classes=10, classifier_activation='softmax'):
         super(ImperativeLenet, self).__init__()
@@ -168,6 +182,12 @@ def test_quant_aware_training(self):
         imperative_qat.quantize(lenet)
         adam = Adam(learning_rate=0.001, parameters=lenet.parameters())
         dynamic_loss_rec = []
+        #for CI coverage
+        conv_transpose = ModelForConv2dT() 
+        imperative_qat.quantize(conv_transpose)
+        x_var = paddle.uniform(
+            (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0)
+        conv_transpose(x_var)
 
         def train(model):
             adam = Adam(learning_rate=0.001, parameters=model.parameters())
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 9d07febd955481..040b04f5e7bf1e 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -486,6 +486,21 @@ class QuantizedConv2DTranspose(layers.Layer):
     """
     The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
     The only difference is that its inputs are all fake quantized.
+    
+    Examples:
+       .. code-block:: python
+          import paddle
+          import paddle.nn as nn
+          from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
+          x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
+          conv = nn.Conv2DTranspose(4, 6, (3, 3))
+          conv_quantized = QuantizedConv2DTranspose(conv)
+          y_quantized = conv_quantized(x_var)
+          y_var = conv(x_var)
+          y_quantized_np = y_quantized.numpy()
+          y_np = y_var.numpy()
+          print(y_np.shape, y_quantized_np.shape)
+          # (2, 6, 10, 10), (2, 6, 10, 10)
     """
 
     def __init__(self,

From 8ec36b6fe5ffbf6a3c6d253652cf2e1031804b59 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Sat, 14 Aug 2021 18:12:44 +0000
Subject: [PATCH 10/16] update quant tests

---
 .../contrib/slim/tests/test_imperative_qat_user_defined.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
index 186789ff03bad0..270e8ee566ab57 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -106,7 +106,7 @@ def __init__(self, num_classes=10):
         super(ModelForConv2dT, self).__init__()
         self.features = nn.Conv2DTranspose(4, 6, (3, 3))
         self.fc = Linear(input_dim=600, output_dim=num_classes)
-    
+
     def forward(self, inputs):
         x = self.features(inputs)
         x = paddle.flatten(x, 1)
@@ -183,10 +183,9 @@ def test_quant_aware_training(self):
         adam = Adam(learning_rate=0.001, parameters=lenet.parameters())
         dynamic_loss_rec = []
         #for CI coverage
-        conv_transpose = ModelForConv2dT() 
+        conv_transpose = ModelForConv2dT()
         imperative_qat.quantize(conv_transpose)
-        x_var = paddle.uniform(
-            (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0)
+        x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
         conv_transpose(x_var)
 
         def train(model):

From fc74ab00f6333ef6b8cc5c946efa26c2c449d731 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Mon, 16 Aug 2021 04:35:08 +0000
Subject: [PATCH 11/16] update sampcd_processor for tests

---
 tools/sampcd_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 3ec12c11a7045a..d8cb70c9dd107b 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -440,6 +440,7 @@ def get_filenames(full_test=False):
     '''
     global whl_error
     import paddle
+    import paddle.fluid.contrib.slim.quantization
     whl_error = []
     if full_test:
         get_full_api_from_pr_spec()

From ccd16757c122d20b4a28a6622bf2ef86bb1333a5 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Mon, 16 Aug 2021 05:58:51 +0000
Subject: [PATCH 12/16] update code examples

---
 .../slim/quantization/imperative/qat.py       | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 8ebad9974ace4f..6208b43c9e9e48 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -216,6 +216,41 @@ def quantize(self, model):
             model(paddle.nn.Layer): the model to be quantized.
         Returns:
             None
+
+        Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.contrib.slim.quantization \
+                import ImperativeQuantAware
+
+            class ImperativeModel(paddle.nn.Layer):
+                def __init__(self):
+                    super(ImperativeModel, self).__init__()
+                    # self.linear_0 would skip the quantization.
+                    self.linear_0 = paddle.nn.Linear(784, 400)
+                    self.linear_0.skip_quant = True
+
+                    # self.linear_1 would not skip the quantization.
+                    self.linear_1 = paddle.nn.Linear(400, 10)
+                    self.linear_1.skip_quant = False
+
+                def forward(self, inputs):
+                    x = self.linear_0(inputs)
+                    x = self.linear_1(inputs)
+                    return x
+
+            model = ImperativeModel()
+            imperative_qat = ImperativeQuantAware(
+                weight_quantize_type='abs_max',
+                activation_quantize_type='moving_average_abs_max')
+
+            # Add the fake quant logical.
+            # The original model will be rewrite.
+            #
+            # There is only one Layer(self.linear1) would be added the
+            # fake quant logical.
+            imperative_qat.quantize(model)
         """
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."

From a5b7c710b9848dd36412dae7be067baf091f329e Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Mon, 16 Aug 2021 12:47:12 +0000
Subject: [PATCH 13/16] fix channel_wise quantization for ernie

---
 cmake/cupti.cmake                             |   1 +
 paddle/fluid/framework/device_worker.h        |   1 +
 paddle/fluid/framework/downpour_worker.cc     |   8 +-
 paddle/fluid/framework/downpour_worker_opt.cc |   4 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |   5 +-
 paddle/fluid/framework/fleet/fleet_wrapper.h  |   3 +-
 .../framework/ir/graph_pattern_detector.cc    |   6 +-
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |  19 +-
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     |   7 +-
 .../cpu_quantize_placement_pass_tester.cc     |   8 +-
 paddle/fluid/framework/trainer_desc.proto     |   1 +
 .../collective/c_comm_init_hccl_op.cc         |   2 +
 .../mkldnn/elementwise_mkldnn_op.h            |  19 +-
 .../mkldnn/elementwise_mul_mkldnn_op.cc       |  10 +-
 paddle/fluid/operators/fake_dequantize_op.cc  |  81 ++-
 paddle/fluid/operators/fake_dequantize_op.cu  |  17 +-
 paddle/fluid/operators/fake_dequantize_op.h   |  12 +-
 paddle/fluid/operators/interpolate_v2_op.cu   |   9 +-
 .../operators/mkldnn/activation_mkldnn_op.cc  |  11 +-
 .../operators/mkldnn/caching_tests.cmake      |   7 +-
 .../fluid/operators/mkldnn/scale_mkldnn_op.cc |   8 +-
 .../operators/mkldnn/softmax_mkldnn_op.cc     | 105 ++--
 .../operators/mkldnn/test_mkldnn_caching.cc   |  84 ++--
 paddle/fluid/operators/size_op_npu.cc         |  51 ++
 paddle/fluid/platform/mkldnn_reuse.h          | 476 ++++++++++++------
 paddle/fluid/pybind/imperative.cc             |   6 +-
 .../slim/quantization/imperative/qat.py       | 101 +---
 .../slim/quantization/imperative/utils.py     |  19 +-
 .../slim/quantization/quantization_pass.py    |   7 +-
 .../contrib/slim/tests/test_imperative_qat.py |  10 +-
 .../tests/test_imperative_qat_user_defined.py |  19 -
 .../pslib/optimizer_factory.py                |   2 +
 .../tests/unittests/npu/test_size_op_npu.py   | 141 ++++++
 python/paddle/fluid/trainer_desc.py           |   4 +
 python/paddle/fluid/trainer_factory.py        |   4 +
 python/paddle/nn/quant/quant_layers.py        | 107 ----
 tools/sampcd_processor.py                     |   1 -
 37 files changed, 806 insertions(+), 570 deletions(-)
 create mode 100644 paddle/fluid/operators/size_op_npu.cc
 create mode 100755 python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py

diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
index 17626688531e61..2d7b1917b68731 100644
--- a/cmake/cupti.cmake
+++ b/cmake/cupti.cmake
@@ -9,6 +9,7 @@ find_path(CUPTI_INCLUDE_DIR cupti.h
         $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
         ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
         ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/include
         NO_DEFAULT_PATH
         )
 
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 45efa43ccb74bc..6dd6fed0151585 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -212,6 +212,7 @@ class DeviceWorker {
   FetchConfig fetch_config_;
   bool use_cvm_;
   bool no_cvm_;
+  bool scale_sparse_gradient_with_batch_size_;
   TrainerDesc trainer_desc_;
 
   // dump params or grads for debug
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index ad3f27f03fa143..11f70acb73aa7f 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -89,6 +89,8 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
   use_cvm_ = desc.use_cvm();
   // for sparse value accessor, embedding only
   no_cvm_ = desc.no_cvm();
+  scale_sparse_gradient_with_batch_size_ =
+      desc.scale_sparse_gradient_with_batch_size();
   scale_datanorm_ = desc.scale_datanorm();
   dump_slot_ = desc.dump_slot();
   adjust_ins_weight_config_ = desc.adjust_ins_weight_config();
@@ -591,7 +593,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
             *thread_scope_, tid, features_[tid], feature_labels_[tid],
             sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
             &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_, &sparse_push_keys_[tid], no_cvm_);
+            dump_slot_, &sparse_push_keys_[tid], no_cvm_,
+            scale_sparse_gradient_with_batch_size_);
         timeline.Pause();
         push_sparse_time += timeline.ElapsedSec();
         total_time += timeline.ElapsedSec();
@@ -866,7 +869,8 @@ void DownpourWorker::TrainFiles() {
             *thread_scope_, tid, features_[tid], feature_labels_[tid],
             sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
             &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_, &sparse_push_keys_[tid], no_cvm_);
+            dump_slot_, &sparse_push_keys_[tid], no_cvm_,
+            scale_sparse_gradient_with_batch_size_);
       }
     }
 
diff --git a/paddle/fluid/framework/downpour_worker_opt.cc b/paddle/fluid/framework/downpour_worker_opt.cc
index afe6ddfa3d9a63..ed0a9d9107e79d 100644
--- a/paddle/fluid/framework/downpour_worker_opt.cc
+++ b/paddle/fluid/framework/downpour_worker_opt.cc
@@ -450,11 +450,13 @@ void DownpourWorkerOpt::TrainFiles() {
             break;
           }
         }
+        bool scale_sparse_gradient_with_batch_size_ = true;
         fleet_ptr_->PushSparseVarsWithLabelAsync(
             *thread_scope_, tid, features_[tid], feature_labels_[tid],
             sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
             &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_, &sparse_push_keys_[tid], no_cvm_);
+            dump_slot_, &sparse_push_keys_[tid], no_cvm_,
+            scale_sparse_gradient_with_batch_size_);
       }
     }
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index bb318e59e46e41..dc5e24ef5de42f 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -870,7 +870,8 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
     std::vector<std::vector<float>>* push_values,
     std::vector<::std::future<int32_t>>* push_sparse_status,
     const int batch_size, const bool use_cvm, const bool dump_slot,
-    std::vector<uint64_t>* sparse_push_keys, const bool no_cvm) {
+    std::vector<uint64_t>* sparse_push_keys, const bool no_cvm,
+    const bool scale_sparse_gradient_with_batch_size) {
 #ifdef PADDLE_WITH_PSLIB
   int offset = 2;
   int slot_offset = 0;
@@ -939,7 +940,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
     }
     float* g = g_tensor->data<float>();
 
-    if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
+    if (scale_sparse_gradient_with_batch_size && grad_dim > 0) {
       int dim = emb_dim;
       Eigen::Map<
           Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 09f7801b19f988..c1db06a298c861 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -209,7 +209,8 @@ class FleetWrapper {
       std::vector<std::vector<float>>* push_values,
       std::vector<::std::future<int32_t>>* push_sparse_status,
       const int batch_size, const bool use_cvm, const bool dump_slot,
-      std::vector<uint64_t>* sparse_push_keys, const bool no_cvm);
+      std::vector<uint64_t>* sparse_push_keys, const bool no_cvm,
+      const bool scale_sparse_gradient_with_batch_size);
 
   // Push sparse variables to server in async mode
   void PushSparseFromTensorWithLabelAsync(
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 70e48755dcd1e3..b4c94010e480a7 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2249,9 +2249,9 @@ PDNode *patterns::MultipleQuantize::operator()() {
 PDNode *patterns::QuantizePlacement::operator()(
     const std::unordered_set<std::string> &quantize_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>(
-          {"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d",
-           "prior_box", "relu", "reshape2", "transpose2", "fusion_gru"});
+      std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
+                                       "fc", "matmul", "pool2d", "prior_box",
+                                       "reshape2", "transpose2", "fusion_gru"});
   if (!quantize_enabled_op_types.empty()) {
     supported_op_types = quantize_enabled_op_types;
   }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 3c06c9ee41d2a2..f50cd0a01d204d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -770,7 +770,8 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
                               elementwise_add_pattern);
 
-    if (!AreScalesPresentForNodes({elementwise_add_x, elementwise_add_y})) {
+    if (!AreScalesPresentForNodes(
+            {elementwise_add_x, elementwise_add_y, elementwise_add_out})) {
       LogCannotQuantizeOp(elementwise_add_op);
       return;
     }
@@ -793,16 +794,12 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
     QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale,
                   is_y_unsigned, "Scale_y");
 
-    // if quantization scale is missing for output tensor, return fp32 data
-    if (AreScalesPresentForNodes({elementwise_add_out})) {
-      bool is_output_unsigned{false};
-      auto output_scale =
-          GetScaleValueForNode(elementwise_add_out, &is_output_unsigned);
-      DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out",
-                       output_scale, is_output_unsigned, "Scale_out");
-    } else {
-      elementwise_add_op->Op()->SetAttr("force_fp32_output", true);
-    }
+    bool is_output_unsigned{false};
+    auto output_scale =
+        GetScaleValueForNode(elementwise_add_out, &is_output_unsigned);
+
+    DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out",
+                     output_scale, is_output_unsigned, "Scale_out");
 
     ++quantize_elementwise_add_count;
   };
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index adb431fdb097f5..6fcea6a66cc5d1 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -854,13 +854,12 @@ TEST(CpuQuantizePass, elementwise_add) {
 
 TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
   int elementwise_add_count = 1;
-  int quant_count = 2;
+  int quant_count = 0;
   int dequant_count = 2;
-  // 2 Quant + 2 IN
-  int added_nodes_count = 4;
+  int added_nodes_count = 0;
   MainTestElementwiseAdd(BuildProgramDescElementwiseAdd(),
                          elementwise_add_count, quant_count, dequant_count,
-                         added_nodes_count, 2.0f * 127, true);
+                         added_nodes_count, 1.f, true);
 }
 
 TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index 761defc25ff5c8..daf913bf7d80d1 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -131,13 +131,13 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
 }
 
 TEST(QuantizerPlacementPass, empty_list) {
-  // all operators quantized
-  MainTest({}, {}, 6);
+  // all operators except relu should be quantized
+  MainTest({}, {}, 5);
 }
 
 TEST(QuantizerPlacementPass, default_attr_value) {
-  //  all operators quantized
-  DefaultAttrTest(6);
+  // all operators except relu should be quantized
+  DefaultAttrTest(5);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 504885ff5ccbce..6f487d6984cc43 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -61,6 +61,7 @@ message TrainerDesc {
 
   optional bool use_ps_gpu = 32 [ default = false ];
   optional string user_define_dump_filename = 33;
+  optional bool scale_sparse_gradient_with_batch_size = 34 [ default = true ];
 
   // device worker parameters
   optional HogwildWorkerParameter hogwild_param = 101;
diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
index 3df0595525941a..7dec645b5b3ad8 100644
--- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@@ -87,6 +87,8 @@ class CCommInitOpAscend : public framework::OperatorBase {
     }
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
         buff, size, HCCL_DATA_TYPE_FP32, 0, comm->comm(), stream));
+    // Synchronize stream to find hccl error in time.
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
     VLOG(3) << "Build connection successful.";
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index ddad70a6a5f31c..ffcdc079985fa6 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -47,13 +47,24 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     float scale_o = ctx.Attr<float>("Scale_out");
     int axis = ctx.Attr<int>("axis");
 
-    platform::BinaryMKLDNNHandler<T> handler(
-        BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z,
-        scale_x, scale_y, scale_o, ctx.OutputName("Out"));
+    platform::BinaryMKLDNNHandler<T> handler(BINARY_OP, axis, mkldnn_engine,
+                                             ctx.GetPlace(), x, y, z, scale_x,
+                                             scale_y, scale_o);
 
     const auto src_x_memory = handler.AcquireSrcMemory(x);
     const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-    const auto dst_memory = handler.AcquireDstMemory(z);
+    // (jczaja) For Inplace src and dst should be the same memory object.
+    // So x should share buffer with z. But UT mechanics is testing inplace
+    // execution for this op not checking that x can be bradcasted to match in
+    // shape y tensor.
+    // This is wrong as when x is to be broadcasted then z(out) will match the
+    // shape of y which is bigger than x. Hence if x is smaller in shape than z
+    // and they share a buffer (of
+    // shape x) then this buffer is not big enough to hold result of elementwise
+    // operation.
+    auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z))
+                          ? src_x_memory
+                          : handler.AcquireDstMemory(z);
 
     const auto binary_prim = handler.AcquireForwardPrimitive();
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index 1c246e8d189370..af4aab8047888a 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -48,9 +48,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
     if (dx) {
       // dx = dout*y
       platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
-          ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f,
-          ctx.InputName(framework::GradVarName("Out")));
+          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
+          dout, y, dx, 1.0f, 1.0f, 1.0f);
 
       const auto src_dout_memory = handler.AcquireSrcMemory(dout);
       const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
@@ -75,9 +74,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
       // Handler is having nullptr passed instead of output tensor as
       // we want Dst buffer to be allocated by oneDNN not to use Tensor
       platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
-          ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f,
-          ctx.InputName(framework::GradVarName("Out")));
+          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
+          dout, x, nullptr, 1.0f, 1.0f, 1.0f);
 
       const auto src_dout_memory = handler.AcquireSrcMemory(dout);
       const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index b70fe78e1a5282..c71710b79d1fbe 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -39,7 +39,7 @@ struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& dev_ctx,
                   const framework::Tensor* in, const framework::Tensor** scales,
                   const int scale_num, T max_range, const int quant_axis,
-                  framework::Tensor* out) {
+                  const int x_num_col_dims, framework::Tensor* out) {
     if (scale_num == 1) {
       // Dequant op is before quantized op
       // Dequantize the weight of quantized op
@@ -81,23 +81,51 @@ struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
     } else if (scale_num == 2) {
       // Dequant op is after quantized op
       // Dequantize the output tensor of quantized op
-      int batch_size = in->dims()[0];
-      int channel = in->dims()[1];
-      const T* scale_one = scales[0]->data<T>();
-      const T* scale_two = scales[1]->data<T>();
-      for (int i = 0; i < batch_size; i++) {
-        framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
-            framework::slice_ddim(in->dims(), 1, in->dims().size()));
-        framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
-            framework::slice_ddim(out->dims(), 1, out->dims().size()));
-        for (int j = 0; j < channel; j++) {
-          T s = scale_one[j];
-          framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
-          framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
-          auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
-          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-          auto& dev = *dev_ctx.eigen_device();
-          out_e.device(dev) = in_e * s * scale_two[0] / max_range;
+      if (x_num_col_dims > 1) {
+        auto in_dims = in->dims();
+        const int64_t channel = in_dims[x_num_col_dims];
+        const T* scale_one = scales[0]->data<T>();
+        const T* scale_two = scales[1]->data<T>();
+        int64_t out_iter = 1;
+        for (int i = 0; i < x_num_col_dims; i++) {
+          out_iter *= in_dims[i];
+        }
+        int64_t step_i = in->numel() / out_iter;
+        int64_t step_j = in->numel() / (out_iter * channel);
+        auto* in_data = in->data<T>();
+        auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        for (int64_t i = 0; i < out_iter; i++) {
+          for (int64_t j = 0; j < channel; j++) {
+            auto* cur_in = in_data + i * step_i + j * step_j;
+            auto* cur_out = out_data + i * step_i + j * step_j;
+            T s = scale_one[j];
+            for (int64_t k = 0; k < step_j; k++) {
+              *cur_out = (*cur_in) * s * scale_two[0] / max_range;
+              ++cur_in;
+              ++cur_out;
+            }
+          }
+        }
+      }
+      else {
+        int batch_size = in->dims()[0];
+        int channel = in->dims()[1];
+        const T* scale_one = scales[0]->data<T>();
+        const T* scale_two = scales[1]->data<T>();
+        for (int i = 0; i < batch_size; i++) {
+          framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
+              framework::slice_ddim(in->dims(), 1, in->dims().size()));
+          framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
+              framework::slice_ddim(out->dims(), 1, out->dims().size()));
+          for (int j = 0; j < channel; j++) {
+            T s = scale_one[j];
+            framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
+            framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
+            auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+            auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+            auto& dev = *dev_ctx.eigen_device();
+            out_e.device(dev) = in_e * s * scale_two[0] / max_range;
+          }
         }
       }
     }
@@ -199,7 +227,16 @@ class FakeChannelWiseDequantizeMaxAbsOpMaker
                                 "the received is %d",
                                 quant_axis));
         });
-
+    AddAttr<int>("x_num_col_dims",
+                 "The x_num_col_dims of mul. Only used for mul or matmul.")
+        .SetDefault(1)
+        .AddCustomChecker([](const int& x_num_col_dims) {
+          PADDLE_ENFORCE_EQ(x_num_col_dims == 0, false,
+                            platform::errors::InvalidArgument(
+                                "'x_num_col_dims' should be larger than 0, but "
+                                "the received is %d",
+                                x_num_col_dims));
+        });
     AddComment(R"DOC(
 FakeChannelWiseDequantizeMaxAbsOp operator.
 
@@ -242,7 +279,7 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs,
 
 REGISTER_OP_VERSION(fake_channel_wise_dequantize_max_abs)
     .AddCheckpoint(
-        R"ROC(add new attributes [quant_axis] for applying per-channel "
-        "dequantization to conv2d_tranpose and mul ops.)ROC",
+        R"ROC(add new attributes [x_num_col_dims] for applying per-channel "
+        "dequantization to mul ops.)ROC",
         paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "quant_axis", "The axis for dequantization.", 0));
+            "x_num_col_dims", "The x_num_col_dims for dequantization.", 1));
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index a89c430c7ab24e..b1d2e220426f73 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -77,9 +77,9 @@ __global__ void DequantizeOneScaleQuantAxis1(const T* in, const T* scale,
 template <typename T>
 __global__ void DequantizeTwoScale(const T* in, const T* scale_one,
                                    const T* scale_two, T max_range, int num,
-                                   int batch_size, int channel, T* out) {
+                                   int iter_size, int channel, T* out) {
   int tid = threadIdx.x;
-  int channel_size = num / (batch_size * channel);
+  int channel_size = num / (iter_size * channel);
   int scale_index = blockIdx.x % channel;
   const T* in_c = in + blockIdx.x * channel_size;
   T* out_c = out + blockIdx.x * channel_size;
@@ -93,7 +93,7 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& dev_ctx,
                   const framework::Tensor* in, const framework::Tensor** scales,
                   const int scale_num, T max_range, const int quant_axis,
-                  framework::Tensor* out) {
+                  const int x_num_col_dims, framework::Tensor* out) {
     auto in_dims = in->dims();
     const T* in_data = in->data<T>();
     T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
@@ -116,14 +116,17 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
     } else if (scale_num == 2) {
       // Not need to consider quant_axis
       int num = in->numel();
-      int batch_size = in->dims()[0];
-      int channel = in->dims()[1];
+      int iter_size = 1;
+      for (int i = 0; i < x_num_col_dims; i++) {
+        iter_size *= in->dims()[i];
+        }
+      int channel = in->dims()[x_num_col_dims];
       const T* scale_one = scales[0]->data<T>();
       const T* scale_two = scales[1]->data<T>();
       int block = 1024;
-      int grid = batch_size * channel;
+      int grid = iter_size * channel;
       DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          in_data, scale_one, scale_two, max_range, num, batch_size, channel,
+          in_data, scale_one, scale_two, max_range, num, iter_size, channel,
           out_data);
     }
   }
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index 6ddb12771fd517..4485edcafba0db 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -33,7 +33,8 @@ template <typename DeviceContext, typename T>
 struct ChannelDequantizeFunctor {
   void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
                   const framework::Tensor** scales, const int scale_num,
-                  T max_range, const int quant_axis, framework::Tensor* out);
+                  T max_range, const int quant_axis, const int x_num_col_dims,
+                  framework::Tensor* out);
 };
 
 template <typename DeviceContext, typename T>
@@ -64,6 +65,7 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
 
     auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
     auto quant_axis = ctx.Attr<int>("quant_axis");
+    auto x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
     int max_range = 1;
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
@@ -80,11 +82,11 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
       max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
     } else if (scale_num == 2) {
       PADDLE_ENFORCE_EQ(
-          scales[0]->numel(), in->dims()[1],
+          scales[0]->numel(), in->dims()[x_num_col_dims],
           platform::errors::PreconditionNotMet(
               "The number of first scale values must be the same with "
-              "second dimension value of Input(X) when the `Scales` has two "
-              "elements, but %ld != %ld here.",
+              "corresponding dimension value of Input(X) when the `Scales` "
+              "has two elements, but %ld != %ld here.",
               scales[0]->numel(), in->dims()[1]));
       PADDLE_ENFORCE_EQ(scales[1]->numel(), 1,
                         platform::errors::PreconditionNotMet(
@@ -96,7 +98,7 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
     }
     ChannelDequantizeFunctor<DeviceContext, T>()(
         dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range),
-        quant_axis, out);
+        quant_axis, x_num_col_dims, out);
   }
 };
 
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 6745592c5c1a8b..d335e1a2f9d58b 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -1186,7 +1186,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
+    dim3 thread_num = config.thread_per_block;
+#ifdef WITH_NV_JETSON
+    if (config.compute_capability == 53 || config.compute_capability == 62) {
+      thread_num = 512;
+    }
+#endif
+
+    KeBilinearInterpFw<T><<<config.block_per_grid, thread_num, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 3b92d2e2d88913..d992890adeec3e 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -79,15 +79,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
                     paddle::platform::errors::PreconditionNotMet(
                         "Operator DNNL eletwise_forward must use CPUPlace"));
   auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
 
   const auto *x = ctx.Input<Tensor>("X");
   auto *y = ctx.Output<Tensor>("Out");
 
   bool is_inplaced = x->IsSharedBufferWith(*y);
 
-  platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, dev_ctx,
-                                               ctx.GetPlace(), x,
-                                               ctx.InputName("X"), is_inplaced);
+  platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
+                                               ctx.GetPlace(), x);
 
   auto src_memory_p = handler.AcquireSrcMemory(x);
   auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y);
@@ -106,13 +106,14 @@ template <typename T>
 void eltwise_grad(const framework::ExecutionContext &ctx,
                   mkldnn::algorithm algorithm) {
   auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
 
   const auto *x = ctx.Input<Tensor>("X");
   const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
   auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-  platform::ActivationMKLDNNHandler<T> handler(
-      algorithm, ctx, dev_ctx, ctx.GetPlace(), x, diff_y, ctx.InputName("X"));
+  platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
+                                               ctx.GetPlace(), x, diff_y);
 
   auto src_memory_p = handler.AcquireBackwardSrcMemory(x);
   auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y);
diff --git a/paddle/fluid/operators/mkldnn/caching_tests.cmake b/paddle/fluid/operators/mkldnn/caching_tests.cmake
index 4130c295b203eb..f48a5d822f8dc8 100644
--- a/paddle/fluid/operators/mkldnn/caching_tests.cmake
+++ b/paddle/fluid/operators/mkldnn/caching_tests.cmake
@@ -1 +1,6 @@
-cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op softmax scope device_context enforce)
+set(TEST_MKLDNN_CACHING_DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce)
+if (WITH_GPU OR WITH_ROCM)
+  set(TEST_MKLDNN_CACHING_DEPS ${TEST_MKLDNN_CACHING_DEPS} depthwise_conv)
+endif()
+cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS ${TEST_MKLDNN_CACHING_DEPS})
+
diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
index ae17048b5d568b..84ac14d04b85b3 100644
--- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
@@ -29,6 +29,7 @@ class ScaleMKLDNNKernel : public framework::OpKernel<T> {
   void RunKernel(const framework::ExecutionContext& ctx) const {
     const auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
 
     auto* x = ctx.Input<Tensor>("X");
     auto* out = ctx.Output<Tensor>("Out");
@@ -36,11 +37,12 @@ class ScaleMKLDNNKernel : public framework::OpKernel<T> {
     bool is_inplaced = x->IsSharedBufferWith(*out);
 
     platform::ActivationMKLDNNHandler<T> handler(
-        mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x,
-        ctx.InputName("X"), is_inplaced);
+        mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(),
+        x);
 
     auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p = handler.AcquireDstMemory(out);
+    auto dst_memory_p =
+        is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
     auto activation_p = handler.AcquireForwardPrimitive();
 
     auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index e065800e4d1c71..b0f27719bf9adc 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -32,69 +32,56 @@ using platform::to_void_cast;
 
 template <typename T>
 class SoftmaxMKLDNNHandler
-    : public platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
-                                      mkldnn::softmax_backward> {
+    : public platform::MKLDNNHandlerNoCachingT<T, mkldnn::softmax_forward,
+                                               mkldnn::softmax_backward> {
  public:
-  SoftmaxMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
-                       const mkldnn::engine mkldnn_engine,
+  SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine,
                        platform::Place cpu_place, const Tensor* input,
-                       Tensor* output, const int axis,
-                       const std::string uniq_name, bool is_inplaced)
-      : platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
-                                 mkldnn::softmax_backward>(
-            dev_ctx, mkldnn_engine, cpu_place,
-            // Softmax may be inplace then uniq_name is no longer unique
-            is_inplaced ? platform::CreateKey(
-                              dev_ctx, framework::vectorize(input->dims()),
-                              axis, uniq_name)
-                        : platform::CreateKey(
-                              dev_ctx, framework::vectorize(input->dims()),
-                              uniq_name)) {
-    if (!this->isCached()) {
-      PADDLE_ENFORCE_EQ(
-          input->dims(), output->dims(),
-          platform::errors::InvalidArgument(
-              "The shape of input and output tensor must be identical."));
-
-      auto softmax_tz = framework::vectorize(input->dims());
-      auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType<T>(),
-                             input->format());
-
-      this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md,
-                                              axis);
-    }
+                       Tensor* output, const int axis)
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::softmax_forward,
+                                          mkldnn::softmax_backward>(
+            mkldnn_engine, cpu_place) {
+    PADDLE_ENFORCE_EQ(
+        input->dims(), output->dims(),
+        platform::errors::InvalidArgument(
+            "The shape of input and output tensor must be identical."));
+
+    auto softmax_tz = framework::vectorize(input->dims());
+    auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType<T>(),
+                           input->format());
+
+    this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md,
+                                            axis);
   }
 
   SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx,
-                       const MKLDNNDeviceContext& dev_ctx,
+                       const mkldnn::engine mkldnn_engine,
                        platform::Place cpu_place, const Tensor* out,
                        const Tensor* out_grad, Tensor* in_x_grad,
                        const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
-                                 mkldnn::softmax_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dev_ctx, framework::vectorize(out->dims()),
-                                unique_name)) {
-    if (!this->isBwdCached()) {
-      PADDLE_ENFORCE_EQ(
-          out_grad->dims(), in_x_grad->dims(),
-          platform::errors::InvalidArgument("The shape of softmax_grad's input "
-                                            "and output must be identical."));
-
-      auto dims = out_grad->dims();  // input and output share the same shape
-      const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
-      auto softmax_tz = framework::vectorize<int64_t>(dims);
-
-      auto data_softmax_md = MKLDNNMemDesc(
-          softmax_tz, platform::MKLDNNGetDataType<T>(), out->format());
-      auto diff_softmax_md = MKLDNNMemDesc(
-          softmax_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());
-
-      this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring,
-                                              data_softmax_md, axis);
-      this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md,
-                                               axis);
-    }
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::softmax_forward,
+                                          mkldnn::softmax_backward>(
+            mkldnn_engine, cpu_place) {
+    PADDLE_ENFORCE_EQ(out_grad->dims(), in_x_grad->dims(),
+                      platform::errors::InvalidArgument(
+                          "The shape of softmax_grad's input "
+                          "and output must be identical, but shapes differ, "
+                          "out_grad: %s in_grad: %s",
+                          out_grad->dims(), in_x_grad->dims()));
+
+    auto dims = out_grad->dims();  // input and output share the same shape
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
+    auto softmax_tz = framework::vectorize<int64_t>(dims);
+
+    auto data_softmax_md = MKLDNNMemDesc(
+        softmax_tz, platform::MKLDNNGetDataType<T>(), out->format());
+    auto diff_softmax_md = MKLDNNMemDesc(
+        softmax_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());
+
+    this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring,
+                                            data_softmax_md, axis);
+    this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md,
+                                             axis);
   }
 };
 
@@ -111,9 +98,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     const int axis = CanonicalAxis(ctx.Attr<int>("axis"), input->dims().size());
 
-    SoftmaxMKLDNNHandler<T> handler(dev_ctx, mkldnn_engine, ctx.GetPlace(),
-                                    input, output, axis, ctx.OutputName("Out"),
-                                    is_inplaced);
+    SoftmaxMKLDNNHandler<T> handler(mkldnn_engine, ctx.GetPlace(), input,
+                                    output, axis);
 
     auto softmax_src_memory_p = handler.AcquireSrcMemory(input);
     // For Inplace src and and dst are the same memory object
@@ -149,11 +135,12 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL SoftmaxGrad must use CPUPlace"));
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* output = ctx.Input<Tensor>("Out");
     auto* out_grad = ctx.template Input<Tensor>(framework::GradVarName("Out"));
     auto* in_x_grad = ctx.template Output<Tensor>(framework::GradVarName("X"));
 
-    SoftmaxMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), output,
+    SoftmaxMKLDNNHandler<T> handler(ctx, mkldnn_engine, ctx.GetPlace(), output,
                                     out_grad, in_x_grad, ctx.InputName("Out"));
 
     auto dst_memory_p = handler.AcquireDstMemory(output);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index cad4f47ec14022..7251653793f899 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -33,6 +33,8 @@ USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
+USE_OP(conv2d);
+USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
 
 namespace paddle {
 namespace operators {
@@ -64,16 +66,19 @@ class CacheTester {
 
 template <typename T>
 void RunOperator(const platform::Place &place, const std::string &op_type,
-                 const framework::DDim &dims, const std::string &output_name,
-                 bool inplace = false) {
+                 const framework::DDim &dims, const std::string &first_input) {
   framework::Scope scope;
 
   std::map<const std::string, int> num_inputs = {{"softmax", 1},
                                                  {"relu", 1},
+                                                 {"conv2d", 2},
                                                  {"elementwise_add", 2},
                                                  {"elementwise_mul", 2}};
 
-  std::string first_input = inplace == true ? output_name : "x";
+  std::string first_input_var_name = (op_type == "conv2d") ? "Input" : "X";
+  std::string second_input_var_name = (op_type == "conv2d") ? "Filter" : "Y";
+  std::string output_var_name = (op_type == "conv2d") ? "Output" : "Out";
+  std::string output_name = "output";
 
   std::vector<InputVars> input_names = {
       {first_input, scope.Var(first_input)->GetMutable<framework::LoDTensor>()},
@@ -113,71 +118,40 @@ void RunOperator(const platform::Place &place, const std::string &op_type,
 
   auto &pool = platform::DeviceContextPool::Instance();
 
-  auto op = num_inputs[op_type] > 1
-                ? framework::OpRegistry::CreateOp(
-                      op_type, {{"X", {first_input}}, {"Y", {"x1"}}},
-                      {{"Out", {output_name}}}, {{"use_mkldnn", {true}}})
-                : framework::OpRegistry::CreateOp(
-                      op_type, {{"X", {first_input}}}, {{"Out", {output_name}}},
-                      {{"use_mkldnn", {true}}});
+  auto op =
+      num_inputs[op_type] > 1
+          ? framework::OpRegistry::CreateOp(
+                op_type, {{first_input_var_name, {first_input}},
+                          {second_input_var_name, {"x1"}}},
+                {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}})
+          : framework::OpRegistry::CreateOp(
+                op_type, {{first_input_var_name, {first_input}}},
+                {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}});
 
   op->Run(scope, place);
   pool.Get(place)->Wait();
 }
 
-TEST(test_softmax_reuse_cache, cpu_place) {
-  framework::DDim dims({32, 64});
+TEST(test_conv2d_reuse_cache, cpu_place) {
+  framework::DDim dims({1, 16, 32, 64});
   platform::CPUPlace p;
   CacheTester ct;
-  RunOperator<float>(p, "softmax", dims, "softmax_out");
-  RunOperator<float>(p, "softmax", dims, "softmax_out");
-  PADDLE_ENFORCE_EQ(ct.Analyze(4), true,
+  RunOperator<float>(p, "conv2d", dims, "input_signal");
+  RunOperator<float>(p, "conv2d", dims, "input_signal");
+  PADDLE_ENFORCE_EQ(ct.Analyze(9), true,
                     platform::errors::InvalidArgument(
-                        "Wrong number of cached oneDNN objects"));
+                        "Invalid number of cached oneDNN objects"));
 }
 
-TEST(test_softmax_noreuse_cache, cpu_place) {
-  framework::DDim dims({32, 64});
+TEST(test_conv2d_noreuse_cache, cpu_place) {
+  framework::DDim dims({1, 16, 32, 64});
   platform::CPUPlace p;
   CacheTester ct;
-  RunOperator<float>(p, "softmax", dims, "softmax_out");
-  RunOperator<float>(p, "softmax", dims, "softmax_out2");
-  PADDLE_ENFORCE_EQ(ct.Analyze(8), true,
+  RunOperator<float>(p, "conv2d", dims, "input_signal");
+  RunOperator<float>(p, "conv2d", dims, "input_signal2");
+  PADDLE_ENFORCE_EQ(ct.Analyze(18), true,
                     platform::errors::InvalidArgument(
-                        "Wrong number of cached oneDNN objects"));
-}
-
-TEST(test_softmax_inplace_cache, cpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CPUPlace p;
-  CacheTester ct;
-  RunOperator<float>(p, "softmax", dims, "softmax_out");
-  RunOperator<float>(p, "softmax", dims, "softmax_out", true);
-  PADDLE_ENFORCE_EQ(ct.Analyze(7), true,
-                    platform::errors::InvalidArgument(
-                        "Wrong number of cached oneDNN objects"));
-}
-
-TEST(test_relu_inplace_cache, cpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CPUPlace p;
-  CacheTester ct;
-  RunOperator<float>(p, "relu", dims, "relu_out");
-  RunOperator<float>(p, "relu", dims, "relu_out", true);
-  PADDLE_ENFORCE_EQ(ct.Analyze(7), true,
-                    platform::errors::InvalidArgument(
-                        "Wrong number of cached oneDNN objects"));
-}
-
-TEST(test_elementwise_add_reuse_cache, cpu_place) {
-  framework::DDim dims({32, 64});
-  platform::CPUPlace p;
-  CacheTester ct;
-  RunOperator<float>(p, "elementwise_add", dims, "elementwise_add_out");
-  RunOperator<float>(p, "relu", dims, "elementwise_add_out", true);
-  PADDLE_ENFORCE_EQ(ct.Analyze(8), true,
-                    platform::errors::InvalidArgument(
-                        "Wrong number of cached oneDNN objects"));
+                        "Invalid number of cached oneDNN objects"));
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc
new file mode 100644
index 00000000000000..4e9c2ec482e927
--- /dev/null
+++ b/paddle/fluid/operators/size_op_npu.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SizeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("Input");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    Tensor cpu_tensor;
+    auto cpu_data =
+        cpu_tensor.mutable_data<int64_t>(out->dims(), platform::CPUPlace());
+    cpu_data[0] = x->numel();
+    TensorCopy(cpu_tensor, ctx.GetPlace(),
+               ctx.template device_context<platform::DeviceContext>(), out);
+    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    size, ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>,
+    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, bool>);
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index f63d45d7ff6ae6..95b8e0c610b1d4 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -34,6 +34,211 @@ using framework::Tensor;
 using user_function = std::function<std::shared_ptr<float>(const float*)>;
 using memory = mkldnn::memory;
 
+template <typename T, typename TForward,
+          typename TBackward = mkldnn_dummy_primitive,
+          typename TBackward_params = mkldnn_dummy_primitive>
+class MKLDNNHandlerNoCachingT {
+ public:
+  MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place)
+      : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) {
+    platform::MKLDNNDeviceContext::tls().log_lib_version();
+  }
+
+  std::shared_ptr<TForward> AcquireForwardPrimitive() {
+    return std::make_shared<TForward>(*fwd_pd_);
+  }
+
+  std::shared_ptr<TBackward> AcquireBackwardPrimitive() {
+    return std::make_shared<TBackward>(*bwd_pd_);
+  }
+
+  std::shared_ptr<TBackward_params> AcquireBackwardWeightsPrimitive() {
+    PADDLE_ENFORCE_NOT_NULL(
+        bwd_w_pd_, platform::errors::Unavailable("BWD_PD should be set when "
+                                                 "getting BWD prim ."));
+    return std::make_shared<TBackward_params>(*bwd_w_pd_);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const framework::Tensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(),
+                                            to_void_cast<T>(input_data));
+  }
+
+  template <typename T_out = T>
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
+    T_out* ptr =
+        output->mutable_data<T_out>(place_, fwd_pd_->dst_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr);
+  }
+
+  template <typename T_out = T>
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(void) {
+    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc());
+  }
+
+  template <typename T_out = T>
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
+      const framework::Tensor* output) {
+    const T_out* output_data = output->data<T_out>();
+    return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(),
+                                            to_void_cast<T_out>(output_data));
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
+      const framework::Tensor* diffdst) {
+    const T* ptr = diffdst->data<T>();
+    return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(),
+                                            to_void_cast<T>(ptr));
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
+      framework::Tensor* diffsrc) {
+    T* ptr =
+        diffsrc->mutable_data<T>(place_, bwd_pd_->diff_src_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr);
+  }
+
+  // Buffer of given Tensor is used for oneDNN computation
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemory(
+      framework::Tensor* diff_weights) {
+    PADDLE_ENFORCE_NOT_NULL(
+        bwd_w_pd_,
+        platform::errors::Unavailable(
+            "BWD_W_PD should be set when getting BWD grad of weights."));
+    T* ptr = diff_weights->mutable_data<T>(
+        place_, bwd_w_pd_->diff_weights_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(),
+                                            ptr);
+  }
+
+  // Buffer is allocated by oneDNN to store computation results
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemory(void) {
+    PADDLE_ENFORCE_NOT_NULL(
+        bwd_w_pd_,
+        platform::errors::Unavailable(
+            "BWD_W_PD should be set when getting BWD grad of weights."));
+    return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc());
+  }
+
+ protected:
+  // If your primitive descriptor requires attributes, pass them as a
+  // first argument and paramters to descriptor constructor in the following
+  // arguments. Otherwise, all arguments will be forwarded to descriptor
+  // constructor, including the first one.
+  template <typename Arg, typename... Args>
+  void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) {
+    CreateForwardPrimitiveDescriptor(first_arg, std::forward<Args>(args)...);
+  }
+
+  // Using sfinae to specialise variadic function. Workaround for not having
+  // if constexpr in C++ 11.
+  template <class First, class... Args>
+  typename std::enable_if<std::is_same<typename std::decay<First>::type,
+                                       dnnl::primitive_attr>::value>::type
+  CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
+    auto fwd_desc = typename TForward::desc(std::forward<Args>(args)...);
+    fwd_pd_ = std::make_shared<typename TForward::primitive_desc>(
+        fwd_desc, first, engine_);
+  }
+
+  template <class First, class... Args>
+  typename std::enable_if<!std::is_same<typename std::decay<First>::type,
+                                        dnnl::primitive_attr>::value>::type
+  CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
+    auto fwd_desc = typename TForward::desc(std::forward<First>(first),
+                                            std::forward<Args>(args)...);
+    fwd_pd_ =
+        std::make_shared<typename TForward::primitive_desc>(fwd_desc, engine_);
+  }
+
+  template <typename... Args>
+  void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
+    // fwd_pd_ is set during grad by calling
+    // AcquireForwardPrimitiveDescriptor
+    PADDLE_ENFORCE_NOT_NULL(fwd_pd_,
+                            platform::errors::Unavailable(
+                                "Get MKLDNN Forward primitive %s failed."));
+    auto bwd_desc = typename TBackward::desc(std::forward<Args>(args)...);
+    bwd_pd_ = std::make_shared<typename TBackward::primitive_desc>(
+        bwd_desc, engine_, *fwd_pd_);
+  }
+
+  template <typename... Args>
+  void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) {
+    // fwd_pd_ is set during grad by calling
+    // AcquireForwardPrimitiveDescriptor
+    PADDLE_ENFORCE_NOT_NULL(fwd_pd_,
+                            platform::errors::Unavailable(
+                                "Get MKLDNN Forward primitive %s failed."));
+    auto bwd_desc =
+        typename TBackward_params::desc(std::forward<Args>(args)...);
+    bwd_w_pd_ = std::make_shared<typename TBackward_params::primitive_desc>(
+        bwd_desc, engine_, *fwd_pd_);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::desc md, void* ptr) {
+    return std::make_shared<mkldnn::memory>(md, engine_, ptr);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::desc md) {
+    return std::make_shared<mkldnn::memory>(md, engine_);
+  }
+
+  void AcquireReorder(const std::shared_ptr<mkldnn::memory>& user_memory_p,
+                      const std::shared_ptr<mkldnn::memory>& target_memory_p) {
+    auto reorder_p =
+        std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    platform::RecordEvent record_reorder("int_reorder",
+                                         platform::EventRole::kUniqueOp);
+    reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
+                                 {MKLDNN_ARG_TO, *target_memory_p}});
+    astream.wait();
+  }
+
+  template <typename F = T>
+  std::shared_ptr<mkldnn::memory> AcquireMemoryWithReorder(
+      const mkldnn::memory::desc& user_md,
+      const mkldnn::memory::desc& target_md, void* ptr,
+      const std::string& suffix, bool is_persistent = false,
+      std::function<std::shared_ptr<F>(const F*)> custom_reorder_func = {}) {
+    std::shared_ptr<mkldnn::memory> target_memory_p;
+    if (custom_reorder_func) {
+      auto reordered_data =
+          custom_reorder_func(reinterpret_cast<const F*>(ptr));
+      ptr = reinterpret_cast<void*>(reordered_data.get());
+    }
+    auto user_memory_p = std::make_shared<dnnl::memory>(user_md, engine_, ptr);
+    if (user_md != target_md) {
+      target_memory_p = std::make_shared<mkldnn::memory>(target_md, engine_);
+      auto reorder_p =
+          std::make_shared<dnnl::reorder>(*user_memory_p, *target_memory_p);
+
+      auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+      platform::RecordEvent record_reorder("int_reorder",
+                                           platform::EventRole::kUniqueOp);
+      reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
+                                   {MKLDNN_ARG_TO, *target_memory_p}});
+      astream.wait();
+    } else {
+      target_memory_p = user_memory_p;
+    }
+    return target_memory_p;
+  }
+
+  mkldnn::engine engine_;
+  platform::Place place_;
+  std::shared_ptr<typename TForward::primitive_desc> fwd_pd_;
+  std::shared_ptr<typename TBackward::primitive_desc> bwd_pd_;
+  std::shared_ptr<typename TBackward_params::primitive_desc> bwd_w_pd_;
+};
+
 template <typename T, typename TForward,
           typename TBackward = mkldnn_dummy_primitive,
           typename TBackward_params = mkldnn_dummy_primitive>
@@ -79,7 +284,7 @@ class MKLDNNHandlerT {
         std::static_pointer_cast<TBackward_params>(dev_ctx_.GetBlob(key_p));
     if (backward_p == nullptr) {
       PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable(
-                                             "Error: BWD_PD should be set when "
+                                             "BWD_PD should be set when "
                                              "getting BWD prim witk key: %s .",
                                              key_p));
       backward_p = std::make_shared<TBackward_params>(*bwd_w_pd_);
@@ -138,7 +343,7 @@ class MKLDNNHandlerT {
     PADDLE_ENFORCE_NOT_NULL(
         bwd_w_pd_,
         platform::errors::Unavailable(
-            "Error: BWD_W_PD should be set when getting BWD grad of weights."));
+            "BWD_W_PD should be set when getting BWD grad of weights."));
     T* ptr = diff_weights->mutable_data<T>(
         place_, bwd_w_pd_->diff_weights_desc().get_size());
     return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr,
@@ -150,7 +355,7 @@ class MKLDNNHandlerT {
     PADDLE_ENFORCE_NOT_NULL(
         bwd_w_pd_,
         platform::errors::Unavailable(
-            "Error: BWD_W_PD should be set when getting BWD grad of weights."));
+            "BWD_W_PD should be set when getting BWD grad of weights."));
     return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(),
                                             "@diff_wei_mem_p");
   }
@@ -589,70 +794,70 @@ class MKLDNNHandler {
 };
 
 template <typename T>
-class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
+class BinaryMKLDNNHandler
+    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
  public:
   BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis,
-                      const MKLDNNDeviceContext& dev_ctx,
                       const mkldnn::engine engine, platform::Place cpu_place,
                       const Tensor* x, const Tensor* y, Tensor* z,
-                      float scale_x, float scale_y, float scale_z,
-                      const std::string& uniq_name)
-      : platform::MKLDNNHandlerT<T, dnnl::binary>(
-            dev_ctx, engine, cpu_place,
-            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
-                                uniq_name)) {
-    if (!this->isCached()) {
-      PADDLE_ENFORCE_EQ(
-          x->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument("Wrong layout set for X tensor."));
-      PADDLE_ENFORCE_NE(
-          x->format(), MKLDNNMemoryFormat::undef,
-          platform::errors::InvalidArgument("Wrong format set for X tensor."));
-
-      PADDLE_ENFORCE_EQ(
-          y->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument("Wrong layout set for Y tensor."));
-      PADDLE_ENFORCE_NE(
-          y->format(), MKLDNNMemoryFormat::undef,
-          platform::errors::InvalidArgument("Wrong format set for Y tensor."));
-
-      const auto src_x_tz = framework::vectorize(x->dims());
-      const auto src_y_tz = framework::vectorize(y->dims());
-      // if output tensor(z) is nullptr then we are computing into oneDNN
-      // managed buffer
-      auto rankdiff = x->dims().size() - y->dims().size();
-      const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
-                                         : framework::vectorize(z->dims());
-
-      auto src0_md = dnnl::memory::desc(
-          src_x_tz, platform::MKLDNNGetDataType<T>(), x->format());
-      auto src1_md = dnnl::memory::desc(
-          src_y_tz, platform::MKLDNNGetDataType<T>(), y->format());
-      if (rankdiff > 0) {  // Second input is of smaller rank than first
-        std::vector<int64_t> dims1_ex(rankdiff, 1);
-        dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)),
-                        src_y_tz.begin(), src_y_tz.end());
-        src1_md = src1_md.reshape(dims1_ex);
-      } else if (rankdiff < 0) {  // First input is of smaller than second
-        std::vector<int64_t> dims0_ex(-rankdiff, 1);
-        dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)),
-                        src_x_tz.begin(), src_x_tz.end());
-        src0_md = src0_md.reshape(dims0_ex);
-      }
-      const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
-                                       MKLDNNMemoryFormat::any);
-
-      auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z);
-      this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md,
-                                              src1_md, dst_md);
+                      float scale_x, float scale_y, float scale_z)
+      : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
+    PADDLE_ENFORCE_EQ(
+        x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument(
+            "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d",
+            DataLayout::kMKLDNN, x->layout()));
+    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef,
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for X tensor : %d (undef)",
+                          static_cast<unsigned int>(x->format())));
+
+    PADDLE_ENFORCE_EQ(
+        y->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument(
+            "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d",
+            DataLayout::kMKLDNN, y->layout()));
+    PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef,
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Y tensor : %d (undef)",
+                          static_cast<unsigned int>(y->format())));
+
+    const auto src_x_tz = framework::vectorize(x->dims());
+    const auto src_y_tz = framework::vectorize(y->dims());
+    // if output tensor(z) is nullptr then we are computing into oneDNN
+    // managed buffer
+    auto rankdiff = x->dims().size() - y->dims().size();
+    const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
+                                       : framework::vectorize(z->dims());
+
+    auto src0_md = dnnl::memory::desc(
+        src_x_tz, platform::MKLDNNGetDataType<T>(), x->format());
+    auto src1_md = dnnl::memory::desc(
+        src_y_tz, platform::MKLDNNGetDataType<T>(), y->format());
+    if (rankdiff > 0) {  // Second input is of smaller rank than first
+      std::vector<int64_t> dims1_ex(rankdiff, 1);
+      dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)),
+                      src_y_tz.begin(), src_y_tz.end());
+      src1_md = src1_md.reshape(dims1_ex);
+    } else if (rankdiff < 0) {  // First input is of smaller than second
+      std::vector<int64_t> dims0_ex(-rankdiff, 1);
+      dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)),
+                      src_x_tz.begin(), src_x_tz.end());
+      src0_md = src0_md.reshape(dims0_ex);
     }
+    const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
+                                     MKLDNNMemoryFormat::any);
+
+    auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z);
+    this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
+                                            dst_md);
   }
 
   std::shared_ptr<mkldnn::memory> AcquireSecondSrcMemory(
       const framework::Tensor* input) {
     const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        this->fwd_pd_->src1_desc(), to_void_cast<T>(input_data), "@src1_mem_p");
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(),
+                                            to_void_cast<T>(input_data));
   }
 
  private:
@@ -775,111 +980,95 @@ class ReductionMKLDNNHandler
 
 template <typename T>
 class ActivationMKLDNNHandler
-    : public MKLDNNHandlerT<T, mkldnn::eltwise_forward,
-                            mkldnn::eltwise_backward> {
+    : public MKLDNNHandlerNoCachingT<T, mkldnn::eltwise_forward,
+                                     mkldnn::eltwise_backward> {
  public:
   ActivationMKLDNNHandler(mkldnn::algorithm algorithm,
                           const framework::ExecutionContext& ctx,
-                          const MKLDNNDeviceContext& dev_ctx, Place cpu_place,
-                          const framework::Tensor* in_x,
-                          const std::string& unique_name, bool is_inplaced)
-      : platform::MKLDNNHandlerT<T, mkldnn::eltwise_forward,
-                                 mkldnn::eltwise_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            is_inplaced ? platform::CreateKey(
-                              dev_ctx, framework::vectorize(in_x->dims()), "a",
-                              algorithm, unique_name)
-                        : platform::CreateKey(
-                              dev_ctx, framework::vectorize(in_x->dims()), "a",
-                              unique_name)) {
-    if (!this->isCached()) {
-      float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 0;
-      float beta = ctx.HasAttr("beta") ? ctx.Attr<float>("beta") : 0;
-      // eltwise_linear means we are in scale op
-      if (algorithm == mkldnn::algorithm::eltwise_linear) {
-        bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-        auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
-        alpha = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
-                                          : (float)*(scale_tensor->data<T>());
-        beta = ctx.Attr<float>("bias");
-        // if bias_after_scale == true
-        //   out = scale*X + bias
-        // else
-        //   out = scale*(X + bias) = scale*X + scale*bias
-        if (!bias_after_scale) beta *= alpha;
-      } else {
-        // paddle uses beta but mkldnn uses alpha for swish
-        if (algorithm == mkldnn::algorithm::eltwise_swish) {
-          std::swap(alpha, beta);
-        } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
-          alpha = ctx.Attr<float>("threshold");
-        }
+                          const mkldnn::engine engine, Place cpu_place,
+                          const framework::Tensor* in_x)
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::eltwise_forward,
+                                          mkldnn::eltwise_backward>(engine,
+                                                                    cpu_place) {
+    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 0;
+    float beta = ctx.HasAttr("beta") ? ctx.Attr<float>("beta") : 0;
+    // eltwise_linear means we are in scale op
+    if (algorithm == mkldnn::algorithm::eltwise_linear) {
+      bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+      auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
+      alpha = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
+                                        : (float)*(scale_tensor->data<T>());
+      beta = ctx.Attr<float>("bias");
+      // if bias_after_scale == true
+      //   out = scale*X + bias
+      // else
+      //   out = scale*(X + bias) = scale*X + scale*bias
+      if (!bias_after_scale) beta *= alpha;
+    } else {
+      // paddle uses beta but mkldnn uses alpha for swish
+      if (algorithm == mkldnn::algorithm::eltwise_swish) {
+        std::swap(alpha, beta);
+      } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
+        alpha = ctx.Attr<float>("threshold");
       }
+    }
 
-      PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6,
-                     platform::errors::Unimplemented(
-                         "Input dimension size can be 1, 2, 3, 4, "
-                         "5, or 6, but now the dimension size is",
-                         in_x->dims().size()));
+    PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6,
+                   platform::errors::Unimplemented(
+                       "Input dimension size can be 1, 2, 3, 4, "
+                       "5, or 6, but now the dimension size is",
+                       in_x->dims().size()));
 
-      auto src_tz = framework::vectorize<int64_t>(in_x->dims());
-      auto src_fmt =
-          src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format();
-      auto md = mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType<T>(),
-                                     src_fmt);
+    auto src_tz = framework::vectorize<int64_t>(in_x->dims());
+    auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format();
+    auto md =
+        mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType<T>(), src_fmt);
 
-      this->AcquireForwardPrimitiveDescriptor(
-          mkldnn::prop_kind::forward_training, algorithm, md, alpha, beta);
-    }
+    this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training,
+                                            algorithm, md, alpha, beta);
   }
 
   ActivationMKLDNNHandler(mkldnn::algorithm algorithm,
                           const framework::ExecutionContext& ctx,
-                          const MKLDNNDeviceContext& dev_ctx, Place cpu_place,
-                          const framework::Tensor* in_x, const Tensor* out_grad,
-                          const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::eltwise_forward,
-                                 mkldnn::eltwise_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()),
-                                "a", unique_name)) {
-    if (!this->isBwdCached()) {
-      float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 0;
-      float beta = ctx.HasAttr("beta") ? ctx.Attr<float>("beta") : 0;
-
-      // paddle uses beta but mkldnn uses alpha for swish
-      if (algorithm == mkldnn::algorithm::eltwise_swish) {
-        std::swap(alpha, beta);
-      } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
-        alpha = ctx.Attr<float>("threshold");
-      }
+                          const mkldnn::engine engine, Place cpu_place,
+                          const framework::Tensor* in_x, const Tensor* out_grad)
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::eltwise_forward,
+                                          mkldnn::eltwise_backward>(engine,
+                                                                    cpu_place) {
+    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 0;
+    float beta = ctx.HasAttr("beta") ? ctx.Attr<float>("beta") : 0;
+
+    // paddle uses beta but mkldnn uses alpha for swish
+    if (algorithm == mkldnn::algorithm::eltwise_swish) {
+      std::swap(alpha, beta);
+    } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
+      alpha = ctx.Attr<float>("threshold");
+    }
 
-      auto diff_dst_tz = framework::vectorize<int64_t>(out_grad->dims());
+    auto diff_dst_tz = framework::vectorize<int64_t>(out_grad->dims());
 
-      auto src_fmt =
-          diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format();
-      auto diff_fmt =
-          diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format();
+    auto src_fmt =
+        diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format();
+    auto diff_fmt =
+        diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format();
 
-      auto dims = framework::vectorize(in_x->dims());
-      auto diff_dst_md = platform::MKLDNNMemDesc(
-          dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
-      auto src_md = platform::MKLDNNMemDesc(
-          dims, platform::MKLDNNGetDataType<T>(), src_fmt);
+    auto dims = framework::vectorize(in_x->dims());
+    auto diff_dst_md = platform::MKLDNNMemDesc(
+        dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
+    auto src_md = platform::MKLDNNMemDesc(
+        dims, platform::MKLDNNGetDataType<T>(), src_fmt);
 
-      this->AcquireForwardPrimitiveDescriptor(
-          mkldnn::prop_kind::forward_training, algorithm, src_md, alpha, beta);
-      this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md,
-                                               alpha, beta);
-    }
+    this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training,
+                                            algorithm, src_md, alpha, beta);
+    this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md,
+                                             alpha, beta);
   }
 
   std::shared_ptr<mkldnn::memory> AcquireBackwardSrcMemory(
       const framework::Tensor* input) {
     const T* input_data = input->data<T>();
     return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(),
-                                            to_void_cast<T>(input_data),
-                                            "@bwd-src_mem_p");
+                                            to_void_cast<T>(input_data));
   }
 };
 
@@ -1430,11 +1619,6 @@ using ConvMKLDNNHandler =
                               mkldnn::convolution_backward_data,
                               mkldnn::convolution_backward_weights>;
 
-using ConvTransposeMKLDNNHandler =
-    ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward,
-                              mkldnn::deconvolution_backward_data,
-                              mkldnn::deconvolution_backward_weights>;
-
 template <typename T>
 static std::shared_ptr<mkldnn::memory> SetDstMemory(
     const framework::ExecutionContext& ctx, framework::Tensor* output,
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 0b6af3b542395d..6c4213979a46be 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -785,7 +785,8 @@ void BindImperative(py::module *m_ptr) {
              const int size = PyTuple_GET_SIZE(index_ptr);
              for (int dim = 0; dim < size; ++dim) {
                PyObject *slice_item = PyTuple_GetItem(index_ptr, dim);
-               if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item))) {
+               if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item) ||
+                     slice_item == Py_Ellipsis || slice_item == Py_None)) {
                  parse_index = false;
                  break;
                }
@@ -807,7 +808,8 @@ void BindImperative(py::module *m_ptr) {
                    {"starts", starts},
                    {"ends", ends},
                    {"steps", steps},
-                   {"decrease_axes", decrease_axes}};
+                   {"decrease_axes", decrease_axes},
+                   {"none_axes", none_axes}};
 
                imperative::NameVarBaseMap ins = {{"Input", {self}}};
                imperative::NameVarBaseMap outs = {{"Out", {self}}};
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 6208b43c9e9e48..b8c0e47e9bbc26 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -42,18 +42,17 @@ class ImperativeQuantAware(object):
     Applying quantization aware training (QAT) to the dgraph model.
     """
 
-    def __init__(
-            self,
-            quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'],
-            weight_quantize_type='abs_max',
-            activation_quantize_type='moving_average_abs_max',
-            weight_bits=8,
-            activation_bits=8,
-            moving_rate=0.9,
-            weight_preprocess_layer=None,
-            act_preprocess_layer=None,
-            weight_quantize_layer=None,
-            act_quantize_layer=None):
+    def __init__(self,
+                 quantizable_layer_type=['Conv2D', 'Linear'],
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='moving_average_abs_max',
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 weight_preprocess_layer=None,
+                 act_preprocess_layer=None,
+                 weight_quantize_layer=None,
+                 act_quantize_layer=None):
         """
         The constructor for ImperativeQuantAware.
 
@@ -213,44 +212,9 @@ def quantize(self, model):
         the out_scale value of outputs would be calculated.
 
         Args:
-            model(paddle.nn.Layer): the model to be quantized.
+            model(fluid.dygraph.Layer): the model to be quantized.
         Returns:
             None
-
-        Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.fluid.contrib.slim.quantization \
-                import ImperativeQuantAware
-
-            class ImperativeModel(paddle.nn.Layer):
-                def __init__(self):
-                    super(ImperativeModel, self).__init__()
-                    # self.linear_0 would skip the quantization.
-                    self.linear_0 = paddle.nn.Linear(784, 400)
-                    self.linear_0.skip_quant = True
-
-                    # self.linear_1 would not skip the quantization.
-                    self.linear_1 = paddle.nn.Linear(400, 10)
-                    self.linear_1.skip_quant = False
-
-                def forward(self, inputs):
-                    x = self.linear_0(inputs)
-                    x = self.linear_1(inputs)
-                    return x
-
-            model = ImperativeModel()
-            imperative_qat = ImperativeQuantAware(
-                weight_quantize_type='abs_max',
-                activation_quantize_type='moving_average_abs_max')
-
-            # Add the fake quant logical.
-            # The original model will be rewrite.
-            #
-            # There is only one Layer(self.linear1) would be added the
-            # fake quant logical.
-            imperative_qat.quantize(model)
         """
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
@@ -268,18 +232,17 @@ class ImperativeQuantizeInputs(object):
     logic both for activation inputs and weight inputs.
     """
 
-    def __init__(
-            self,
-            quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'],
-            weight_quantize_type='abs_max',
-            activation_quantize_type='moving_average_abs_max',
-            weight_bits=8,
-            activation_bits=8,
-            moving_rate=0.9,
-            weight_preprocess_layer=None,
-            act_preprocess_layer=None,
-            weight_quantize_layer=None,
-            act_quantize_layer=None):
+    def __init__(self,
+                 quantizable_layer_type=['Conv2D', 'Linear'],
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='moving_average_abs_max',
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 weight_preprocess_layer=None,
+                 act_preprocess_layer=None,
+                 weight_quantize_layer=None,
+                 act_quantize_layer=None):
         """
         The constructor for ImperativeQuantizeInputs. 
 
@@ -340,18 +303,6 @@ def __init__(
         }
 
     def apply(self, model):
-        """
-        Quantize the weights and activations to calculate for specific 
-        layers.
-
-        Args:
-            model(paddle.nn.Layer): The target model which would
-                calculate the input quantization scale.
-
-        Returns:
-            None
-        """
-
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
@@ -403,7 +354,7 @@ def apply(self, model):
         output scales for specific layers in the dygraph model.
 
         Args:
-            model(paddle.nn.Layer): The target model which would be
+            model(fluid.dygraph.Layer): The target model which would be
                 calculate the output quantization scale.
 
         Returns:
@@ -593,9 +544,7 @@ def _is_skip_quant_op(self, block, in_op):
         1. the type of input op should be conv2d, depthwise_conv2d or matmul
         2. the previous ops of the input op are not fake_quantize_dequantize ops
         """
-        target_op_types = [
-            "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose"
-        ]
+        target_op_types = ["conv2d", "depthwise_conv2d", "matmul"]
         if in_op.type not in target_op_types:
             return False
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 009ce372b4f29c..a9d52c5a87ad36 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -24,7 +24,6 @@
 from ..quantization_pass import _get_input_name_index
 
 layer_name_map = {
-    'Conv2DTranspose': paddle.nn.Conv2DTranspose,
     'Conv2D': paddle.nn.Conv2D,
     'Linear': paddle.nn.Linear,
     'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
@@ -47,9 +46,8 @@
 }
 
 # Apply fake quant for the inputs of these layers
-fake_quant_input_layers = [
-    paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.Conv2DTranspose
-]
+# TODO (jc): support paddle.nn.Conv2DTranspose
+fake_quant_input_layers = [paddle.nn.Conv2D, paddle.nn.Linear]
 
 # Apply fake quant for the output of these layers
 # TODO(jc): fix the problem of adding duplicate fake_quant ops
@@ -67,8 +65,7 @@
 ]
 
 fake_quant_wrap_layers = [
-    quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear,
-    quant_layers.QuantizedConv2DTranspose
+    quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear
 ]
 
 # The weight format of these layers is Cin * Cout * H * W 
@@ -87,9 +84,9 @@
 
 
 def load_variable_data(scope, var_name):
-    """
+    '''
     Load variable value from scope
-    """
+    '''
     var_node = scope.find_var(var_name)
     assert var_node is not None, \
         "Can not find " + var_name + " in the scope."
@@ -123,12 +120,6 @@ def find_parent_layer_and_sub_name(model, name):
     the sub_name of the layer.
     For example, if name is 'block_1/convbn_1/conv_1', the parent layer is
     'block_1/convbn_1' and the sub_name is `conv_1`.
-    Args:
-        model(paddle.nn.Layer): the model to be quantized.
-        name(string): the name of a layer
-
-    Returns:
-        parent_layer, subname
     """
     assert isinstance(model, paddle.nn.Layer), \
             "The model must be the instance of paddle.nn.Layer."
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 9917730daa543f..c2d7a9bb4d5174 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -1273,12 +1273,17 @@ def _insert_post_channel_dequant_op(self, graph, op_node, quant_axis):
             var_type=output_var_node.type(),
             shape=output_var_node.shape(),
             var_dtype=output_var_node.dtype())
+        if op_node.op().has_attr("x_num_col_dims"):
+            x_num_col_dims = op_node.op().attr("x_num_col_dims")
+        else:
+            x_num_col_dims = 1
         dequant_op_node = graph.create_op_node(
             op_type='fake_channel_wise_dequantize_max_abs',
             attrs={
                 'quant_bits': [self._weight_bits, self._activation_bits],
                 'quant_axis': quant_axis,
-                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward,
+                'x_num_col_dims': x_num_col_dims
             },
             inputs={
                 'X': output_var_node,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 677ccb52e242cf..14fa291ee077c6 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -28,10 +28,10 @@
 from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.container import Sequential
-from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose
+from paddle.nn import Linear, Conv2D, Softmax
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose
+from paddle.nn.quant.quant_layers import QuantizedConv2D
 
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
@@ -75,12 +75,6 @@ def test_qat(self):
             data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
             quant_conv1(fluid.dygraph.to_variable(data))
 
-            conv_transpose = Conv2DTranspose(4, 6, (3, 3))
-            quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose)
-            x_var = paddle.uniform(
-                (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0)
-            quant_conv_transpose(x_var)
-
             seed = 1
             np.random.seed(seed)
             fluid.default_main_program().random_seed = seed
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
index 270e8ee566ab57..621213beb31cd7 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -28,7 +28,6 @@
 from paddle.fluid.dygraph import Conv2D
 from paddle.fluid.dygraph import Pool2D
 from paddle.fluid.dygraph import Linear
-from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
 from paddle.fluid.log_helper import get_logger
 
 os.environ["CPU_NUM"] = "1"
@@ -101,19 +100,6 @@ def dequantize(x, lower_bound, delta, interval):
         return x
 
 
-class ModelForConv2dT(nn.Layer):
-    def __init__(self, num_classes=10):
-        super(ModelForConv2dT, self).__init__()
-        self.features = nn.Conv2DTranspose(4, 6, (3, 3))
-        self.fc = Linear(input_dim=600, output_dim=num_classes)
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-        x = paddle.flatten(x, 1)
-        x = self.fc(x)
-        return x
-
-
 class ImperativeLenet(paddle.nn.Layer):
     def __init__(self, num_classes=10, classifier_activation='softmax'):
         super(ImperativeLenet, self).__init__()
@@ -182,11 +168,6 @@ def test_quant_aware_training(self):
         imperative_qat.quantize(lenet)
         adam = Adam(learning_rate=0.001, parameters=lenet.parameters())
         dynamic_loss_rec = []
-        #for CI coverage
-        conv_transpose = ModelForConv2dT()
-        imperative_qat.quantize(conv_transpose)
-        x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
-        conv_transpose(x_var)
 
         def train(model):
             adam = Adam(learning_rate=0.001, parameters=model.parameters())
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 607a3c94f8a4e7..9a21a5a850db97 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -825,6 +825,8 @@ def _minimize(self,
         opt_info["worker_skipped_ops"] = worker_skipped_ops
         opt_info["use_cvm"] = strategy.get("use_cvm", False)
         opt_info["no_cvm"] = strategy.get("no_cvm", False)
+        opt_info["scale_sparse_gradient_with_batch_size"] = strategy.get(
+            "scale_sparse_gradient_with_batch_size", True)
         opt_info["worker_class"] = strategy.get("worker_class",
                                                 "DownpourWorker")
         opt_info["stat_var_names"] = strategy.get("stat_var_names", [])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py
new file mode 100755
index 00000000000000..80721cbd66a558
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py
@@ -0,0 +1,141 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+import paddle
+import paddle.fluid as fluid
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestSizeOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "size"
+
+        self.config()
+        input = np.zeros(self.shape, dtype=self.dtype)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array([np.size(input)], dtype=np.int64)}
+
+    def config(self):
+        self.shape = [1, 2]
+        self.dtype = np.int32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+
+class TestSizeOp1(TestSizeOp):
+    def config(self):
+        self.shape = [2]
+        self.dtype = np.float64
+
+
+class TestSizeOp2(TestSizeOp):
+    def config(self):
+        self.shape = [2, 3]
+        self.dtype = np.float32
+
+
+class TestSizeOp3(TestSizeOp):
+    def config(self):
+        self.shape = [2, 3, 100]
+        self.dtype = np.float16
+
+
+class TestSizeOp4(TestSizeOp):
+    def config(self):
+        self.shape = [2**10]
+        self.dtype = np.bool
+
+
+class TestSizeOp5(TestSizeOp):
+    def config(self):
+        self.shape = [7, 8, 9, 10]
+        self.dtype = np.int64
+
+
+class TestSizeOp6(TestSizeOp):
+    def config(self):
+        self.shape = []
+        self.dtype = np.int64
+
+
+class TestSizeAPI(unittest.TestCase):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_size_static(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            shape1 = [2, 1, 4, 5]
+            shape2 = [1, 4, 5]
+            x_1 = paddle.fluid.data(shape=shape1, dtype='int32', name='x_1')
+            x_2 = paddle.fluid.data(shape=shape2, dtype='int32', name='x_2')
+            input_1 = np.random.random(shape1).astype("int32")
+            input_2 = np.random.random(shape2).astype("int32")
+            out_1 = paddle.fluid.layers.size(x_1)
+            out_2 = paddle.fluid.layers.size(x_2)
+            exe = paddle.static.Executor(place=self.place)
+            res_1, res_2 = exe.run(feed={
+                "x_1": input_1,
+                "x_2": input_2,
+            },
+                                   fetch_list=[out_1, out_2])
+            assert (np.array_equal(
+                res_1, np.array([np.size(input_1)]).astype("int64")))
+            assert (np.array_equal(
+                res_2, np.array([np.size(input_2)]).astype("int64")))
+
+    def test_size_imperative(self):
+        paddle.disable_static(self.place)
+        input_1 = np.random.random([2, 1, 4, 5]).astype("int32")
+        input_2 = np.random.random([1, 4, 5]).astype("int32")
+        x_1 = paddle.to_tensor(input_1)
+        x_2 = paddle.to_tensor(input_2)
+        out_1 = paddle.fluid.layers.size(x_1)
+        out_2 = paddle.fluid.layers.size(x_2)
+        assert (np.array_equal(out_1.numpy().item(0), np.size(input_1)))
+        assert (np.array_equal(out_2.numpy().item(0), np.size(input_2)))
+        paddle.enable_static()
+
+    def test_error(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+
+            def test_x_type():
+                shape = [1, 4, 5]
+                input_1 = np.random.random(shape).astype("int32")
+                out_1 = paddle.fluid.layers.size(input_1)
+
+            self.assertRaises(TypeError, test_x_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 4eca3a494e25a4..6152bce55ce9f2 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -124,6 +124,10 @@ def _set_use_cvm(self, use_cvm=False):
     def _set_no_cvm(self, no_cvm=False):
         self.proto_desc.no_cvm = no_cvm
 
+    def _set_scale_sparse_grad_with_batch_size(
+            self, scale_sparse_gradient_with_batch_size=True):
+        self.proto_desc.scale_sparse_gradient_with_batch_size = scale_sparse_gradient_with_batch_size
+
     def _set_scale_datanorm(self, scale_datanorm=-1):
         self.proto_desc.scale_datanorm = scale_datanorm
 
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index 7912ffca84ba41..ed10bee2e063a7 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -95,6 +95,10 @@ def _create_trainer(self, opt_info=None):
                     trainer._set_use_cvm(opt_info["use_cvm"])
                 if opt_info.get("no_cvm") is not None:
                     trainer._set_no_cvm(opt_info["no_cvm"])
+                if opt_info.get(
+                        "scale_sparse_gradient_with_batch_size") is not None:
+                    trainer._set_scale_sparse_grad_with_batch_size(opt_info[
+                        "scale_sparse_gradient_with_batch_size"])
                 if opt_info.get("scale_datanorm") is not None:
                     trainer._set_scale_datanorm(opt_info["scale_datanorm"])
                 if opt_info.get("adjust_ins_weight") is not None:
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 040b04f5e7bf1e..5573683ebd0458 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -31,7 +31,6 @@
     'FakeQuantMovingAverageAbsMax',
     'FakeQuantChannelWiseAbsMax',
     'QuantizedConv2D',
-    'QuantizedConv2DTranspose',
     'QuantizedLinear',
     'MovingAverageAbsMaxScale',
     'MAOutputScaleLayer',
@@ -482,112 +481,6 @@ def forward(self, input):
             data_format=self._data_format)
 
 
-class QuantizedConv2DTranspose(layers.Layer):
-    """
-    The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
-    The only difference is that its inputs are all fake quantized.
-    
-    Examples:
-       .. code-block:: python
-          import paddle
-          import paddle.nn as nn
-          from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
-          x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
-          conv = nn.Conv2DTranspose(4, 6, (3, 3))
-          conv_quantized = QuantizedConv2DTranspose(conv)
-          y_quantized = conv_quantized(x_var)
-          y_var = conv(x_var)
-          y_quantized_np = y_quantized.numpy()
-          y_np = y_var.numpy()
-          print(y_np.shape, y_quantized_np.shape)
-          # (2, 6, 10, 10), (2, 6, 10, 10)
-    """
-
-    def __init__(self,
-                 layer,
-                 weight_bits=8,
-                 activation_bits=8,
-                 moving_rate=0.9,
-                 weight_quantize_type='abs_max',
-                 activation_quantize_type='abs_max',
-                 weight_pre_layer=None,
-                 act_pre_layer=None,
-                 weight_quant_layer=None,
-                 act_quant_layer=None):
-        r"""
-        Constructor.
-
-        The arguments are the same as ImperativeQuantAware.
-        """
-        super(QuantizedConv2DTranspose, self).__init__()
-        # For Conv2DTranspose
-        self._groups = getattr(layer, '_groups')
-        self._stride = getattr(layer, '_stride')
-        self._padding = getattr(layer, '_padding')
-        self._output_padding = getattr(layer, 'output_padding')
-        self._dilation = getattr(layer, '_dilation')
-        self._data_format = getattr(layer, '_data_format')
-        self.weight = getattr(layer, 'weight')
-        self.bias = getattr(layer, 'bias')
-        # For FakeQuant
-        self._conv2d_transpose_quant_axis = 1
-        if weight_quant_layer is not None:
-            self._fake_quant_weight = weight_quant_layer()
-        else:
-            self._fake_quant_weight = _get_fake_quant_type(
-                weight_quantize_type,
-                name=self.weight.name,
-                moving_rate=moving_rate,
-                quant_bits=weight_bits,
-                dtype=self._dtype,
-                quant_on_weight=True,
-                channel_num=self.weight.shape[
-                    self._conv2d_transpose_quant_axis],
-                quant_axis=self._conv2d_transpose_quant_axis)
-        if act_quant_layer is not None:
-            self._fake_quant_input = act_quant_layer()
-        else:
-            self._fake_quant_input = _get_fake_quant_type(
-                activation_quantize_type,
-                name=layer.full_name(),
-                moving_rate=moving_rate,
-                quant_bits=activation_bits,
-                dtype=self._dtype,
-                quant_on_weight=False)
-
-        self._act_preprocess = act_pre_layer(
-        ) if act_pre_layer is not None else None
-        self._weight_preprocess = weight_pre_layer(
-        ) if weight_pre_layer is not None else None
-
-    def forward(self, input, output_size=None):
-        if self._act_preprocess is not None:
-            input = self._act_preprocess(input)
-        quant_input = self._fake_quant_input(input)
-
-        weight = self.weight
-        if self._weight_preprocess is not None:
-            weight = self._weight_preprocess(self.weight)
-        quant_weight = self._fake_quant_weight(weight)
-
-        if output_size is None:
-            output_padding = self._output_padding
-        else:
-            output_padding = 0
-
-        return F.conv2d_transpose(
-            quant_input,
-            quant_weight,
-            bias=self.bias,
-            padding=self._padding,
-            output_padding=output_padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            output_size=output_size,
-            data_format=self._data_format)
-
-
 class QuantizedLinear(layers.Layer):
     """
     The computational logic of QuantizedLinear is the same with Linear.
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index d8cb70c9dd107b..3ec12c11a7045a 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -440,7 +440,6 @@ def get_filenames(full_test=False):
     '''
     global whl_error
     import paddle
-    import paddle.fluid.contrib.slim.quantization
     whl_error = []
     if full_test:
         get_full_api_from_pr_spec()

From 891d1e5d8c39e11a799fba2bc7bb828710560f0e Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Tue, 17 Aug 2021 11:07:30 +0800
Subject: [PATCH 14/16] update fake_dequant op

---
 paddle/fluid/operators/fake_dequantize_op.cc | 8 ++++++--
 paddle/fluid/operators/fake_dequantize_op.cu | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index c71710b79d1fbe..14ae6beb4e4a61 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -106,8 +106,7 @@ struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
             }
           }
         }
-      }
-      else {
+      } else {
         int batch_size = in->dims()[0];
         int channel = in->dims()[1];
         const T* scale_one = scales[0]->data<T>();
@@ -278,6 +277,11 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs,
                        ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, double>);
 
 REGISTER_OP_VERSION(fake_channel_wise_dequantize_max_abs)
+    .AddCheckpoint(
+        R"ROC(add new attributes [quant_axis] for applying per-channel "
+        "dequantization to conv2d_tranpose and mul ops.)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "quant_axis", "The axis for dequantization.", 0))
     .AddCheckpoint(
         R"ROC(add new attributes [x_num_col_dims] for applying per-channel "
         "dequantization to mul ops.)ROC",
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index b1d2e220426f73..c88a8fe196edf8 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -119,7 +119,7 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
       int iter_size = 1;
       for (int i = 0; i < x_num_col_dims; i++) {
         iter_size *= in->dims()[i];
-        }
+      }
       int channel = in->dims()[x_num_col_dims];
       const T* scale_one = scales[0]->data<T>();
       const T* scale_two = scales[1]->data<T>();

From 5f047ff988ceae6c2e48e14b06c19f23d9c668b4 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Tue, 17 Aug 2021 12:11:33 +0000
Subject: [PATCH 15/16] register new attr in enhanced pass

---
 paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc   | 3 +++
 .../compat/fake_channel_wise_dequantize_max_abs.pbtxt         | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 354db8acf87a73..378d2231e329a3 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -115,6 +115,9 @@ QuantDequantFusePass::QuantDequantFusePass() {
       .AddAttr("quant_axis")
       .IsIntIn({0, 1})
       .IsOptional()
+      .AddAttr("x_num_col_dims")
+      .IsType<int>()
+      .IsOptional()
       .End();
   AddOpCompat(OpCompat("conv2d"))
       .AddInput("Input")
diff --git a/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt b/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt
index ec80ffaaf32ae1..c32c170ce65ab0 100644
--- a/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt
+++ b/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt
@@ -17,4 +17,8 @@ def {
     name: "quant_axis"
     type: INT
   }
+  attrs {
+    name: "x_num_col_dims"
+    type: INT
+  }
 }

From 76d7f0754a78862de3f71c9f2e784e5da5627a58 Mon Sep 17 00:00:00 2001
From: XGZhang <46363693+XGZhang11@users.noreply.github.com>
Date: Tue, 17 Aug 2021 20:26:27 +0800
Subject: [PATCH 16/16] Update quant_conv2d_dequant_fuse_pass.cc

---
 paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 378d2231e329a3..5958728946c2ed 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -115,6 +115,7 @@ QuantDequantFusePass::QuantDequantFusePass() {
       .AddAttr("quant_axis")
       .IsIntIn({0, 1})
       .IsOptional()
+      .End()
       .AddAttr("x_num_col_dims")
       .IsType<int>()
       .IsOptional()