From f4f31f2396a7e6b168ece6527187da2be8911442 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Mon, 2 Aug 2021 08:35:52 +0000 Subject: [PATCH 01/16] support quantization of conv2d_transpose --- .../slim/quantization/imperative/qat.py | 62 ++++++++----- .../slim/quantization/imperative/utils.py | 18 +++- python/paddle/nn/quant/quant_layers.py | 87 +++++++++++++++++++ 3 files changed, 140 insertions(+), 27 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index b8c0e47e9bbc26..32a3ebfe047030 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -42,17 +42,18 @@ class ImperativeQuantAware(object): Applying quantization aware training (QAT) to the dgraph model. """ - def __init__(self, - quantizable_layer_type=['Conv2D', 'Linear'], - weight_quantize_type='abs_max', - activation_quantize_type='moving_average_abs_max', - weight_bits=8, - activation_bits=8, - moving_rate=0.9, - weight_preprocess_layer=None, - act_preprocess_layer=None, - weight_quantize_layer=None, - act_quantize_layer=None): + def __init__( + self, + quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'], + weight_quantize_type='abs_max', + activation_quantize_type='moving_average_abs_max', + weight_bits=8, + activation_bits=8, + moving_rate=0.9, + weight_preprocess_layer=None, + act_preprocess_layer=None, + weight_quantize_layer=None, + act_quantize_layer=None): """ The constructor for ImperativeQuantAware. @@ -232,17 +233,18 @@ class ImperativeQuantizeInputs(object): logic both for activation inputs and weight inputs. """ - def __init__(self, - quantizable_layer_type=['Conv2D', 'Linear'], - weight_quantize_type='abs_max', - activation_quantize_type='moving_average_abs_max', - weight_bits=8, - activation_bits=8, - moving_rate=0.9, - weight_preprocess_layer=None, - act_preprocess_layer=None, - weight_quantize_layer=None, - act_quantize_layer=None): + def __init__( + self, + quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'], + weight_quantize_type='abs_max', + activation_quantize_type='moving_average_abs_max', + weight_bits=8, + activation_bits=8, + moving_rate=0.9, + weight_preprocess_layer=None, + act_preprocess_layer=None, + weight_quantize_layer=None, + act_quantize_layer=None): """ The constructor for ImperativeQuantizeInputs. @@ -303,6 +305,18 @@ def __init__(self, } def apply(self, model): + """ + Quantize the weights and activations to calculate for specific + layers in the dygraph model. + + Args: + model(fluid.dygraph.Layer): The target model which would + calculate the input quantization scale. + + Returns: + None + """ + assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." @@ -544,7 +558,9 @@ def _is_skip_quant_op(self, block, in_op): 1. the type of input op should be conv2d, depthwise_conv2d or matmul 2. the previous ops of the input op are not fake_quantize_dequantize ops """ - target_op_types = ["conv2d", "depthwise_conv2d", "matmul"] + target_op_types = [ + "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose" + ] if in_op.type not in target_op_types: return False diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index a9d52c5a87ad36..5a98ac80549f18 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -24,6 +24,7 @@ from ..quantization_pass import _get_input_name_index layer_name_map = { + 'Conv2DTranspose': paddle.nn.Conv2DTranspose, 'Conv2D': paddle.nn.Conv2D, 'Linear': paddle.nn.Linear, 'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D, @@ -47,7 +48,9 @@ # Apply fake quant for the inputs of these layers # TODO (jc): support paddle.nn.Conv2DTranspose -fake_quant_input_layers = [paddle.nn.Conv2D, paddle.nn.Linear] +fake_quant_input_layers = [ + paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.Conv2DTranspose +] # Apply fake quant for the output of these layers # TODO(jc): fix the problem of adding duplicate fake_quant ops @@ -65,7 +68,8 @@ ] fake_quant_wrap_layers = [ - quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear + quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear, + quant_layers.QuantizedConv2DTranspose ] # The weight format of these layers is Cin * Cout * H * W @@ -84,9 +88,9 @@ def load_variable_data(scope, var_name): - ''' + """ Load variable value from scope - ''' + """ var_node = scope.find_var(var_name) assert var_node is not None, \ "Can not find " + var_name + " in the scope." @@ -120,6 +124,12 @@ def find_parent_layer_and_sub_name(model, name): the sub_name of the layer. For example, if name is 'block_1/convbn_1/conv_1', the parent layer is 'block_1/convbn_1' and the sub_name is `conv_1`. + Args: + model(fluid.dygraph.Layer): the model to be quantized. + name(string): the name of a layer + + Returns: + parent_layer, subname """ assert isinstance(model, paddle.nn.Layer), \ "The model must be the instance of paddle.nn.Layer." diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py index 5573683ebd0458..cc98f73174b4b3 100644 --- a/python/paddle/nn/quant/quant_layers.py +++ b/python/paddle/nn/quant/quant_layers.py @@ -31,6 +31,7 @@ 'FakeQuantMovingAverageAbsMax', 'FakeQuantChannelWiseAbsMax', 'QuantizedConv2D', + 'QuantizedConv2DTranspose', 'QuantizedLinear', 'MovingAverageAbsMaxScale', 'MAOutputScaleLayer', @@ -481,6 +482,92 @@ def forward(self, input): data_format=self._data_format) +class QuantizedConv2DTranspose(layers.Layer): + """ + The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose. + The only difference is that its inputs are all fake quantized. + """ + + def __init__(self, + layer, + weight_bits=8, + activation_bits=8, + moving_rate=0.9, + weight_quantize_type='abs_max', + activation_quantize_type='abs_max', + weight_pre_layer=None, + act_pre_layer=None, + weight_quant_layer=None, + act_quant_layer=None): + super(QuantizedConv2DTranspose, self).__init__() + # For Conv2DTranspose + self._groups = getattr(layer, '_groups') + self._stride = getattr(layer, '_stride') + self._padding = getattr(layer, '_padding') + self._output_padding = getattr(layer, 'output_padding') + self._dilation = getattr(layer, '_dilation') + self._data_format = getattr(layer, '_data_format') + self.weight = getattr(layer, 'weight') + self.bias = getattr(layer, 'bias') + # For FakeQuant + self._conv2d_transpose_quant_axis = 1 + if weight_quant_layer is not None: + self._fake_quant_weight = weight_quant_layer() + else: + self._fake_quant_weight = _get_fake_quant_type( + weight_quantize_type, + name=self.weight.name, + moving_rate=moving_rate, + quant_bits=weight_bits, + dtype=self._dtype, + quant_on_weight=True, + channel_num=self.weight.shape[ + self._conv2d_transpose_quant_axis], + quant_axis=self._conv2d_transpose_quant_axis) + if act_quant_layer is not None: + self._fake_quant_input = act_quant_layer() + else: + self._fake_quant_input = _get_fake_quant_type( + activation_quantize_type, + name=layer.full_name(), + moving_rate=moving_rate, + quant_bits=activation_bits, + dtype=self._dtype, + quant_on_weight=False) + + self._act_preprocess = act_pre_layer( + ) if act_pre_layer is not None else None + self._weight_preprocess = weight_pre_layer( + ) if weight_pre_layer is not None else None + + def forward(self, input, output_size=None): + if self._act_preprocess is not None: + input = self._act_preprocess(input) + quant_input = self._fake_quant_input(input) + + weight = self.weight + if self._weight_preprocess is not None: + weight = self._weight_preprocess(self.weight) + quant_weight = self._fake_quant_weight(weight) + + if output_size is None: + output_padding = self._output_padding + else: + output_padding = 0 + + return F.conv2d_transpose( + quant_input, + quant_weight, + bias=self.bias, + padding=self._padding, + output_padding=output_padding, + stride=self._stride, + dilation=self._dilation, + groups=self._groups, + output_size=output_size, + data_format=self._data_format) + + class QuantizedLinear(layers.Layer): """ The computational logic of QuantizedLinear is the same with Linear. From ac21a6041534a459c3dce4a0a75f375cb50e6e17 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Thu, 5 Aug 2021 11:17:43 +0000 Subject: [PATCH 02/16] fix quantization bugs --- .../contrib/slim/quantization/post_training_quantization.py | 2 ++ .../fluid/contrib/slim/quantization/quantization_pass.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index 5996e752c8c22d..5272d9f59903d7 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -578,6 +578,8 @@ def _sample_mse(self): var_tensor = _load_variable_data(self._scope, var_name) var_tensor = var_tensor.flatten() abs_max_value = float(np.max(np.abs(var_tensor))) + if abs_max_value == 0.0: + abs_max_value = 1e-8 s = 0.3 if var_name not in self._best_mse_loss: self._best_mse_loss[var_name] = float('inf') diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index b3b12a477e2a0a..857486b3fc46cc 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -1312,6 +1312,8 @@ def _insert_post_dequant_op(self, graph, op_node): assert self._is_float( scale_v), 'The scale of parameter %s is not a float.' % ( original_var_name) + if scale_v == 0.0: + scale_v = 1e-8 max_range *= param_range / scale_v else: max_range *= act_range @@ -1413,6 +1415,8 @@ def _clip(x, scale): x[:, i] = _clip(x[:, i], s) x[:, i] = np.round(x[:, i] / s * bnt) else: + if scale == 0.0: + scale = 1e-8 x = _clip(x, scale) x = np.round(x / scale * bnt) return x From 350048ef509d720d31782e607f571e8e0719e4f6 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Sun, 8 Aug 2021 16:27:40 +0800 Subject: [PATCH 03/16] Update post_training_quantization.py --- .../contrib/slim/quantization/post_training_quantization.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index 5272d9f59903d7..5996e752c8c22d 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -578,8 +578,6 @@ def _sample_mse(self): var_tensor = _load_variable_data(self._scope, var_name) var_tensor = var_tensor.flatten() abs_max_value = float(np.max(np.abs(var_tensor))) - if abs_max_value == 0.0: - abs_max_value = 1e-8 s = 0.3 if var_name not in self._best_mse_loss: self._best_mse_loss[var_name] = float('inf') From cdfa3fe87f6649a5083ae9c4a8475ae5d87ac98d Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Sun, 8 Aug 2021 16:28:51 +0800 Subject: [PATCH 04/16] Update quantization_pass.py --- .../fluid/contrib/slim/quantization/quantization_pass.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 857486b3fc46cc..b3b12a477e2a0a 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -1312,8 +1312,6 @@ def _insert_post_dequant_op(self, graph, op_node): assert self._is_float( scale_v), 'The scale of parameter %s is not a float.' % ( original_var_name) - if scale_v == 0.0: - scale_v = 1e-8 max_range *= param_range / scale_v else: max_range *= act_range @@ -1415,8 +1413,6 @@ def _clip(x, scale): x[:, i] = _clip(x[:, i], s) x[:, i] = np.round(x[:, i] / s * bnt) else: - if scale == 0.0: - scale = 1e-8 x = _clip(x, scale) x = np.round(x / scale * bnt) return x From 4b047da84ab21cbd0748aeeff1c8bc0a1dd4d376 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Mon, 9 Aug 2021 08:28:11 +0000 Subject: [PATCH 05/16] update docs --- .../paddle/fluid/contrib/slim/quantization/imperative/qat.py | 4 ++-- .../fluid/contrib/slim/quantization/imperative/utils.py | 3 +-- python/paddle/nn/quant/quant_layers.py | 5 +++++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index 32a3ebfe047030..aa3269b0f2541d 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -307,10 +307,10 @@ def __init__( def apply(self, model): """ Quantize the weights and activations to calculate for specific - layers in the dygraph model. + layers. Args: - model(fluid.dygraph.Layer): The target model which would + model(paddle.nn.Layer): The target model which would calculate the input quantization scale. Returns: diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index 5a98ac80549f18..009ce372b4f29c 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -47,7 +47,6 @@ } # Apply fake quant for the inputs of these layers -# TODO (jc): support paddle.nn.Conv2DTranspose fake_quant_input_layers = [ paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.Conv2DTranspose ] @@ -125,7 +124,7 @@ def find_parent_layer_and_sub_name(model, name): For example, if name is 'block_1/convbn_1/conv_1', the parent layer is 'block_1/convbn_1' and the sub_name is `conv_1`. Args: - model(fluid.dygraph.Layer): the model to be quantized. + model(paddle.nn.Layer): the model to be quantized. name(string): the name of a layer Returns: diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py index cc98f73174b4b3..9d07febd955481 100644 --- a/python/paddle/nn/quant/quant_layers.py +++ b/python/paddle/nn/quant/quant_layers.py @@ -499,6 +499,11 @@ def __init__(self, act_pre_layer=None, weight_quant_layer=None, act_quant_layer=None): + r""" + Constructor. + + The arguments are the same as ImperativeQuantAware. + """ super(QuantizedConv2DTranspose, self).__init__() # For Conv2DTranspose self._groups = getattr(layer, '_groups') From e5ea4eb4c2430653b47ebaa53c7c77f5bb3fe976 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Mon, 9 Aug 2021 09:12:13 +0000 Subject: [PATCH 06/16] add tests for quantized_conv2d_transpose --- .../fluid/contrib/slim/tests/test_imperative_qat.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index 14fa291ee077c6..2dc118aa7e1a9f 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -28,10 +28,10 @@ from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.dygraph.container import Sequential -from paddle.nn import Linear, Conv2D, Softmax +from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose from paddle.fluid.log_helper import get_logger from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX -from paddle.nn.quant.quant_layers import QuantizedConv2D +from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose from imperative_test_utils import fix_model_dict, ImperativeLenet @@ -74,6 +74,11 @@ def test_qat(self): quant_conv1 = QuantizedConv2D(conv1) data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') quant_conv1(fluid.dygraph.to_variable(data)) + + conv_transpose = Conv2DTranspose(4, 6, (3, 3)) + quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose) + x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) + quant_conv_transpose(x_var) seed = 1 np.random.seed(seed) From 3231853193666ca376e56745d4d98a0912b9f739 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:17:21 +0000 Subject: [PATCH 07/16] update codestyle --- .../paddle/fluid/contrib/slim/tests/test_imperative_qat.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index 2dc118aa7e1a9f..bec486185ba171 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -74,10 +74,11 @@ def test_qat(self): quant_conv1 = QuantizedConv2D(conv1) data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') quant_conv1(fluid.dygraph.to_variable(data)) - + conv_transpose = Conv2DTranspose(4, 6, (3, 3)) quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose) - x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) + x_var = paddle.uniform( + (2, 4, 8, 8), dtype='float32', min=-1., max=1.) quant_conv_transpose(x_var) seed = 1 From da48df7ea923cbf86c03561dc57a312a578c9ba8 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:20:50 +0000 Subject: [PATCH 08/16] update docs --- .../paddle/fluid/contrib/slim/quantization/imperative/qat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index aa3269b0f2541d..8ebad9974ace4f 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -213,7 +213,7 @@ def quantize(self, model): the out_scale value of outputs would be calculated. Args: - model(fluid.dygraph.Layer): the model to be quantized. + model(paddle.nn.Layer): the model to be quantized. Returns: None """ @@ -368,7 +368,7 @@ def apply(self, model): output scales for specific layers in the dygraph model. Args: - model(fluid.dygraph.Layer): The target model which would be + model(paddle.nn.Layer): The target model which would be calculate the output quantization scale. Returns: From 43976beaba3c1381e03e955cb1a50f5f442efdb0 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Sat, 14 Aug 2021 12:11:47 +0000 Subject: [PATCH 09/16] update tests and conv2dtranspose layer --- .../contrib/slim/tests/test_imperative_qat.py | 2 +- .../tests/test_imperative_qat_user_defined.py | 20 +++++++++++++++++++ python/paddle/nn/quant/quant_layers.py | 15 ++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index bec486185ba171..677ccb52e242cf 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -78,7 +78,7 @@ def test_qat(self): conv_transpose = Conv2DTranspose(4, 6, (3, 3)) quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose) x_var = paddle.uniform( - (2, 4, 8, 8), dtype='float32', min=-1., max=1.) + (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0) quant_conv_transpose(x_var) seed = 1 diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py index 621213beb31cd7..186789ff03bad0 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py @@ -28,6 +28,7 @@ from paddle.fluid.dygraph import Conv2D from paddle.fluid.dygraph import Pool2D from paddle.fluid.dygraph import Linear +from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose from paddle.fluid.log_helper import get_logger os.environ["CPU_NUM"] = "1" @@ -100,6 +101,19 @@ def dequantize(x, lower_bound, delta, interval): return x +class ModelForConv2dT(nn.Layer): + def __init__(self, num_classes=10): + super(ModelForConv2dT, self).__init__() + self.features = nn.Conv2DTranspose(4, 6, (3, 3)) + self.fc = Linear(input_dim=600, output_dim=num_classes) + + def forward(self, inputs): + x = self.features(inputs) + x = paddle.flatten(x, 1) + x = self.fc(x) + return x + + class ImperativeLenet(paddle.nn.Layer): def __init__(self, num_classes=10, classifier_activation='softmax'): super(ImperativeLenet, self).__init__() @@ -168,6 +182,12 @@ def test_quant_aware_training(self): imperative_qat.quantize(lenet) adam = Adam(learning_rate=0.001, parameters=lenet.parameters()) dynamic_loss_rec = [] + #for CI coverage + conv_transpose = ModelForConv2dT() + imperative_qat.quantize(conv_transpose) + x_var = paddle.uniform( + (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0) + conv_transpose(x_var) def train(model): adam = Adam(learning_rate=0.001, parameters=model.parameters()) diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py index 9d07febd955481..040b04f5e7bf1e 100644 --- a/python/paddle/nn/quant/quant_layers.py +++ b/python/paddle/nn/quant/quant_layers.py @@ -486,6 +486,21 @@ class QuantizedConv2DTranspose(layers.Layer): """ The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose. The only difference is that its inputs are all fake quantized. + + Examples: + .. code-block:: python + import paddle + import paddle.nn as nn + from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose + x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) + conv = nn.Conv2DTranspose(4, 6, (3, 3)) + conv_quantized = QuantizedConv2DTranspose(conv) + y_quantized = conv_quantized(x_var) + y_var = conv(x_var) + y_quantized_np = y_quantized.numpy() + y_np = y_var.numpy() + print(y_np.shape, y_quantized_np.shape) + # (2, 6, 10, 10), (2, 6, 10, 10) """ def __init__(self, From 8ec36b6fe5ffbf6a3c6d253652cf2e1031804b59 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Sat, 14 Aug 2021 18:12:44 +0000 Subject: [PATCH 10/16] update quant tests --- .../contrib/slim/tests/test_imperative_qat_user_defined.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py index 186789ff03bad0..270e8ee566ab57 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py @@ -106,7 +106,7 @@ def __init__(self, num_classes=10): super(ModelForConv2dT, self).__init__() self.features = nn.Conv2DTranspose(4, 6, (3, 3)) self.fc = Linear(input_dim=600, output_dim=num_classes) - + def forward(self, inputs): x = self.features(inputs) x = paddle.flatten(x, 1) @@ -183,10 +183,9 @@ def test_quant_aware_training(self): adam = Adam(learning_rate=0.001, parameters=lenet.parameters()) dynamic_loss_rec = [] #for CI coverage - conv_transpose = ModelForConv2dT() + conv_transpose = ModelForConv2dT() imperative_qat.quantize(conv_transpose) - x_var = paddle.uniform( - (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0) + x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) conv_transpose(x_var) def train(model): From fc74ab00f6333ef6b8cc5c946efa26c2c449d731 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Mon, 16 Aug 2021 04:35:08 +0000 Subject: [PATCH 11/16] update sampcd_processor for tests --- tools/sampcd_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py index 3ec12c11a7045a..d8cb70c9dd107b 100644 --- a/tools/sampcd_processor.py +++ b/tools/sampcd_processor.py @@ -440,6 +440,7 @@ def get_filenames(full_test=False): ''' global whl_error import paddle + import paddle.fluid.contrib.slim.quantization whl_error = [] if full_test: get_full_api_from_pr_spec() From ccd16757c122d20b4a28a6622bf2ef86bb1333a5 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Mon, 16 Aug 2021 05:58:51 +0000 Subject: [PATCH 12/16] update code examples --- .../slim/quantization/imperative/qat.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index 8ebad9974ace4f..6208b43c9e9e48 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -216,6 +216,41 @@ def quantize(self, model): model(paddle.nn.Layer): the model to be quantized. Returns: None + + Examples: + .. code-block:: python + + import paddle + from paddle.fluid.contrib.slim.quantization \ + import ImperativeQuantAware + + class ImperativeModel(paddle.nn.Layer): + def __init__(self): + super(ImperativeModel, self).__init__() + # self.linear_0 would skip the quantization. + self.linear_0 = paddle.nn.Linear(784, 400) + self.linear_0.skip_quant = True + + # self.linear_1 would not skip the quantization. + self.linear_1 = paddle.nn.Linear(400, 10) + self.linear_1.skip_quant = False + + def forward(self, inputs): + x = self.linear_0(inputs) + x = self.linear_1(inputs) + return x + + model = ImperativeModel() + imperative_qat = ImperativeQuantAware( + weight_quantize_type='abs_max', + activation_quantize_type='moving_average_abs_max') + + # Add the fake quant logical. + # The original model will be rewrite. + # + # There is only one Layer(self.linear1) would be added the + # fake quant logical. + imperative_qat.quantize(model) """ assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." From a5b7c710b9848dd36412dae7be067baf091f329e Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Mon, 16 Aug 2021 12:47:12 +0000 Subject: [PATCH 13/16] fix channel_wise quantization for ernie --- cmake/cupti.cmake | 1 + paddle/fluid/framework/device_worker.h | 1 + paddle/fluid/framework/downpour_worker.cc | 8 +- paddle/fluid/framework/downpour_worker_opt.cc | 4 +- paddle/fluid/framework/fleet/fleet_wrapper.cc | 5 +- paddle/fluid/framework/fleet/fleet_wrapper.h | 3 +- .../framework/ir/graph_pattern_detector.cc | 6 +- .../framework/ir/mkldnn/cpu_quantize_pass.cc | 19 +- .../ir/mkldnn/cpu_quantize_pass_tester.cc | 7 +- .../cpu_quantize_placement_pass_tester.cc | 8 +- paddle/fluid/framework/trainer_desc.proto | 1 + .../collective/c_comm_init_hccl_op.cc | 2 + .../mkldnn/elementwise_mkldnn_op.h | 19 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 10 +- paddle/fluid/operators/fake_dequantize_op.cc | 81 ++- paddle/fluid/operators/fake_dequantize_op.cu | 17 +- paddle/fluid/operators/fake_dequantize_op.h | 12 +- paddle/fluid/operators/interpolate_v2_op.cu | 9 +- .../operators/mkldnn/activation_mkldnn_op.cc | 11 +- .../operators/mkldnn/caching_tests.cmake | 7 +- .../fluid/operators/mkldnn/scale_mkldnn_op.cc | 8 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 105 ++-- .../operators/mkldnn/test_mkldnn_caching.cc | 84 ++-- paddle/fluid/operators/size_op_npu.cc | 51 ++ paddle/fluid/platform/mkldnn_reuse.h | 476 ++++++++++++------ paddle/fluid/pybind/imperative.cc | 6 +- .../slim/quantization/imperative/qat.py | 101 +--- .../slim/quantization/imperative/utils.py | 19 +- .../slim/quantization/quantization_pass.py | 7 +- .../contrib/slim/tests/test_imperative_qat.py | 10 +- .../tests/test_imperative_qat_user_defined.py | 19 - .../pslib/optimizer_factory.py | 2 + .../tests/unittests/npu/test_size_op_npu.py | 141 ++++++ python/paddle/fluid/trainer_desc.py | 4 + python/paddle/fluid/trainer_factory.py | 4 + python/paddle/nn/quant/quant_layers.py | 107 ---- tools/sampcd_processor.py | 1 - 37 files changed, 806 insertions(+), 570 deletions(-) create mode 100644 paddle/fluid/operators/size_op_npu.cc create mode 100755 python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake index 17626688531e61..2d7b1917b68731 100644 --- a/cmake/cupti.cmake +++ b/cmake/cupti.cmake @@ -9,6 +9,7 @@ find_path(CUPTI_INCLUDE_DIR cupti.h $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/include + ${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/include NO_DEFAULT_PATH ) diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 45efa43ccb74bc..6dd6fed0151585 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -212,6 +212,7 @@ class DeviceWorker { FetchConfig fetch_config_; bool use_cvm_; bool no_cvm_; + bool scale_sparse_gradient_with_batch_size_; TrainerDesc trainer_desc_; // dump params or grads for debug diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index ad3f27f03fa143..11f70acb73aa7f 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -89,6 +89,8 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { use_cvm_ = desc.use_cvm(); // for sparse value accessor, embedding only no_cvm_ = desc.no_cvm(); + scale_sparse_gradient_with_batch_size_ = + desc.scale_sparse_gradient_with_batch_size(); scale_datanorm_ = desc.scale_datanorm(); dump_slot_ = desc.dump_slot(); adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); @@ -591,7 +593,8 @@ void DownpourWorker::TrainFilesWithProfiler() { *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, - dump_slot_, &sparse_push_keys_[tid], no_cvm_); + dump_slot_, &sparse_push_keys_[tid], no_cvm_, + scale_sparse_gradient_with_batch_size_); timeline.Pause(); push_sparse_time += timeline.ElapsedSec(); total_time += timeline.ElapsedSec(); @@ -866,7 +869,8 @@ void DownpourWorker::TrainFiles() { *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, - dump_slot_, &sparse_push_keys_[tid], no_cvm_); + dump_slot_, &sparse_push_keys_[tid], no_cvm_, + scale_sparse_gradient_with_batch_size_); } } diff --git a/paddle/fluid/framework/downpour_worker_opt.cc b/paddle/fluid/framework/downpour_worker_opt.cc index afe6ddfa3d9a63..ed0a9d9107e79d 100644 --- a/paddle/fluid/framework/downpour_worker_opt.cc +++ b/paddle/fluid/framework/downpour_worker_opt.cc @@ -450,11 +450,13 @@ void DownpourWorkerOpt::TrainFiles() { break; } } + bool scale_sparse_gradient_with_batch_size_ = true; fleet_ptr_->PushSparseVarsWithLabelAsync( *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, - dump_slot_, &sparse_push_keys_[tid], no_cvm_); + dump_slot_, &sparse_push_keys_[tid], no_cvm_, + scale_sparse_gradient_with_batch_size_); } } diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index bb318e59e46e41..dc5e24ef5de42f 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -870,7 +870,8 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( std::vector>* push_values, std::vector<::std::future>* push_sparse_status, const int batch_size, const bool use_cvm, const bool dump_slot, - std::vector* sparse_push_keys, const bool no_cvm) { + std::vector* sparse_push_keys, const bool no_cvm, + const bool scale_sparse_gradient_with_batch_size) { #ifdef PADDLE_WITH_PSLIB int offset = 2; int slot_offset = 0; @@ -939,7 +940,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( } float* g = g_tensor->data(); - if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) { + if (scale_sparse_gradient_with_batch_size && grad_dim > 0) { int dim = emb_dim; Eigen::Map< Eigen::Matrix> diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 09f7801b19f988..c1db06a298c861 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -209,7 +209,8 @@ class FleetWrapper { std::vector>* push_values, std::vector<::std::future>* push_sparse_status, const int batch_size, const bool use_cvm, const bool dump_slot, - std::vector* sparse_push_keys, const bool no_cvm); + std::vector* sparse_push_keys, const bool no_cvm, + const bool scale_sparse_gradient_with_batch_size); // Push sparse variables to server in async mode void PushSparseFromTensorWithLabelAsync( diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 70e48755dcd1e3..b4c94010e480a7 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2249,9 +2249,9 @@ PDNode *patterns::MultipleQuantize::operator()() { PDNode *patterns::QuantizePlacement::operator()( const std::unordered_set &quantize_enabled_op_types) { std::unordered_set supported_op_types = - std::unordered_set( - {"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d", - "prior_box", "relu", "reshape2", "transpose2", "fusion_gru"}); + std::unordered_set({"concat", "conv2d", "elementwise_add", + "fc", "matmul", "pool2d", "prior_box", + "reshape2", "transpose2", "fusion_gru"}); if (!quantize_enabled_op_types.empty()) { supported_op_types = quantize_enabled_op_types; } diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 3c06c9ee41d2a2..f50cd0a01d204d 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -770,7 +770,8 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, elementwise_add_pattern); - if (!AreScalesPresentForNodes({elementwise_add_x, elementwise_add_y})) { + if (!AreScalesPresentForNodes( + {elementwise_add_x, elementwise_add_y, elementwise_add_out})) { LogCannotQuantizeOp(elementwise_add_op); return; } @@ -793,16 +794,12 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale, is_y_unsigned, "Scale_y"); - // if quantization scale is missing for output tensor, return fp32 data - if (AreScalesPresentForNodes({elementwise_add_out})) { - bool is_output_unsigned{false}; - auto output_scale = - GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); - DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out", - output_scale, is_output_unsigned, "Scale_out"); - } else { - elementwise_add_op->Op()->SetAttr("force_fp32_output", true); - } + bool is_output_unsigned{false}; + auto output_scale = + GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); + + DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out", + output_scale, is_output_unsigned, "Scale_out"); ++quantize_elementwise_add_count; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index adb431fdb097f5..6fcea6a66cc5d1 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -854,13 +854,12 @@ TEST(CpuQuantizePass, elementwise_add) { TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { int elementwise_add_count = 1; - int quant_count = 2; + int quant_count = 0; int dequant_count = 2; - // 2 Quant + 2 IN - int added_nodes_count = 4; + int added_nodes_count = 0; MainTestElementwiseAdd(BuildProgramDescElementwiseAdd(), elementwise_add_count, quant_count, dequant_count, - added_nodes_count, 2.0f * 127, true); + added_nodes_count, 1.f, true); } TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc index 761defc25ff5c8..daf913bf7d80d1 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc @@ -131,13 +131,13 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) { } TEST(QuantizerPlacementPass, empty_list) { - // all operators quantized - MainTest({}, {}, 6); + // all operators except relu should be quantized + MainTest({}, {}, 5); } TEST(QuantizerPlacementPass, default_attr_value) { - // all operators quantized - DefaultAttrTest(6); + // all operators except relu should be quantized + DefaultAttrTest(5); } } // namespace ir diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 504885ff5ccbce..6f487d6984cc43 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -61,6 +61,7 @@ message TrainerDesc { optional bool use_ps_gpu = 32 [ default = false ]; optional string user_define_dump_filename = 33; + optional bool scale_sparse_gradient_with_batch_size = 34 [ default = true ]; // device worker parameters optional HogwildWorkerParameter hogwild_param = 101; diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc index 3df0595525941a..7dec645b5b3ad8 100644 --- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -87,6 +87,8 @@ class CCommInitOpAscend : public framework::OperatorBase { } PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( buff, size, HCCL_DATA_TYPE_FP32, 0, comm->comm(), stream)); + // Synchronize stream to find hccl error in time. + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream)); VLOG(3) << "Build connection successful."; #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index ddad70a6a5f31c..ffcdc079985fa6 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -47,13 +47,24 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler( - BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z, - scale_x, scale_y, scale_o, ctx.OutputName("Out")); + platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, + ctx.GetPlace(), x, y, z, scale_x, + scale_y, scale_o); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_memory = handler.AcquireDstMemory(z); + // (jczaja) For Inplace src and dst should be the same memory object. + // So x should share buffer with z. But UT mechanics is testing inplace + // execution for this op not checking that x can be bradcasted to match in + // shape y tensor. + // This is wrong as when x is to be broadcasted then z(out) will match the + // shape of y which is bigger than x. Hence if x is smaller in shape than z + // and they share a buffer (of + // shape x) then this buffer is not big enough to hold result of elementwise + // operation. + auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z)) + ? src_x_memory + : handler.AcquireDstMemory(z); const auto binary_prim = handler.AcquireForwardPrimitive(); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 1c246e8d189370..af4aab8047888a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -48,9 +48,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { if (dx) { // dx = dout*y platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, - ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f, - ctx.InputName(framework::GradVarName("Out"))); + dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), + dout, y, dx, 1.0f, 1.0f, 1.0f); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); @@ -75,9 +74,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { // Handler is having nullptr passed instead of output tensor as // we want Dst buffer to be allocated by oneDNN not to use Tensor platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, - ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f, - ctx.InputName(framework::GradVarName("Out"))); + dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), + dout, x, nullptr, 1.0f, 1.0f, 1.0f); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_x_memory = handler.AcquireSecondSrcMemory(x); diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index b70fe78e1a5282..c71710b79d1fbe 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -39,7 +39,7 @@ struct ChannelDequantizeFunctor { void operator()(const platform::CPUDeviceContext& dev_ctx, const framework::Tensor* in, const framework::Tensor** scales, const int scale_num, T max_range, const int quant_axis, - framework::Tensor* out) { + const int x_num_col_dims, framework::Tensor* out) { if (scale_num == 1) { // Dequant op is before quantized op // Dequantize the weight of quantized op @@ -81,23 +81,51 @@ struct ChannelDequantizeFunctor { } else if (scale_num == 2) { // Dequant op is after quantized op // Dequantize the output tensor of quantized op - int batch_size = in->dims()[0]; - int channel = in->dims()[1]; - const T* scale_one = scales[0]->data(); - const T* scale_two = scales[1]->data(); - for (int i = 0; i < batch_size; i++) { - framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize( - framework::slice_ddim(in->dims(), 1, in->dims().size())); - framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize( - framework::slice_ddim(out->dims(), 1, out->dims().size())); - for (int j = 0; j < channel; j++) { - T s = scale_one[j]; - framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1); - framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1); - auto in_e = framework::EigenVector::Flatten(one_channel_in); - auto out_e = framework::EigenVector::Flatten(one_channel_out); - auto& dev = *dev_ctx.eigen_device(); - out_e.device(dev) = in_e * s * scale_two[0] / max_range; + if (x_num_col_dims > 1) { + auto in_dims = in->dims(); + const int64_t channel = in_dims[x_num_col_dims]; + const T* scale_one = scales[0]->data(); + const T* scale_two = scales[1]->data(); + int64_t out_iter = 1; + for (int i = 0; i < x_num_col_dims; i++) { + out_iter *= in_dims[i]; + } + int64_t step_i = in->numel() / out_iter; + int64_t step_j = in->numel() / (out_iter * channel); + auto* in_data = in->data(); + auto* out_data = out->mutable_data(dev_ctx.GetPlace()); + for (int64_t i = 0; i < out_iter; i++) { + for (int64_t j = 0; j < channel; j++) { + auto* cur_in = in_data + i * step_i + j * step_j; + auto* cur_out = out_data + i * step_i + j * step_j; + T s = scale_one[j]; + for (int64_t k = 0; k < step_j; k++) { + *cur_out = (*cur_in) * s * scale_two[0] / max_range; + ++cur_in; + ++cur_out; + } + } + } + } + else { + int batch_size = in->dims()[0]; + int channel = in->dims()[1]; + const T* scale_one = scales[0]->data(); + const T* scale_two = scales[1]->data(); + for (int i = 0; i < batch_size; i++) { + framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize( + framework::slice_ddim(in->dims(), 1, in->dims().size())); + framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize( + framework::slice_ddim(out->dims(), 1, out->dims().size())); + for (int j = 0; j < channel; j++) { + T s = scale_one[j]; + framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1); + framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1); + auto in_e = framework::EigenVector::Flatten(one_channel_in); + auto out_e = framework::EigenVector::Flatten(one_channel_out); + auto& dev = *dev_ctx.eigen_device(); + out_e.device(dev) = in_e * s * scale_two[0] / max_range; + } } } } @@ -199,7 +227,16 @@ class FakeChannelWiseDequantizeMaxAbsOpMaker "the received is %d", quant_axis)); }); - + AddAttr("x_num_col_dims", + "The x_num_col_dims of mul. Only used for mul or matmul.") + .SetDefault(1) + .AddCustomChecker([](const int& x_num_col_dims) { + PADDLE_ENFORCE_EQ(x_num_col_dims == 0, false, + platform::errors::InvalidArgument( + "'x_num_col_dims' should be larger than 0, but " + "the received is %d", + x_num_col_dims)); + }); AddComment(R"DOC( FakeChannelWiseDequantizeMaxAbsOp operator. @@ -242,7 +279,7 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs, REGISTER_OP_VERSION(fake_channel_wise_dequantize_max_abs) .AddCheckpoint( - R"ROC(add new attributes [quant_axis] for applying per-channel " - "dequantization to conv2d_tranpose and mul ops.)ROC", + R"ROC(add new attributes [x_num_col_dims] for applying per-channel " + "dequantization to mul ops.)ROC", paddle::framework::compatible::OpVersionDesc().NewAttr( - "quant_axis", "The axis for dequantization.", 0)); + "x_num_col_dims", "The x_num_col_dims for dequantization.", 1)); diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index a89c430c7ab24e..b1d2e220426f73 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -77,9 +77,9 @@ __global__ void DequantizeOneScaleQuantAxis1(const T* in, const T* scale, template __global__ void DequantizeTwoScale(const T* in, const T* scale_one, const T* scale_two, T max_range, int num, - int batch_size, int channel, T* out) { + int iter_size, int channel, T* out) { int tid = threadIdx.x; - int channel_size = num / (batch_size * channel); + int channel_size = num / (iter_size * channel); int scale_index = blockIdx.x % channel; const T* in_c = in + blockIdx.x * channel_size; T* out_c = out + blockIdx.x * channel_size; @@ -93,7 +93,7 @@ struct ChannelDequantizeFunctor { void operator()(const platform::CUDADeviceContext& dev_ctx, const framework::Tensor* in, const framework::Tensor** scales, const int scale_num, T max_range, const int quant_axis, - framework::Tensor* out) { + const int x_num_col_dims, framework::Tensor* out) { auto in_dims = in->dims(); const T* in_data = in->data(); T* out_data = out->mutable_data(dev_ctx.GetPlace()); @@ -116,14 +116,17 @@ struct ChannelDequantizeFunctor { } else if (scale_num == 2) { // Not need to consider quant_axis int num = in->numel(); - int batch_size = in->dims()[0]; - int channel = in->dims()[1]; + int iter_size = 1; + for (int i = 0; i < x_num_col_dims; i++) { + iter_size *= in->dims()[i]; + } + int channel = in->dims()[x_num_col_dims]; const T* scale_one = scales[0]->data(); const T* scale_two = scales[1]->data(); int block = 1024; - int grid = batch_size * channel; + int grid = iter_size * channel; DequantizeTwoScale<<>>( - in_data, scale_one, scale_two, max_range, num, batch_size, channel, + in_data, scale_one, scale_two, max_range, num, iter_size, channel, out_data); } } diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index 6ddb12771fd517..4485edcafba0db 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -33,7 +33,8 @@ template struct ChannelDequantizeFunctor { void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in, const framework::Tensor** scales, const int scale_num, - T max_range, const int quant_axis, framework::Tensor* out); + T max_range, const int quant_axis, const int x_num_col_dims, + framework::Tensor* out); }; template @@ -64,6 +65,7 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { auto quant_bits = ctx.Attr>("quant_bits"); auto quant_axis = ctx.Attr("quant_axis"); + auto x_num_col_dims = ctx.Attr("x_num_col_dims"); int max_range = 1; auto& dev_ctx = ctx.template device_context(); @@ -80,11 +82,11 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { max_range *= (std::pow(2, quant_bits[0] - 1) - 1); } else if (scale_num == 2) { PADDLE_ENFORCE_EQ( - scales[0]->numel(), in->dims()[1], + scales[0]->numel(), in->dims()[x_num_col_dims], platform::errors::PreconditionNotMet( "The number of first scale values must be the same with " - "second dimension value of Input(X) when the `Scales` has two " - "elements, but %ld != %ld here.", + "corresponding dimension value of Input(X) when the `Scales` " + "has two elements, but %ld != %ld here.", scales[0]->numel(), in->dims()[1])); PADDLE_ENFORCE_EQ(scales[1]->numel(), 1, platform::errors::PreconditionNotMet( @@ -96,7 +98,7 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { } ChannelDequantizeFunctor()( dev_ctx, in, scales.data(), scale_num, static_cast(max_range), - quant_axis, out); + quant_axis, x_num_col_dims, out); } }; diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 6745592c5c1a8b..d335e1a2f9d58b 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -1186,7 +1186,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } else if ("bilinear" == interp_method) { - KeBilinearInterpFw<<<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 3b92d2e2d88913..d992890adeec3e 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -79,15 +79,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx, paddle::platform::errors::PreconditionNotMet( "Operator DNNL eletwise_forward must use CPUPlace")); auto &dev_ctx = ctx.template device_context(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); bool is_inplaced = x->IsSharedBufferWith(*y); - platform::ActivationMKLDNNHandler handler(algorithm, ctx, dev_ctx, - ctx.GetPlace(), x, - ctx.InputName("X"), is_inplaced); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, + ctx.GetPlace(), x); auto src_memory_p = handler.AcquireSrcMemory(x); auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y); @@ -106,13 +106,14 @@ template void eltwise_grad(const framework::ExecutionContext &ctx, mkldnn::algorithm algorithm) { auto &dev_ctx = ctx.template device_context(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); const auto *diff_y = ctx.Input(framework::GradVarName("Out")); auto *diff_x = ctx.Output(framework::GradVarName("X")); - platform::ActivationMKLDNNHandler handler( - algorithm, ctx, dev_ctx, ctx.GetPlace(), x, diff_y, ctx.InputName("X")); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, + ctx.GetPlace(), x, diff_y); auto src_memory_p = handler.AcquireBackwardSrcMemory(x); auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y); diff --git a/paddle/fluid/operators/mkldnn/caching_tests.cmake b/paddle/fluid/operators/mkldnn/caching_tests.cmake index 4130c295b203eb..f48a5d822f8dc8 100644 --- a/paddle/fluid/operators/mkldnn/caching_tests.cmake +++ b/paddle/fluid/operators/mkldnn/caching_tests.cmake @@ -1 +1,6 @@ -cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op softmax scope device_context enforce) +set(TEST_MKLDNN_CACHING_DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce) +if (WITH_GPU OR WITH_ROCM) + set(TEST_MKLDNN_CACHING_DEPS ${TEST_MKLDNN_CACHING_DEPS} depthwise_conv) +endif() +cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS ${TEST_MKLDNN_CACHING_DEPS}) + diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc index ae17048b5d568b..84ac14d04b85b3 100644 --- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc @@ -29,6 +29,7 @@ class ScaleMKLDNNKernel : public framework::OpKernel { void RunKernel(const framework::ExecutionContext& ctx) const { const auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); @@ -36,11 +37,12 @@ class ScaleMKLDNNKernel : public framework::OpKernel { bool is_inplaced = x->IsSharedBufferWith(*out); platform::ActivationMKLDNNHandler handler( - mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x, - ctx.InputName("X"), is_inplaced); + mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(), + x); auto src_memory_p = handler.AcquireSrcMemory(x); - auto dst_memory_p = handler.AcquireDstMemory(out); + auto dst_memory_p = + is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); auto activation_p = handler.AcquireForwardPrimitive(); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index e065800e4d1c71..b0f27719bf9adc 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -32,69 +32,56 @@ using platform::to_void_cast; template class SoftmaxMKLDNNHandler - : public platform::MKLDNNHandlerT { + : public platform::MKLDNNHandlerNoCachingT { public: - SoftmaxMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine mkldnn_engine, + SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, - Tensor* output, const int axis, - const std::string uniq_name, bool is_inplaced) - : platform::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - // Softmax may be inplace then uniq_name is no longer unique - is_inplaced ? platform::CreateKey( - dev_ctx, framework::vectorize(input->dims()), - axis, uniq_name) - : platform::CreateKey( - dev_ctx, framework::vectorize(input->dims()), - uniq_name)) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - input->dims(), output->dims(), - platform::errors::InvalidArgument( - "The shape of input and output tensor must be identical.")); - - auto softmax_tz = framework::vectorize(input->dims()); - auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, - axis); - } + Tensor* output, const int axis) + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, cpu_place) { + PADDLE_ENFORCE_EQ( + input->dims(), output->dims(), + platform::errors::InvalidArgument( + "The shape of input and output tensor must be identical.")); + + auto softmax_tz = framework::vectorize(input->dims()); + auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, + axis); } SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* out, const Tensor* out_grad, Tensor* in_x_grad, const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(out->dims()), - unique_name)) { - if (!this->isBwdCached()) { - PADDLE_ENFORCE_EQ( - out_grad->dims(), in_x_grad->dims(), - platform::errors::InvalidArgument("The shape of softmax_grad's input " - "and output must be identical.")); - - auto dims = out_grad->dims(); // input and output share the same shape - const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - auto softmax_tz = framework::vectorize(dims); - - auto data_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out->format()); - auto diff_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, - data_softmax_md, axis); - this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, - axis); - } + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, cpu_place) { + PADDLE_ENFORCE_EQ(out_grad->dims(), in_x_grad->dims(), + platform::errors::InvalidArgument( + "The shape of softmax_grad's input " + "and output must be identical, but shapes differ, " + "out_grad: %s in_grad: %s", + out_grad->dims(), in_x_grad->dims())); + + auto dims = out_grad->dims(); // input and output share the same shape + const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); + auto softmax_tz = framework::vectorize(dims); + + auto data_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out->format()); + auto diff_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, + data_softmax_md, axis); + this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, + axis); } }; @@ -111,9 +98,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { const int axis = CanonicalAxis(ctx.Attr("axis"), input->dims().size()); - SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, ctx.GetPlace(), - input, output, axis, ctx.OutputName("Out"), - is_inplaced); + SoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), input, + output, axis); auto softmax_src_memory_p = handler.AcquireSrcMemory(input); // For Inplace src and and dst are the same memory object @@ -149,11 +135,12 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { paddle::platform::errors::PreconditionNotMet( "Operator DNNL SoftmaxGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* output = ctx.Input("Out"); auto* out_grad = ctx.template Input(framework::GradVarName("Out")); auto* in_x_grad = ctx.template Output(framework::GradVarName("X")); - SoftmaxMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), output, + SoftmaxMKLDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), output, out_grad, in_x_grad, ctx.InputName("Out")); auto dst_memory_p = handler.AcquireDstMemory(output); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index cad4f47ec14022..7251653793f899 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -33,6 +33,8 @@ USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +USE_OP(conv2d); +USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { namespace operators { @@ -64,16 +66,19 @@ class CacheTester { template void RunOperator(const platform::Place &place, const std::string &op_type, - const framework::DDim &dims, const std::string &output_name, - bool inplace = false) { + const framework::DDim &dims, const std::string &first_input) { framework::Scope scope; std::map num_inputs = {{"softmax", 1}, {"relu", 1}, + {"conv2d", 2}, {"elementwise_add", 2}, {"elementwise_mul", 2}}; - std::string first_input = inplace == true ? output_name : "x"; + std::string first_input_var_name = (op_type == "conv2d") ? "Input" : "X"; + std::string second_input_var_name = (op_type == "conv2d") ? "Filter" : "Y"; + std::string output_var_name = (op_type == "conv2d") ? "Output" : "Out"; + std::string output_name = "output"; std::vector input_names = { {first_input, scope.Var(first_input)->GetMutable()}, @@ -113,71 +118,40 @@ void RunOperator(const platform::Place &place, const std::string &op_type, auto &pool = platform::DeviceContextPool::Instance(); - auto op = num_inputs[op_type] > 1 - ? framework::OpRegistry::CreateOp( - op_type, {{"X", {first_input}}, {"Y", {"x1"}}}, - {{"Out", {output_name}}}, {{"use_mkldnn", {true}}}) - : framework::OpRegistry::CreateOp( - op_type, {{"X", {first_input}}}, {{"Out", {output_name}}}, - {{"use_mkldnn", {true}}}); + auto op = + num_inputs[op_type] > 1 + ? framework::OpRegistry::CreateOp( + op_type, {{first_input_var_name, {first_input}}, + {second_input_var_name, {"x1"}}}, + {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}) + : framework::OpRegistry::CreateOp( + op_type, {{first_input_var_name, {first_input}}}, + {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}); op->Run(scope, place); pool.Get(place)->Wait(); } -TEST(test_softmax_reuse_cache, cpu_place) { - framework::DDim dims({32, 64}); +TEST(test_conv2d_reuse_cache, cpu_place) { + framework::DDim dims({1, 16, 32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out"); - PADDLE_ENFORCE_EQ(ct.Analyze(4), true, + RunOperator(p, "conv2d", dims, "input_signal"); + RunOperator(p, "conv2d", dims, "input_signal"); + PADDLE_ENFORCE_EQ(ct.Analyze(9), true, platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); + "Invalid number of cached oneDNN objects")); } -TEST(test_softmax_noreuse_cache, cpu_place) { - framework::DDim dims({32, 64}); +TEST(test_conv2d_noreuse_cache, cpu_place) { + framework::DDim dims({1, 16, 32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out2"); - PADDLE_ENFORCE_EQ(ct.Analyze(8), true, + RunOperator(p, "conv2d", dims, "input_signal"); + RunOperator(p, "conv2d", dims, "input_signal2"); + PADDLE_ENFORCE_EQ(ct.Analyze(18), true, platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_softmax_inplace_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(7), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_relu_inplace_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "relu", dims, "relu_out"); - RunOperator(p, "relu", dims, "relu_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(7), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_elementwise_add_reuse_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "elementwise_add", dims, "elementwise_add_out"); - RunOperator(p, "relu", dims, "elementwise_add_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(8), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); + "Invalid number of cached oneDNN objects")); } } // namespace operators diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc new file mode 100644 index 00000000000000..4e9c2ec482e927 --- /dev/null +++ b/paddle/fluid/operators/size_op_npu.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/mul_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class SizeNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("Input"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + Tensor cpu_tensor; + auto cpu_data = + cpu_tensor.mutable_data(out->dims(), platform::CPUPlace()); + cpu_data[0] = x->numel(); + TensorCopy(cpu_tensor, ctx.GetPlace(), + ctx.template device_context(), out); + ctx.template device_context().Wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + size, ops::SizeNPUKernel, + ops::SizeNPUKernel, + ops::SizeNPUKernel, + ops::SizeNPUKernel, + ops::SizeNPUKernel, + ops::SizeNPUKernel); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index f63d45d7ff6ae6..95b8e0c610b1d4 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -34,6 +34,211 @@ using framework::Tensor; using user_function = std::function(const float*)>; using memory = mkldnn::memory; +template +class MKLDNNHandlerNoCachingT { + public: + MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place) + : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { + platform::MKLDNNDeviceContext::tls().log_lib_version(); + } + + std::shared_ptr AcquireForwardPrimitive() { + return std::make_shared(*fwd_pd_); + } + + std::shared_ptr AcquireBackwardPrimitive() { + return std::make_shared(*bwd_pd_); + } + + std::shared_ptr AcquireBackwardWeightsPrimitive() { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, platform::errors::Unavailable("BWD_PD should be set when " + "getting BWD prim .")); + return std::make_shared(*bwd_w_pd_); + } + + std::shared_ptr AcquireSrcMemory( + const framework::Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), + to_void_cast(input_data)); + } + + template + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + T_out* ptr = + output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); + } + + template + std::shared_ptr AcquireDstMemory(void) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); + } + + template + std::shared_ptr AcquireDstMemory( + const framework::Tensor* output) { + const T_out* output_data = output->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), + to_void_cast(output_data)); + } + + std::shared_ptr AcquireDiffDstMemory( + const framework::Tensor* diffdst) { + const T* ptr = diffdst->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), + to_void_cast(ptr)); + } + + std::shared_ptr AcquireDiffSrcMemory( + framework::Tensor* diffsrc) { + T* ptr = + diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); + } + + // Buffer of given Tensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + framework::Tensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + ptr); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); + } + + protected: + // If your primitive descriptor requires attributes, pass them as a + // first argument and paramters to descriptor constructor in the following + // arguments. Otherwise, all arguments will be forwarded to descriptor + // constructor, including the first one. + template + void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + } + + // Using sfinae to specialise variadic function. Workaround for not having + // if constexpr in C++ 11. + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(args)...); + fwd_pd_ = std::make_shared( + fwd_desc, first, engine_); + } + + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(first), + std::forward(args)...); + fwd_pd_ = + std::make_shared(fwd_desc, engine_); + } + + template + void AcquireBackwardPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md, void* ptr) { + return std::make_shared(md, engine_, ptr); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md) { + return std::make_shared(md, engine_); + } + + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const mkldnn::memory::desc& user_md, + const mkldnn::memory::desc& target_md, void* ptr, + const std::string& suffix, bool is_persistent = false, + std::function(const F*)> custom_reorder_func = {}) { + std::shared_ptr target_memory_p; + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + return target_memory_p; + } + + mkldnn::engine engine_; + platform::Place place_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; +}; + template @@ -79,7 +284,7 @@ class MKLDNNHandlerT { std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); if (backward_p == nullptr) { PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_PD should be set when " + "BWD_PD should be set when " "getting BWD prim witk key: %s .", key_p)); backward_p = std::make_shared(*bwd_w_pd_); @@ -138,7 +343,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + "BWD_W_PD should be set when getting BWD grad of weights.")); T* ptr = diff_weights->mutable_data( place_, bwd_w_pd_->diff_weights_desc().get_size()); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr, @@ -150,7 +355,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + "BWD_W_PD should be set when getting BWD grad of weights.")); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), "@diff_wei_mem_p"); } @@ -589,70 +794,70 @@ class MKLDNNHandler { }; template -class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { +class BinaryMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { public: BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, - const MKLDNNDeviceContext& dev_ctx, const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, - float scale_x, float scale_y, float scale_z, - const std::string& uniq_name) - : platform::MKLDNNHandlerT( - dev_ctx, engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), - uniq_name)) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor.")); - PADDLE_ENFORCE_NE( - x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for X tensor.")); - - PADDLE_ENFORCE_EQ( - y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); - PADDLE_ENFORCE_NE( - y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for Y tensor.")); - - const auto src_x_tz = framework::vectorize(x->dims()); - const auto src_y_tz = framework::vectorize(y->dims()); - // if output tensor(z) is nullptr then we are computing into oneDNN - // managed buffer - auto rankdiff = x->dims().size() - y->dims().size(); - const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) - : framework::vectorize(z->dims()); - - auto src0_md = dnnl::memory::desc( - src_x_tz, platform::MKLDNNGetDataType(), x->format()); - auto src1_md = dnnl::memory::desc( - src_y_tz, platform::MKLDNNGetDataType(), y->format()); - if (rankdiff > 0) { // Second input is of smaller rank than first - std::vector dims1_ex(rankdiff, 1); - dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), - src_y_tz.begin(), src_y_tz.end()); - src1_md = src1_md.reshape(dims1_ex); - } else if (rankdiff < 0) { // First input is of smaller than second - std::vector dims0_ex(-rankdiff, 1); - dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), - src_x_tz.begin(), src_x_tz.end()); - src0_md = src0_md.reshape(dims0_ex); - } - const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); - - auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); - this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, - src1_md, dst_md); + float scale_x, float scale_y, float scale_z) + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d", + DataLayout::kMKLDNN, x->layout())); + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for X tensor : %d (undef)", + static_cast(x->format()))); + + PADDLE_ENFORCE_EQ( + y->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d", + DataLayout::kMKLDNN, y->layout())); + PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Y tensor : %d (undef)", + static_cast(y->format()))); + + const auto src_x_tz = framework::vectorize(x->dims()); + const auto src_y_tz = framework::vectorize(y->dims()); + // if output tensor(z) is nullptr then we are computing into oneDNN + // managed buffer + auto rankdiff = x->dims().size() - y->dims().size(); + const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : framework::vectorize(z->dims()); + + auto src0_md = dnnl::memory::desc( + src_x_tz, platform::MKLDNNGetDataType(), x->format()); + auto src1_md = dnnl::memory::desc( + src_y_tz, platform::MKLDNNGetDataType(), y->format()); + if (rankdiff > 0) { // Second input is of smaller rank than first + std::vector dims1_ex(rankdiff, 1); + dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), + src_y_tz.begin(), src_y_tz.end()); + src1_md = src1_md.reshape(dims1_ex); + } else if (rankdiff < 0) { // First input is of smaller than second + std::vector dims0_ex(-rankdiff, 1); + dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), + src_x_tz.begin(), src_x_tz.end()); + src0_md = src0_md.reshape(dims0_ex); } + const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); + + auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, + dst_md); } std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->src1_desc(), to_void_cast(input_data), "@src1_mem_p"); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), + to_void_cast(input_data)); } private: @@ -775,111 +980,95 @@ class ReductionMKLDNNHandler template class ActivationMKLDNNHandler - : public MKLDNNHandlerT { + : public MKLDNNHandlerNoCachingT { public: ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, Place cpu_place, - const framework::Tensor* in_x, - const std::string& unique_name, bool is_inplaced) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - is_inplaced ? platform::CreateKey( - dev_ctx, framework::vectorize(in_x->dims()), "a", - algorithm, unique_name) - : platform::CreateKey( - dev_ctx, framework::vectorize(in_x->dims()), "a", - unique_name)) { - if (!this->isCached()) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - // eltwise_linear means we are in scale op - if (algorithm == mkldnn::algorithm::eltwise_linear) { - bool bias_after_scale = ctx.Attr("bias_after_scale"); - auto* scale_tensor = ctx.Input("ScaleTensor"); - alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") - : (float)*(scale_tensor->data()); - beta = ctx.Attr("bias"); - // if bias_after_scale == true - // out = scale*X + bias - // else - // out = scale*(X + bias) = scale*X + scale*bias - if (!bias_after_scale) beta *= alpha; - } else { - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + const mkldnn::engine engine, Place cpu_place, + const framework::Tensor* in_x) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + // eltwise_linear means we are in scale op + if (algorithm == mkldnn::algorithm::eltwise_linear) { + bool bias_after_scale = ctx.Attr("bias_after_scale"); + auto* scale_tensor = ctx.Input("ScaleTensor"); + alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") + : (float)*(scale_tensor->data()); + beta = ctx.Attr("bias"); + // if bias_after_scale == true + // out = scale*X + bias + // else + // out = scale*(X + bias) = scale*X + scale*bias + if (!bias_after_scale) beta *= alpha; + } else { + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); } + } - PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, - platform::errors::Unimplemented( - "Input dimension size can be 1, 2, 3, 4, " - "5, or 6, but now the dimension size is", - in_x->dims().size())); + PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, + platform::errors::Unimplemented( + "Input dimension size can be 1, 2, 3, 4, " + "5, or 6, but now the dimension size is", + in_x->dims().size())); - auto src_tz = framework::vectorize(in_x->dims()); - auto src_fmt = - src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto md = mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), - src_fmt); + auto src_tz = framework::vectorize(in_x->dims()); + auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto md = + mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, algorithm, md, alpha, beta); - } + this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + algorithm, md, alpha, beta); } ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, Place cpu_place, - const framework::Tensor* in_x, const Tensor* out_grad, - const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), - "a", unique_name)) { - if (!this->isBwdCached()) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + const mkldnn::engine engine, Place cpu_place, + const framework::Tensor* in_x, const Tensor* out_grad) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } - auto diff_dst_tz = framework::vectorize(out_grad->dims()); + auto diff_dst_tz = framework::vectorize(out_grad->dims()); - auto src_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto diff_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); + auto src_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto diff_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); - auto dims = framework::vectorize(in_x->dims()); - auto diff_dst_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), diff_fmt); - auto src_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), src_fmt); + auto dims = framework::vectorize(in_x->dims()); + auto diff_dst_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), diff_fmt); + auto src_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, algorithm, src_md, alpha, beta); - this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, - alpha, beta); - } + this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + algorithm, src_md, alpha, beta); + this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, + alpha, beta); } std::shared_ptr AcquireBackwardSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), - to_void_cast(input_data), - "@bwd-src_mem_p"); + to_void_cast(input_data)); } }; @@ -1430,11 +1619,6 @@ using ConvMKLDNNHandler = mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights>; -using ConvTransposeMKLDNNHandler = - ConvMKLDNNTemplateHandler; - template static std::shared_ptr SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 0b6af3b542395d..6c4213979a46be 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -785,7 +785,8 @@ void BindImperative(py::module *m_ptr) { const int size = PyTuple_GET_SIZE(index_ptr); for (int dim = 0; dim < size; ++dim) { PyObject *slice_item = PyTuple_GetItem(index_ptr, dim); - if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item))) { + if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item) || + slice_item == Py_Ellipsis || slice_item == Py_None)) { parse_index = false; break; } @@ -807,7 +808,8 @@ void BindImperative(py::module *m_ptr) { {"starts", starts}, {"ends", ends}, {"steps", steps}, - {"decrease_axes", decrease_axes}}; + {"decrease_axes", decrease_axes}, + {"none_axes", none_axes}}; imperative::NameVarBaseMap ins = {{"Input", {self}}}; imperative::NameVarBaseMap outs = {{"Out", {self}}}; diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index 6208b43c9e9e48..b8c0e47e9bbc26 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -42,18 +42,17 @@ class ImperativeQuantAware(object): Applying quantization aware training (QAT) to the dgraph model. """ - def __init__( - self, - quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'], - weight_quantize_type='abs_max', - activation_quantize_type='moving_average_abs_max', - weight_bits=8, - activation_bits=8, - moving_rate=0.9, - weight_preprocess_layer=None, - act_preprocess_layer=None, - weight_quantize_layer=None, - act_quantize_layer=None): + def __init__(self, + quantizable_layer_type=['Conv2D', 'Linear'], + weight_quantize_type='abs_max', + activation_quantize_type='moving_average_abs_max', + weight_bits=8, + activation_bits=8, + moving_rate=0.9, + weight_preprocess_layer=None, + act_preprocess_layer=None, + weight_quantize_layer=None, + act_quantize_layer=None): """ The constructor for ImperativeQuantAware. @@ -213,44 +212,9 @@ def quantize(self, model): the out_scale value of outputs would be calculated. Args: - model(paddle.nn.Layer): the model to be quantized. + model(fluid.dygraph.Layer): the model to be quantized. Returns: None - - Examples: - .. code-block:: python - - import paddle - from paddle.fluid.contrib.slim.quantization \ - import ImperativeQuantAware - - class ImperativeModel(paddle.nn.Layer): - def __init__(self): - super(ImperativeModel, self).__init__() - # self.linear_0 would skip the quantization. - self.linear_0 = paddle.nn.Linear(784, 400) - self.linear_0.skip_quant = True - - # self.linear_1 would not skip the quantization. - self.linear_1 = paddle.nn.Linear(400, 10) - self.linear_1.skip_quant = False - - def forward(self, inputs): - x = self.linear_0(inputs) - x = self.linear_1(inputs) - return x - - model = ImperativeModel() - imperative_qat = ImperativeQuantAware( - weight_quantize_type='abs_max', - activation_quantize_type='moving_average_abs_max') - - # Add the fake quant logical. - # The original model will be rewrite. - # - # There is only one Layer(self.linear1) would be added the - # fake quant logical. - imperative_qat.quantize(model) """ assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." @@ -268,18 +232,17 @@ class ImperativeQuantizeInputs(object): logic both for activation inputs and weight inputs. """ - def __init__( - self, - quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'], - weight_quantize_type='abs_max', - activation_quantize_type='moving_average_abs_max', - weight_bits=8, - activation_bits=8, - moving_rate=0.9, - weight_preprocess_layer=None, - act_preprocess_layer=None, - weight_quantize_layer=None, - act_quantize_layer=None): + def __init__(self, + quantizable_layer_type=['Conv2D', 'Linear'], + weight_quantize_type='abs_max', + activation_quantize_type='moving_average_abs_max', + weight_bits=8, + activation_bits=8, + moving_rate=0.9, + weight_preprocess_layer=None, + act_preprocess_layer=None, + weight_quantize_layer=None, + act_quantize_layer=None): """ The constructor for ImperativeQuantizeInputs. @@ -340,18 +303,6 @@ def __init__( } def apply(self, model): - """ - Quantize the weights and activations to calculate for specific - layers. - - Args: - model(paddle.nn.Layer): The target model which would - calculate the input quantization scale. - - Returns: - None - """ - assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." @@ -403,7 +354,7 @@ def apply(self, model): output scales for specific layers in the dygraph model. Args: - model(paddle.nn.Layer): The target model which would be + model(fluid.dygraph.Layer): The target model which would be calculate the output quantization scale. Returns: @@ -593,9 +544,7 @@ def _is_skip_quant_op(self, block, in_op): 1. the type of input op should be conv2d, depthwise_conv2d or matmul 2. the previous ops of the input op are not fake_quantize_dequantize ops """ - target_op_types = [ - "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose" - ] + target_op_types = ["conv2d", "depthwise_conv2d", "matmul"] if in_op.type not in target_op_types: return False diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index 009ce372b4f29c..a9d52c5a87ad36 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -24,7 +24,6 @@ from ..quantization_pass import _get_input_name_index layer_name_map = { - 'Conv2DTranspose': paddle.nn.Conv2DTranspose, 'Conv2D': paddle.nn.Conv2D, 'Linear': paddle.nn.Linear, 'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D, @@ -47,9 +46,8 @@ } # Apply fake quant for the inputs of these layers -fake_quant_input_layers = [ - paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.Conv2DTranspose -] +# TODO (jc): support paddle.nn.Conv2DTranspose +fake_quant_input_layers = [paddle.nn.Conv2D, paddle.nn.Linear] # Apply fake quant for the output of these layers # TODO(jc): fix the problem of adding duplicate fake_quant ops @@ -67,8 +65,7 @@ ] fake_quant_wrap_layers = [ - quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear, - quant_layers.QuantizedConv2DTranspose + quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear ] # The weight format of these layers is Cin * Cout * H * W @@ -87,9 +84,9 @@ def load_variable_data(scope, var_name): - """ + ''' Load variable value from scope - """ + ''' var_node = scope.find_var(var_name) assert var_node is not None, \ "Can not find " + var_name + " in the scope." @@ -123,12 +120,6 @@ def find_parent_layer_and_sub_name(model, name): the sub_name of the layer. For example, if name is 'block_1/convbn_1/conv_1', the parent layer is 'block_1/convbn_1' and the sub_name is `conv_1`. - Args: - model(paddle.nn.Layer): the model to be quantized. - name(string): the name of a layer - - Returns: - parent_layer, subname """ assert isinstance(model, paddle.nn.Layer), \ "The model must be the instance of paddle.nn.Layer." diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 9917730daa543f..c2d7a9bb4d5174 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -1273,12 +1273,17 @@ def _insert_post_channel_dequant_op(self, graph, op_node, quant_axis): var_type=output_var_node.type(), shape=output_var_node.shape(), var_dtype=output_var_node.dtype()) + if op_node.op().has_attr("x_num_col_dims"): + x_num_col_dims = op_node.op().attr("x_num_col_dims") + else: + x_num_col_dims = 1 dequant_op_node = graph.create_op_node( op_type='fake_channel_wise_dequantize_max_abs', attrs={ 'quant_bits': [self._weight_bits, self._activation_bits], 'quant_axis': quant_axis, - 'op_role': core.op_proto_and_checker_maker.OpRole.Forward + 'op_role': core.op_proto_and_checker_maker.OpRole.Forward, + 'x_num_col_dims': x_num_col_dims }, inputs={ 'X': output_var_node, diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index 677ccb52e242cf..14fa291ee077c6 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -28,10 +28,10 @@ from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.dygraph.container import Sequential -from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose +from paddle.nn import Linear, Conv2D, Softmax from paddle.fluid.log_helper import get_logger from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX -from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose +from paddle.nn.quant.quant_layers import QuantizedConv2D from imperative_test_utils import fix_model_dict, ImperativeLenet @@ -75,12 +75,6 @@ def test_qat(self): data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') quant_conv1(fluid.dygraph.to_variable(data)) - conv_transpose = Conv2DTranspose(4, 6, (3, 3)) - quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose) - x_var = paddle.uniform( - (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0) - quant_conv_transpose(x_var) - seed = 1 np.random.seed(seed) fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py index 270e8ee566ab57..621213beb31cd7 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py @@ -28,7 +28,6 @@ from paddle.fluid.dygraph import Conv2D from paddle.fluid.dygraph import Pool2D from paddle.fluid.dygraph import Linear -from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose from paddle.fluid.log_helper import get_logger os.environ["CPU_NUM"] = "1" @@ -101,19 +100,6 @@ def dequantize(x, lower_bound, delta, interval): return x -class ModelForConv2dT(nn.Layer): - def __init__(self, num_classes=10): - super(ModelForConv2dT, self).__init__() - self.features = nn.Conv2DTranspose(4, 6, (3, 3)) - self.fc = Linear(input_dim=600, output_dim=num_classes) - - def forward(self, inputs): - x = self.features(inputs) - x = paddle.flatten(x, 1) - x = self.fc(x) - return x - - class ImperativeLenet(paddle.nn.Layer): def __init__(self, num_classes=10, classifier_activation='softmax'): super(ImperativeLenet, self).__init__() @@ -182,11 +168,6 @@ def test_quant_aware_training(self): imperative_qat.quantize(lenet) adam = Adam(learning_rate=0.001, parameters=lenet.parameters()) dynamic_loss_rec = [] - #for CI coverage - conv_transpose = ModelForConv2dT() - imperative_qat.quantize(conv_transpose) - x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) - conv_transpose(x_var) def train(model): adam = Adam(learning_rate=0.001, parameters=model.parameters()) diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index 607a3c94f8a4e7..9a21a5a850db97 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -825,6 +825,8 @@ def _minimize(self, opt_info["worker_skipped_ops"] = worker_skipped_ops opt_info["use_cvm"] = strategy.get("use_cvm", False) opt_info["no_cvm"] = strategy.get("no_cvm", False) + opt_info["scale_sparse_gradient_with_batch_size"] = strategy.get( + "scale_sparse_gradient_with_batch_size", True) opt_info["worker_class"] = strategy.get("worker_class", "DownpourWorker") opt_info["stat_var_names"] = strategy.get("stat_var_names", []) diff --git a/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py new file mode 100755 index 00000000000000..80721cbd66a558 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py @@ -0,0 +1,141 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +sys.path.append("..") +import paddle +import paddle.fluid as fluid +from op_test import OpTest + +paddle.enable_static() + + +class TestSizeOp(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "size" + + self.config() + input = np.zeros(self.shape, dtype=self.dtype) + self.inputs = {'Input': input} + self.outputs = {'Out': np.array([np.size(input)], dtype=np.int64)} + + def config(self): + self.shape = [1, 2] + self.dtype = np.int32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def set_npu(self): + self.__class__.use_npu = True + + +class TestSizeOp1(TestSizeOp): + def config(self): + self.shape = [2] + self.dtype = np.float64 + + +class TestSizeOp2(TestSizeOp): + def config(self): + self.shape = [2, 3] + self.dtype = np.float32 + + +class TestSizeOp3(TestSizeOp): + def config(self): + self.shape = [2, 3, 100] + self.dtype = np.float16 + + +class TestSizeOp4(TestSizeOp): + def config(self): + self.shape = [2**10] + self.dtype = np.bool + + +class TestSizeOp5(TestSizeOp): + def config(self): + self.shape = [7, 8, 9, 10] + self.dtype = np.int64 + + +class TestSizeOp6(TestSizeOp): + def config(self): + self.shape = [] + self.dtype = np.int64 + + +class TestSizeAPI(unittest.TestCase): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + + def set_npu(self): + self.__class__.use_npu = True + + def test_size_static(self): + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + shape1 = [2, 1, 4, 5] + shape2 = [1, 4, 5] + x_1 = paddle.fluid.data(shape=shape1, dtype='int32', name='x_1') + x_2 = paddle.fluid.data(shape=shape2, dtype='int32', name='x_2') + input_1 = np.random.random(shape1).astype("int32") + input_2 = np.random.random(shape2).astype("int32") + out_1 = paddle.fluid.layers.size(x_1) + out_2 = paddle.fluid.layers.size(x_2) + exe = paddle.static.Executor(place=self.place) + res_1, res_2 = exe.run(feed={ + "x_1": input_1, + "x_2": input_2, + }, + fetch_list=[out_1, out_2]) + assert (np.array_equal( + res_1, np.array([np.size(input_1)]).astype("int64"))) + assert (np.array_equal( + res_2, np.array([np.size(input_2)]).astype("int64"))) + + def test_size_imperative(self): + paddle.disable_static(self.place) + input_1 = np.random.random([2, 1, 4, 5]).astype("int32") + input_2 = np.random.random([1, 4, 5]).astype("int32") + x_1 = paddle.to_tensor(input_1) + x_2 = paddle.to_tensor(input_2) + out_1 = paddle.fluid.layers.size(x_1) + out_2 = paddle.fluid.layers.size(x_2) + assert (np.array_equal(out_1.numpy().item(0), np.size(input_1))) + assert (np.array_equal(out_2.numpy().item(0), np.size(input_2))) + paddle.enable_static() + + def test_error(self): + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + + def test_x_type(): + shape = [1, 4, 5] + input_1 = np.random.random(shape).astype("int32") + out_1 = paddle.fluid.layers.size(input_1) + + self.assertRaises(TypeError, test_x_type) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 4eca3a494e25a4..6152bce55ce9f2 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -124,6 +124,10 @@ def _set_use_cvm(self, use_cvm=False): def _set_no_cvm(self, no_cvm=False): self.proto_desc.no_cvm = no_cvm + def _set_scale_sparse_grad_with_batch_size( + self, scale_sparse_gradient_with_batch_size=True): + self.proto_desc.scale_sparse_gradient_with_batch_size = scale_sparse_gradient_with_batch_size + def _set_scale_datanorm(self, scale_datanorm=-1): self.proto_desc.scale_datanorm = scale_datanorm diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index 7912ffca84ba41..ed10bee2e063a7 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -95,6 +95,10 @@ def _create_trainer(self, opt_info=None): trainer._set_use_cvm(opt_info["use_cvm"]) if opt_info.get("no_cvm") is not None: trainer._set_no_cvm(opt_info["no_cvm"]) + if opt_info.get( + "scale_sparse_gradient_with_batch_size") is not None: + trainer._set_scale_sparse_grad_with_batch_size(opt_info[ + "scale_sparse_gradient_with_batch_size"]) if opt_info.get("scale_datanorm") is not None: trainer._set_scale_datanorm(opt_info["scale_datanorm"]) if opt_info.get("adjust_ins_weight") is not None: diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py index 040b04f5e7bf1e..5573683ebd0458 100644 --- a/python/paddle/nn/quant/quant_layers.py +++ b/python/paddle/nn/quant/quant_layers.py @@ -31,7 +31,6 @@ 'FakeQuantMovingAverageAbsMax', 'FakeQuantChannelWiseAbsMax', 'QuantizedConv2D', - 'QuantizedConv2DTranspose', 'QuantizedLinear', 'MovingAverageAbsMaxScale', 'MAOutputScaleLayer', @@ -482,112 +481,6 @@ def forward(self, input): data_format=self._data_format) -class QuantizedConv2DTranspose(layers.Layer): - """ - The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose. - The only difference is that its inputs are all fake quantized. - - Examples: - .. code-block:: python - import paddle - import paddle.nn as nn - from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose - x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) - conv = nn.Conv2DTranspose(4, 6, (3, 3)) - conv_quantized = QuantizedConv2DTranspose(conv) - y_quantized = conv_quantized(x_var) - y_var = conv(x_var) - y_quantized_np = y_quantized.numpy() - y_np = y_var.numpy() - print(y_np.shape, y_quantized_np.shape) - # (2, 6, 10, 10), (2, 6, 10, 10) - """ - - def __init__(self, - layer, - weight_bits=8, - activation_bits=8, - moving_rate=0.9, - weight_quantize_type='abs_max', - activation_quantize_type='abs_max', - weight_pre_layer=None, - act_pre_layer=None, - weight_quant_layer=None, - act_quant_layer=None): - r""" - Constructor. - - The arguments are the same as ImperativeQuantAware. - """ - super(QuantizedConv2DTranspose, self).__init__() - # For Conv2DTranspose - self._groups = getattr(layer, '_groups') - self._stride = getattr(layer, '_stride') - self._padding = getattr(layer, '_padding') - self._output_padding = getattr(layer, 'output_padding') - self._dilation = getattr(layer, '_dilation') - self._data_format = getattr(layer, '_data_format') - self.weight = getattr(layer, 'weight') - self.bias = getattr(layer, 'bias') - # For FakeQuant - self._conv2d_transpose_quant_axis = 1 - if weight_quant_layer is not None: - self._fake_quant_weight = weight_quant_layer() - else: - self._fake_quant_weight = _get_fake_quant_type( - weight_quantize_type, - name=self.weight.name, - moving_rate=moving_rate, - quant_bits=weight_bits, - dtype=self._dtype, - quant_on_weight=True, - channel_num=self.weight.shape[ - self._conv2d_transpose_quant_axis], - quant_axis=self._conv2d_transpose_quant_axis) - if act_quant_layer is not None: - self._fake_quant_input = act_quant_layer() - else: - self._fake_quant_input = _get_fake_quant_type( - activation_quantize_type, - name=layer.full_name(), - moving_rate=moving_rate, - quant_bits=activation_bits, - dtype=self._dtype, - quant_on_weight=False) - - self._act_preprocess = act_pre_layer( - ) if act_pre_layer is not None else None - self._weight_preprocess = weight_pre_layer( - ) if weight_pre_layer is not None else None - - def forward(self, input, output_size=None): - if self._act_preprocess is not None: - input = self._act_preprocess(input) - quant_input = self._fake_quant_input(input) - - weight = self.weight - if self._weight_preprocess is not None: - weight = self._weight_preprocess(self.weight) - quant_weight = self._fake_quant_weight(weight) - - if output_size is None: - output_padding = self._output_padding - else: - output_padding = 0 - - return F.conv2d_transpose( - quant_input, - quant_weight, - bias=self.bias, - padding=self._padding, - output_padding=output_padding, - stride=self._stride, - dilation=self._dilation, - groups=self._groups, - output_size=output_size, - data_format=self._data_format) - - class QuantizedLinear(layers.Layer): """ The computational logic of QuantizedLinear is the same with Linear. diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py index d8cb70c9dd107b..3ec12c11a7045a 100644 --- a/tools/sampcd_processor.py +++ b/tools/sampcd_processor.py @@ -440,7 +440,6 @@ def get_filenames(full_test=False): ''' global whl_error import paddle - import paddle.fluid.contrib.slim.quantization whl_error = [] if full_test: get_full_api_from_pr_spec() From 891d1e5d8c39e11a799fba2bc7bb828710560f0e Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Tue, 17 Aug 2021 11:07:30 +0800 Subject: [PATCH 14/16] update fake_dequant op --- paddle/fluid/operators/fake_dequantize_op.cc | 8 ++++++-- paddle/fluid/operators/fake_dequantize_op.cu | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index c71710b79d1fbe..14ae6beb4e4a61 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -106,8 +106,7 @@ struct ChannelDequantizeFunctor { } } } - } - else { + } else { int batch_size = in->dims()[0]; int channel = in->dims()[1]; const T* scale_one = scales[0]->data(); @@ -278,6 +277,11 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs, ops::FakeChannelWiseDequantizeMaxAbsKernel); REGISTER_OP_VERSION(fake_channel_wise_dequantize_max_abs) + .AddCheckpoint( + R"ROC(add new attributes [quant_axis] for applying per-channel " + "dequantization to conv2d_tranpose and mul ops.)ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "quant_axis", "The axis for dequantization.", 0)) .AddCheckpoint( R"ROC(add new attributes [x_num_col_dims] for applying per-channel " "dequantization to mul ops.)ROC", diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index b1d2e220426f73..c88a8fe196edf8 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -119,7 +119,7 @@ struct ChannelDequantizeFunctor { int iter_size = 1; for (int i = 0; i < x_num_col_dims; i++) { iter_size *= in->dims()[i]; - } + } int channel = in->dims()[x_num_col_dims]; const T* scale_one = scales[0]->data(); const T* scale_two = scales[1]->data(); From 5f047ff988ceae6c2e48e14b06c19f23d9c668b4 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Tue, 17 Aug 2021 12:11:33 +0000 Subject: [PATCH 15/16] register new attr in enhanced pass --- paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc | 3 +++ .../compat/fake_channel_wise_dequantize_max_abs.pbtxt | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 354db8acf87a73..378d2231e329a3 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -115,6 +115,9 @@ QuantDequantFusePass::QuantDequantFusePass() { .AddAttr("quant_axis") .IsIntIn({0, 1}) .IsOptional() + .AddAttr("x_num_col_dims") + .IsType() + .IsOptional() .End(); AddOpCompat(OpCompat("conv2d")) .AddInput("Input") diff --git a/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt b/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt index ec80ffaaf32ae1..c32c170ce65ab0 100644 --- a/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt +++ b/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt @@ -17,4 +17,8 @@ def { name: "quant_axis" type: INT } + attrs { + name: "x_num_col_dims" + type: INT + } } From 76d7f0754a78862de3f71c9f2e784e5da5627a58 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Tue, 17 Aug 2021 20:26:27 +0800 Subject: [PATCH 16/16] Update quant_conv2d_dequant_fuse_pass.cc --- paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 378d2231e329a3..5958728946c2ed 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -115,6 +115,7 @@ QuantDequantFusePass::QuantDequantFusePass() { .AddAttr("quant_axis") .IsIntIn({0, 1}) .IsOptional() + .End() .AddAttr("x_num_col_dims") .IsType() .IsOptional()