From 270699e6478d1314b4f723bc603856d54f0bf59a Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 24 Mar 2021 10:46:12 +0800 Subject: [PATCH 01/12] [ROCM] fix test_matmul_v2_op (#31802) --- paddle/fluid/operators/dot_op.h | 2 +- python/paddle/fluid/tests/unittests/test_matmul_v2_op.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h index 0b0b7f69b9d849..1b607922eda1d8 100644 --- a/paddle/fluid/operators/dot_op.h +++ b/paddle/fluid/operators/dot_op.h @@ -160,7 +160,7 @@ struct DotGradFunction> { const Tensor* tensor_dout, Tensor* tensor_dx, Tensor* tensor_dy, const paddle::framework::ExecutionContext& ctx) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) if (1 == tensor_dout->dims().size()) { auto dout = framework::EigenVector::Flatten(*tensor_dout); diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index 761d318d7b8a3d..efcc0e4cfe3232 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -67,7 +67,7 @@ def config(self): self.trans_y = False def init_kernel_type(self): - self.dtype = "float64" + self.dtype = "float32" if core.is_compiled_with_rocm() else "float64" def setUp(self): self.init_kernel_type() @@ -91,7 +91,10 @@ def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['X', 'Y'], 'Out') + if core.is_compiled_with_rocm(): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2) + else: + self.check_grad(['X', 'Y'], 'Out') class TestMatMuklOp2(TestMatMulV2Op): From 68497e7b39a13939f1a466f56874fc5aa984878a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 24 Mar 2021 14:26:51 +0800 Subject: [PATCH 02/12] change trainable to stop_gradient in optimizer (#31823) --- python/paddle/optimizer/adam.py | 2 +- python/paddle/optimizer/adamax.py | 2 +- python/paddle/optimizer/optimizer.py | 19 ++++++++++--------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index b0c05cf8de76c2..0cafbda893dd2f 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -351,7 +351,7 @@ def step(self): """ params_grads = [] for param in self._parameter_list: - if not param.trainable: + if param.stop_gradient: continue if param._grad_ivar() is not None: grad_var = param._grad_ivar() diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index bd65fc19c32aaf..4a6c2278a46f40 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -184,7 +184,7 @@ def _finish_update(self, block, parameters_and_grads): """ assert isinstance(block, framework.Block) for param, grad in parameters_and_grads: - if grad is None or param.trainable is False: + if grad is None or param.stop_gradient is True: continue with param.block.program._optimized_guard( [param, grad]), name_scope('adamax'): diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 212dad7c77cb4f..b37d1726064113 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -542,7 +542,7 @@ def _get_accumulator(self, name, param): def _update_param_device_map(self, parameters_and_grads, target_block): for param_and_grad in parameters_and_grads: - if param_and_grad[0].trainable is True: + if param_and_grad[0].stop_gradient is False: param_name = param_and_grad[0].name ops = target_block.ops device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName( @@ -598,14 +598,14 @@ def _create_optimization_pass(self, parameters_and_grads): self._update_param_device_map(parameters_and_grads, target_block) self._create_accumulators( target_block, - [p[0] for p in parameters_and_grads if p[0].trainable]) + [p[0] for p in parameters_and_grads if not p[0].stop_gradient]) self._create_global_learning_rate() if framework.in_dygraph_mode(): for param_and_grad in parameters_and_grads: if param_and_grad[1] is None: continue - if param_and_grad[0].trainable is True: + if param_and_grad[0].stop_gradient is False: self._append_optimize_op(target_block, param_and_grad) else: for param_and_grad in parameters_and_grads: @@ -613,7 +613,7 @@ def _create_optimization_pass(self, parameters_and_grads): continue with param_and_grad[0].block.program._optimized_guard( param_and_grad), name_scope("optimizer"): - if param_and_grad[0].trainable is True: + if param_and_grad[0].stop_gradient is False: device = self._get_device_for_param(param_and_grad[0] .name) with device_guard(device): @@ -689,7 +689,7 @@ def backward(self, params_grads = [] for param in parameter_list: - if not param.trainable: + if param.stop_gradient: continue if param._grad_ivar() is not None: # create gradient tensor @@ -789,8 +789,9 @@ def _apply_optimize(self, loss, startup_program, params_grads): def _get_no_grad_set(self, loss, no_grad_set=None): no_grad_set = _get_no_grad_set_name(no_grad_set) parameters = loss.block.program.global_block().all_parameters() - param_no_trainable = set( - [param.name for param in parameters if param.trainable is False]) + param_no_trainable = set([ + param.name for param in parameters if param.stop_gradient is True + ]) # If the parameter is no trainable, it should not have a gradient. no_grad_set.update(param_no_trainable) @@ -825,7 +826,7 @@ def clear_grad(self): """ for p in self._parameter_list: - if p.trainable: + if not p.stop_gradient: p.clear_gradient() @imperative_base.no_grad @@ -920,7 +921,7 @@ def step(self): """ params_grads = [] for param in self._parameter_list: - if not param.trainable: + if param.stop_gradient: continue if param._grad_ivar() is not None: grad_var = param._grad_ivar() From 84a551380efa7feffc496112a1b746ab7d0617d1 Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Wed, 24 Mar 2021 14:40:14 +0800 Subject: [PATCH 03/12] [dygraph qat] Refine saving output scale to infer program (#31784) * Refine saving output scale to infer program --- .../slim/quantization/imperative/qat.py | 229 ++++++++++-------- .../slim/quantization/imperative/utils.py | 34 ++- .../slim/tests/test_imperative_out_scale.py | 23 +- 3 files changed, 166 insertions(+), 120 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index 68b4cfdc661b4a..ea2e8e073b5084 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -251,8 +251,8 @@ def __init__(self, super(ImperativeQuantizeInputs, self).__init__() self._quantizable_layer_type = tuple( - utils.supported_quant_layers_map[layer] - if layer in utils.supported_quant_layers_map else layer + utils.quant_input_layers_map[layer] + if layer in utils.quant_input_layers_map else layer for layer in quantizable_layer_type) for layer in self._quantizable_layer_type: assert not isinstance(layer, str), \ @@ -324,12 +324,11 @@ def apply(self, model): target = name[last_idx:idx] quant_layer = self._get_quantized_layer(layer) - setattr(quant_layer, "layer_name", layer.full_name()) setattr(obj, target, quant_layer) def _get_quantized_layer(self, layer): quant_layer_name = None - for key, value in utils.supported_quant_layers_map.items(): + for key, value in utils.quant_input_layers_map.items(): if isinstance(layer, value): quant_layer_name = 'Quantized' + key break @@ -372,6 +371,9 @@ def apply(self, model): """ assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." + + # Calculate the target ops's output scale, and don't consider + # the skip_quant attr for _, layer in model.named_sublayers(): if self._is_target_layer(layer): self._init_scale_params(layer) @@ -411,24 +413,21 @@ def save_quantized_model(self, layer, path, input_spec=None, **config): assert isinstance(layer, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." - # remove handles and collect output scales + self._gather_output_scale(layer) + with dygraph.guard(): layer.eval() for handle in self._register_hook_handle_list: handle.remove() - for _, sub_layer in layer.named_sublayers(): - if self._is_target_layer(sub_layer): - if hasattr(sub_layer, "layer_name"): - layer_name = sub_layer.layer_name - else: - layer_name = sub_layer.full_name() - if hasattr(sub_layer, "_quant_out_scale"): - self._out_scale_dict[layer_name] = float( - sub_layer._quant_out_scale) - - # save the quantized model that doesn't have output scales paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config) + if len(self._out_scale_dict) == 0: + warnings.warn("Warning: No Layer of the model while to be " \ + "saved contains the out_threshold attribute, so the " \ + "generated inference model would not contain the " \ + "out_threshold.") + return + # load static model is_dynamic_mode = False if paddle.in_dynamic_mode(): @@ -443,79 +442,26 @@ def save_quantized_model(self, layer, path, input_spec=None, **config): basename = os.path.basename(path) model_filename = basename + INFER_MODEL_SUFFIX params_filename = basename + INFER_PARAMS_SUFFIX - [inference_program, feed_target_names, fetch_targets] = ( + + [infer_program, feed_target_names, fetch_targets] = ( load_inference_model( dirname=dirname, executor=exe, model_filename=model_filename, params_filename=params_filename)) + # TODO(jc): analyse whether the dygraph model has + # several blocks before applying qat + assert infer_program.num_blocks == 1, \ + "Quantization aware training (QAT) requires the program " \ + "only has a block for now. When the model has if-else or " \ + "while, the program will have several blocks." + # set output scales to the static model - check_behind_op = False - op_count = 0 - ops_list = [key for key, _ in self._out_scale_dict.items()] - if len(ops_list) == 0: - warnings.warn( - "Warning: No Layer of the model while to be saved contains " - "the out_threshold attribute, so the generated inference " - "model would not contain the out_threshold.") - else: - # Because the Layer in dygraph may correspond to multiple ops - # in static program after being saved. To ensure correctness, - # the outscale collected for output of dygraph Layer can only - # be set to the last op in the corresponding ops in static program. - # - # We can judge the execution order of the ops which corresponding - # to dygraph Layer by check_behind_op - forward_op = None - for block in inference_program.blocks: - for op in block.ops: - if op.type in utils.op_real_in_out_name: - if op_count > len(ops_list): - warnings.warn( - "The number of Layer which has " - "out_threshold attribute should be bigger than " - "the op in inference model") - break - if check_behind_op: - check_behind_op = False - if op.type == "elementwise_add": - if self._is_op_matched(ops_list[op_count], op, - block): - op._set_attr("out_threshold", - self._out_scale_dict[ops_list[ - op_count]]) - op_count += 1 - forward_op = None - continue - else: - if forward_op is None: - raise ValueError( - "forward_op should not be None") - if self._is_op_matched(ops_list[op_count], - forward_op, block): - forward_op._set_attr( - "out_threshold", self._out_scale_dict[ - ops_list[op_count]]) - op_count += 1 - forward_op = None - - if op.type in ["conv2d", "depthwise_conv2d", "matmul"]: - check_behind_op = True - forward_op = op - continue - if op_count >= len(ops_list): - warnings.warn( - "The number of Layer which has out_threshold attribute should be bigger than the op in inference model" - ) - break - if self._is_op_matched(ops_list[op_count], op, block): - op._set_attr( - "out_threshold", - self._out_scale_dict[ops_list[op_count]]) - op_count += 1 - - self._set_skip_quant_attr(inference_program) + self._save_output_scale(infer_program) + + # process skip quant + self._set_skip_quant_attr(infer_program) # save the final quantized model that has output scales save_inference_model( @@ -523,16 +469,75 @@ def save_quantized_model(self, layer, path, input_spec=None, **config): feeded_var_names=feed_target_names, target_vars=fetch_targets, executor=exe, - main_program=inference_program.clone(), + main_program=infer_program.clone(), model_filename=model_filename, params_filename=params_filename) if is_dynamic_mode: paddle.disable_static() + def _gather_output_scale(self, layer): + """ + Gather all output scales to self._out_scale_dict + """ + with dygraph.guard(): + layer.eval() + for _, sub_layer in layer.named_sublayers(): + if self._is_target_layer(sub_layer): + layer_name = sub_layer.full_name() + if hasattr(sub_layer, "_quant_out_scale"): + self._out_scale_dict[layer_name] = float( + sub_layer._quant_out_scale) + + def _save_output_scale(self, infer_program): + """ + Save all output scales to the corresponding ops in static + inference program. + + Because the Layer in dygraph may correspond to multiple ops + in static program after being saved. To ensure correctness, + the outscale collected for output of dygraph Layer can only + be set to the last op in the corresponding ops in static program. + """ + assert infer_program.num_blocks == 1, \ + "The inference program should only have a block." + + global_block = infer_program.global_block() + target_ops = global_block.ops + + scale_idx = 0 + op_idx = 0 + attr_name = "out_threshold" + + for scale_name, scale_value in self._out_scale_dict.items(): + while True: + if op_idx >= len(target_ops): + break + + op = target_ops[op_idx] + if not self._is_scale_op_matched(scale_name, op, global_block): + op_idx += 1 + else: + if op.type in utils.weight_op_types \ + and op_idx + 1 < len(target_ops) \ + and target_ops[op_idx+1].type == "elementwise_add": + target_ops[op_idx + 1]._set_attr(attr_name, scale_value) + op_idx += 2 + else: + op._set_attr(attr_name, scale_value) + op_idx += 1 + scale_idx += 1 + break + + if scale_idx != len(self._out_scale_dict): + _logger.warning("Warning: the model have %s output scales, "\ + "but it only saves %s output scales." \ + % (len(self._out_scale_dict), scale_idx)) + def _is_target_layer(self, layer): - return isinstance(layer, utils.out_scale_layers_list) \ - or 'quantized_' in layer.full_name() + return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \ + or ('quantized_' in layer.full_name() and \ + 'quantized_noweight' not in layer.full_name()) def _init_scale_params(self, layer, name=None): """ @@ -570,27 +575,39 @@ def _create_param(in_layer, first_name, last_name, dtype): layer._quant_out_accum = _create_param(layer, name, "accum", dtype) layer._quant_out_accum.stop_gradient = True - # Judge whether the op in program matches the Layer in dynamic model - def _is_op_matched(self, layer_name, op, block): - output_var_names = quantization_pass._get_op_output_var_names(op) - for output_var_name in output_var_names: - output_var_tensor = block.var(output_var_name) - if output_var_tensor.dtype not in [ - core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32 - ]: - return False - - # Because the naming styles of static and dynamic graph are different, - # in order to avoid mistakes, we unify the name here. - op_type = output_var_names[0].split(".")[0] - op_type = op_type.rsplit("_", 1)[0] - if op_type == 'depthwise_conv2d': - op_type = 'conv2d' - if 'prelu' in op_type: - op_type = op_type.replace('prelu', 'p_re_lu') - if 'relu' in op_type: - op_type = op_type.replace('relu', 're_lu') - return op_type in layer_name + def _is_scale_op_matched(self, scale_name, op, block): + """ + Based on the op name and attrs to judge whether the op in + program matches the scale_name. We must know the corresponding + name between dgraph and static model. + """ + fp_type = [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32] + if op.type in quantization_pass._op_real_in_out_name.keys(): + output_var_names = quantization_pass._get_op_output_var_names(op) + for output_var_name in output_var_names: + output_var_tensor = block.var(output_var_name) + if output_var_tensor.dtype not in fp_type: + return False + + # corresponding_map: [name, op_types, function] + # Note that, the items have priority in corresponding_map + corresponding_map = [ + ['conv2d_tranpose', ['conv2d_transpose', \ + 'depthwise_conv2d_transpose'], None], + ['conv2d', ['conv2d', 'depthwise_conv2d'], None], + ['linear', ['matmul'], None], + ['re_lu6', ['relu6'], None], + ['p_re_lu', ['prelu'], None], + ['leaky_re_lu', ['leaky_relu'], None], + ['re_lu', ['relu'], None], + ] + + for item in corresponding_map: + if item[0] in scale_name: + return (op.type in item[1]) and \ + (len(item) == 2 or item[2] is None or item[2](op)) + + return op.type in scale_name def _set_skip_quant_attr(self, program): block = program.global_block() diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index 3bf655265c6f22..090f6cda389af2 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -30,7 +30,7 @@ "swish": [["X"], ["Out"]], } -supported_quant_layers_map = { +quant_input_layers_map = { 'Conv2D': paddle.nn.Conv2D, 'Linear': paddle.nn.Linear, 'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D, @@ -58,8 +58,30 @@ "fake_quantize_dequantize_moving_average_abs_max" ] -out_scale_layers_list = ( - paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.MaxPool2D, - paddle.nn.BatchNorm, paddle.nn.BatchNorm2D, paddle.nn.SyncBatchNorm, - paddle.nn.LeakyReLU, paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6, - paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Tanh, paddle.nn.Swish) +quant_output_layers_map = { + 'Conv2D': paddle.nn.Conv2D, + 'Conv2DTranspose': paddle.nn.Conv2DTranspose, + 'Linear': paddle.nn.Linear, + 'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D, + 'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D, + 'AvgPool2D': paddle.nn.AvgPool2D, + 'MaxPool2D': paddle.nn.MaxPool2D, + 'BatchNorm': paddle.nn.BatchNorm, + 'BatchNorm2D': paddle.nn.BatchNorm2D, + 'SyncBatchNorm': paddle.nn.SyncBatchNorm, + 'ELU': paddle.nn.ELU, + 'GELU': paddle.nn.GELU, + 'LeakyReLU': paddle.nn.LeakyReLU, + 'PReLU': paddle.nn.PReLU, + 'ReLU': paddle.nn.ReLU, + 'ReLU6': paddle.nn.ReLU6, + 'Sigmoid': paddle.nn.Sigmoid, + 'Softmax': paddle.nn.Softmax, + 'Tanh': paddle.nn.Tanh, + 'Swish': paddle.nn.Swish, +} + +weight_op_types = [ + "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose", + "depthwise_conv2d_transpose" +] diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index ed29375d22bb9f..600174e503feb2 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -33,7 +33,6 @@ from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D -from paddle.fluid.dygraph.nn import Pool2D from paddle.fluid.log_helper import get_logger from paddle.fluid.dygraph import nn @@ -131,8 +130,8 @@ def __init__(self, num_classes=10): bias_attr=False), BatchNorm2D(6), ReLU(), - Pool2D( - pool_size=2, pool_type='max', pool_stride=2), + MaxPool2D( + kernel_size=2, stride=2), Conv2D( in_channels=6, out_channels=16, @@ -357,7 +356,6 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000): "diff({}) at {}, dynamic loss = {}, static loss = {}". format(diff, i, loss_d, loss_s)) break - self.assertTrue( np.allclose( np.array(dynamic_loss_rec), @@ -398,10 +396,15 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000): if dynamic_ops[i].has_attr("out_threshold"): op_count += 1 self.assertTrue(dynamic_ops[i].type == static_ops[i].type) + if dynamic_ops[i].attr("out_threshold") != static_ops[i].attr( + "out_threshold"): + _logger.info(dynamic_ops[i].attr("out_threshold")) + _logger.info(static_ops[i].attr("out_threshold")) self.assertTrue(dynamic_ops[i].attr("out_threshold") == static_ops[i].attr("out_threshold")) - self.assertTrue(op_count == 13) + _logger.info("op_cout: {}".format(op_count)) + self.assertTrue(op_count == 14) class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): @@ -470,7 +473,9 @@ def test_save_quantized_model(self): self.assertTrue(dynamic_ops[i].type == static_ops[i].type) self.assertTrue(dynamic_ops[i].attr("out_threshold") == static_ops[i].attr("out_threshold")) - self.assertTrue(op_count == 13) + + _logger.info("op_cout: {}".format(op_count)) + self.assertTrue(op_count == 14) class TestSaveQuantizedModel_Warning(unittest.TestCase): @@ -490,8 +495,10 @@ def test_warning(self): shape=[None, 1, 28, 28], dtype='float32') ]) - warning_message = "Warning: No Layer of the model while to be saved contains the out_threshold attribute, " \ - "so the generated inference model would not contain the out_threshold." + warning_message = "Warning: No Layer of the model while to be " \ + "saved contains the out_threshold attribute, so the " \ + "generated inference model would not contain the " \ + "out_threshold." num = get_vaild_warning_num(warning_message, w) assert num == 1 From f2cfc0f46d8b47f743320b8037d6f309a097d294 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 24 Mar 2021 15:24:46 +0800 Subject: [PATCH 04/12] [CustomOp]Avoid raising warning while import paddle (#31804) --- python/paddle/utils/cpp_extension/cpp_extension.py | 6 +++--- python/paddle/utils/cpp_extension/extension_utils.py | 6 ------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py index d84ae67fff8d67..ea4c85e20db764 100644 --- a/python/paddle/utils/cpp_extension/cpp_extension.py +++ b/python/paddle/utils/cpp_extension/cpp_extension.py @@ -400,14 +400,14 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs, # ncvv compile CUDA source if is_cuda_file(src): if core.is_compiled_with_rocm(): - assert ROCM_HOME is not None + assert ROCM_HOME is not None, "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it." hipcc_cmd = os.path.join(ROCM_HOME, 'bin', 'hipcc') self.compiler.set_executable('compiler_so', hipcc_cmd) # {'nvcc': {}, 'cxx: {}} if isinstance(cflags, dict): cflags = cflags['hipcc'] else: - assert CUDA_HOME is not None + assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it." nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc') self.compiler.set_executable('compiler_so', nvcc_cmd) # {'nvcc': {}, 'cxx: {}} @@ -470,7 +470,7 @@ def win_custom_spawn(cmd): src = src_list[0] obj = obj_list[0] if is_cuda_file(src): - assert CUDA_HOME is not None + assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it." nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc') if isinstance(self.cflags, dict): cflags = self.cflags['nvcc'] diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 1ff42a7bcbc0d1..7d6bcc4d564c93 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -461,9 +461,6 @@ def find_cuda_home(): if cuda_home and not os.path.exists( cuda_home) and core.is_compiled_with_cuda(): cuda_home = None - warnings.warn( - "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it." - ) return cuda_home @@ -494,9 +491,6 @@ def find_rocm_home(): if rocm_home and not os.path.exists( rocm_home) and core.is_compiled_with_rocm(): rocm_home = None - warnings.warn( - "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it." - ) return rocm_home From e5f7a834d4200ad9d7e8b748d2d96fc7faeb0e63 Mon Sep 17 00:00:00 2001 From: Wojciech Uss Date: Wed, 24 Mar 2021 08:41:47 +0100 Subject: [PATCH 05/12] fix cache key in concat oneDNN kernel (#31820) * fix cache key in concat oneDNN kernel * key simplified --- .../operators/mkldnn/concat_mkldnn_op.cc | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 4beb7ad017851b..df1b5af121da93 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -71,6 +71,15 @@ static const std::vector ReduceMultiInput( return reduced; } +static const std::vector GetDimsForKey( + const std::vector& inputs) { + auto dims_key = paddle::framework::vectorize(inputs[0]->dims()); + for (auto it = std::next(inputs.begin()); it != inputs.end(); ++it) { + dims_key.push_back((*it)->dims()[0]); + } + return dims_key; +} + template class ConcatPrimitiveFactory { public: @@ -134,6 +143,8 @@ template class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { + // If any of the multiple inputs of concat has an input size of 0, the + // actual size of the multi_input will change auto multi_input = ReduceMultiInput(ctx.MultiInput("X")); EnforceLayouts(multi_input); Tensor* output = ctx.Output("Out"); @@ -156,12 +167,9 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::ToMKLDNNDataType(multi_input[0]->type()); ConcatPrimitiveFactory prim_creator; - // If one of the multiple inputs of concat has an input size of 0, the - // actual size of the multi_input will change - std::string key = platform::CreateKey( - dev_ctx, paddle::framework::vectorize(multi_input[0]->dims()), - multi_input.size(), ctx.OutputName("Out"), dt, - platform::ThreadIDasStr()); + std::string key = + platform::CreateKey(dev_ctx, GetDimsForKey(multi_input), + multi_input.size(), ctx.OutputName("Out"), dt); key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); const std::string key_prim = key + "@concat_p"; From 649868ffb262bdba89741eca93e7c7cb8632b9e2 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Wed, 24 Mar 2021 16:37:16 +0800 Subject: [PATCH 06/12] [Dy2stat] Fix the bug that loop_body_func may return single element (#31806) Our old `loop_body` function may return single element when `loop_vars` just contains only 1 element, which can cause bug. The key point of this PR is forcing `loop_body` functions always return tuple. --- .../dygraph_to_static/loop_transformer.py | 2 +- .../fluid/dygraph/dygraph_to_static/utils.py | 12 ++++++-- .../dygraph_to_static/test_for_enumerate.py | 29 +++++++++++++++++-- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index b7ef000938a151..bd89a79c805c98 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -594,7 +594,7 @@ def get_for_stmt_nodes(self, node): # append return values for loop body body_stmts.append( gast.Return(value=generate_name_node( - loop_var_names, ctx=gast.Load()))) + loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True))) body_func_node = gast.FunctionDef( name=unique_name.generate(FOR_BODY_PREFIX), args=gast.arguments( diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 1071fc1350bfeb..624ca085ac6c2d 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -381,9 +381,15 @@ def get_attribute_full_name(node): return astor.to_source(gast.gast_to_ast(node)).strip() -def generate_name_node(name_ids, ctx=gast.Load()): +def generate_name_node(name_ids, ctx=gast.Load(), gen_tuple_if_single=False): """ - Generate list or gast.Tuple of ast.Name for Return statement. + If name_ids is list or tuple or set with multiple strings, this function + generates gast.Tuple of gast.Name. + If the name_ids is single string or contains only 1 string, this function + returns gast.Name if gen_tuple_if_single==False else returns gast.Tuple + with only one gast.Name + + This function is used at several gast.Return statements. """ if isinstance(name_ids, six.string_types): name_ids = [name_ids] @@ -395,7 +401,7 @@ def generate_name_node(name_ids, ctx=gast.Load()): id=name_id, ctx=ctx, annotation=None, type_comment=None) for name_id in name_ids ] - if len(gast_names) == 1: + if len(gast_names) == 1 and not gen_tuple_if_single: name_node = gast_names[0] else: name_node = gast.Tuple(elts=gast_names, ctx=ctx) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py index c28997c5c1c673..517cff39a276f4 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py @@ -233,6 +233,7 @@ def for_iter_var_idx(x_array): return z +# 17. for a,b,c in z: (a, b, c) is a tuple @paddle.jit.to_static def for_tuple_as_iter_var(x_array): x = paddle.to_tensor(x_array) @@ -250,6 +251,7 @@ def for_tuple_as_iter_var(x_array): return a_result, b_result, c_result +# 18. for t in enumerate(collection): t is tuple of (idx, element) @paddle.jit.to_static def for_tuple_as_enumerate_iter(x_array): x = paddle.to_tensor(x_array) @@ -263,6 +265,7 @@ def for_tuple_as_enumerate_iter(x_array): return a_result +# 19. for i, (a, b, c, d, e) in enumerate(collection): (a, b, c, d, e) is a tuple @paddle.jit.to_static def for_tuple_as_enumerate_value(x_array): x = paddle.to_tensor(x_array) @@ -284,6 +287,23 @@ def for_tuple_as_enumerate_value(x_array): return a_result +# 20. test for function in a class +class ForwardContainsForLayer(paddle.nn.Layer): + def __init__(self): + super(ForwardContainsForLayer, self).__init__() + self.high = 5 + self.low = 3 + + @paddle.jit.to_static + def forward(self, x): + # just for test case, x is useless in this method + y = paddle.zeros([10, 2, 3]) + z = [] + for i in range(self.high - self.low): + z.append(y[i].clone()) + return z + + class TestTransformBase(unittest.TestCase): def setUp(self): self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda( @@ -313,11 +333,11 @@ def get_static_output(self): class TestTransform(TestTransformBase): def transformed_result_compare(self): dy_outs = self.get_dygraph_output() - if not isinstance(dy_outs, tuple): + if not isinstance(dy_outs, (tuple, list)): dy_outs = (dy_outs, ) st_outs = self.get_static_output() - if not isinstance(st_outs, tuple): + if not isinstance(st_outs, (tuple, list)): st_outs = (st_outs, ) for x, y in zip(dy_outs, st_outs): @@ -446,5 +466,10 @@ def set_test_func(self): self.dygraph_func = for_tuple_as_enumerate_value +class TestForwardContainsForLayer(TestForIterVarNumpy): + def set_test_func(self): + self.dygraph_func = ForwardContainsForLayer() + + if __name__ == '__main__': unittest.main() From 5d89ec36dc36c3b09a3972db326a2d41c4a330a5 Mon Sep 17 00:00:00 2001 From: parap1uie-s Date: Wed, 24 Mar 2021 17:25:00 +0800 Subject: [PATCH 07/12] Update pooling.py (#31829) Fix default argument of nn.MaxPool3D() --- python/paddle/nn/layer/pooling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py index 0f3c4449a3f20d..5830af3a182d4f 100755 --- a/python/paddle/nn/layer/pooling.py +++ b/python/paddle/nn/layer/pooling.py @@ -589,8 +589,8 @@ class MaxPool3D(layers.Layer): def __init__(self, kernel_size, - stride, - padding, + stride=None, + padding=0, return_mask=False, ceil_mode=False, data_format="NCDHW", From e7f28d6c0db54eb9c9a810612300b526687e56a6 Mon Sep 17 00:00:00 2001 From: winter-wang <78149749+winter-wang@users.noreply.github.com> Date: Wed, 24 Mar 2021 18:19:51 +0800 Subject: [PATCH 08/12] fix runtime crash when rnn model inference, test=develop (#31833) --- .../analysis/passes/memory_optimize_pass.cc | 1 + paddle/fluid/operators/recurrent_op.cc | 25 +++++++++---------- python/paddle/nn/functional/norm.py | 9 ++++--- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 5e6960c4c7e8c0..fdfd2c60af0c16 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -103,6 +103,7 @@ void MemoryOptimizePass::CollectVarMemorySize( "merge_lod_tensor", "equal", "sequence_pool", + "recurrent", "lod_reset"}; for (auto* tmp : node->inputs) { CHECK(tmp->IsOp()); diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 9766008963be00..92e5e4a0cd120f 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -210,9 +210,10 @@ void RecurrentOp::RunImpl(const framework::Scope &scope, auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare( - *program, block->ID(), Attr>( - kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/); + auto ctx = executor.Prepare(*program, block->ID(), + Attr>( + kSkipEagerDeletionVars), /*skip_ref_cnt_vars*/ + true); static std::mutex mutex; std::lock_guard lock(mutex); @@ -255,16 +256,6 @@ void RecurrentOp::RunImpl(const framework::Scope &scope, // Link inside::output -> outside::output // outside::output[seq_offset: seq_offset + 1] = inside::output executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_); - if (i > 0) { - LinkTensorWithCallback(scope, Outputs(kOutputs), cur_scope, - Outputs(kOutputs), - [&](const framework::LoDTensor &src_tensor, - framework::LoDTensor *dst_tensor) { - framework::Tensor src_slice = - src_tensor.Slice(seq_offset, seq_offset + 1); - dst_tensor->ShareDataWith(src_slice); - }); - } // Linked now, execute! executor.RunPreparedContext(ctx.get(), &cur_scope, @@ -284,6 +275,14 @@ void RecurrentOp::RunImpl(const framework::Scope &scope, // early. framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out); }); + } else { + LinkTensorWithCallback( + cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs), + [&](const framework::LoDTensor &src_tensor, + framework::LoDTensor *dst_tensor) { + auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); + framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out); + }); } scopes.ForwardNext(); diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index 03ba78e12f6376..54824233f70762 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -188,10 +188,10 @@ def batch_norm(x, if in_dygraph_mode(): # for dygraph need tuple - attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout", - data_format, "use_mkldnn", False, "fuse_with_relu", False, - "use_global_stats", use_global_stats, "trainable_statistics", - trainable_statistics) + attrs = ("momentum", momentum, "epsilon", epsilon, "is_test", + not training, "data_layout", data_format, "use_mkldnn", False, + "fuse_with_relu", False, "use_global_stats", use_global_stats, + "trainable_statistics", trainable_statistics) batch_norm_out, _, _, _, _, _ = core.ops.batch_norm( x, weight, bias, running_mean, running_var, mean_out, variance_out, *attrs) @@ -205,6 +205,7 @@ def batch_norm(x, attrs = { "momentum": momentum, "epsilon": epsilon, + "is_test": not training, "data_layout": data_format, "use_mkldnn": False, "fuse_with_relu": False, From 6472d62093c49e76cfcc5fc93224a4be4b1f063b Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Thu, 25 Mar 2021 08:57:24 +0800 Subject: [PATCH 09/12] Revert "add relu forward kernel and backward kernel (#31613)" (#31853) --- paddle/fluid/operators/activation_op.cu | 284 +----------------------- 1 file changed, 1 insertion(+), 283 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 29498da0f026f5..2033081af224a4 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -10,276 +10,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/math/math_cuda_utils.h" -#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using float16 = paddle::platform::float16; - -template -struct CudaVecType { - using type = T; - static constexpr int vecsize = 1; -}; - -template <> -struct CudaVecType { - using type = __half2; - static constexpr int vecsize = 2; -}; - -template <> -struct CudaVecType { - using type = float4; - static constexpr int vecsize = 4; -}; - -template -class BaseGPUFunctor { - public: - using ELEMENT_TYPE = T; -}; - -/* ========================================================================== */ - -/* =========================== relu forward ============================ */ -template -class ReluGPUFuctor : public BaseGPUFunctor { - private: - T zero_; - - public: - ReluGPUFuctor() { zero_ = static_cast(0.0f); } - - // for relu forward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type* x); - - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T x) { - return x > zero_ ? x : zero_; - } -}; - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFuctor::Compute(const CudaVecType::type* x) { -// relu forward : out = max(x, 0) -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 - return __ldg(x) > zero_ ? __ldg(x) : zero_; -#else - return (*x) > zero_ ? (*x) : zero_; -#endif -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFuctor::Compute(const CudaVecType::type* xx) { - // relu forward : out = max(xx, 0) - return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y), - (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w)); -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFuctor::Compute(const CudaVecType::type* in) { -// relu forward : out = max(in, 0) -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 - const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in)); -#else - const float2 xx = __half22float2(*in); - return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), - (xx.y > 0.0f) * static_cast(xx.y)); -#endif -} -/* ========================================================================== */ - -/* =========================== relu backward ============================ - */ - -template -class ReluGradGPUFunctor : public BaseGPUFunctor { - private: - T zero_; - - public: - ReluGradGPUFunctor() { zero_ = static_cast(0.0f); } - - // for relu backward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type* out, - const typename CudaVecType::type* dout); - - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { - // relu backward : dx = out > 0 ? dout : 0; - return out > zero_ ? dout : zero_; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { -// relu backward : dx = out > 0 ? dout : 0; -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 - return __ldg(out) > zero_ ? __ldg(dout) : zero_; -#else - return (*out) > zero_ ? (*dout) : zero_; -#endif -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { - // relu backward : dx = out > 0 ? dout : 0; - return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y), - (out->z > zero_) * (dout->z), - (out->w > zero_) * (dout->w)); -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { -// relu backward : dx = out > 0 ? dout : 0; -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 - const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout)); -#else - const float2 xx = __half22float2(*out); - const float2 yy = __half22float2(*dout); - return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), - (xx.y > 0.0f) * static_cast(yy.y)); -#endif -} - -/* ========================================================================== */ - -template -__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, - T* dx, int num, Functor functor) { - using VecType = typename CudaVecType::type; - constexpr int vecsize = CudaVecType::vecsize; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - int loop = num / vecsize; - int tail = num % vecsize; - const VecType* in_forward = reinterpret_cast(forward_data); - const VecType* in_dout = reinterpret_cast(dout); - VecType* out = reinterpret_cast(dx); - - for (int i = idx; i < loop; i += stride) { - out[i] = functor.Compute((in_forward + i), (in_dout + i)); - } - - while (idx == loop && tail) { - dx[num - tail] = - functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]); - --tail; - } -} - -template -__global__ void ActivationkernelVec(const T* src, T* dst, int num, - Functor functor) { - constexpr int vecsize = CudaVecType::vecsize; - using VecType = typename CudaVecType::type; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - int loop = num / vecsize; - int tail = num % vecsize; - const VecType* in = reinterpret_cast(src); - VecType* out = reinterpret_cast(dst); - - for (int i = idx; i < loop; i += stride) { - out[i] = functor.Compute((in + i)); - } - - while (idx == loop && tail) { - dst[num - tail] = functor.ComputeRemainder(src[num - tail]); - --tail; - } -} - -template -class ActivationGPUKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = nullptr; - framework::Tensor* out = nullptr; - ExtractActivationTensor(context, &in_x, &out); - auto& dev_ctx = context.template device_context(); - - int num = in_x->numel(); - const T* input_data = in_x->data(); - T* output_data = out->mutable_data(dev_ctx.GetPlace(), - static_cast(num * sizeof(T))); - - int block = 512; -#ifdef __HIPCC__ - block = 256; -#endif - Functor functor; - constexpr int vecsize = CudaVecType::vecsize; - int grid = max((num / vecsize + block - 1) / block, 1); - ActivationkernelVec<<>>(input_data, output_data, - num, functor); - } -}; - -template -class ActivationGradGPUKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor *x, *out, *d_out; - framework::Tensor* d_x = nullptr; - x = out = d_out = nullptr; - ExtractActivationGradTensor(context, &x, &out, &d_out, - &d_x); - int numel = d_out->numel(); - auto& dev_ctx = context.template device_context(); - auto* dx_data = d_x->mutable_data( - dev_ctx.GetPlace(), static_cast(numel * sizeof(T))); - auto* dout_data = d_out->data(); - - auto* forward_data = dout_data; - if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { - // Only need forward output Out - forward_data = out->data(); - } else if (static_cast(Functor::FwdDeps()) == - static_cast(kDepX)) { - // Only need forward input X - forward_data = x->data(); - } - - int block = 512; -#ifdef __HIPCC__ - block = 256; -#endif - Functor functor; - constexpr int vecsize = CudaVecType::vecsize; - int grid = max((numel / vecsize + block - 1) / block, 1); - ActivationGradKernelVec<<>>( - forward_data, dout_data, dx_data, numel, functor); - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; @@ -328,21 +60,7 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== relu register ============================ */ -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationGPUKernel>, - ops::ActivationGPUKernel>, - ops::ActivationGPUKernel>); - -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradGPUKernel>, - ops::ActivationGradGPUKernel>, - ops::ActivationGradGPUKernel>); +REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor); REGISTER_OP_CUDA_KERNEL( relu_grad_grad, From 511e204e620f3c6e3df2018746c52c5bf2386a59 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Thu, 25 Mar 2021 11:24:01 +0800 Subject: [PATCH 10/12] LRScheduler.get_lr should not update lr in LinearWarmup (#31843) --- .../fluid/tests/unittests/test_lr_scheduler.py | 12 ++++++++++++ python/paddle/optimizer/lr.py | 5 ++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py index 8c6383cd6ef523..04a0d47e47c86b 100644 --- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py @@ -537,6 +537,18 @@ def test_scheduler(self): self._test_dygraph(python_func, paddle_api, kwarg, place) paddle.enable_static() + def test_linear_warmp(self): + natural_lr = paddle.optimizer.lr.NaturalExpDecay( + learning_rate=0.5, gamma=0.1) + natural_lr_warmup = paddle.optimizer.lr.LinearWarmup( + learning_rate=natural_lr, warmup_steps=10, start_lr=0.0, end_lr=0.1) + for idx in range(30): + if idx >= 10: + self.assertEqual(natural_lr_warmup.get_lr(), + natural_lr.get_lr()) + natural_lr.step() + natural_lr_warmup.step() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index 5085911ce927a3..484b4fb7246a76 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -786,9 +786,8 @@ def get_lr(self): self.last_epoch) / float(self.warmup_steps) + self.start_lr else: if isinstance(self.learning_rate, LRScheduler): - lr_value = self.learning_rate() - self.learning_rate.step() - return lr_value + self.learning_rate.step(self.last_epoch - self.warmup_steps) + return self.learning_rate() return self.learning_rate From 27f2d8df8e48847f62e31e627ee25ac2102f27fc Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 25 Mar 2021 11:36:16 +0800 Subject: [PATCH 11/12] Polish two error messages (#31852) * polish two error messages * polish details --- paddle/fluid/operators/detection/polygon_box_transform_op.cu | 3 ++- paddle/fluid/operators/matmul_op.cc | 2 +- paddle/fluid/operators/nll_loss_op.h | 5 ++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu index 337a76f9f976f8..5977a434a6023f 100644 --- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu @@ -45,7 +45,8 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); + platform::errors::InvalidArgument( + "The polygon_box_transform operator needs to be executed on GPU.")); auto* in = ctx.Input("Input"); auto in_dims = in->dims(); const T* in_data = in->data(); diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 9b64e99c944725..c12aecc9ba5160 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -587,7 +587,7 @@ class MatMulOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_, platform::errors::InvalidArgument( "Input X's width should be equal to the Y's height, " - "but received X's shape: [%s]," + "but received X's shape: [%s], " "Y's shape: [%s].", dim_x, dim_y)); #endif diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h index e93d5792205900..be6f4422d4ac6a 100644 --- a/paddle/fluid/operators/nll_loss_op.h +++ b/paddle/fluid/operators/nll_loss_op.h @@ -36,7 +36,10 @@ static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data, } PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, platform::errors::InvalidArgument( - "label should not be out of bounds.")); + "Label value is out of range. " + "Expected label value in range of [0, %d), but " + "received value is %d.", + n_classes, cur_label)); const auto cur_weight = weight_data ? weight_data[cur_label] : static_cast(1); From bf09dcb346c9aa4c20fbfaf520ab781d4f640346 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Thu, 25 Mar 2021 14:08:22 +0800 Subject: [PATCH 12/12] add GPU tensor notice & update default_collate_fn/default_convert_fn. test=develop (#31763) --- python/paddle/fluid/dataloader/collate.py | 47 +++++++++++++++++------ python/paddle/fluid/reader.py | 6 +++ 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py index ddc010d04280c8..8e90b308b393ed 100644 --- a/python/paddle/fluid/dataloader/collate.py +++ b/python/paddle/fluid/dataloader/collate.py @@ -27,24 +27,31 @@ def default_collate_fn(batch): """ Default batch collating function for :code:`paddle.io.DataLoader`, - batch should be a list of samples, and each sample should be a list - of fields as follows: + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array and paddle.Tensor, this + function will parse input data recursively and stack number, + numpy array and paddle.Tensor datas as batch datas. e.g. for + following input data: + + [{'image': np.array(shape=[3, 224, 224]), 'label': 1}, + {'image': np.array(shape=[3, 224, 224]), 'label': 3}, + {'image': np.array(shape=[3, 224, 224]), 'label': 4}, + {'image': np.array(shape=[3, 224, 224]), 'label': 5},] - [[filed1, filed2, ...], [filed1, filed2, ...], ...] - This default collate function zipped each filed together and stack - each filed as the batch field as follows: + This default collate function zipped each number and numpy array + field together and stack each field as the batch field as follows: + + {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} - [batch_filed1, batch_filed2, ...] Args: - batch(list of list of numpy array|paddle.Tensor): the batch data, each fields - should be a numpy array, each sample should be a list of - fileds, and batch should be a list of sample. + batch(list of sample data): batch should be a list of sample data. Returns: - a list of numpy array|Paddle.Tensor: collated batch of input batch data, - fields data type as same as fields in each sample. + Batched data: batched each number, numpy array and paddle.Tensor + in input data. """ sample = batch[0] if isinstance(sample, np.ndarray): @@ -75,6 +82,24 @@ def default_collate_fn(batch): def default_convert_fn(batch): + """ + Default batch converting function for :code:`paddle.io.DataLoader`. + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array and paddle.Tensor. + + .. note:: + This function is default :attr:`collate_fn` in **Distable + automatic batching** mode, for **Distable automatic batching** + mode, please ses :attr:`paddle.io.DataLoader` + + Args: + batch(list of sample data): batch should be a list of sample data. + + Returns: + Batched data: batched each number, numpy array and paddle.Tensor + in input data. + """ if isinstance(batch, (paddle.Tensor, np.ndarray)): return batch elif isinstance(batch, (str, bytes)): diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index be196b73edd698..9f2b2127aa7043 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -165,6 +165,12 @@ class DataLoader(object): For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` + .. note:: + GPU tensor operation is not supported in subprocess currently, + please don't use GPU tensor operations in pipeline which will + be performed in subprocess, such as dataset transforms, collte_fn, + etc. Numpy array and CPU tensor operation is supported. + **Disable automatic batching** In certain cases such as some NLP tasks, instead of automatic batching,