From 94108314984ddbf7f1774764d142cd31610a1f35 Mon Sep 17 00:00:00 2001 From: 0x45f Date: Tue, 14 May 2024 07:30:20 +0000 Subject: [PATCH 1/5] [PIR AMP]Fix opt.minimize error when no using grad scaler --- python/paddle/amp/auto_cast.py | 27 +++++++++--------- python/paddle/static/amp/decorator.py | 2 ++ test/amp/test_pir_amp.py | 40 +++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 0f67084da733ec..afb5a916db58ce 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -26,7 +26,6 @@ ) from paddle.base.wrapped_decorator import signature_safe_contextmanager from paddle.static.amp.decorator import OptimizerWithMixedPrecision -from paddle.static.amp.fp16_lists import AutoMixedPrecisionLists from .amp_lists import black_list, white_list @@ -1035,15 +1034,15 @@ def decorate( amp_lists=None, level=level, dtype=dtype, - init_loss_scaling=2.0**16, - incr_every_n_steps=2000, - decr_every_n_nan_or_inf=1, - incr_ratio=2.0, - decr_ratio=0.5, + init_loss_scaling=1.0, + incr_every_n_steps=None, + decr_every_n_nan_or_inf=None, + incr_ratio=None, + decr_ratio=None, use_dynamic_loss_scaling=False, use_amp_guard=None, use_master_grad=master_grad, - use_promote=True, + use_promote=None, ) return models, optimizers elif level == 'O2': @@ -1057,16 +1056,18 @@ def decorate( else: optimizers = OptimizerWithMixedPrecision( optimizer=optimizers, - amp_lists=AutoMixedPrecisionLists(dtype=dtype), + amp_lists=None, level=level, dtype=dtype, - init_loss_scaling=2**15, + init_loss_scaling=1.0, + incr_every_n_steps=None, + decr_every_n_nan_or_inf=None, + incr_ratio=None, + decr_ratio=None, use_dynamic_loss_scaling=False, - incr_every_n_steps=1000, - decr_every_n_nan_or_inf=2, - incr_ratio=2.0, - decr_ratio=0.8, + use_amp_guard=None, use_master_grad=master_grad, + use_promote=None, ) return models, optimizers else: diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py index 877a855bcb95e7..0337ffe7b43cdd 100644 --- a/python/paddle/static/amp/decorator.py +++ b/python/paddle/static/amp/decorator.py @@ -271,6 +271,8 @@ def backward( self._train_program, startup_program ): self._init_amp_var() + if self._scaled_loss is None: + self._scaled_loss = loss params_grads = self._optimizer.backward( self._scaled_loss, startup_program, diff --git a/test/amp/test_pir_amp.py b/test/amp/test_pir_amp.py index 6f30a1e8078619..2c539913a7d850 100644 --- a/test/amp/test_pir_amp.py +++ b/test/amp/test_pir_amp.py @@ -64,6 +64,46 @@ def test_linear_amp_o1(self): np.testing.assert_equal(len(_white_list), 0) np.testing.assert_equal(len(_black_list), 0) + def test_linear_amp_o2_without_scaler(self): + if not core.is_compiled_with_cuda(): + return + with paddle.pir_utils.IrGuard(): + startup = paddle.static.Program() + main = paddle.static.Program() + with paddle.static.program_guard(main, startup): + x = paddle.static.data('x', [3, 4], 'float32') + linear = paddle.nn.Linear(4, 5) + optimizer = paddle.optimizer.Adam( + learning_rate=0.001, parameters=linear.parameters() + ) + linear, optimizer = paddle.amp.decorate( + models=linear, + optimizers=optimizer, + level='O2', + master_weight=True, + master_grad=True, + ) + + with paddle.amp.auto_cast( + level='O2', dtype='float16', use_promote=True + ): + out = linear(x) + loss = paddle.mean(out) + optimizer.minimize(loss) + cast_op_count = 0 + for op in main.global_block().ops: + if op.name() == 'pd_op.cast': + cast_op_count += 1 + np.testing.assert_equal(cast_op_count, 3) + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + exe.run(startup) + result = exe.run( + main, + feed={'x': np.random.rand(3, 4).astype('float32')}, + fetch_list=[loss], + ) + def test_linear_amp_o2(self): if not core.is_compiled_with_cuda(): return From 85d5614b49532b7729e34a44bdf222b1242cccc6 Mon Sep 17 00:00:00 2001 From: chenzhiyang <1792266893@qq.com> Date: Tue, 14 May 2024 11:48:43 +0000 Subject: [PATCH 2/5] fix save_load bf16 --- .../pir/transforms/pd_op_to_kernel_pass.cc | 1 + .../legacy_test/test_static_save_load_bf16.py | 140 ++++++++++++++++++ 2 files changed, 141 insertions(+) diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index d81f0e4ed912d9..67325e6b2c3feb 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -3048,6 +3048,7 @@ void ProcessBlock( op_item = op_item_inner; op_info_parser = GetOpYamlInfoParser(op_item_inner); kernel_key.set_backend(phi::Backend::ONEDNN); + kernel_key.set_layout(phi::DataLayout::ONEDNN); } } else if (FLAGS_use_mkldnn && kernel_key.backend() == phi::Backend::CPU && !op_item->HasTrait() && diff --git a/test/legacy_test/test_static_save_load_bf16.py b/test/legacy_test/test_static_save_load_bf16.py index d898136bbde6ab..b8351c91eb6293 100644 --- a/test/legacy_test/test_static_save_load_bf16.py +++ b/test/legacy_test/test_static_save_load_bf16.py @@ -26,6 +26,8 @@ import paddle from paddle import base from paddle.base import core, framework +from paddle.framework.io_utils import is_pir_fetch_var +from paddle.pir_utils import IrGuard @unittest.skipIf( @@ -124,6 +126,7 @@ def test_ptb_rnn_cpu_bfloat16(self): # get value before save main_program = framework.default_main_program() + # print(main_program) base_map = {} for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: @@ -162,6 +165,143 @@ def test_ptb_rnn_cpu_bfloat16(self): base_t = base_map[var.name] np.testing.assert_array_equal(new_t, base_t) + def test_ptb_rnn_cpu_bfloat16_pir(self): + with IrGuard(): + seed = 90 + hidden_size = 10 + vocab_size = 500 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 100 + + with new_program_scope(): + paddle.seed(seed) + ptb_model = PtbModel( + "ptb_model", + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale, + ) + + place = self.set_place() + exe = base.Executor(place) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) + x = paddle.static.data( + name="x", shape=[-1, num_steps], dtype='int64' + ) + y = paddle.static.data(name="y", shape=[-1, 1], dtype='float32') + init_hidden = paddle.static.data( + name="init_hidden", shape=[-1, 1], dtype='float32' + ) + init_cell = paddle.static.data( + name="init_cell", shape=[-1, 1], dtype='float32' + ) + + ptb_model, sgd = paddle.amp.decorate( + models=ptb_model, + optimizers=sgd, + level="O2", + dtype='bfloat16', + ) + + with paddle.amp.auto_cast( + enable=True, + level='O2', + dtype='bfloat16', + custom_white_list={'slice'}, + custom_black_list={'transpose2', 'concat'}, + ): + ( + static_loss, + static_last_hidden, + static_last_cell, + ) = ptb_model(x, y, init_hidden, init_cell) + # NOTE:something wrong with grad scaler, fix later + # scaler = paddle.amp.GradScaler( + # init_loss_scaling=2.0**16 + # ) + # scaled = scaler.scale(static_loss) + # scaler.minimize(sgd, scaled) + # sgd.minimize(static_loss) + exe.run(paddle.static.default_startup_program()) + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + x_data = x_data.reshape((-1, num_steps, 1)) + y_data = y_data.reshape((-1, 1)) + # TODO investigate initializing model with "float32" instead of "uint16" as it was before + # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='uint16' + ) + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='uint16' + ) + + fetch_list = [ + static_loss, + static_last_hidden, + static_last_cell, + ] + + out = exe.run( + paddle.static.default_main_program(), + feed={ + "x": x_data, + "y": y_data, + "init_hidden": init_hidden_data, + "init_cell": init_cell_data, + }, + fetch_list=fetch_list, + ) + + # get value before save + main_program = paddle.static.default_main_program() + base_map = {} + for var in main_program.list_vars(): + if var.persistable and not is_pir_fetch_var(var): + t = np.array( + base.global_scope().find_var(var.name).get_tensor() + ) + # make sure all the parameter or optimizer var have been update + self.assertTrue(np.sum(np.abs(t)) != 0) + base_map[var.name] = t + save_dir = os.path.join(self.temp_dir.name, "test_1") + paddle.static.save(main_program, save_dir) + + # set var to zero + for var in main_program.list_vars(): + if var.persistable and not is_pir_fetch_var(var): + ten = ( + base.global_scope().find_var(var.name).get_tensor() + ) + ten.set(np.zeros_like(np.array(ten)), place) + + new_t = np.array( + base.global_scope().find_var(var.name).get_tensor() + ) + # make sure all the parameter or optimizer var have been set to zero + self.assertTrue(np.sum(np.abs(new_t)) == 0) + + paddle.static.load( + main_program, + os.path.join(self.temp_dir.name, "test_1.pdparams"), + exe, + ) + + for var in main_program.list_vars(): + if var.persistable and not is_pir_fetch_var(var): + new_t = np.array( + base.global_scope().find_var(var.name).get_tensor() + ) + base_t = base_map[var.name] + np.testing.assert_array_equal(new_t, base_t) + if __name__ == '__main__': paddle.enable_static() From 8816ed38da806fe84bf134370f000fb3c2aaf5f8 Mon Sep 17 00:00:00 2001 From: chenzhiyang <1792266893@qq.com> Date: Thu, 16 May 2024 07:05:04 +0000 Subject: [PATCH 3/5] fix --- test/legacy_test/test_static_save_load_bf16.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/test/legacy_test/test_static_save_load_bf16.py b/test/legacy_test/test_static_save_load_bf16.py index b8351c91eb6293..998d31e7049188 100644 --- a/test/legacy_test/test_static_save_load_bf16.py +++ b/test/legacy_test/test_static_save_load_bf16.py @@ -126,7 +126,6 @@ def test_ptb_rnn_cpu_bfloat16(self): # get value before save main_program = framework.default_main_program() - # print(main_program) base_map = {} for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: @@ -214,19 +213,14 @@ def test_ptb_rnn_cpu_bfloat16_pir(self): dtype='bfloat16', custom_white_list={'slice'}, custom_black_list={'transpose2', 'concat'}, + use_promote=True, ): ( static_loss, static_last_hidden, static_last_cell, ) = ptb_model(x, y, init_hidden, init_cell) - # NOTE:something wrong with grad scaler, fix later - # scaler = paddle.amp.GradScaler( - # init_loss_scaling=2.0**16 - # ) - # scaled = scaler.scale(static_loss) - # scaler.minimize(sgd, scaled) - # sgd.minimize(static_loss) + sgd.minimize(static_loss) exe.run(paddle.static.default_startup_program()) for i in range(batch_num): From e36b77fadc3592bafb7320b25349df65e7130d49 Mon Sep 17 00:00:00 2001 From: Wang Huan Date: Fri, 17 May 2024 06:44:02 +0000 Subject: [PATCH 4/5] refine --- .../pir/transforms/pd_op_to_kernel_pass.cc | 59 +++++++++++++++++-- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index 67325e6b2c3feb..53e6b31eec940c 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -511,9 +511,7 @@ static pir::Value AddOneDNN2PaddleLayoutTransferOp( } block->push_back(op); - auto new_in = op->result(0); - - return new_in; + return op->result(0); } #endif @@ -1253,6 +1251,12 @@ phi::KernelKey GetKernelKey( kernel_backend = paddle::experimental::ParseBackend(place); } +#ifdef PADDLE_WITH_DNNL + if (kernel_backend != phi::Backend::ONEDNN && + kernel_layout == phi::DataLayout::ONEDNN) { + kernel_layout = phi::DataLayout::ANY; + } +#endif phi::KernelKey res(kernel_backend, kernel_layout, kernel_dtype); // kernel backend infered incorrectly from memcpy op operands, @@ -1284,6 +1288,11 @@ phi::KernelKey GetKernelKey( if (NeedFallBackCpu((op), kernel_fn_str, res)) { res.set_backend(phi::Backend::CPU); +#ifdef PADDLE_WITH_DNNL + if (res.layout() == phi::DataLayout::ONEDNN) { + res.set_layout(phi::DataLayout::ANY); + } +#endif VLOG(8) << "kernel backend must be on CPU when need fallback"; } @@ -2375,6 +2384,38 @@ std::vector BuildInputs( new_in = AddOneDNN2PaddleLayoutTransferOp( new_in, phi::DataLayout::ANY, block); } + } else if (new_in_type.isa() && + new_in.defining_op()->isa<::pir::CombineOp>()) { + bool need_replace_combine_op = false; + std::vector new_vec_inputs; + std::vector types_in_vec; + for (auto& in : new_in.defining_op()->operands()) { + auto in_value = in.source(); + if (in_value.type().isa()) { + if (in_value.type() + .dyn_cast() + .data_layout() == phi::DataLayout::ONEDNN) { + need_replace_combine_op = true; + in_value = AddOneDNN2PaddleLayoutTransferOp( + in_value, phi::DataLayout::ANY, block); + } + new_vec_inputs.push_back(in_value); + types_in_vec.push_back(in_value.type()); + } + } + if (need_replace_combine_op) { + std::string combine_op_name(pir::CombineOp::name()); + pir::OpInfo op_info = ctx->GetRegisteredOpInfo(combine_op_name); + + pir::Type target_vec_type = pir::VectorType::get(ctx, types_in_vec); + pir::Operation* operation = pir::Operation::Create( + new_vec_inputs, {}, {target_vec_type}, op_info); + new_in.defining_op()->ReplaceAllUsesWith(operation->results()); + block->erase(*new_in.defining_op()); + + new_in = operation->result(0); + block->push_back(operation); + } } } #endif @@ -3052,13 +3093,23 @@ void ProcessBlock( } } else if (FLAGS_use_mkldnn && kernel_key.backend() == phi::Backend::CPU && !op_item->HasTrait() && - SupportsMKLDNN(kernel_name, phi::DataType::BFLOAT16)) { + SupportsMKLDNN(kernel_name, kernel_key.dtype())) { // Support FLAGS_use_mkldnn auto op_item_inner = PdOp2OneDNNOp(op_item, block, ctx); if (op_item_inner != op_item) { op_item = op_item_inner; op_info_parser = GetOpYamlInfoParser(op_item_inner); kernel_key.set_backend(phi::Backend::ONEDNN); + kernel_key.set_layout(phi::DataLayout::ONEDNN); + } + } else if (kernel_key.backend() == phi::Backend::ONEDNN && + !op_item->HasTrait()) { + auto op_item_inner = PdOp2OneDNNOp(op_item, block, ctx); + if (op_item_inner != op_item) { + op_item = op_item_inner; + op_info_parser = GetOpYamlInfoParser(op_item_inner); + kernel_key.set_backend(phi::Backend::ONEDNN); + kernel_key.set_layout(phi::DataLayout::ONEDNN); } } #endif From 6e577ff55a9cd7ba8e293a0d11adf1f0bf7ea3d3 Mon Sep 17 00:00:00 2001 From: chenzhiyang <1792266893@qq.com> Date: Fri, 17 May 2024 08:46:05 +0000 Subject: [PATCH 5/5] bf16 test solved --- test/legacy_test/test_static_save_load_bf16.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/legacy_test/test_static_save_load_bf16.py b/test/legacy_test/test_static_save_load_bf16.py index 998d31e7049188..fe088936f671f3 100644 --- a/test/legacy_test/test_static_save_load_bf16.py +++ b/test/legacy_test/test_static_save_load_bf16.py @@ -211,7 +211,6 @@ def test_ptb_rnn_cpu_bfloat16_pir(self): enable=True, level='O2', dtype='bfloat16', - custom_white_list={'slice'}, custom_black_list={'transpose2', 'concat'}, use_promote=True, ): @@ -228,13 +227,11 @@ def test_ptb_rnn_cpu_bfloat16_pir(self): y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) - # TODO investigate initializing model with "float32" instead of "uint16" as it was before - # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that) init_hidden_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='uint16' + (num_layers, batch_size, hidden_size), dtype='float32' ) init_cell_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='uint16' + (num_layers, batch_size, hidden_size), dtype='float32' ) fetch_list = [