From 12c19322178b876042ad35b15c754f27ae8d2c9a Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Wed, 28 Jul 2021 13:28:43 +0000 Subject: [PATCH 1/2] fix lr in param group --- python/paddle/optimizer/adamw.py | 7 +++---- python/paddle/optimizer/optimizer.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index c3cffa2998f6cc..316bf2ba2f5813 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -184,10 +184,9 @@ def _append_decoupled_weight_decay(self, block, param_and_grad): Raises: Exception: The type of coeff and parameter is not consistent. """ - if not isinstance(param_and_grad, dict): - param, grad = param_and_grad - else: - param, grad = self._update_param_group(param_and_grad) + if isinstance(param_and_grad, dict): + param_and_grad = self._update_param_group(param_and_grad) + param, grad = param_and_grad if self._apply_decay_param_fun is not None \ and not self._apply_decay_param_fun(param.name): diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 93b618b7c9edc0..5c16c4fb5c57e6 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -205,7 +205,6 @@ def __init__(self, self._param_device_map = dict() self.clear_gradients = self.clear_grad self._default_dict = { - 'learning_rate': self._learning_rate, 'weight_decay': self.regularization, 'grad_clip': self._grad_clip } @@ -1184,7 +1183,8 @@ def _add_param_group(self, param_group): else: regularization = weight_decay param.regularizer = regularization - param.optimize_attr['learning_rate'] = param_group['learning_rate'] + param.optimize_attr['learning_rate'] = param_group.get( + 'learning_rate', 1.) self._param_groups.append(param_group) From a1b7941916a13f8c0ac240d3641e405f6866a9f3 Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Wed, 28 Jul 2021 13:39:01 +0000 Subject: [PATCH 2/2] add unittest for adamw --- .../fluid/tests/unittests/test_adamw_op.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index ce01ca042c123d..fdc3682f4b1bfb 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -147,5 +147,33 @@ def test_adamw_op_dygraph(self): adam.clear_gradients() +class TestAdamWOpGroupWithLR(TestAdamWOp): + def test_adamw_op_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear_1 = paddle.nn.Linear(13, 5) + linear_2 = paddle.nn.Linear(5, 3) + adam = paddle.optimizer.AdamW( + learning_rate=paddle.optimizer.lr.PiecewiseDecay( + boundaries=[3, 6], values=[0.1, 0.2, 0.3]), + parameters=[{ + 'params': linear_1.parameters(), + 'learning_rate': 0.1, + }, { + 'params': linear_2.parameters(), + 'weight_decay': 0.001, + }], + apply_decay_param_fun=lambda name: True, + weight_decay=0.01) + + for _ in range(2): + out = linear_1(a) + out = linear_2(out) + out.backward() + adam.step() + adam.clear_gradients() + + if __name__ == "__main__": unittest.main()