refine

zhangting2020 · zhangting2020 · commit cb9dfe13a408 · 2021-06-09T06:04:51.000Z
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
@@ -33,7 +33,6 @@
 from .initializer import Constant
 from .layer_helper import LayerHelper
 from .layers import ops
-from .regularizer import append_regularization_ops
 from .dygraph import base as imperative_base
 from .dygraph import no_grad
 from .dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
@@ -884,6 +883,93 @@ def backward(self,
                                                act_no_grad_set, callbacks)
         return params_grads
 
+    def _create_regularization_of_grad(self, param, grad, regularization=None):
+        """ Create and add backward regularization Operators
+    
+        Function helper of append_regularization_ops.
+        """
+        # If no gradient or no regularization is specified,  then we don't need to do anything
+        if grad is None or ((not hasattr(param, 'regularizer') or
+                             (hasattr(param, 'regularizer') and
+                              param.regularizer is None)) and
+                            regularization is None):
+            return grad
+        regularization_term = None
+        if hasattr(param, 'regularizer') and param.regularizer is not None:
+            # Add variable for regularization term in grad block
+            regularization_term = param.regularizer(param, grad, grad.block)
+        elif regularization is not None:
+            regularization_term = regularization(param, grad, grad.block)
+
+        assert regularization_term is not None
+
+        new_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
+            # the grad's type and name will be changed. But the gradient's name
+            # is used in ParallelExecutor Reduce mode, so I add a flag for
+            # the new_grad here.
+            new_grad = grad.block.create_var(
+                name=grad.name + core.kNewGradSuffix(),
+                dtype=param.dtype,
+                shape=param.shape,
+                lod_level=param.lod_level,
+                type=core.VarDesc.VarType.LOD_TENSOR)
+
+        inputs = {"X": [grad, regularization_term]}
+        outputs = {"Out": [new_grad]}
+        if framework.in_dygraph_mode():
+            new_grad = core.ops.sum([grad, regularization_term])
+        else:
+            grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
+
+        return new_grad
+
+    def append_regularization_ops(self,
+                                  parameters_and_grads,
+                                  regularization=None):
+        r"""Create and add backward regularization Operators
+    
+        Creates and adds backward regularization operators in the BlockDesc.
+        This will add gradients of the regularizer function to the gradients
+        of the parameters and return these modified gradients. This is the
+        same as implementing weight decay in optimizers for regularization.
+    
+        Args:
+            parameters_and_grads: A list of (parameters, gradients) pairs
+                                  that need to be regularized.
+            regularization: A global regularizer. If the parameter is not
+                            set. It will be applied with regularizer.
+    
+        Returns:
+            list[(Variable, Variable)]: list of (parameters, gradients) \
+            pair with the regularized gradient
+    
+        Raises:
+            Exception: Unknown regularization type
+        """
+        params_and_grads = []
+        if framework.in_dygraph_mode():
+            for param, grad in parameters_and_grads:
+                new_grad = self._create_regularization_of_grad(param, grad,
+                                                               regularization)
+                params_and_grads.append((param, new_grad))
+        else:
+            repeate_regularizer = False
+            with framework.name_scope('regularization'):
+                for param, grad in parameters_and_grads:
+                    if not repeate_regularizer and param.regularizer is not None and regularization is not None:
+                        repeate_regularizer = True
+                        logging.info(
+                            "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
+                            "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
+                            % regularization.__str__())
+                    with param.block.program._optimized_guard([param, grad]):
+                        new_grad = self._create_regularization_of_grad(
+                            param, grad, regularization)
+                        params_and_grads.append((param, new_grad))
+        return params_and_grads
+
     def apply_gradients(self, params_grads):
         """
         Second part of `minimize`, appending optimization operators for
@@ -916,8 +1002,8 @@ def apply_gradients(self, params_grads):
             params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
-        params_grads = append_regularization_ops(params_grads,
-                                                 self.regularization)
+        params_grads = self.append_regularization_ops(params_grads,
+                                                      self.regularization)
 
         optimize_ops = self._create_optimization_pass(params_grads)
         return optimize_ops
@@ -939,8 +1025,8 @@ def apply_optimize(self, loss, startup_program, params_grads):
                                framework.default_startup_program()):
                 if self._grad_clip is not None:
                     params_grads = self._grad_clip(params_grads)
-                params_grads = append_regularization_ops(params_grads,
-                                                         self.regularization)
+                params_grads = self.append_regularization_ops(
+                    params_grads, self.regularization)
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
             program = loss.block.program
@@ -1674,8 +1760,8 @@ def apply_gradients(self, params_grads):
             not_dgc_params_grads = append_gradient_clip_ops(
                 not_dgc_params_grads)
 
-        not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
-                                                         self.regularization)
+        not_dgc_params_grads = self.append_regularization_ops(
+            not_dgc_params_grads, self.regularization)
 
         params_grads = not_dgc_params_grads + dgc_params_grads
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
@@ -22,92 +22,6 @@
 __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
 
 
-def _create_regularization_of_grad(param, grad, regularization=None):
-    """ Create and add backward regularization Operators
-
-    Function helper of append_regularization_ops.
-    """
-    # If no gradient or no regularization is specified,  then we don't need to do anything
-    if grad is None or ((not hasattr(param, 'regularizer') or (
-            hasattr(param, 'regularizer') and param.regularizer is None)) and
-                        regularization is None):
-        return grad
-    regularization_term = None
-    if hasattr(param, 'regularizer') and param.regularizer is not None:
-        # Add variable for regularization term in grad block
-        regularization_term = param.regularizer(param, grad, grad.block)
-    elif regularization is not None:
-        regularization_term = regularization(param, grad, grad.block)
-
-    assert regularization_term is not None
-
-    new_grad = grad
-    if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-        # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
-        # the grad's type and name will be changed. But the gradient's name
-        # is used in ParallelExecutor Reduce mode, so I add a flag for
-        # the new_grad here.
-        new_grad = grad.block.create_var(
-            name=grad.name + core.kNewGradSuffix(),
-            dtype=param.dtype,
-            shape=param.shape,
-            lod_level=param.lod_level,
-            type=core.VarDesc.VarType.LOD_TENSOR)
-
-    inputs = {"X": [grad, regularization_term]}
-    outputs = {"Out": [new_grad]}
-    if in_dygraph_mode():
-        new_grad = core.ops.sum([grad, regularization_term])
-    else:
-        grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
-
-    return new_grad
-
-
-def append_regularization_ops(parameters_and_grads, regularization=None):
-    r"""Create and add backward regularization Operators
-
-    Creates and adds backward regularization operators in the BlockDesc.
-    This will add gradients of the regularizer function to the gradients
-    of the parameters and return these modified gradients. This is the
-    same as implementing weight decay in optimizers for regularization.
-
-    Args:
-        parameters_and_grads: A list of (parameters, gradients) pairs
-                              that need to be regularized.
-        regularization: A global regularizer. If the parameter is not
-                        set. It will be applied with regularizer.
-
-    Returns:
-        list[(Variable, Variable)]: list of (parameters, gradients) \
-        pair with the regularized gradient
-
-    Raises:
-        Exception: Unknown regularization type
-    """
-    params_and_grads = []
-    if in_dygraph_mode():
-        for param, grad in parameters_and_grads:
-            new_grad = _create_regularization_of_grad(param, grad,
-                                                      regularization)
-            params_and_grads.append((param, new_grad))
-    else:
-        repeate_regularizer = False
-        with framework.name_scope('regularization'):
-            for param, grad in parameters_and_grads:
-                if not repeate_regularizer and param.regularizer is not None and regularization is not None:
-                    repeate_regularizer = True
-                    logging.info(
-                        "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
-                        "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
-                        % regularization.__str__())
-                with param.block.program._optimized_guard([param, grad]):
-                    new_grad = _create_regularization_of_grad(param, grad,
-                                                              regularization)
-                    params_and_grads.append((param, new_grad))
-    return params_and_grads
-
-
 class WeightDecayRegularizer(object):
     """Base class for weight decay regularizers
 
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -614,41 +614,65 @@ def test_momentum_static(self):
 
 
 class TestFusedMomentumWithDecayAPI(unittest.TestCase):
-    def get_loss_and_optimizer(self, input):
+    def get_program(self, weight_attr):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(
+                main_program=main_program, startup_program=startup_program):
+            x = paddle.static.data(name='x', shape=[10, 10])
+            linear = paddle.nn.Linear(
+                10, 10, weight_attr=weight_attr, bias_attr=False)
+            out = linear(x)
+            loss = paddle.mean(out)
+            optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.01,
+                momentum=0.9,
+                weight_decay=paddle.regularizer.L2Decay(0.5))
+            optimizer.minimize(loss)
+        return main_program
+
+    def test_param_has_l2decay(self):
+        paddle.enable_static()
         weight_attr = paddle.ParamAttr(
             name="weight",
             initializer=paddle.nn.initializer.Constant(value=0.5),
             regularizer=paddle.regularizer.L2Decay(0.1))
-        linear = paddle.nn.Linear(10, 10, weight_attr=weight_attr)
-        out = linear(input)
-        loss = paddle.mean(out)
-
-        momentum = paddle.optimizer.Momentum(
-            learning_rate=0.01,
-            momentum=0.9,
-            parameters=linear.parameters(),
-            weight_decay=paddle.regularizer.L1Decay(0.5),
-            grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0))
-        return loss, momentum
+        program = self.get_program(weight_attr)
+        ops = program.global_block().ops
 
-    def test_dygraph(self):
-        paddle.disable_static()
-        inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
-        x = paddle.to_tensor(inp)
-        loss, optimizer = self.get_loss_and_optimizer(x)
-        loss.backward()
-        optimizer.step()
-        optimizer.clear_grad()
+        self.assertEqual(ops[-1].attr('regularization_method'), 'l2_decay')
+        self.assertEqual(ops[-1].attr('regularization_coeff'), np.float32(0.1))
+        for i in range(len(ops)):
+            self.assertTrue('sum' not in ops[i].type)
+            self.assertTrue('scale' not in ops[i].type)
 
-    def test_static(self):
+    def test_param_has_l1decay(self):
         paddle.enable_static()
-        inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
-        x = paddle.static.data(name='x', shape=[10, 10])
-        loss, optimizer = self.get_loss_and_optimizer(x)
-        optimizer.minimize(loss)
-        exe = paddle.static.Executor()
-        exe.run(paddle.static.default_startup_program())
-        exe.run(feed={"x": inp})
+        weight_attr = paddle.ParamAttr(
+            name="weight",
+            initializer=paddle.nn.initializer.Constant(value=0.5),
+            regularizer=paddle.regularizer.L1Decay(0.1))
+        program = self.get_program(weight_attr)
+        ops = program.global_block().ops
+        self.assertEqual(ops[-1].attr('regularization_method'), '')
+        self.assertEqual(ops[-1].attr('regularization_coeff'), 0)
+        self.assertEqual(ops[-2].type, 'sum')
+        self.assertEqual(ops[-3].type, 'scale')
+        self.assertEqual(ops[-4].type, 'sign')
+
+    def test_param_regularizer_is_none(self):
+        paddle.enable_static()
+        weight_attr = paddle.ParamAttr(
+            name="weight",
+            initializer=paddle.nn.initializer.Constant(value=0.5),
+            regularizer=None)
+        program = self.get_program(weight_attr)
+        ops = program.global_block().ops
+        self.assertEqual(ops[-1].attr('regularization_method'), 'l2_decay')
+        self.assertEqual(ops[-1].attr('regularization_coeff'), np.float32(0.5))
+        for i in range(len(ops)):
+            self.assertTrue('sum' not in ops[i].type)
+            self.assertTrue('scale' not in ops[i].type)
 
 
 class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -59,6 +59,7 @@ def test_l2decay_regularizer(self):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
+        optimizer = paddle.optimizer.Adam()
         params_grads = optimizer.append_regularization_ops(params_grads)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(block.ops), count_ops + 2)
@@ -97,6 +98,7 @@ def test_l2decay_regularizer(self):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
+        optimizer = paddle.optimizer.Adam()
         params_grads = optimizer.append_regularization_ops(params_grads)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(block.ops), count_ops + 3)
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py