3333from .initializer import Constant
3434from .layer_helper import LayerHelper
3535from .layers import ops
36- from .regularizer import append_regularization_ops
3736from .dygraph import base as imperative_base
3837from .dygraph import no_grad
3938from .dygraph .learning_rate_scheduler import LearningRateDecay , _LearningRateEpochDecay
@@ -805,6 +804,93 @@ def backward(self,
805804 act_no_grad_set , callbacks )
806805 return params_grads
807806
807+ def _create_regularization_of_grad (self , param , grad , regularization = None ):
808+ """ Create and add backward regularization Operators
809+
810+ Function helper of append_regularization_ops.
811+ """
812+ # If no gradient or no regularization is specified, then we don't need to do anything
813+ if grad is None or ((not hasattr (param , 'regularizer' ) or
814+ (hasattr (param , 'regularizer' ) and
815+ param .regularizer is None )) and
816+ regularization is None ):
817+ return grad
818+ regularization_term = None
819+ if hasattr (param , 'regularizer' ) and param .regularizer is not None :
820+ # Add variable for regularization term in grad block
821+ regularization_term = param .regularizer (param , grad , grad .block )
822+ elif regularization is not None :
823+ regularization_term = regularization (param , grad , grad .block )
824+
825+ assert regularization_term is not None
826+
827+ new_grad = grad
828+ if grad .type == core .VarDesc .VarType .SELECTED_ROWS :
829+ # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
830+ # the grad's type and name will be changed. But the gradient's name
831+ # is used in ParallelExecutor Reduce mode, so I add a flag for
832+ # the new_grad here.
833+ new_grad = grad .block .create_var (
834+ name = grad .name + core .kNewGradSuffix (),
835+ dtype = param .dtype ,
836+ shape = param .shape ,
837+ lod_level = param .lod_level ,
838+ type = core .VarDesc .VarType .LOD_TENSOR )
839+
840+ inputs = {"X" : [grad , regularization_term ]}
841+ outputs = {"Out" : [new_grad ]}
842+ if framework .in_dygraph_mode ():
843+ new_grad = core .ops .sum ([grad , regularization_term ])
844+ else :
845+ grad .block .append_op (type = 'sum' , inputs = inputs , outputs = outputs )
846+
847+ return new_grad
848+
849+ def append_regularization_ops (self ,
850+ parameters_and_grads ,
851+ regularization = None ):
852+ r"""Create and add backward regularization Operators
853+
854+ Creates and adds backward regularization operators in the BlockDesc.
855+ This will add gradients of the regularizer function to the gradients
856+ of the parameters and return these modified gradients. This is the
857+ same as implementing weight decay in optimizers for regularization.
858+
859+ Args:
860+ parameters_and_grads: A list of (parameters, gradients) pairs
861+ that need to be regularized.
862+ regularization: A global regularizer. If the parameter is not
863+ set. It will be applied with regularizer.
864+
865+ Returns:
866+ list[(Variable, Variable)]: list of (parameters, gradients) \
867+ pair with the regularized gradient
868+
869+ Raises:
870+ Exception: Unknown regularization type
871+ """
872+ params_and_grads = []
873+ if framework .in_dygraph_mode ():
874+ for param , grad in parameters_and_grads :
875+ new_grad = self ._create_regularization_of_grad (param , grad ,
876+ regularization )
877+ params_and_grads .append ((param , new_grad ))
878+ else :
879+ repeate_regularizer = False
880+ with framework .name_scope ('regularization' ):
881+ for param , grad in parameters_and_grads :
882+ if not repeate_regularizer and param .regularizer is not None and regularization is not None :
883+ repeate_regularizer = True
884+ logging .info (
885+ "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
886+ "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
887+ % regularization .__str__ ())
888+ with param .block .program ._optimized_guard ([param , grad ]):
889+ new_grad = self ._create_regularization_of_grad (
890+ param , grad , regularization )
891+ params_and_grads .append ((param , new_grad ))
892+ return params_and_grads
893+
808894 def apply_gradients (self , params_grads ):
809895 """
810896 Second part of `minimize`, appending optimization operators for
@@ -837,8 +923,8 @@ def apply_gradients(self, params_grads):
837923 params_grads = append_gradient_clip_ops (params_grads )
838924
839925 # Add regularization if any
840- params_grads = append_regularization_ops (params_grads ,
841- self .regularization )
926+ params_grads = self . append_regularization_ops (params_grads ,
927+ self .regularization )
842928
843929 optimize_ops = self ._create_optimization_pass (params_grads )
844930 return optimize_ops
@@ -860,8 +946,8 @@ def apply_optimize(self, loss, startup_program, params_grads):
860946 framework .default_startup_program ()):
861947 if self ._grad_clip is not None :
862948 params_grads = self ._grad_clip (params_grads )
863- params_grads = append_regularization_ops (params_grads ,
864- self .regularization )
949+ params_grads = self . append_regularization_ops (
950+ params_grads , self .regularization )
865951 optimize_ops = self ._create_optimization_pass (params_grads )
866952 else :
867953 program = loss .block .program
@@ -1595,8 +1681,8 @@ def apply_gradients(self, params_grads):
15951681 not_dgc_params_grads = append_gradient_clip_ops (
15961682 not_dgc_params_grads )
15971683
1598- not_dgc_params_grads = append_regularization_ops (not_dgc_params_grads ,
1599- self .regularization )
1684+ not_dgc_params_grads = self . append_regularization_ops (
1685+ not_dgc_params_grads , self .regularization )
16001686
16011687 params_grads = not_dgc_params_grads + dgc_params_grads
16021688 params_grads = sorted (params_grads , key = lambda x : x [0 ].name )
0 commit comments