3333from .initializer import Constant
3434from .layer_helper import LayerHelper
3535from .layers import ops
36- from .regularizer import append_regularization_ops
3736from .dygraph import base as imperative_base
3837from .dygraph import no_grad
3938from .dygraph .learning_rate_scheduler import LearningRateDecay , _LearningRateEpochDecay
@@ -884,6 +883,93 @@ def backward(self,
884883 act_no_grad_set , callbacks )
885884 return params_grads
886885
886+ def _create_regularization_of_grad (self , param , grad , regularization = None ):
887+ """ Create and add backward regularization Operators
888+
889+ Function helper of append_regularization_ops.
890+ """
891+ # If no gradient or no regularization is specified, then we don't need to do anything
892+ if grad is None or ((not hasattr (param , 'regularizer' ) or
893+ (hasattr (param , 'regularizer' ) and
894+ param .regularizer is None )) and
895+ regularization is None ):
896+ return grad
897+ regularization_term = None
898+ if hasattr (param , 'regularizer' ) and param .regularizer is not None :
899+ # Add variable for regularization term in grad block
900+ regularization_term = param .regularizer (param , grad , grad .block )
901+ elif regularization is not None :
902+ regularization_term = regularization (param , grad , grad .block )
903+
904+ assert regularization_term is not None
905+
906+ new_grad = grad
907+ if grad .type == core .VarDesc .VarType .SELECTED_ROWS :
908+ # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
909+ # the grad's type and name will be changed. But the gradient's name
910+ # is used in ParallelExecutor Reduce mode, so I add a flag for
911+ # the new_grad here.
912+ new_grad = grad .block .create_var (
913+ name = grad .name + core .kNewGradSuffix (),
914+ dtype = param .dtype ,
915+ shape = param .shape ,
916+ lod_level = param .lod_level ,
917+ type = core .VarDesc .VarType .LOD_TENSOR )
918+
919+ inputs = {"X" : [grad , regularization_term ]}
920+ outputs = {"Out" : [new_grad ]}
921+ if framework .in_dygraph_mode ():
922+ new_grad = core .ops .sum ([grad , regularization_term ])
923+ else :
924+ grad .block .append_op (type = 'sum' , inputs = inputs , outputs = outputs )
925+
926+ return new_grad
927+
928+ def append_regularization_ops (self ,
929+ parameters_and_grads ,
930+ regularization = None ):
931+ r"""Create and add backward regularization Operators
932+
933+ Creates and adds backward regularization operators in the BlockDesc.
934+ This will add gradients of the regularizer function to the gradients
935+ of the parameters and return these modified gradients. This is the
936+ same as implementing weight decay in optimizers for regularization.
937+
938+ Args:
939+ parameters_and_grads: A list of (parameters, gradients) pairs
940+ that need to be regularized.
941+ regularization: A global regularizer. If the parameter is not
942+ set. It will be applied with regularizer.
943+
944+ Returns:
945+ list[(Variable, Variable)]: list of (parameters, gradients) \
946+ pair with the regularized gradient
947+
948+ Raises:
949+ Exception: Unknown regularization type
950+ """
951+ params_and_grads = []
952+ if framework .in_dygraph_mode ():
953+ for param , grad in parameters_and_grads :
954+ new_grad = self ._create_regularization_of_grad (param , grad ,
955+ regularization )
956+ params_and_grads .append ((param , new_grad ))
957+ else :
958+ repeate_regularizer = False
959+ with framework .name_scope ('regularization' ):
960+ for param , grad in parameters_and_grads :
961+ if not repeate_regularizer and param .regularizer is not None and regularization is not None :
962+ repeate_regularizer = True
963+ logging .info (
964+ "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
965+ "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
966+ % regularization .__str__ ())
967+ with param .block .program ._optimized_guard ([param , grad ]):
968+ new_grad = self ._create_regularization_of_grad (
969+ param , grad , regularization )
970+ params_and_grads .append ((param , new_grad ))
971+ return params_and_grads
972+
887973 def apply_gradients (self , params_grads ):
888974 """
889975 Second part of `minimize`, appending optimization operators for
@@ -916,8 +1002,8 @@ def apply_gradients(self, params_grads):
9161002 params_grads = append_gradient_clip_ops (params_grads )
9171003
9181004 # Add regularization if any
919- params_grads = append_regularization_ops (params_grads ,
920- self .regularization )
1005+ params_grads = self . append_regularization_ops (params_grads ,
1006+ self .regularization )
9211007
9221008 optimize_ops = self ._create_optimization_pass (params_grads )
9231009 return optimize_ops
@@ -939,8 +1025,8 @@ def apply_optimize(self, loss, startup_program, params_grads):
9391025 framework .default_startup_program ()):
9401026 if self ._grad_clip is not None :
9411027 params_grads = self ._grad_clip (params_grads )
942- params_grads = append_regularization_ops (params_grads ,
943- self .regularization )
1028+ params_grads = self . append_regularization_ops (
1029+ params_grads , self .regularization )
9441030 optimize_ops = self ._create_optimization_pass (params_grads )
9451031 else :
9461032 program = loss .block .program
@@ -1674,8 +1760,8 @@ def apply_gradients(self, params_grads):
16741760 not_dgc_params_grads = append_gradient_clip_ops (
16751761 not_dgc_params_grads )
16761762
1677- not_dgc_params_grads = append_regularization_ops (not_dgc_params_grads ,
1678- self .regularization )
1763+ not_dgc_params_grads = self . append_regularization_ops (
1764+ not_dgc_params_grads , self .regularization )
16791765
16801766 params_grads = not_dgc_params_grads + dgc_params_grads
16811767 params_grads = sorted (params_grads , key = lambda x : x [0 ].name )
0 commit comments