1515from .optimizer import Optimizer
1616from .adam import Adam
1717from ..fluid import framework
18+ from ..fluid .dygraph import base as imperative_base
1819import paddle
1920from paddle .fluid .dygraph .parallel import apply_collective_grads
2021
@@ -171,13 +172,14 @@ def _scale_parameters(self, params_and_grads):
171172 learning_rate = self ._learning_rate ()
172173 with param .block .program ._optimized_guard (
173174 [param , grad ]), framework .name_scope ('weight decay' ):
175+ scaled_params .append (
176+ (param , grad , param * self ._coeff * learning_rate ))
174177 if param .name not in self ._params_name :
175- scaled_params .append (
176- (param , grad , param * self ._coeff * learning_rate ))
177178 self ._params_name .add (param .name )
178179 param = param * self ._coeff
179180 return scaled_params
180181
182+ @imperative_base .no_grad
181183 def minimize (self ,
182184 loss ,
183185 startup_program = None ,
@@ -207,6 +209,7 @@ def minimize(self,
207209 return optimize_ops , params_grads
208210
209211 @framework .dygraph_only
212+ @imperative_base .no_grad
210213 def step (self ):
211214 if paddle .distributed .get_world_size () > 1 :
212215 apply_collective_grads (self ._parameter_list )
@@ -227,7 +230,7 @@ def step(self):
227230 [param , grad ]), framework .name_scope ('weight decay' ):
228231 updated_param = paddle .fluid .layers .elementwise_sub (
229232 x = param , y = scaled_param )
230- param . set_value ( updated_param . numpy () )
233+ paddle . fluid . layers . assign ( input = updated_param , output = param )
231234 self ._apply_optimize (
232235 loss = None , startup_program = None , params_grads = params_grads )
233236
0 commit comments