fix cpu device, test=allcase

wangxicoding · wangxicoding · commit f16c7e9c3aba · 2021-07-27T12:27:55.000+08:00
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
@@ -4634,6 +4634,9 @@ def _add_op_device_attr_for_op(self, op, idx, block):
                     op.type == 'elementwise_div'):
                 device = f"{self._device}:all"
             op._set_attr(self._op_device_key, device)
+        elif self._is_weight_decay_op(op) and op.type == 'scale':
+            # set AdamW decay_coeff to device:all
+            op._set_attr(self._op_device_key, f"{self._device}:all")
         elif op.type == "alloc_float_status":
             op._set_attr(self._op_device_key, f"{self._device}:all")
         else:
@@ -5267,6 +5270,11 @@ def _is_regularization_op(self, op):
         return op.desc.has_attr("op_namescope") \
             and op.desc.attr("op_namescope").startswith("/regularization")
 
+    def _is_weight_decay_op(self, op):
+        # in AdamW namescope is /optimizer_*/weight decay/
+        return op.desc.has_attr("op_namescope") \
+            and 'weight decay' in op.desc.attr("op_namescope")
+
     def _get_input_output_info(self, block):
         '''
         Get info of op input and output.
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
@@ -161,12 +161,6 @@ def __init__(self,
         self._coeff = coeff
         self._lr_to_coeff = dict()
 
-        self._device = "cpu"
-        if core.is_compiled_with_npu():
-            self._device = "npu"
-        elif core.is_compiled_with_cuda():
-            self._device = "gpu"
-
         super(AdamW, self).__init__(
             learning_rate=learning_rate,
             parameters=parameters,
@@ -218,7 +212,8 @@ def _append_decoupled_weight_decay(self, block, param_and_grad):
             # we do this in _create_optimization_pass
             decay_coeff = self._lr_to_coeff.get(learning_rate, None)
             if decay_coeff is None:
-                with paddle.static.device_guard("{}:all".format(self._device)):
+                # NOTE(wangxi): for pipeline to set device:all
+                with paddle.static.device_guard(None):
                     decay_coeff = 1.0 - learning_rate * self._coeff
                 self._lr_to_coeff[learning_rate] = decay_coeff