From 8c26c389459d32f1c0782650ba00b9d0d2785239 Mon Sep 17 00:00:00 2001 From: WangXi Date: Thu, 26 Aug 2021 12:38:42 +0800 Subject: [PATCH 1/2] fix npu clear float status in pipeline --- python/paddle/fluid/optimizer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 478ea75472717a..46264af14ed324 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4659,6 +4659,9 @@ def _add_op_device_attr_for_op(self, op, idx, block): op._set_attr(self._op_device_key, f"{self._device}:all") elif op.type == "alloc_float_status" or op.type == "clear_float_status": op._set_attr(self._op_device_key, f"{self._device}:all") + # NOTE(wangxi): NPU should only clear the float status + # once at each batch step + op._set_attr(self._op_role_key, self._op_role.LRSched) else: other_known_ops = [ 'update_loss_scaling', 'reduce_any', 'concat', 'sum', From 45a2af0c1eda6e56459ebc2318ceb5844d83dd78 Mon Sep 17 00:00:00 2001 From: WangXi Date: Thu, 26 Aug 2021 15:07:30 +0800 Subject: [PATCH 2/2] fix pipeline float status uninitialized --- python/paddle/fluid/optimizer.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 46264af14ed324..eb3d559ddcde9e 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4654,18 +4654,22 @@ def _add_op_device_attr_for_op(self, op, idx, block): op.type == 'elementwise_div'): device = f"{self._device}:all" op._set_attr(self._op_device_key, device) - elif self._is_weight_decay_op(op) and op.type == 'scale': - # set AdamW decay_coeff to device:all - op._set_attr(self._op_device_key, f"{self._device}:all") elif op.type == "alloc_float_status" or op.type == "clear_float_status": op._set_attr(self._op_device_key, f"{self._device}:all") # NOTE(wangxi): NPU should only clear the float status # once at each batch step op._set_attr(self._op_role_key, self._op_role.LRSched) + + float_status_name = op.output_arg_names[0] + float_status_var = block.var(float_status_name) + # FIXME(wangxi): pipeline lr schedule will exec on sub_scope(0) + # while update will exec on sub_scope(last_micro_step), should + # set persistable to use global scope + float_status_var.persistable = True else: other_known_ops = [ 'update_loss_scaling', 'reduce_any', 'concat', 'sum', - 'check_finite_and_unscale', 'alloc_float_status', 'memcpy' + 'check_finite_and_unscale', 'memcpy' ] assert op.type in other_known_ops, "For other ops without " \ "op_device set, they must be one of {}, but it " \