From 6ec8cf3270959a14bfd1e00c5246b31dc27b6b04 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Tue, 3 Aug 2021 12:47:47 +0000
Subject: [PATCH 1/3] optimize global gradient clip

---
 python/paddle/fluid/clip.py                   | 36 ++++++++++++-------
 .../tests/unittests/test_gradient_clip.py     | 22 ++++++++----
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 8fd01509331e20..ff76c6bae7033b 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -19,6 +19,7 @@
 import warnings
 
 import functools
+import paddle
 from . import layers
 from . import framework
 from . import core
@@ -416,8 +417,8 @@ def _dygraph_clip(self, params_grads):
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                 merge_grad = layers.merge_selected_rows(g)
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            square = layers.square(merge_grad)
-            sum_square = layers.reduce_sum(square)
+
+            sum_square = paddle.square(paddle.norm(merge_grad))
             sum_square_list.append(sum_square)
 
         # all parameters have been filterd out
@@ -439,6 +440,7 @@ def _dygraph_clip(self, params_grads):
             if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
+            # TODO(wangxi): use place elementwise_mul
             new_grad = layers.elementwise_mul(x=g, y=clip_var)
             params_and_grads.append((p, new_grad))
 
@@ -460,8 +462,8 @@ def _static_clip(self, params_grads):
                         merge_grad = layers.get_tensor_from_selected_rows(
                             merge_grad)
 
-                    square = layers.square(merge_grad)
-                    sum_square = layers.reduce_sum(input=square)
+                    # maybe need a reduce_square_sum op
+                    sum_square = paddle.square(paddle.norm(merge_grad))
                     sum_square_list.append(sum_square)
 
             # all parameters have been filterd out
@@ -489,9 +491,14 @@ def _static_clip(self, params_grads):
                     continue
 
                 with p.block.program._optimized_guard([p, g]):
-                    new_grad = layers.elementwise_mul(x=g, y=scale_var)
-                param_new_grad_name_dict[p.name] = new_grad.name
-                params_and_grads.append((p, new_grad))
+                    # inplace
+                    p.block.append_op(
+                        type='elementwise_mul',
+                        inputs={'X': g,
+                                'Y': scale_var},
+                        outputs={'Out': g})
+                param_new_grad_name_dict[p.name] = g.name
+                params_and_grads.append((p, g))
 
         _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
         return params_and_grads
@@ -513,8 +520,7 @@ def _process_context(self, context, param, grad):
             merge_grad = layers.merge_selected_rows(grad)
             merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
 
-        square = layers.square(merge_grad)
-        local_norm_var = layers.reduce_sum(input=square)
+        local_norm_var = paddle.square(paddle.norm(merge_grad))
         context[self.group_name].append(local_norm_var)
 
         self.context = context
@@ -532,10 +538,14 @@ def _create_operators(self, param, grad):
             assert group_scale_var.shape == (1, )
             self.context[group_scale_name] = group_scale_var
 
-        new_grad = layers.elementwise_mul(
-            x=grad, y=self.context[group_scale_name])
+        # inplace
+        param.block.append_op(
+            type='elementwise_mul',
+            inputs={'X': grad,
+                    'Y': self.context[group_scale_name]},
+            outputs={'Out': grad})
 
-        return param, new_grad
+        return param, grad
 
 
 @framework.dygraph_not_support
@@ -709,7 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
             continue
         block_id_list.append(block_id)
         for op in param.block.program.global_block().ops:
-            if 'op_namescope' in op.all_attrs() and "gradient_clip" in op.attr(
+            if op.has_attr("op_namescope") and "gradient_clip" in op.attr(
                     "op_namescope") and op.attr('op_role_var'):
                 param_name = op.attr('op_role_var')[0]
                 if param_name in param_new_grad_name_dict:
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 14f5d4a41a1fed..e4d60bc7cb0d42 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -22,6 +22,8 @@
 import six
 from fake_reader import fake_imdb_reader
 
+paddle.enable_static()
+
 
 def bow_net(data,
             label,
@@ -149,7 +151,10 @@ def clip_gradient(self, params_grads):
     def check_clip_result(self, out, out_clip):
         global_norm = 0
         for v in out:
-            global_norm += np.sum(np.power(v, 2))
+            # if encounter numerical accuracy problem, use
+            # global_norm += np.square(np.linalg.norm(v))
+            # and maybe paddle need a reduce_square_sum op better
+            global_norm += np.sum(np.square(v))
         global_norm = np.sqrt(global_norm)
         scale = self.clip_norm / np.maximum(self.clip_norm, global_norm)
         res = []
@@ -160,7 +165,8 @@ def check_clip_result(self, out, out_clip):
             self.assertTrue(
                 np.allclose(
                     a=u, b=v, rtol=1e-5, atol=1e-8),
-                "gradient clip by global norm has wrong results!")
+                "gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}".
+                format(u, v, u - v))
 
     # test whether the ouput is right when use 'set_gradient_clip'
     def test_old_gradient_clip(self):
@@ -210,12 +216,16 @@ def test_none_grad(self):
         params_grads = [(x, None), (x, y), (y, x)]
         params_grads = clip(params_grads)
         self.assertTrue(
-            len(clip(params_grads)) == 2,
+            len(params_grads) == 2,
             "ClipByGlobalNorm: when grad is None, it shouldn't be returned by gradient clip!"
         )
-        self.assertTrue(
-            params_grads[0][1].name != 'y',
-            "ClipByGlobalNorm: param_grad (x, y) should be clipped!")
+
+        ops = [op.type for op in x.block.ops]
+        self.assertListEqual(ops, [
+            'frobenius_norm', 'square', 'frobenius_norm', 'square', 'sum',
+            'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div',
+            'elementwise_mul', 'elementwise_mul'
+        ])
 
     # raise typeError
     def test_tpyeError(self):

From d8de7ed2ce9847fb54104ff09ecb2533635950ea Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Wed, 4 Aug 2021 14:54:18 +0800
Subject: [PATCH 2/3] use squared_l2_norm, fix ci

---
 python/paddle/fluid/clip.py                   | 30 +++++++++++++++----
 .../test_fleet_sharding_meta_optimizer.py     |  4 +--
 .../tests/unittests/test_gradient_clip.py     |  7 ++---
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index ff76c6bae7033b..0a786d5178bd65 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -25,6 +25,9 @@
 from . import core
 from . import name_scope
 from .dygraph import base as imperative_base
+from .data_feeder import check_variable_and_dtype
+from .framework import in_dygraph_mode
+from .layer_helper import LayerHelper
 
 __all__ = [
     'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
@@ -32,6 +35,24 @@
 ]
 
 
+def _squared_l2_norm(x):
+    r"""
+    This OP returns the squared L2 norm of a tensor.
+    """
+    if in_dygraph_mode():
+        core.ops.squared_l2_norm(x)
+
+    op_type = 'squared_l2_norm'
+    check_variable_and_dtype(x, 'x', ['float32'], op_type)
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+
+    inputs = {"X": x}
+    outputs = {'Out': out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out
+
+
 class BaseErrorClipAttr(object):
     def __str__(self):
         raise NotImplementedError()
@@ -418,7 +439,7 @@ def _dygraph_clip(self, params_grads):
                 merge_grad = layers.merge_selected_rows(g)
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
 
-            sum_square = paddle.square(paddle.norm(merge_grad))
+            sum_square = _squared_l2_norm(merge_grad)
             sum_square_list.append(sum_square)
 
         # all parameters have been filterd out
@@ -440,7 +461,7 @@ def _dygraph_clip(self, params_grads):
             if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
-            # TODO(wangxi): use place elementwise_mul
+            # TODO(wangxi): use inplace elementwise_mul
             new_grad = layers.elementwise_mul(x=g, y=clip_var)
             params_and_grads.append((p, new_grad))
 
@@ -462,8 +483,7 @@ def _static_clip(self, params_grads):
                         merge_grad = layers.get_tensor_from_selected_rows(
                             merge_grad)
 
-                    # maybe need a reduce_square_sum op
-                    sum_square = paddle.square(paddle.norm(merge_grad))
+                    sum_square = _squared_l2_norm(merge_grad)
                     sum_square_list.append(sum_square)
 
             # all parameters have been filterd out
@@ -520,7 +540,7 @@ def _process_context(self, context, param, grad):
             merge_grad = layers.merge_selected_rows(grad)
             merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
 
-        local_norm_var = paddle.square(paddle.norm(merge_grad))
+        local_norm_var = _squared_l2_norm(merge_grad)
         context[self.group_name].append(local_norm_var)
 
         self.context = context
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 1387827736560e..36aa5404f55110 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -264,8 +264,8 @@ def test_sharding_gradient_clip(self):
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'square',
-            'reduce_sum', 'square', 'reduce_sum', 'square', 'reduce_sum', 'sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
+            'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'sum',
             'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
             'elementwise_div', 'elementwise_mul', 'elementwise_mul',
             'elementwise_mul', 'momentum', 'momentum', 'momentum'
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index e4d60bc7cb0d42..9b6dbc00f7c565 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -151,9 +151,6 @@ def clip_gradient(self, params_grads):
     def check_clip_result(self, out, out_clip):
         global_norm = 0
         for v in out:
-            # if encounter numerical accuracy problem, use
-            # global_norm += np.square(np.linalg.norm(v))
-            # and maybe paddle need a reduce_square_sum op better
             global_norm += np.sum(np.square(v))
         global_norm = np.sqrt(global_norm)
         scale = self.clip_norm / np.maximum(self.clip_norm, global_norm)
@@ -222,8 +219,8 @@ def test_none_grad(self):
 
         ops = [op.type for op in x.block.ops]
         self.assertListEqual(ops, [
-            'frobenius_norm', 'square', 'frobenius_norm', 'square', 'sum',
-            'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div',
+            'squared_l2_norm', 'squared_l2_norm', 'sum', 'sqrt',
+            'fill_constant', 'elementwise_max', 'elementwise_div',
             'elementwise_mul', 'elementwise_mul'
         ])
 

From d1d500a6e06b928f7ab685bafc32317a3e3969ec Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Wed, 4 Aug 2021 19:47:31 +0800
Subject: [PATCH 3/3] fix npu xpu

---
 python/paddle/fluid/clip.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 0a786d5178bd65..fbe524376e592e 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -39,8 +39,14 @@ def _squared_l2_norm(x):
     r"""
     This OP returns the squared L2 norm of a tensor.
     """
+
+    if core.is_compiled_with_npu() or core.is_compiled_with_xpu():
+        square = layers.square(x)
+        sum_square = layers.reduce_sum(square)
+        return sum_square
+
     if in_dygraph_mode():
-        core.ops.squared_l2_norm(x)
+        return core.ops.squared_l2_norm(x)
 
     op_type = 'squared_l2_norm'
     check_variable_and_dtype(x, 'x', ['float32'], op_type)