From 1e5695c66fb7ed3258db83f609e9f217852702fb Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 30 Jun 2021 12:47:02 +0000
Subject: [PATCH 1/4] delete useless GELU in gelu npu op

---
 paddle/fluid/operators/gelu_op_npu.cc | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index 6e60926cc7951a..79c477f8a1bd4a 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -61,13 +61,7 @@ class GeluGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor out(x->type());
-    out.mutable_data<T>(x->dims(), place);
-    const auto& runner_out = NpuOpRunner("Gelu", {*x}, {out}, {});
-    runner_out.Run(stream);
-
-    const auto& runner_dx =
-        NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {});
+    const auto& runner_dx = NpuOpRunner("GeluGrad", {*dout, *x, *x}, {*dx}, {});
     runner_dx.Run(stream);
   }
 };

From 084173c5487138015cccad68582868b1d6da92f8 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 30 Jun 2021 23:29:33 +0000
Subject: [PATCH 2/4] add description

---
 paddle/fluid/operators/gelu_op_npu.cc | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index 79c477f8a1bd4a..74ba3be14c8cae 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -61,7 +61,16 @@ class GeluGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    const auto& runner_dx = NpuOpRunner("GeluGrad", {*dout, *x, *x}, {*dx}, {});
+    // NOTE(pangyoki): In the original implementation of GeluGrad op, the input
+    // is
+    // {*dout, *x, out}, where out = Gelu(x). However, we find that variable
+    // `out`
+    // was not actually used. In order to improve performance, the useless GELU
+    // operation was deleted.
+    // We directly use `*dout` as a placeholder to replace `out`, it will not be
+    // used in calculations.
+    const auto& runner_dx =
+        NpuOpRunner("GeluGrad", {*dout, *x, *dout}, {*dx}, {});
     runner_dx.Run(stream);
   }
 };

From 2b30114380e0b33d586b2cf695cca886c47959d2 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 30 Jun 2021 23:32:35 +0000
Subject: [PATCH 3/4] fix format

---
 paddle/fluid/operators/gelu_op_npu.cc | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index 74ba3be14c8cae..4db82e96cfae7c 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -62,13 +62,11 @@ class GeluGradNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     // NOTE(pangyoki): In the original implementation of GeluGrad op, the input
-    // is
-    // {*dout, *x, out}, where out = Gelu(x). However, we find that variable
-    // `out`
-    // was not actually used. In order to improve performance, the useless GELU
-    // operation was deleted.
-    // We directly use `*dout` as a placeholder to replace `out`, it will not be
-    // used in calculations.
+    // is {*dout, *x, out}, where out = Gelu(x). However, we find that variable
+    // `out` was not actually used. In order to improve performance, the
+    // useless GELU operation was deleted.
+    // We directly use `*dout` as a placeholder to replace `out`, it will not
+    // be used in calculations.
     const auto& runner_dx =
         NpuOpRunner("GeluGrad", {*dout, *x, *dout}, {*dx}, {});
     runner_dx.Run(stream);

From 13741a1bf44fa2ad4a40fae1d91caa454a30a650 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Fri, 2 Jul 2021 06:36:11 +0000
Subject: [PATCH 4/4] add check_grad in gelu unittest

---
 .../fluid/tests/unittests/npu/test_gelu_op_npu.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
index efa1918206b035..d811aaf228ddf5 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
@@ -58,12 +58,9 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
@@ -115,10 +112,10 @@ def _test(self, run_npu=True):
                 name="label", shape=[32, 1], dtype='int64')
 
             c = paddle.multiply(a, b)
-            d = fluid.layers.gelu(c)
 
-            fc_1 = fluid.layers.fc(input=d, size=128)
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            fc_1_gelu = fluid.layers.gelu(fc_1)
+            prediction = fluid.layers.fc(input=fc_1_gelu, size=2, act='softmax')
 
             cost = fluid.layers.cross_entropy(input=prediction, label=label)
             loss = fluid.layers.reduce_mean(cost)