From 713d5b3c64604d25fe5ec73d69d838fcfaf5fc79 Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Fri, 16 Jul 2021 19:08:20 +0200
Subject: [PATCH 1/4] added sigmoid BF16 FWD/BWD and gelu BF16 BWD

---
 .../operators/mkldnn/activation_mkldnn_op.cc  |  7 +-
 .../mkldnn/test_activation_mkldnn_op.py       | 74 ++++++++++++-------
 .../tests/unittests/test_activation_op.py     |  3 +-
 3 files changed, 55 insertions(+), 29 deletions(-)
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 177e539c4b6c29..d9d7b045f0ff42 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -251,7 +251,9 @@ namespace ops = paddle::operators;
       ops::MKLDNNActivationKernel<ops::functor<paddle::platform::bfloat16>>); \
   REGISTER_OP_KERNEL(                                                         \
       act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,                  \
-      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
+      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>,              \
+      ops::MKLDNNActivationGradKernel<                                        \
+          ops::grad_functor<paddle::platform::bfloat16>>);
 
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                           \
   __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);                \
@@ -259,7 +261,6 @@ namespace ops = paddle::operators;
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);          \
   __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor);             \
   __macro(hardswish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
-  __macro(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradFunctor);       \
   __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor);                \
   __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor);                \
   __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
@@ -267,3 +268,5 @@ namespace ops = paddle::operators;
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
                                        GeluMKLDNNGradFunctor);
+REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor,
+                                       SigmoidMKLDNNGradFunctor);
\ No newline at end of file
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 75348cd53e1b80..6190086d97e801 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -16,9 +16,9 @@
 
 import unittest
 import numpy as np
-from scipy.special import expit
+from scipy.special import expit, erf
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
 from paddle.fluid.tests.unittests.test_activation_op import TestActivation, TestRelu, TestTanh, TestSqrt, TestAbs, TestLeakyRelu, TestSwish, TestHardSwish, TestRelu6, TestSigmoid
 from paddle.fluid.tests.unittests.test_gelu_op import gelu
 from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
@@ -79,46 +79,70 @@ def setUp(self):
         self.attrs = {"use_mkldnn": True, "approximate": True}
 
 
-@unittest.skipIf(not core.supports_bfloat16(),
-                 "place does not support BF16 evaluation")
-class TestMKLDNNGeluBf16Dim2(TestActivation):
+#Use it as a base class for BF16 activation tests, just override necessary functions
+class TestMKLDNNSigmoidBF16Op(TestActivation):
+    @OpTestTool.skip_if_not_cpu_bf16()
+    def config(self):
+        self.op_type = "sigmoid"
+        self.op_func = lambda x: (1 / (1 + np.exp(-x)))
+        self.op_grad_func = lambda dout, x: (dout * self.op_func(x)) * (1 - self.op_func(x))
+
+    def set_attrs(self):
+        self.attrs = {"use_mkldnn": True}
+
+    def init_data(self):
+        self.x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype(np.float32)
+
     def setUp(self):
-        self.op_type = "gelu"
         self.dtype = np.uint16
+        self.init_data()
+        self.config()
+        self.out = self.op_func(self.x)
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
-        out = convert_float_to_uint16(gelu(x, False))
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.outputs = {'Out': self.out}
+        self.set_attrs()
 
-        self.inputs = {'X': convert_float_to_uint16(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+    def calculate_grads(self):
+        self.dx = self.op_grad_func(self.out, self.x)
 
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad(self):
-        pass
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            user_defined_grads=[self.dx],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.out)])
 
 
-@unittest.skipIf(not core.supports_bfloat16(),
-                 "place does not support BF16 evaluation")
-class TestMKLDNNGeluBf16Dim2Approx(TestActivation):
-    def setUp(self):
+class TestMKLDNNGeluErfBF16Op(TestMKLDNNSigmoidBF16Op):
+    def config(self):
         self.op_type = "gelu"
-        self.dtype = np.uint16
+        self.op_func = lambda x: gelu(x, False)
+        self.op_grad_func = lambda dout, x: (dout * (0.5 + 0.5 * erf(x / np.sqrt(2)) + (x / np.sqrt(2 * np.pi) * np.exp(-0.5 * np.power(x, 2)))))
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
-        out = convert_float_to_uint16(gelu(x, True))
 
-        self.inputs = {'X': convert_float_to_uint16(x)}
-        self.outputs = {'Out': out}
+class TestMKLDNNGeluErfDim2BF16Op(TestMKLDNNGeluErfBF16Op):
+    def init_data(self):
+        self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+
+
+class TestMKLDNNGeluTanhBF16Op(TestMKLDNNSigmoidBF16Op):
+    def config(self):
+        self.op_type = "gelu"
+        self.op_func = lambda x: gelu(x, True)
+        self.op_grad_func = lambda dout, x: (dout * 0.5 * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) * (1 + np.sqrt(2 / np.pi) * (x + 0.134145 * np.power(x, 3)) * (1 - np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))))
+
+    def set_attrs(self):
         self.attrs = {"use_mkldnn": True, "approximate": True}
 
-    def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
 
-    def test_check_grad(self):
-        pass
+class TestMKLDNNGeluTanhDim2BF16Op(TestMKLDNNGeluTanhBF16Op):
+    def init_data(self):
+        self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
 
 
 class TestMKLDNNTanhDim2(TestTanh):
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 98d2493257d614..346accac01cc70 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -18,7 +18,7 @@
 import numpy as np
 from scipy.special import expit, erf
 
-from op_test import OpTest, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
@@ -1619,7 +1619,6 @@ def setUp(self):
         self.op_type = 'hard_swish'
         self.init_dtype()
 
-        from op_test import skip_check_grad_ci
         skip_check_grad_ci(reason="not implemented yet")
 
         np.random.seed(1024)

From 5da3813a097dda4874e04a15714b5ec0041f0cf8 Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Fri, 16 Jul 2021 19:12:54 +0200
Subject: [PATCH 2/4] added newline at EOF

---
 paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index d9d7b045f0ff42..3b92d2e2d88913 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -269,4 +269,4 @@ FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
                                        GeluMKLDNNGradFunctor);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor,
-                                       SigmoidMKLDNNGradFunctor);
\ No newline at end of file
+                                       SigmoidMKLDNNGradFunctor);

From 94a702b584bcdf55e64e4a701189d5afbc7b715e Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Wed, 21 Jul 2021 22:28:59 +0200
Subject: [PATCH 3/4] switched from lambdas to local functions

---
 .../mkldnn/test_activation_mkldnn_op.py       | 30 +++++++++++++++----
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 6190086d97e801..1bcf0388e5b765 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -84,8 +84,12 @@ class TestMKLDNNSigmoidBF16Op(TestActivation):
     @OpTestTool.skip_if_not_cpu_bf16()
     def config(self):
         self.op_type = "sigmoid"
-        self.op_func = lambda x: (1 / (1 + np.exp(-x)))
-        self.op_grad_func = lambda dout, x: (dout * self.op_func(x)) * (1 - self.op_func(x))
+
+    def op_func(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def op_grad_func(self, dout, x):
+        return dout * self.op_func(x) * (1 - self.op_func(x))
 
     def set_attrs(self):
         self.attrs = {"use_mkldnn": True}
@@ -121,8 +125,14 @@ def test_check_grad(self):
 class TestMKLDNNGeluErfBF16Op(TestMKLDNNSigmoidBF16Op):
     def config(self):
         self.op_type = "gelu"
-        self.op_func = lambda x: gelu(x, False)
-        self.op_grad_func = lambda dout, x: (dout * (0.5 + 0.5 * erf(x / np.sqrt(2)) + (x / np.sqrt(2 * np.pi) * np.exp(-0.5 * np.power(x, 2)))))
+
+    def op_func(self, x):
+        return gelu(x, False)
+
+    def op_grad_func(self, dout, x):
+        return (dout *
+                (0.5 + 0.5 * erf(x / np.sqrt(2)) +
+                 (x / np.sqrt(2 * np.pi) * np.exp(-0.5 * np.power(x, 2)))))
 
 
 class TestMKLDNNGeluErfDim2BF16Op(TestMKLDNNGeluErfBF16Op):
@@ -133,8 +143,16 @@ def init_data(self):
 class TestMKLDNNGeluTanhBF16Op(TestMKLDNNSigmoidBF16Op):
     def config(self):
         self.op_type = "gelu"
-        self.op_func = lambda x: gelu(x, True)
-        self.op_grad_func = lambda dout, x: (dout * 0.5 * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) * (1 + np.sqrt(2 / np.pi) * (x + 0.134145 * np.power(x, 3)) * (1 - np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))))
+
+    def op_func(self, x):
+        return gelu(x, True)
+
+    def op_grad_func(self, dout, x):
+        grad_part = np.tanh(
+            np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))
+        return dout * 0.5 * (1 + grad_part) * (1 + np.sqrt(2 / np.pi) *
+                                               (x + 0.134145 * np.power(x, 3)) *
+                                               (1 - grad_part))
 
     def set_attrs(self):
         self.attrs = {"use_mkldnn": True, "approximate": True}

From e89f92b1c0856e2742bf0e2a8ed775f528cf60f6 Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Wed, 21 Jul 2021 22:39:50 +0200
Subject: [PATCH 4/4] changed function names

---
 .../mkldnn/test_activation_mkldnn_op.py        | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 1bcf0388e5b765..7c73eda2ca8fbe 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -85,11 +85,11 @@ class TestMKLDNNSigmoidBF16Op(TestActivation):
     def config(self):
         self.op_type = "sigmoid"
 
-    def op_func(self, x):
+    def op_forward(self, x):
         return 1 / (1 + np.exp(-x))
 
-    def op_grad_func(self, dout, x):
-        return dout * self.op_func(x) * (1 - self.op_func(x))
+    def op_grad(self, dout, x):
+        return dout * self.op_forward(x) * (1 - self.op_forward(x))
 
     def set_attrs(self):
         self.attrs = {"use_mkldnn": True}
@@ -101,14 +101,14 @@ def setUp(self):
         self.dtype = np.uint16
         self.init_data()
         self.config()
-        self.out = self.op_func(self.x)
+        self.out = self.op_forward(self.x)
 
         self.inputs = {'X': convert_float_to_uint16(self.x)}
         self.outputs = {'Out': self.out}
         self.set_attrs()
 
     def calculate_grads(self):
-        self.dx = self.op_grad_func(self.out, self.x)
+        self.dx = self.op_grad(self.out, self.x)
 
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
@@ -126,10 +126,10 @@ class TestMKLDNNGeluErfBF16Op(TestMKLDNNSigmoidBF16Op):
     def config(self):
         self.op_type = "gelu"
 
-    def op_func(self, x):
+    def op_forward(self, x):
         return gelu(x, False)
 
-    def op_grad_func(self, dout, x):
+    def op_grad(self, dout, x):
         return (dout *
                 (0.5 + 0.5 * erf(x / np.sqrt(2)) +
                  (x / np.sqrt(2 * np.pi) * np.exp(-0.5 * np.power(x, 2)))))
@@ -144,10 +144,10 @@ class TestMKLDNNGeluTanhBF16Op(TestMKLDNNSigmoidBF16Op):
     def config(self):
         self.op_type = "gelu"
 
-    def op_func(self, x):
+    def op_forward(self, x):
         return gelu(x, True)
 
-    def op_grad_func(self, dout, x):
+    def op_grad(self, dout, x):
         grad_part = np.tanh(
             np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))
         return dout * 0.5 * (1 + grad_part) * (1 + np.sqrt(2 / np.pi) *