square, sin and cos support bfloat16 for xpu

zhangyk0314 · zhangyk0314 · commit b7515143e031 · 2023-12-27T08:59:28.000Z
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -293,7 +293,9 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::INT32,
                      phi::DataType::INT64})},
       {"elementwise_pow",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"elementwise_sub_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"elementwise_sub",
@@ -885,7 +887,9 @@ XPUOpMap& get_kl3_ops() {
       {"square_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"square",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"squared_l2_norm",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
@@ -1136,9 +1140,15 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
                      phi::DataType::BFLOAT16})},
-      {"sin", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"sin",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"sin_grad", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"cos", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"cos",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"cos_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"linspace",
        XPUKernelSet({phi::DataType::FLOAT32,
diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -624,19 +624,34 @@ PD_REGISTER_KERNEL(sqrt,
 PD_REGISTER_KERNEL(
     tanh, XPU, ALL_LAYOUT, phi::TanhKernel, float, phi::dtype::float16) {}
 
-PD_REGISTER_KERNEL(
-    square, XPU, ALL_LAYOUT, phi::SquareKernel, float, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(square,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::SquareKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(
     log, XPU, ALL_LAYOUT, phi::LogKernel, float, phi::dtype::float16) {}
 
 PD_REGISTER_KERNEL(
     relu6, XPU, ALL_LAYOUT, phi::Relu6Kernel, float, phi::dtype::float16) {}
 
-PD_REGISTER_KERNEL(
-    sin, XPU, ALL_LAYOUT, phi::SinKernel, float, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    cos, XPU, ALL_LAYOUT, phi::CosKernel, float, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sin,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::SinKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(cos,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::CosKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 #define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
   PD_REGISTER_KERNEL(name, XPU, ALL_LAYOUT, phi::func, float) {}
diff --git a/paddle/phi/kernels/xpu/elementwise_kernel.cc b/paddle/phi/kernels/xpu/elementwise_kernel.cc
@@ -114,4 +114,5 @@ PD_REGISTER_KERNEL(elementwise_pow,
                    ALL_LAYOUT,
                    phi::ElementwisePowKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/test/xpu/test_activation_op_xpu.py b/test/xpu/test_activation_op_xpu.py
@@ -521,34 +521,39 @@ def set_case(self):
             self.op_type = "square"
             self.dtype = self.in_type
             self.init_config()
+            if self.dtype == np.uint16:
+                # bfloat16 actually
+                self.x = convert_float_to_uint16(self.tmp_x)
+            else:
+                self.x = self.tmp_x.astype(self.dtype)
             out = np.square(self.x)
 
             self.attrs = {'use_xpu': True}
             self.inputs = {'X': OpTest.np_dtype_to_base_dtype(self.x)}
             self.outputs = {'Out': out}
 
         def init_config(self):
-            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            self.tmp_x = np.random.uniform(-1, 1, [11, 17])
 
     class XPUTestSquare_ZeroDim(XPUTestSquare):
         def init_config(self):
-            self.x = np.random.uniform(-2, 2, []).astype(self.dtype)
+            self.tmp_x = np.random.uniform(-2, 2, [])
 
     class XPUTestSquare2(XPUTestSquare):
         def init_config(self):
-            self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
+            self.tmp_x = np.random.uniform(-2, 2, [100])
 
     class XPUTestSquare3(XPUTestSquare):
         def init_config(self):
-            self.x = np.random.uniform(-2, 2, [1, 15, 19]).astype(self.dtype)
+            self.tmp_x = np.random.uniform(-2, 2, [1, 15, 19])
 
     class XPUTestSquare4(XPUTestSquare):
         def init_config(self):
-            self.x = np.random.uniform(-2, 2, [100, 10]).astype(self.dtype)
+            self.tmp_x = np.random.uniform(-2, 2, [100, 10])
 
     class XPUTestSquare5(XPUTestSquare):
         def init_config(self):
-            self.x = np.random.uniform(-2, 2, [1, 2, 5, 17]).astype(self.dtype)
+            self.tmp_x = np.random.uniform(-2, 2, [1, 2, 5, 17])
 
 
 support_types = get_xpu_op_support_types('square')
@@ -1297,38 +1302,35 @@ def set_case(self):
             self.dtype = self.in_type
 
             self.init_config()
+            if self.dtype == np.uint16:
+                # bfloat16 actually
+                self.x = convert_float_to_uint16(self.tmp_x)
+            else:
+                self.x = self.tmp_x.astype(self.dtype)
             out = np.sin(self.x)
 
             self.inputs = {'X': self.x}
             self.outputs = {'Out': out}
             self.attrs = {'use_xpu': True}
 
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, [11, 17]).astype(
-                self.dtype
-            )
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [11, 17])
 
     class XPUTestSin_ZeroDim(XPUTestSinBase):
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, []).astype(self.dtype)
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [])
 
     class XPUTestSin2(XPUTestSinBase):
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, [1024, 8]).astype(
-                self.dtype
-            )
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [1024, 8])
 
     class XPUTestSin3(XPUTestSinBase):
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, [4, 512, 15, 15]).astype(
-                self.dtype
-            )
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [4, 512, 15, 15])
 
     class XPUTestSin4(XPUTestSinBase):
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, [4, 256, 22, 22]).astype(
-                self.dtype
-            )
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [4, 256, 22, 22])
 
 
 support_types = get_xpu_op_support_types('sin')
@@ -1347,38 +1349,35 @@ def set_case(self):
             self.dtype = self.in_type
 
             self.init_config()
+            if self.dtype == np.uint16:
+                # bfloat16 actually
+                self.x = convert_float_to_uint16(self.tmp_x)
+            else:
+                self.x = self.tmp_x.astype(self.dtype)
             out = np.cos(self.x)
 
             self.inputs = {'X': self.x}
             self.outputs = {'Out': out}
             self.attrs = {'use_xpu': True}
 
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, [11, 17]).astype(
-                self.dtype
-            )
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [11, 17])
 
     class XPUTestCos_ZeroDim(XPUTestCosBase):
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, []).astype(self.dtype)
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [])
 
     class XPUTestCos2(XPUTestCosBase):
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, [1024, 8]).astype(
-                self.dtype
-            )
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [1024, 8])
 
     class XPUTestCos3(XPUTestCosBase):
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, [4, 512, 15, 15]).astype(
-                self.dtype
-            )
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [4, 512, 15, 15])
 
     class XPUTestCos4(XPUTestCosBase):
         def init_config(self):
-            self.x = np.random.uniform(-np.pi, np.pi, [4, 256, 22, 22]).astype(
-                self.dtype
-            )
+            self.tmp_x = np.random.uniform(-np.pi, np.pi, [4, 256, 22, 22])
 
 
 support_types = get_xpu_op_support_types('cos')
diff --git a/test/xpu/test_elementwise_pow_op_xpu.py b/test/xpu/test_elementwise_pow_op_xpu.py
@@ -20,7 +20,7 @@
     create_test_class,
     get_xpu_op_support_types,
 )
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
@@ -40,73 +40,60 @@ def setUp(self):
             self.dtype = self.in_type
             self.__class__.no_need_check_grad = True
             self.compute_input_output()
-
-        def compute_input_output(self):
+            if self.dtype == np.uint16:
+                # bfloat16 actually
+                self.x = convert_float_to_uint16(self.tmp_x)
+                self.y = convert_float_to_uint16(self.tmp_y)
+            else:
+                self.x = self.tmp_x.astype(self.dtype)
+                self.y = self.tmp_y.astype(self.dtype)
             self.inputs = {
-                'X': np.random.uniform(1, 2, [20, 5]).astype(self.dtype),
-                'Y': np.random.uniform(1, 2, [20, 5]).astype(self.dtype),
+                'X': self.x,
+                'Y': self.y,
             }
             self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
 
+        def compute_input_output(self):
+            self.tmp_x = np.random.uniform(1, 2, [20, 5])
+            self.tmp_y = np.random.uniform(1, 2, [20, 5])
+
         def test_check_output(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
                 self.check_output_with_place(place, check_dygraph=False)
 
     class TestElementwisePowOp_big_shape_1(TestElementwisePowOp):
         def compute_input_output(self):
-            self.inputs = {
-                'X': np.random.uniform(1, 2, [10, 10]).astype(self.dtype),
-                'Y': np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype),
-            }
-            self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+            self.tmp_x = np.random.uniform(1, 2, [10, 10])
+            self.tmp_y = np.random.uniform(0.1, 1, [10, 10])
 
     class TestElementwisePowOp_big_shape_2(TestElementwisePowOp):
         def compute_input_output(self):
-            self.inputs = {
-                'X': np.random.uniform(1, 2, [10, 10]).astype(self.dtype),
-                'Y': np.random.uniform(0.2, 2, [10, 10]).astype(self.dtype),
-            }
-            self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+            self.tmp_x = np.random.uniform(1, 2, [10, 10])
+            self.tmp_y = np.random.uniform(0.2, 2, [10, 10])
 
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1) to test broadcast."
     )
     class TestElementwisePowOp_scalar(TestElementwisePowOp):
         def compute_input_output(self):
-            self.inputs = {
-                'X': np.random.uniform(0.1, 1, [3, 3, 4]).astype(self.dtype),
-                'Y': np.random.uniform(0.1, 1, [1]).astype(self.dtype),
-            }
-            self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+            self.tmp_x = np.random.uniform(0.1, 1, [3, 3, 4])
+            self.tmp_y = np.random.uniform(0.1, 1, [1])
 
     class TestElementwisePowOp_tensor(TestElementwisePowOp):
         def compute_input_output(self):
-            self.inputs = {
-                'X': np.random.uniform(0.1, 1, [100]).astype(self.dtype),
-                'Y': np.random.uniform(1, 3, [100]).astype(self.dtype),
-            }
-            self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+            self.tmp_x = np.random.uniform(0.1, 1, [100])
+            self.tmp_y = np.random.uniform(1, 3, [100])
 
     class TestElementwisePowOp_broadcast_0(TestElementwisePowOp):
         def compute_input_output(self):
-            self.inputs = {
-                'X': np.random.uniform(0.1, 1, [2, 1, 100]).astype(self.dtype),
-                'Y': np.random.uniform(0.1, 1, [100]).astype(self.dtype),
-            }
-            self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+            self.tmp_x = np.random.uniform(0.1, 1, [2, 1, 100])
+            self.tmp_y = np.random.uniform(0.1, 1, [100])
 
     class TestElementwisePowOp_broadcast_4(TestElementwisePowOp):
         def compute_input_output(self):
-            self.inputs = {
-                'X': np.random.uniform(0.1, 1, [2, 10, 3, 5]).astype(
-                    self.dtype
-                ),
-                'Y': np.random.uniform(0.1, 1, [2, 10, 1, 5]).astype(
-                    self.dtype
-                ),
-            }
-            self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+            self.tmp_x = np.random.uniform(0.1, 1, [2, 10, 3, 5])
+            self.tmp_y = np.random.uniform(0.1, 1, [2, 10, 1, 5])
 
     class TestElementwisePowOpInt(OpTest):
         def setUp(self):