PaddlePaddle · QingshuChen · May 16, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -32,7 +32,7 @@ if(NOT DEFINED XPU_XDNN_BASE_DATE)
   set(XPU_XDNN_BASE_DATE "20240327")
 endif()
 if(NOT DEFINED XPU_XHPC_BASE_DATE)
-  set(XPU_XHPC_BASE_DATE "20240511")
+  set(XPU_XHPC_BASE_DATE "20240514")
 endif()
 set(XPU_XCCL_BASE_VERSION "1.2.0.5")
 if(NOT DEFINED XPU_XFT_BASE_VERSION)

diff --git a/paddle/phi/common/type_promotion.h b/paddle/phi/common/type_promotion.h
@@ -145,6 +145,11 @@ inline bool NeedTypePromotion(const std::string& op_name,
         (y == phi::DataType::BFLOAT16 || y == phi::DataType::FLOAT16)) {
       return false;
     }
+#elif defined(PADDLE_WITH_XPU)
+    if ((op_name == "add" || op_name == "add_") && x == DataType::FLOAT32 &&
+        (y == phi::DataType::BFLOAT16 || y == phi::DataType::FLOAT16)) {
+      return false;
+    }
 #endif
 
     if ((is_support_float(x) && is_support_float(y)) ||

diff --git a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
@@ -37,18 +37,44 @@ void AddKernel(const Context& dev_ctx,
   if (x.dtype() == phi::DataType::FLOAT32 &&
       (y.dtype() == phi::DataType::BFLOAT16 ||
        y.dtype() == phi::DataType::FLOAT16)) {
-    using Type = DataTypeToCppType<phi::DataType::FLOAT32>::type;
-    using XPUType = typename XPUTypeTrait<Type>::Type;
-    auto f = [](xpu::Context* ctx,
-                const XPUType* x,
-                const XPUType* y,
-                XPUType* z,
-                const std::vector<int>& xshape,
-                const std::vector<int>& yshape) {
-      return xpu::broadcast_add<XPUType>(ctx, x, y, z, xshape, yshape);
-    };
-    auto casted_y = phi::Cast<T>(dev_ctx, y, phi::DataType::FLOAT32);
-    XPUElementwise<Type, XPUType>(dev_ctx, x, casted_y, -1, out, f);
+    auto dev_version =
+        phi::backends::xpu::get_xpu_version(dev_ctx.GetPlace().GetDeviceId());
+    if (dev_version >= phi::backends::xpu::XPUVersion::XPU3 &&
+        x.dims() == y.dims()) {
+      dev_ctx.template Alloc<float>(out);
+
+      const float* x_data = x.data<float>();
+      float* z_data = out->data<float>();
+
+      int ret = xpu::SUCCESS;
+      if (y.dtype() == phi::DataType::BFLOAT16) {
+        using YType = DataTypeToCppType<phi::DataType::BFLOAT16>::type;
+        using XPUYType = typename XPUTypeTrait<YType>::Type;
+        auto y_data = reinterpret_cast<const XPUYType*>(y.data<YType>());
+        ret = xpu::add_mul_type<float, XPUYType, float>(
+            dev_ctx.x_context(), x_data, y_data, z_data, x.numel());
+      } else {
+        using YType = DataTypeToCppType<phi::DataType::FLOAT16>::type;
+        using XPUYType = typename XPUTypeTrait<YType>::Type;
+        auto y_data = reinterpret_cast<const XPUYType*>(y.data<YType>());
+        ret = xpu::add_mul_type<float, XPUYType, float>(
+            dev_ctx.x_context(), x_data, y_data, z_data, x.numel());
+      }
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "add_mul_type");
+    } else {
+      using Type = DataTypeToCppType<phi::DataType::FLOAT32>::type;
+      using XPUType = typename XPUTypeTrait<Type>::Type;
+      auto f = [](xpu::Context* ctx,
+                  const XPUType* x,
+                  const XPUType* y,
+                  XPUType* z,
+                  const std::vector<int>& xshape,
+                  const std::vector<int>& yshape) {
+        return xpu::broadcast_add<XPUType>(ctx, x, y, z, xshape, yshape);
+      };
+      auto casted_y = phi::Cast<T>(dev_ctx, y, phi::DataType::FLOAT32);
+      XPUElementwise<Type, XPUType>(dev_ctx, x, casted_y, -1, out, f);
+    }
   } else {
     using XPUType = typename XPUTypeTrait<T>::Type;
 

@@ -8246,8 +8246,8 @@ def add_cast_for_type_promotion(op, block, idx, var_name, out_dtype):
 
 
 def can_skip_promote(op, device):
-    # Only GPU elementwise_add kernel supports the pattern "float + half".
-    if device != 'GPU':
+    # Only GPU/XPU elementwise_add kernel supports the pattern "float + half".
+    if device not in ['GPU', 'XPU']:
         return False
     if op.type != "elementwise_add":
         return False
@@ -8268,6 +8268,10 @@ def process_type_promotion(program):
         _current_expected_place(), core.CUDAPlace
     ):
         device = 'GPU'
+    elif core.is_compiled_with_xpu() and isinstance(
+        _current_expected_place(), core.XPUPlace
+    ):
+        device = 'XPU'
     org_program = program
     if program is None:
         program = default_main_program()

diff --git a/test/xpu/test_elementwise_add_op_xpu.py b/test/xpu/test_elementwise_add_op_xpu.py
@@ -317,7 +317,10 @@ def _float32_bfloat16_or_float16_add(self, y_dtype):
         val_range = 10000
         shapes = []
         for i in range(test_num):
-            shape = [np.random.randint(val_range), np.random.randint(val_range)]
+            shape = [
+                np.random.randint(1, val_range),
+                np.random.randint(1, val_range),
+            ]
             shapes.append(shape)
 
         for i, shape in enumerate(shapes):