PaddlePaddle · heavengate · May 23, 2024 · May 7, 2024 · May 8, 2024 · May 8, 2024
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
@@ -29,6 +29,8 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/resource_manager.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/utils/string/printf.h"
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
@@ -45,6 +47,8 @@
 #include "paddle/pir/include/core/program.h"
 
 namespace paddle_infer {
+using float16 = paddle::platform::float16;
+using bfloat16 = phi::dtype::bfloat16;
 namespace experimental {
 class InternalUtils;
 };

diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
@@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) {
   switch (dtype) {
     case PaddleDType::FLOAT32:
       return sizeof(float);
+    case PaddleDType::BFLOAT16:
+      return sizeof(uint16_t);
     case PaddleDType::INT64:
       return sizeof(int64_t);
     case PaddleDType::INT32:

diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
@@ -221,6 +221,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       input_ptr = input.mutable_data<float>(ddim, place_);
     } else if (inputs[i].dtype == PaddleDType::INT32) {
       input_ptr = input.mutable_data<int32_t>(ddim, place_);
+    } else if (inputs[i].dtype == PaddleDType::BFLOAT16) {
+      input_ptr = input.mutable_data<bfloat16>(ddim, place_);
     } else {
       LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
       return false;

diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -821,6 +821,7 @@ template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
 template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
 template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
 template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
+template void Tensor::ORTCopyToCpu<bfloat16>(bfloat16 *data) const;
 #endif
 
 namespace experimental {

diff --git a/paddle/fluid/operators/fused/attn_gemm_int8.h b/paddle/fluid/operators/fused/attn_gemm_int8.h
@@ -16,12 +16,12 @@ limitations under the License. */
 
 #include <iostream>
 #include <vector>
-#include "paddle/fluid/operators/fused/cublaslt.h"
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/cublaslt.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
@@ -35,7 +35,8 @@ class AttnMatmulINT8 {
   AttnMatmulINT8(
       const phi::GPUContext& dev_ctx, int m, int n, int k, bool compute_bias)
       : dev_ctx_(dev_ctx), m_(m), n_(n), k_(k), compute_bias_(compute_bias) {
-    auto helper = std::make_shared<CublasLtHelper>(m, k, n);
+    auto helper = std::make_shared<phi::CublasLtHelper>(
+        m, k, n, dev_ctx.cublaslt_handle());
     helpers_.emplace_back(helper);
     gpu_config_ = std::make_unique<GpuLaunchConfig>(
         phi::backends::gpu::GetGpuLaunchConfig1D(
@@ -186,7 +187,7 @@ class AttnMatmulINT8 {
   int k_;  // k
 
   int compute_bias_;
-  std::vector<std::shared_ptr<CublasLtHelper>> helpers_;
+  std::vector<std::shared_ptr<phi::CublasLtHelper>> helpers_;
   std::unique_ptr<GpuLaunchConfig> gpu_config_;
 };