PaddlePaddle · zhupengyang · Dec 27, 2021 · Dec 14, 2021 · Dec 15, 2021 · Dec 16, 2021
@@ -5,4 +5,6 @@ endif()
 if(LITE_WITH_XTCL)
   lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
 endif()
-lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+lite_cc_library(xpu_scratch SRCS xpu_scratch.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+lite_cc_library(xpu_quantizer SRCS xpu_quantizer.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs} xpu_scratch)
+lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs} xpu_scratch xpu_quantizer)
@@ -20,60 +20,32 @@
 namespace paddle {
 namespace lite {
 
-void XPUScratchPad::Reserve(size_t new_size) {
-  if (new_size <= size_) {
-    return;
-  }
-  TargetWrapperXPU::Free(addr_);
-  addr_ = TargetWrapperXPU::Malloc(new_size);
-  size_ = new_size;
-}
-
-void XPUScratchPadDeleter::operator()(XPUScratchPad* sp) const {
-  TargetWrapperXPU::Free(sp->addr_);
-  delete sp;
-}
-
 XPUL3CacheBlock* TargetWrapperXPU::CreateL3CacheBlock() {
   l3_block_dict.push_back(new XPUL3CacheBlock());
   return l3_block_dict.back();
 }
 
-void* TargetWrapperXPU::Malloc(size_t size) {
-  void* ptr{nullptr};
-  if (size > 0) {
-    XPU_CALL(xpu_malloc(&ptr, size));
-  }
-  return ptr;
-}
-
-void TargetWrapperXPU::Free(void* ptr) {
-  XPU_CALL(xpu_wait());
-  XPU_CALL(xpu_free(ptr));
-}
-
 void TargetWrapperXPU::MemcpySync(void* dst,
                                   const void* src,
                                   size_t size,
                                   IoDirection dir) {
   switch (dir) {
     case IoDirection::HtoD:
-      XPU_CALL(xpu_wait());
-      XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE));
+      XPUMemory::MemcpyHtoDSync(dst, src, size);
       break;
     case IoDirection::DtoH:
-      XPU_CALL(xpu_wait());
-      XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST));
+      XPUMemory::MemcpyDtoHSync(dst, src, size);
       break;
     default:
       LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
   }
 }
 
-XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size) {
-  void* ptr = TargetWrapperXPU::Malloc(size);
-  CHECK(ptr) << "XPU Malloc Fail, Malloc Size is: " << size;
-  return XPUScratchPadGuard(new XPUScratchPad(ptr, size));
+template <typename Tcpu, typename Txpu>
+XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight(
+    const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose) {
+  CHECK(quantizer_);
+  return quantizer_->quant<Tcpu, Txpu>(cpu_data, dims, data_transpose);
 }
 
 void TargetWrapperXPU::ScatterL3Cache(
@@ -167,6 +139,19 @@ void TargetWrapperXPU::FreeL3Cache() {
   }
 }
 
+template XPUQuantData
+TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, float>(
+    const float*, const DDimLite&, bool);
+template XPUQuantData
+TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
+    const float*, const DDimLite&, bool);
+template XPUQuantData
+TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int8_t>(
+    const float*, const DDimLite&, bool);
+template XPUQuantData
+TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int8_t, int8_t>(
+    const int8_t*, const DDimLite&, bool);
+
 // xpu context
 LITE_THREAD_LOCAL xdnn::Context* TargetWrapperXPU::tls_raw_ctx_{nullptr};
 // multi encoder config
@@ -190,6 +175,8 @@ LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*> TargetWrapperXPU::l3_block_dict;
 std::mutex TargetWrapperXPU::mutex_l3_;
 // l3 planner
 LITE_THREAD_LOCAL XPUL3Planner* TargetWrapperXPU::l3_planner_{nullptr};
+// xpu quantizer
+LITE_THREAD_LOCAL XPUQuantizer* TargetWrapperXPU::quantizer_{nullptr};
 
 }  // namespace lite
 }  // namespace paddle
@@ -16,23 +16,19 @@
 
 #include <algorithm>
 #include <map>
-#include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/backends/xpu/xpu_l3_cache_block.h"
 #include "lite/backends/xpu/xpu_l3_strategy.h"
+#include "lite/backends/xpu/xpu_quantizer.h"
+#include "lite/backends/xpu/xpu_scratch.h"
+#include "lite/core/dim.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/utils/log/cp_logging.h"
 #include "lite/utils/macros.h"
 
-#define XPU_CALL(func)                                        \
-  {                                                           \
-    auto e = (func);                                          \
-    CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \
-  }
-
 namespace paddle {
 namespace lite {
 
@@ -45,35 +41,28 @@ const int XPU_MAX_LOD_SEQ_LEN = 512;
 
 using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
 
-struct XPUScratchPad {
-  XPUScratchPad(void* addr, size_t size) : addr_(addr), size_(size) {}
-  // XXX(miaotianxiang): |size_| increases monotonically
-  void Reserve(size_t new_size);
-  void* addr_{nullptr};
-  size_t size_{0};
-};
-
-struct XPUScratchPadDeleter {
-  void operator()(XPUScratchPad* sp) const;
-};
-
-using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;
-
 template <>
 class TargetWrapper<TARGET(kXPU)> {
  public:
   static size_t num_devices() { return 1; }
   static size_t maximum_stream() { return 0; }
 
-  static void* Malloc(size_t size);
-  static void Free(void* ptr);
+  static void* Malloc(size_t size) { return XPUMemory::Malloc(size); }
+  static void Free(void* ptr) { XPUMemory::Free(ptr); }
 
   static void MemcpySync(void* dst,
                          const void* src,
                          size_t size,
                          IoDirection dir);
 
-  static XPUScratchPadGuard MallocScratchPad(size_t size);
+  static XPUScratchPadGuard MallocScratchPad(size_t size) {
+    return XPUMemory::MallocScratchPad(size);
+  }
+
+  template <typename Tcpu, typename Txpu>
+  static XPUQuantData ConvertCPUWeightToXPUQuantWeight(const Tcpu* cpu_data,
+                                                       const DDimLite& dims,
+                                                       bool data_transpose);
 
   static xdnn::Context* GetRawContext() {
     if (tls_raw_ctx_ == nullptr) {
@@ -83,6 +72,10 @@ class TargetWrapper<TARGET(kXPU)> {
         l3_planner_ = new XPUL3Planner;
       }
       CHECK(l3_planner_);
+      if (quantizer_ == nullptr) {
+        quantizer_ = new XPUQuantizer();
+      }
+      CHECK(quantizer_);
       if (conv_autotune) {
         tls_raw_ctx_->_xpu1_conv_selector.set_autotune_loop(true);
         tls_raw_ctx_->_xpu1_conv_selector.set_inference_mode(true);
@@ -101,7 +94,7 @@ class TargetWrapper<TARGET(kXPU)> {
       }
       CHECK_LE(shared_l3_size, max_l3_size);
       if (local_gm_size > 0) {
-        VLOG(3) << "Try To Malloc Local GM Workspace Size is" << local_gm_size;
+        VLOG(3) << "Try To Malloc Local GM Workspace Size is " << local_gm_size;
         void* local_gm_ptr = nullptr;
         int ret =
             xpu_malloc(reinterpret_cast<void**>(&local_gm_ptr), local_gm_size);
@@ -166,6 +159,7 @@ class TargetWrapper<TARGET(kXPU)> {
   static void* shared_l3_ptr_;
   static std::mutex mutex_l3_;
   static LITE_THREAD_LOCAL XPUL3Planner* l3_planner_;
+  static LITE_THREAD_LOCAL XPUQuantizer* quantizer_;
 };
 
 }  // namespace lite