PaddlePaddle · zhupengyang · Dec 27, 2021 · Dec 14, 2021 · Dec 15, 2021 · Dec 16, 2021
@@ -5,4 +5,5 @@ endif()
 if(LITE_WITH_XTCL)
   lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
 endif()
+lite_cc_library(xpu_quantizer SRCS xpu_quantizer.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
 lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
@@ -76,6 +76,13 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size) {
   return XPUScratchPadGuard(new XPUScratchPad(ptr, size));
 }
 
+template <typename Tcpu, typename Txpu>
+XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight(
+    const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose) {
+  CHECK(quantizer_);
+  return quantizer_->quant<Tcpu, Txpu>(cpu_data, dims, data_transpose);
+}
+
 void TargetWrapperXPU::ScatterL3Cache(
     void* l3_ptr,
     size_t l3_size,
@@ -190,6 +197,8 @@ LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*> TargetWrapperXPU::l3_block_dict;
 std::mutex TargetWrapperXPU::mutex_l3_;
 // l3 planner
 LITE_THREAD_LOCAL XPUL3Planner* TargetWrapperXPU::l3_planner_{nullptr};
+// xpu quantizer
+LITE_THREAD_LOCAL XPUQuantizer* TargetWrapperXPU::quantizer_{nullptr};
 
 }  // namespace lite
 }  // namespace paddle
@@ -16,13 +16,15 @@
 
 #include <algorithm>
 #include <map>
-#include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/backends/xpu/xpu_l3_cache_block.h"
 #include "lite/backends/xpu/xpu_l3_strategy.h"
+#include "lite/backends/xpu/xpu_quantizer.h"
+#include "lite/backends/xpu/xpu_scratch.h"
+#include "lite/core/dim.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/utils/log/cp_logging.h"
 #include "lite/utils/macros.h"
@@ -45,20 +47,6 @@ const int XPU_MAX_LOD_SEQ_LEN = 512;
 
 using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
 
-struct XPUScratchPad {
-  XPUScratchPad(void* addr, size_t size) : addr_(addr), size_(size) {}
-  // XXX(miaotianxiang): |size_| increases monotonically
-  void Reserve(size_t new_size);
-  void* addr_{nullptr};
-  size_t size_{0};
-};
-
-struct XPUScratchPadDeleter {
-  void operator()(XPUScratchPad* sp) const;
-};
-
-using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;
-
 template <>
 class TargetWrapper<TARGET(kXPU)> {
  public:
@@ -75,6 +63,11 @@ class TargetWrapper<TARGET(kXPU)> {
 
   static XPUScratchPadGuard MallocScratchPad(size_t size);
 
+  template <typename Tcpu, typename Txpu>
+  static XPUQuantData ConvertCPUWeightToXPUQuantWeight(const Tcpu* cpu_data,
+                                                       const DDimLite& dims,
+                                                       bool data_transpose);
+
   static xdnn::Context* GetRawContext() {
     if (tls_raw_ctx_ == nullptr) {
       tls_raw_ctx_ = xdnn::create_context();
@@ -83,6 +76,10 @@ class TargetWrapper<TARGET(kXPU)> {
         l3_planner_ = new XPUL3Planner;
       }
       CHECK(l3_planner_);
+      if (quantizer_ == nullptr) {
+        quantizer_ = new XPUQuantizer();
+      }
+      CHECK(quantizer_);
       if (conv_autotune) {
         tls_raw_ctx_->_xpu1_conv_selector.set_autotune_loop(true);
         tls_raw_ctx_->_xpu1_conv_selector.set_inference_mode(true);
@@ -101,7 +98,7 @@ class TargetWrapper<TARGET(kXPU)> {
       }
       CHECK_LE(shared_l3_size, max_l3_size);
       if (local_gm_size > 0) {
-        VLOG(3) << "Try To Malloc Local GM Workspace Size is" << local_gm_size;
+        VLOG(3) << "Try To Malloc Local GM Workspace Size is " << local_gm_size;
         void* local_gm_ptr = nullptr;
         int ret =
             xpu_malloc(reinterpret_cast<void**>(&local_gm_ptr), local_gm_size);
@@ -166,6 +163,7 @@ class TargetWrapper<TARGET(kXPU)> {
   static void* shared_l3_ptr_;
   static std::mutex mutex_l3_;
   static LITE_THREAD_LOCAL XPUL3Planner* l3_planner_;
+  static LITE_THREAD_LOCAL XPUQuantizer* quantizer_;
 };
 
 }  // namespace lite

@@ -0,0 +1,229 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/xpu_quantizer.h"
+#include <algorithm>
+#include <string>
+#include "lite/backends/xpu/math.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename T>
+static inline size_t hash_combine(size_t seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  return seed;
+}
+
+static size_t Hashed(const float* cpu_data,
+                     int numel,
+                     const std::string& precision,
+                     bool trans) {
+  std::hash<const float*> ptr_hasher;
+  auto hash_res = ptr_hasher(cpu_data);
+  hash_res = hash_combine(hash_res, numel);
+  hash_res = hash_combine(hash_res, precision);
+  hash_res = hash_combine(hash_res, trans);
+  return hash_res;
+}
+
+template <typename T>
+static inline const std::string CppTypeToString() {
+  return "unkown";
+}
+template <>
+inline const std::string CppTypeToString<float>() {
+  return "float";
+}
+template <>
+inline const std::string CppTypeToString<float16>() {
+  return "float16";
+}
+template <>
+inline const std::string CppTypeToString<int64_t>() {
+  return "int64_t";
+}
+template <>
+inline const std::string CppTypeToString<int>() {
+  return "int";
+}
+template <>
+inline const std::string CppTypeToString<int16_t>() {
+  return "int16_t";
+}
+template <>
+inline const std::string CppTypeToString<int8_t>() {
+  return "int8_t";
+}
+
+template <typename T>
+static void QuantFP32ToIntX(const float* src_ptr,
+                            T* dst_ptr,
+                            float max_val,
+                            int numel) {
+  CHECK(false) << "Not support for T is " << CppTypeToString<T>();
+}
+template <>
+void QuantFP32ToIntX<float>(const float* src_ptr,
+                            float* dst_ptr,
+                            float max_val,
+                            int numel) {
+  std::copy(src_ptr, src_ptr + numel, dst_ptr);
+}
+template <>
+void QuantFP32ToIntX<int16_t>(const float* src_ptr,
+                              int16_t* dst_ptr,
+                              float max_val,
+                              int numel) {
+  paddle::lite::xpu::math::ConvertFP32ToInt16(src_ptr, dst_ptr, max_val, numel);
+}
+template <>
+void QuantFP32ToIntX<int8_t>(const float* src_ptr,
+                             int8_t* dst_ptr,
+                             float max_val,
+                             int numel) {
+  paddle::lite::xpu::math::ConvertFP32ToInt8(src_ptr, dst_ptr, max_val, numel);
+}
+
+template <typename Tcpu,
+          typename Txpu,
+          typename std::enable_if<!std::is_same<Tcpu, float>::value,
+                                  Tcpu>::type* ptr = nullptr>
+void ConvertWithQuant(
+    const Tcpu* cpu_data,
+    const DDimLite& dims,
+    bool data_transpose,
+    std::unordered_map<size_t,
+                       std::pair<XPUScratchPadGuard, XPUScratchPadGuard>>&
+        weight_cache_,
+    size_t hashed_key) {
+  CHECK(false) << "Not support for Tcpu is " << CppTypeToString<Tcpu>();
+}
+
+template <typename Tcpu,
+          typename Txpu,
+          typename std::enable_if<std::is_same<Tcpu, float>::value, Tcpu>::type*
+              ptr = nullptr>
+void ConvertWithQuant(
+    const Tcpu* cpu_data,
+    const DDimLite& dims,
+    bool data_transpose,
+    std::unordered_map<size_t,
+                       std::pair<XPUScratchPadGuard, XPUScratchPadGuard>>&
+        weight_cache_,
+    size_t hashed_key) {
+  // transpose
+  const Tcpu* cpu_ptr = nullptr;
+  int numel = dims.production();
+  std::vector<Tcpu> transpose_data(numel, 0);
+  if (data_transpose) {
+    CHECK(dims.size() == 2) << "Not support: dims.size = " << dims.size();
+    paddle::lite::xpu::math::Transpose<Tcpu>(
+        cpu_data, transpose_data.data(), dims[0], dims[1]);
+    cpu_ptr = transpose_data.data();
+  } else {
+    cpu_ptr = cpu_data;
+  }
+  // findmax
+  XPUScratchPadGuard weight_max_guard;
+  XPUScratchPadGuard quant_weight_guard;
+  float max_val = paddle::lite::xpu::math::FindMaxAbs(cpu_ptr, numel);
+  int max_ptr_size = xdnn::get_max_ptr_size(TargetWrapperXPU::GetRawContext());
+  std::vector<float> max_vec(max_ptr_size, max_val);
+  weight_max_guard = std::move(
+      TargetWrapperXPU::MallocScratchPad(max_ptr_size * sizeof(float)));
+  TargetWrapperXPU::MemcpySync(weight_max_guard->addr_,
+                               max_vec.data(),
+                               max_ptr_size * sizeof(float),
+                               IoDirection::HtoD);
+  // quant
+  quant_weight_guard =
+      std::move(TargetWrapperXPU::MallocScratchPad(numel * sizeof(Txpu)));
+  std::vector<int16_t> quant_data_cpu(numel, 0);
+  QuantFP32ToIntX<Txpu>(cpu_ptr, quant_data_cpu.data(), max_val, numel);
+  TargetWrapperXPU::MemcpySync(quant_weight_guard->addr_,
+                               quant_data_cpu.data(),
+                               numel * sizeof(Txpu),
+                               IoDirection::HtoD);
+  // add to cache
+  weight_cache_[hashed_key] = std::make_pair(std::move(weight_max_guard),
+                                             std::move(quant_weight_guard));
+}
+
+template <typename T>
+void ConvertWithoutQuant(
+    const T* cpu_data,
+    const DDimLite& dims,
+    bool data_transpose,
+    std::unordered_map<size_t,
+                       std::pair<XPUScratchPadGuard, XPUScratchPadGuard>>&
+        weight_cache_,
+    size_t hashed_key) {
+  // transpose
+  const T* cpu_ptr = nullptr;
+  int numel = dims.production();
+  std::vector<T> transpose_data(numel, 0);
+  if (data_transpose) {
+    CHECK(dims.size() == 2) << "Not support: dims.size = " << dims.size();
+    paddle::lite::xpu::math::Transpose<T>(
+        cpu_data, transpose_data.data(), dims[0], dims[1]);
+    cpu_ptr = transpose_data.data();
+  } else {
+    cpu_ptr = cpu_data;
+  }
+  // copy to XPU
+  XPUScratchPadGuard weight_max_guard(new XPUScratchPad(nullptr, 0));
+  XPUScratchPadGuard quant_weight_guard;
+  quant_weight_guard =
+      std::move(TargetWrapperXPU::MallocScratchPad(numel * sizeof(T)));
+  TargetWrapperXPU::MemcpySync(
+      quant_weight_guard->addr_, cpu_ptr, numel * sizeof(T), IoDirection::HtoD);
+  // add to cache
+  weight_cache_[hashed_key] = std::make_pair(std::move(weight_max_guard),
+                                             std::move(quant_weight_guard));
+}
+
+template <typename Tcpu, typename Txpu>
+XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
+                                 const DDimLite& dims,
+                                 bool data_transpose) {
+  int numel = dims.production();
+  const std::string cpu_dtype = CppTypeToString<Tcpu>();
+  const std::string xpu_dtype = CppTypeToString<Txpu>();
+  const std::string precision = cpu_dtype + xpu_dtype;
+  auto hashed_key = Hashed(cpu_data, numel, precision, data_transpose);
+  VLOG(3) << "cpu_data=" << cpu_data << ", numel=" << numel
+          << ", precision=" << precision << ", transpose=" << data_transpose
+          << ", hashed_key=" << hashed_key;
+  if (weight_cache_.find(hashed_key) == weight_cache_.end()) {
+    bool need_quant = !std::is_same<Tcpu, Txpu>::value;
+    if (need_quant) {
+      ConvertWithQuant<Tcpu, Txpu>(
+          cpu_data, dims, data_transpose, weight_cache_, hashed_key);
+    } else {
+      ConvertWithoutQuant<Tcpu>(
+          cpu_data, dims, data_transpose, weight_cache_, hashed_key);
+    }
+  }
+
+  float* max_ptr =
+      reinterpret_cast<float*>(weight_cache_[hashed_key].first->addr_);
+  void* qdata_ptr = weight_cache_[hashed_key].second->addr_;
+  XPUQuantData xpu_qdata(max_ptr, qdata_ptr);
+  return xpu_qdata;
+}
+
+}  // namespace lite
+}  // namespace paddle
@@ -0,0 +1,48 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/backends/xpu/xpu_scratch.h"
+#include "lite/core/dim.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+
+struct XPUQuantData {
+  XPUQuantData() : data_ptr_(nullptr), max_ptr_(nullptr) {}
+  XPUQuantData(float* max_ptr, void* data_ptr)
+      : data_ptr_(data_ptr), max_ptr_(max_ptr) {}
+  void* data_ptr_{nullptr};
+  float* max_ptr_{nullptr};
+};
+
+class XPUQuantizer {
+ public:
+  template <typename Tcpu, typename Txpu>
+  XPUQuantData quant(const Tcpu* cpu_data,
+                     const DDimLite& dims,
+                     bool data_transpose);
+
+ private:
+  // cpu data to xpu quant data
+  std::unordered_map<size_t, std::pair<XPUScratchPadGuard, XPUScratchPadGuard>>
+      weight_cache_;
+};
+
+}  // namespace lite
+}  // namespace paddle