Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lite/backends/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ endif()
if(LITE_WITH_XTCL)
lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
endif()
lite_cc_library(xpu_quantizer SRCS xpu_quantizer.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
9 changes: 9 additions & 0 deletions lite/backends/xpu/target_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size) {
return XPUScratchPadGuard(new XPUScratchPad(ptr, size));
}

template <typename Tcpu, typename Txpu>
XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight(
const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose) {
CHECK(quantizer_);
return quantizer_->quant<Tcpu, Txpu>(cpu_data, dims, data_transpose);
}

void TargetWrapperXPU::ScatterL3Cache(
void* l3_ptr,
size_t l3_size,
Expand Down Expand Up @@ -190,6 +197,8 @@ LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*> TargetWrapperXPU::l3_block_dict;
std::mutex TargetWrapperXPU::mutex_l3_;
// l3 planner
LITE_THREAD_LOCAL XPUL3Planner* TargetWrapperXPU::l3_planner_{nullptr};
// xpu quantizer
LITE_THREAD_LOCAL XPUQuantizer* TargetWrapperXPU::quantizer_{nullptr};

} // namespace lite
} // namespace paddle
30 changes: 14 additions & 16 deletions lite/backends/xpu/target_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@

#include <algorithm>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/backends/xpu/xpu_l3_cache_block.h"
#include "lite/backends/xpu/xpu_l3_strategy.h"
#include "lite/backends/xpu/xpu_quantizer.h"
#include "lite/backends/xpu/xpu_scratch.h"
#include "lite/core/dim.h"
#include "lite/core/target_wrapper.h"
#include "lite/utils/log/cp_logging.h"
#include "lite/utils/macros.h"
Expand All @@ -45,20 +47,6 @@ const int XPU_MAX_LOD_SEQ_LEN = 512;

using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;

struct XPUScratchPad {
XPUScratchPad(void* addr, size_t size) : addr_(addr), size_(size) {}
// XXX(miaotianxiang): |size_| increases monotonically
void Reserve(size_t new_size);
void* addr_{nullptr};
size_t size_{0};
};

struct XPUScratchPadDeleter {
void operator()(XPUScratchPad* sp) const;
};

using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;

template <>
class TargetWrapper<TARGET(kXPU)> {
public:
Expand All @@ -75,6 +63,11 @@ class TargetWrapper<TARGET(kXPU)> {

static XPUScratchPadGuard MallocScratchPad(size_t size);

template <typename Tcpu, typename Txpu>
static XPUQuantData ConvertCPUWeightToXPUQuantWeight(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose);

static xdnn::Context* GetRawContext() {
if (tls_raw_ctx_ == nullptr) {
tls_raw_ctx_ = xdnn::create_context();
Expand All @@ -83,6 +76,10 @@ class TargetWrapper<TARGET(kXPU)> {
l3_planner_ = new XPUL3Planner;
}
CHECK(l3_planner_);
if (quantizer_ == nullptr) {
quantizer_ = new XPUQuantizer();
}
CHECK(quantizer_);
if (conv_autotune) {
tls_raw_ctx_->_xpu1_conv_selector.set_autotune_loop(true);
tls_raw_ctx_->_xpu1_conv_selector.set_inference_mode(true);
Expand All @@ -101,7 +98,7 @@ class TargetWrapper<TARGET(kXPU)> {
}
CHECK_LE(shared_l3_size, max_l3_size);
if (local_gm_size > 0) {
VLOG(3) << "Try To Malloc Local GM Workspace Size is" << local_gm_size;
VLOG(3) << "Try To Malloc Local GM Workspace Size is " << local_gm_size;
void* local_gm_ptr = nullptr;
int ret =
xpu_malloc(reinterpret_cast<void**>(&local_gm_ptr), local_gm_size);
Expand Down Expand Up @@ -166,6 +163,7 @@ class TargetWrapper<TARGET(kXPU)> {
static void* shared_l3_ptr_;
static std::mutex mutex_l3_;
static LITE_THREAD_LOCAL XPUL3Planner* l3_planner_;
static LITE_THREAD_LOCAL XPUQuantizer* quantizer_;
};

} // namespace lite
Expand Down
229 changes: 229 additions & 0 deletions lite/backends/xpu/xpu_quantizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/backends/xpu/xpu_quantizer.h"
#include <algorithm>
#include <string>
#include "lite/backends/xpu/math.h"

namespace paddle {
namespace lite {

template <typename T>
static inline size_t hash_combine(size_t seed, const T& v) {
std::hash<T> hasher;
seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
return seed;
}

static size_t Hashed(const float* cpu_data,
int numel,
const std::string& precision,
bool trans) {
std::hash<const float*> ptr_hasher;
auto hash_res = ptr_hasher(cpu_data);
hash_res = hash_combine(hash_res, numel);
hash_res = hash_combine(hash_res, precision);
hash_res = hash_combine(hash_res, trans);
return hash_res;
}

template <typename T>
static inline const std::string CppTypeToString() {
return "unkown";
}
template <>
inline const std::string CppTypeToString<float>() {
return "float";
}
template <>
inline const std::string CppTypeToString<float16>() {
return "float16";
}
template <>
inline const std::string CppTypeToString<int64_t>() {
return "int64_t";
}
template <>
inline const std::string CppTypeToString<int>() {
return "int";
}
template <>
inline const std::string CppTypeToString<int16_t>() {
return "int16_t";
}
template <>
inline const std::string CppTypeToString<int8_t>() {
return "int8_t";
}

template <typename T>
static void QuantFP32ToIntX(const float* src_ptr,
T* dst_ptr,
float max_val,
int numel) {
CHECK(false) << "Not support for T is " << CppTypeToString<T>();
}
template <>
void QuantFP32ToIntX<float>(const float* src_ptr,
float* dst_ptr,
float max_val,
int numel) {
std::copy(src_ptr, src_ptr + numel, dst_ptr);
}
template <>
void QuantFP32ToIntX<int16_t>(const float* src_ptr,
int16_t* dst_ptr,
float max_val,
int numel) {
paddle::lite::xpu::math::ConvertFP32ToInt16(src_ptr, dst_ptr, max_val, numel);
}
template <>
void QuantFP32ToIntX<int8_t>(const float* src_ptr,
int8_t* dst_ptr,
float max_val,
int numel) {
paddle::lite::xpu::math::ConvertFP32ToInt8(src_ptr, dst_ptr, max_val, numel);
}

template <typename Tcpu,
typename Txpu,
typename std::enable_if<!std::is_same<Tcpu, float>::value,
Tcpu>::type* ptr = nullptr>
void ConvertWithQuant(
const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose,
std::unordered_map<size_t,
std::pair<XPUScratchPadGuard, XPUScratchPadGuard>>&
weight_cache_,
size_t hashed_key) {
CHECK(false) << "Not support for Tcpu is " << CppTypeToString<Tcpu>();
}

template <typename Tcpu,
typename Txpu,
typename std::enable_if<std::is_same<Tcpu, float>::value, Tcpu>::type*
ptr = nullptr>
void ConvertWithQuant(
const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose,
std::unordered_map<size_t,
std::pair<XPUScratchPadGuard, XPUScratchPadGuard>>&
weight_cache_,
size_t hashed_key) {
// transpose
const Tcpu* cpu_ptr = nullptr;
int numel = dims.production();
std::vector<Tcpu> transpose_data(numel, 0);
if (data_transpose) {
CHECK(dims.size() == 2) << "Not support: dims.size = " << dims.size();
paddle::lite::xpu::math::Transpose<Tcpu>(
cpu_data, transpose_data.data(), dims[0], dims[1]);
cpu_ptr = transpose_data.data();
} else {
cpu_ptr = cpu_data;
}
// findmax
XPUScratchPadGuard weight_max_guard;
XPUScratchPadGuard quant_weight_guard;
float max_val = paddle::lite::xpu::math::FindMaxAbs(cpu_ptr, numel);
int max_ptr_size = xdnn::get_max_ptr_size(TargetWrapperXPU::GetRawContext());
std::vector<float> max_vec(max_ptr_size, max_val);
weight_max_guard = std::move(
TargetWrapperXPU::MallocScratchPad(max_ptr_size * sizeof(float)));
TargetWrapperXPU::MemcpySync(weight_max_guard->addr_,
max_vec.data(),
max_ptr_size * sizeof(float),
IoDirection::HtoD);
// quant
quant_weight_guard =
std::move(TargetWrapperXPU::MallocScratchPad(numel * sizeof(Txpu)));
std::vector<int16_t> quant_data_cpu(numel, 0);
QuantFP32ToIntX<Txpu>(cpu_ptr, quant_data_cpu.data(), max_val, numel);
TargetWrapperXPU::MemcpySync(quant_weight_guard->addr_,
quant_data_cpu.data(),
numel * sizeof(Txpu),
IoDirection::HtoD);
// add to cache
weight_cache_[hashed_key] = std::make_pair(std::move(weight_max_guard),
std::move(quant_weight_guard));
}

template <typename T>
void ConvertWithoutQuant(
const T* cpu_data,
const DDimLite& dims,
bool data_transpose,
std::unordered_map<size_t,
std::pair<XPUScratchPadGuard, XPUScratchPadGuard>>&
weight_cache_,
size_t hashed_key) {
// transpose
const T* cpu_ptr = nullptr;
int numel = dims.production();
std::vector<T> transpose_data(numel, 0);
if (data_transpose) {
CHECK(dims.size() == 2) << "Not support: dims.size = " << dims.size();
paddle::lite::xpu::math::Transpose<T>(
cpu_data, transpose_data.data(), dims[0], dims[1]);
cpu_ptr = transpose_data.data();
} else {
cpu_ptr = cpu_data;
}
// copy to XPU
XPUScratchPadGuard weight_max_guard(new XPUScratchPad(nullptr, 0));
XPUScratchPadGuard quant_weight_guard;
quant_weight_guard =
std::move(TargetWrapperXPU::MallocScratchPad(numel * sizeof(T)));
TargetWrapperXPU::MemcpySync(
quant_weight_guard->addr_, cpu_ptr, numel * sizeof(T), IoDirection::HtoD);
// add to cache
weight_cache_[hashed_key] = std::make_pair(std::move(weight_max_guard),
std::move(quant_weight_guard));
}

template <typename Tcpu, typename Txpu>
XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose) {
int numel = dims.production();
const std::string cpu_dtype = CppTypeToString<Tcpu>();
const std::string xpu_dtype = CppTypeToString<Txpu>();
const std::string precision = cpu_dtype + xpu_dtype;
auto hashed_key = Hashed(cpu_data, numel, precision, data_transpose);
VLOG(3) << "cpu_data=" << cpu_data << ", numel=" << numel
<< ", precision=" << precision << ", transpose=" << data_transpose
<< ", hashed_key=" << hashed_key;
if (weight_cache_.find(hashed_key) == weight_cache_.end()) {
bool need_quant = !std::is_same<Tcpu, Txpu>::value;
if (need_quant) {
ConvertWithQuant<Tcpu, Txpu>(
cpu_data, dims, data_transpose, weight_cache_, hashed_key);
} else {
ConvertWithoutQuant<Tcpu>(
cpu_data, dims, data_transpose, weight_cache_, hashed_key);
}
}

float* max_ptr =
reinterpret_cast<float*>(weight_cache_[hashed_key].first->addr_);
void* qdata_ptr = weight_cache_[hashed_key].second->addr_;
XPUQuantData xpu_qdata(max_ptr, qdata_ptr);
return xpu_qdata;
}

} // namespace lite
} // namespace paddle
48 changes: 48 additions & 0 deletions lite/backends/xpu/xpu_quantizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
#include <utility>
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/backends/xpu/xpu_scratch.h"
#include "lite/core/dim.h"
#include "lite/utils/macros.h"

namespace paddle {
namespace lite {

struct XPUQuantData {
XPUQuantData() : data_ptr_(nullptr), max_ptr_(nullptr) {}
XPUQuantData(float* max_ptr, void* data_ptr)
: data_ptr_(data_ptr), max_ptr_(max_ptr) {}
void* data_ptr_{nullptr};
float* max_ptr_{nullptr};
};

class XPUQuantizer {
public:
template <typename Tcpu, typename Txpu>
XPUQuantData quant(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose);

private:
// cpu data to xpu quant data
std::unordered_map<size_t, std::pair<XPUScratchPadGuard, XPUScratchPadGuard>>
weight_cache_;
};

} // namespace lite
} // namespace paddle
Loading