Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion lite/backends/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ endif()
if(LITE_WITH_XTCL)
lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
endif()
lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
lite_cc_library(xpu_scratch SRCS xpu_scratch.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
lite_cc_library(xpu_quantizer SRCS xpu_quantizer.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs} xpu_scratch)
lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs} xpu_scratch xpu_quantizer)
58 changes: 23 additions & 35 deletions lite/backends/xpu/target_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,60 +20,32 @@
namespace paddle {
namespace lite {

void XPUScratchPad::Reserve(size_t new_size) {
if (new_size <= size_) {
return;
}
TargetWrapperXPU::Free(addr_);
addr_ = TargetWrapperXPU::Malloc(new_size);
size_ = new_size;
}

void XPUScratchPadDeleter::operator()(XPUScratchPad* sp) const {
TargetWrapperXPU::Free(sp->addr_);
delete sp;
}

XPUL3CacheBlock* TargetWrapperXPU::CreateL3CacheBlock() {
l3_block_dict.push_back(new XPUL3CacheBlock());
return l3_block_dict.back();
}

void* TargetWrapperXPU::Malloc(size_t size) {
void* ptr{nullptr};
if (size > 0) {
XPU_CALL(xpu_malloc(&ptr, size));
}
return ptr;
}

void TargetWrapperXPU::Free(void* ptr) {
XPU_CALL(xpu_wait());
XPU_CALL(xpu_free(ptr));
}

void TargetWrapperXPU::MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir) {
switch (dir) {
case IoDirection::HtoD:
XPU_CALL(xpu_wait());
XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE));
XPUMemory::MemcpyHtoDSync(dst, src, size);
break;
case IoDirection::DtoH:
XPU_CALL(xpu_wait());
XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST));
XPUMemory::MemcpyDtoHSync(dst, src, size);
break;
default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
}
}

XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size) {
void* ptr = TargetWrapperXPU::Malloc(size);
CHECK(ptr) << "XPU Malloc Fail, Malloc Size is: " << size;
return XPUScratchPadGuard(new XPUScratchPad(ptr, size));
template <typename Tcpu, typename Txpu>
XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight(
const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose) {
CHECK(quantizer_.get());
return quantizer_->quant<Tcpu, Txpu>(cpu_data, dims, data_transpose);
}

void TargetWrapperXPU::ScatterL3Cache(
Expand Down Expand Up @@ -167,6 +139,19 @@ void TargetWrapperXPU::FreeL3Cache() {
}
}

template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, float>(
const float*, const DDimLite&, bool);
template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
const float*, const DDimLite&, bool);
template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int8_t>(
const float*, const DDimLite&, bool);
template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int8_t, int8_t>(
const int8_t*, const DDimLite&, bool);

// xpu context
LITE_THREAD_LOCAL xdnn::Context* TargetWrapperXPU::tls_raw_ctx_{nullptr};
// multi encoder config
Expand All @@ -190,6 +175,9 @@ LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*> TargetWrapperXPU::l3_block_dict;
std::mutex TargetWrapperXPU::mutex_l3_;
// l3 planner
LITE_THREAD_LOCAL XPUL3Planner* TargetWrapperXPU::l3_planner_{nullptr};
// xpu quantizer
LITE_THREAD_LOCAL std::shared_ptr<XPUQuantizer> TargetWrapperXPU::quantizer_{
nullptr};

} // namespace lite
} // namespace paddle
43 changes: 19 additions & 24 deletions lite/backends/xpu/target_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,13 @@
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/backends/xpu/xpu_l3_cache_block.h"
#include "lite/backends/xpu/xpu_l3_strategy.h"
#include "lite/backends/xpu/xpu_quantizer.h"
#include "lite/backends/xpu/xpu_scratch.h"
#include "lite/core/dim.h"
#include "lite/core/target_wrapper.h"
#include "lite/utils/log/cp_logging.h"
#include "lite/utils/macros.h"

#define XPU_CALL(func) \
{ \
auto e = (func); \
CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \
}

namespace paddle {
namespace lite {

Expand All @@ -45,35 +42,28 @@ const int XPU_MAX_LOD_SEQ_LEN = 512;

using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;

struct XPUScratchPad {
XPUScratchPad(void* addr, size_t size) : addr_(addr), size_(size) {}
// XXX(miaotianxiang): |size_| increases monotonically
void Reserve(size_t new_size);
void* addr_{nullptr};
size_t size_{0};
};

struct XPUScratchPadDeleter {
void operator()(XPUScratchPad* sp) const;
};

using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;

template <>
class TargetWrapper<TARGET(kXPU)> {
public:
static size_t num_devices() { return 1; }
static size_t maximum_stream() { return 0; }

static void* Malloc(size_t size);
static void Free(void* ptr);
static void* Malloc(size_t size) { return XPUMemory::Malloc(size); }
static void Free(void* ptr) { XPUMemory::Free(ptr); }

static void MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir);

static XPUScratchPadGuard MallocScratchPad(size_t size);
static XPUScratchPadGuard MallocScratchPad(size_t size) {
return XPUMemory::MallocScratchPad(size);
}

template <typename Tcpu, typename Txpu>
static XPUQuantData ConvertCPUWeightToXPUQuantWeight(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose);

static xdnn::Context* GetRawContext() {
if (tls_raw_ctx_ == nullptr) {
Expand All @@ -83,6 +73,10 @@ class TargetWrapper<TARGET(kXPU)> {
l3_planner_ = new XPUL3Planner;
}
CHECK(l3_planner_);
if (quantizer_.get() == nullptr) {
quantizer_.reset(new XPUQuantizer());
}
CHECK(quantizer_.get());
if (conv_autotune) {
tls_raw_ctx_->_xpu1_conv_selector.set_autotune_loop(true);
tls_raw_ctx_->_xpu1_conv_selector.set_inference_mode(true);
Expand All @@ -101,7 +95,7 @@ class TargetWrapper<TARGET(kXPU)> {
}
CHECK_LE(shared_l3_size, max_l3_size);
if (local_gm_size > 0) {
VLOG(3) << "Try To Malloc Local GM Workspace Size is" << local_gm_size;
VLOG(3) << "Try To Malloc Local GM Workspace Size is " << local_gm_size;
void* local_gm_ptr = nullptr;
int ret =
xpu_malloc(reinterpret_cast<void**>(&local_gm_ptr), local_gm_size);
Expand Down Expand Up @@ -166,6 +160,7 @@ class TargetWrapper<TARGET(kXPU)> {
static void* shared_l3_ptr_;
static std::mutex mutex_l3_;
static LITE_THREAD_LOCAL XPUL3Planner* l3_planner_;
static LITE_THREAD_LOCAL std::shared_ptr<XPUQuantizer> quantizer_;
};

} // namespace lite
Expand Down
Loading