Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion lite/backends/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ endif()
if(LITE_WITH_XTCL)
lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
endif()
lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
lite_cc_library(xpu_scratch SRCS xpu_scratch.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
lite_cc_library(xpu_quantizer SRCS xpu_quantizer.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs} xpu_scratch)
lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs} xpu_scratch xpu_quantizer)
57 changes: 22 additions & 35 deletions lite/backends/xpu/target_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,60 +20,32 @@
namespace paddle {
namespace lite {

void XPUScratchPad::Reserve(size_t new_size) {
if (new_size <= size_) {
return;
}
TargetWrapperXPU::Free(addr_);
addr_ = TargetWrapperXPU::Malloc(new_size);
size_ = new_size;
}

void XPUScratchPadDeleter::operator()(XPUScratchPad* sp) const {
TargetWrapperXPU::Free(sp->addr_);
delete sp;
}

XPUL3CacheBlock* TargetWrapperXPU::CreateL3CacheBlock() {
l3_block_dict.push_back(new XPUL3CacheBlock());
return l3_block_dict.back();
}

void* TargetWrapperXPU::Malloc(size_t size) {
void* ptr{nullptr};
if (size > 0) {
XPU_CALL(xpu_malloc(&ptr, size));
}
return ptr;
}

void TargetWrapperXPU::Free(void* ptr) {
XPU_CALL(xpu_wait());
XPU_CALL(xpu_free(ptr));
}

void TargetWrapperXPU::MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir) {
switch (dir) {
case IoDirection::HtoD:
XPU_CALL(xpu_wait());
XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE));
XPUMemory::MemcpyHtoDSync(dst, src, size);
break;
case IoDirection::DtoH:
XPU_CALL(xpu_wait());
XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST));
XPUMemory::MemcpyDtoHSync(dst, src, size);
break;
default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
}
}

XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size) {
void* ptr = TargetWrapperXPU::Malloc(size);
CHECK(ptr) << "XPU Malloc Fail, Malloc Size is: " << size;
return XPUScratchPadGuard(new XPUScratchPad(ptr, size));
template <typename Tcpu, typename Txpu>
XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight(
const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose) {
CHECK(quantizer_);
return quantizer_->quant<Tcpu, Txpu>(cpu_data, dims, data_transpose);
}

void TargetWrapperXPU::ScatterL3Cache(
Expand Down Expand Up @@ -167,6 +139,19 @@ void TargetWrapperXPU::FreeL3Cache() {
}
}

template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, float>(
const float*, const DDimLite&, bool);
template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
const float*, const DDimLite&, bool);
template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int8_t>(
const float*, const DDimLite&, bool);
template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int8_t, int8_t>(
const int8_t*, const DDimLite&, bool);

// xpu context
LITE_THREAD_LOCAL xdnn::Context* TargetWrapperXPU::tls_raw_ctx_{nullptr};
// multi encoder config
Expand All @@ -190,6 +175,8 @@ LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*> TargetWrapperXPU::l3_block_dict;
std::mutex TargetWrapperXPU::mutex_l3_;
// l3 planner
LITE_THREAD_LOCAL XPUL3Planner* TargetWrapperXPU::l3_planner_{nullptr};
// xpu quantizer
LITE_THREAD_LOCAL XPUQuantizer* TargetWrapperXPU::quantizer_{nullptr};

} // namespace lite
} // namespace paddle
44 changes: 19 additions & 25 deletions lite/backends/xpu/target_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,19 @@

#include <algorithm>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/backends/xpu/xpu_l3_cache_block.h"
#include "lite/backends/xpu/xpu_l3_strategy.h"
#include "lite/backends/xpu/xpu_quantizer.h"
#include "lite/backends/xpu/xpu_scratch.h"
#include "lite/core/dim.h"
#include "lite/core/target_wrapper.h"
#include "lite/utils/log/cp_logging.h"
#include "lite/utils/macros.h"

#define XPU_CALL(func) \
{ \
auto e = (func); \
CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \
}

namespace paddle {
namespace lite {

Expand All @@ -45,35 +41,28 @@ const int XPU_MAX_LOD_SEQ_LEN = 512;

using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;

struct XPUScratchPad {
XPUScratchPad(void* addr, size_t size) : addr_(addr), size_(size) {}
// XXX(miaotianxiang): |size_| increases monotonically
void Reserve(size_t new_size);
void* addr_{nullptr};
size_t size_{0};
};

struct XPUScratchPadDeleter {
void operator()(XPUScratchPad* sp) const;
};

using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;

template <>
class TargetWrapper<TARGET(kXPU)> {
public:
static size_t num_devices() { return 1; }
static size_t maximum_stream() { return 0; }

static void* Malloc(size_t size);
static void Free(void* ptr);
static void* Malloc(size_t size) { return XPUMemory::Malloc(size); }
static void Free(void* ptr) { XPUMemory::Free(ptr); }

static void MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir);

static XPUScratchPadGuard MallocScratchPad(size_t size);
static XPUScratchPadGuard MallocScratchPad(size_t size) {
return XPUMemory::MallocScratchPad(size);
}

template <typename Tcpu, typename Txpu>
static XPUQuantData ConvertCPUWeightToXPUQuantWeight(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose);

static xdnn::Context* GetRawContext() {
if (tls_raw_ctx_ == nullptr) {
Expand All @@ -83,6 +72,10 @@ class TargetWrapper<TARGET(kXPU)> {
l3_planner_ = new XPUL3Planner;
}
CHECK(l3_planner_);
if (quantizer_ == nullptr) {
quantizer_ = new XPUQuantizer();
}
CHECK(quantizer_);
if (conv_autotune) {
tls_raw_ctx_->_xpu1_conv_selector.set_autotune_loop(true);
tls_raw_ctx_->_xpu1_conv_selector.set_inference_mode(true);
Expand All @@ -101,7 +94,7 @@ class TargetWrapper<TARGET(kXPU)> {
}
CHECK_LE(shared_l3_size, max_l3_size);
if (local_gm_size > 0) {
VLOG(3) << "Try To Malloc Local GM Workspace Size is" << local_gm_size;
VLOG(3) << "Try To Malloc Local GM Workspace Size is " << local_gm_size;
void* local_gm_ptr = nullptr;
int ret =
xpu_malloc(reinterpret_cast<void**>(&local_gm_ptr), local_gm_size);
Expand Down Expand Up @@ -166,6 +159,7 @@ class TargetWrapper<TARGET(kXPU)> {
static void* shared_l3_ptr_;
static std::mutex mutex_l3_;
static LITE_THREAD_LOCAL XPUL3Planner* l3_planner_;
static LITE_THREAD_LOCAL XPUQuantizer* quantizer_;
};

} // namespace lite
Expand Down
Loading