Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions paddle/fluid/framework/tensor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,52 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
stream);
}
else if (platform::is_npu_pinned_place(src_place) && // NOLINT
platform::is_npu_place(dst_place)) { /* npu_pinned->npu */
auto src_npu_pinned_place =
BOOST_GET_CONST(platform::NPUPinnedPlace, src_place);
auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place), true,
platform::errors::PreconditionNotMet(
"Device context place mismatch. When copying Tensor "
"data from NPU Pinned memory to NPU memory, current "
"device context place should be NPU."));
auto ctx_npu_place = BOOST_GET_CONST(platform::NPUPlace, ctx_place);
PADDLE_ENFORCE_EQ(dst_npu_place, ctx_npu_place,
platform::errors::PreconditionNotMet(
"The target NPU device and current device context do "
"not match. The target NPU device number is %d, but "
"device context NPU number is %d.",
dst_npu_place.device, ctx_npu_place.device));
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(dst_npu_place, dst_ptr, src_npu_pinned_place, src_ptr, size,
stream);
}
else if (platform::is_npu_place(src_place) && // NOLINT
platform::is_npu_pinned_place(dst_place)) { /* npu->npu_pinned */
auto src_npu_place = BOOST_GET_CONST(platform::NPUPlace, src_place);
auto dst_npu_pinned_place =
BOOST_GET_CONST(platform::NPUPinnedPlace, dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place), true,
platform::errors::PreconditionNotMet(
"Device context place mismatch. When copying Tensor "
"data from NPU memory to NPU Pinned memory, current "
"device context place should be NPU."));
auto ctx_npu_place = BOOST_GET_CONST(platform::NPUPlace, ctx_place);
PADDLE_ENFORCE_EQ(src_place, ctx_npu_place,
platform::errors::PreconditionNotMet(
"The source NPU device and current device context do "
"not match. The source NPU device number is %d, but "
"device context NPU number is %d.",
src_npu_place.device, ctx_npu_place.device));
auto stream =
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
memory::Copy(dst_npu_pinned_place, dst_ptr, src_npu_place, src_ptr, size,
stream);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
Expand Down
11 changes: 11 additions & 0 deletions paddle/fluid/memory/allocation/npu_pinned_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,17 @@ void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
std::lock_guard<std::mutex> lock(mtx_);
void *ptr = allocation->ptr();
auto iter = npu_events_.find(allocation);

// Managed by GC if not called RecordEvent.
if (iter == npu_events_.end()) {
// double free? No such problem has been found so far.
// Or maybe we need a set<Allocation*> to record which
// Allocation managed by GC.
free(ptr);
delete allocation;
return;
}

aclrtEvent event = iter->second;
aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
Expand Down
21 changes: 17 additions & 4 deletions paddle/fluid/operators/memcpy_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ class SelectedRows;
namespace paddle {
namespace operators {
class MemcpyFunctor {
private:
enum DeviceType {
CPU = 0,
CUDA = 1,
CUDA_PINNED = 2,
XPU = 3,
NPU = 4,
NPU_PINNED = 5,
};

public:
MemcpyFunctor(framework::Variable *out,
const platform::DeviceContext &dev_ctx,
Expand All @@ -45,18 +55,21 @@ class MemcpyFunctor {
void operator()(const framework::LoDTensor &lod_tensor) const {
auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();

if (dst_place_type_ == 2) {
if (dst_place_type_ == DeviceType::CUDA_PINNED) {
framework::TensorCopy(lod_tensor, platform::CUDAPinnedPlace(), dev_ctx_,
&out_tensor);
} else if (dst_place_type_ == 1) {
} else if (dst_place_type_ == DeviceType::CUDA) {
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor);
} else if (dst_place_type_ == 0) {
} else if (dst_place_type_ == DeviceType::CPU) {
framework::TensorCopySync(lod_tensor, platform::CPUPlace(), &out_tensor);
#ifdef PADDLE_WITH_ASCEND_CL
} else if (dst_place_type_ == 4) {
} else if (dst_place_type_ == DeviceType::NPU) { /* npu_pin->npu */
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor);
} else if (dst_place_type_ == DeviceType::NPU_PINNED) { /* npu->npu_pin */
framework::TensorCopy(lod_tensor, platform::NPUPinnedPlace(), dev_ctx_,
&out_tensor);
#endif
} else {
PADDLE_THROW(platform::errors::Unimplemented(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,36 @@
__all__ = []


class PlaceType:
# sync with memcpy op, maybe not a good design
CPU = 0
CUDA = 1
CUDA_PINNED = 2
XPU = 3 # unsupport for now
NPU = 4
NPU_PINNED = 5

@staticmethod
def default_device():
if core.is_compiled_with_cuda():
return PlaceType.CUDA
elif core.is_compiled_with_npu():
return PlaceType.NPU
return PlaceType.CPU

@staticmethod
def default_pinned():
if core.is_compiled_with_cuda():
return PlaceType.CUDA_PINNED
elif core.is_compiled_with_npu():
return PlaceType.NPU_PINNED
return PlaceType.CPU


class OffloadHelper(object):
cpu_place_type = 0
cuda_place_type = 1
cuda_pinned_place_type = 2
cuda_place_type = PlaceType.default_device()
cuda_pinned_place_type = PlaceType.default_pinned()

def __init__(self, mp_ring_id=None, dp_ring_id=None):
self.mp_ring_id = mp_ring_id
Expand Down