Skip to content

Commit 49660c4

Browse files
committed
[npu][hybrid] support offload
1 parent df7cc45 commit 49660c4

File tree

4 files changed

+102
-6
lines changed

4 files changed

+102
-6
lines changed

paddle/fluid/framework/tensor_util.cc

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,52 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
151151
BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
152152
stream);
153153
}
154+
else if (platform::is_npu_pinned_place(src_place) && // NOLINT
155+
platform::is_npu_place(dst_place)) { /* npu_pinned->npu */
156+
auto src_npu_pinned_place =
157+
BOOST_GET_CONST(platform::NPUPinnedPlace, src_place);
158+
auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, dst_place);
159+
auto ctx_place = ctx.GetPlace();
160+
PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place), true,
161+
platform::errors::PreconditionNotMet(
162+
"Device context place mismatch. When copying Tensor "
163+
"data from NPU Pinned memory to NPU memory, current "
164+
"device context place should be NPU."));
165+
auto ctx_npu_place = BOOST_GET_CONST(platform::NPUPlace, ctx_place);
166+
PADDLE_ENFORCE_EQ(dst_npu_place, ctx_npu_place,
167+
platform::errors::PreconditionNotMet(
168+
"The target NPU device and current device context do "
169+
"not match. The target NPU device number is %d, but "
170+
"device context NPU number is %d.",
171+
dst_npu_place.device, ctx_npu_place.device));
172+
auto stream =
173+
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
174+
memory::Copy(dst_npu_place, dst_ptr, src_npu_pinned_place, src_ptr, size,
175+
stream);
176+
}
177+
else if (platform::is_npu_place(src_place) && // NOLINT
178+
platform::is_npu_pinned_place(dst_place)) { /* npu->npu_pinned */
179+
auto src_npu_place = BOOST_GET_CONST(platform::NPUPlace, src_place);
180+
auto dst_npu_pinned_place =
181+
BOOST_GET_CONST(platform::NPUPinnedPlace, dst_place);
182+
auto ctx_place = ctx.GetPlace();
183+
PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place), true,
184+
platform::errors::PreconditionNotMet(
185+
"Device context place mismatch. When copying Tensor "
186+
"data from NPU memory to NPU Pinned memory, current "
187+
"device context place should be NPU."));
188+
auto ctx_npu_place = BOOST_GET_CONST(platform::NPUPlace, ctx_place);
189+
PADDLE_ENFORCE_EQ(src_place, ctx_npu_place,
190+
platform::errors::PreconditionNotMet(
191+
"The source NPU device and current device context do "
192+
"not match. The source NPU device number is %d, but "
193+
"device context NPU number is %d.",
194+
src_npu_place.device, ctx_npu_place.device));
195+
auto stream =
196+
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
197+
memory::Copy(dst_npu_pinned_place, dst_ptr, src_npu_place, src_ptr, size,
198+
stream);
199+
}
154200
else { // NOLINT
155201
PADDLE_THROW(platform::errors::Unimplemented(
156202
"Copy from %s to %s is not supported.", src_place, dst_place));

paddle/fluid/memory/allocation/npu_pinned_allocator.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,17 @@ void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
5454
std::lock_guard<std::mutex> lock(mtx_);
5555
void *ptr = allocation->ptr();
5656
auto iter = npu_events_.find(allocation);
57+
58+
// Managed by GC if not called RecordEvent.
59+
if (iter == npu_events_.end()) {
60+
// double free? No such problem has been found so far.
61+
// Or maybe we need a set<Allocation*> to record which
62+
// Allocation managed by GC.
63+
free(ptr);
64+
delete allocation;
65+
return;
66+
}
67+
5768
aclrtEvent event = iter->second;
5869
aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
5970
PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));

paddle/fluid/operators/memcpy_op.h

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,16 @@ class SelectedRows;
3636
namespace paddle {
3737
namespace operators {
3838
class MemcpyFunctor {
39+
private:
40+
enum DeviceType {
41+
CPU = 0,
42+
CUDA = 1,
43+
CUDA_PINNED = 2,
44+
XPU = 3,
45+
NPU = 4,
46+
NPU_PINNED = 5,
47+
};
48+
3949
public:
4050
MemcpyFunctor(framework::Variable *out,
4151
const platform::DeviceContext &dev_ctx,
@@ -45,18 +55,21 @@ class MemcpyFunctor {
4555
void operator()(const framework::LoDTensor &lod_tensor) const {
4656
auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
4757

48-
if (dst_place_type_ == 2) {
58+
if (dst_place_type_ == DeviceType::CUDA_PINNED) {
4959
framework::TensorCopy(lod_tensor, platform::CUDAPinnedPlace(), dev_ctx_,
5060
&out_tensor);
51-
} else if (dst_place_type_ == 1) {
61+
} else if (dst_place_type_ == DeviceType::CUDA) {
5262
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
5363
&out_tensor);
54-
} else if (dst_place_type_ == 0) {
64+
} else if (dst_place_type_ == DeviceType::CPU) {
5565
framework::TensorCopySync(lod_tensor, platform::CPUPlace(), &out_tensor);
5666
#ifdef PADDLE_WITH_ASCEND_CL
57-
} else if (dst_place_type_ == 4) {
67+
} else if (dst_place_type_ == DeviceType::NPU) { /* npu_pin->npu */
5868
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
5969
&out_tensor);
70+
} else if (dst_place_type_ == DeviceType::NPU_PINNED) { /* npu->npu_pin */
71+
framework::TensorCopy(lod_tensor, platform::NPUPinnedPlace(), dev_ctx_,
72+
&out_tensor);
6073
#endif
6174
} else {
6275
PADDLE_THROW(platform::errors::Unimplemented(

python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,36 @@
2020
__all__ = []
2121

2222

23+
class PlaceType:
24+
# sync with memcpy op, maybe not a good design
25+
CPU = 0
26+
CUDA = 1
27+
CUDA_PINNED = 2
28+
XPU = 3 # unsupport for now
29+
NPU = 4
30+
NPU_PINNED = 5
31+
32+
@staticmethod
33+
def default_device():
34+
if core.is_compiled_with_cuda():
35+
return PlaceType.CUDA
36+
elif core.is_compiled_with_npu():
37+
return PlaceType.NPU
38+
return PlaceType.CPU
39+
40+
@staticmethod
41+
def default_pinned():
42+
if core.is_compiled_with_cuda():
43+
return PlaceType.CUDA_PINNED
44+
elif core.is_compiled_with_npu():
45+
return PlaceType.NPU_PINNED
46+
return PlaceType.CPU
47+
48+
2349
class OffloadHelper(object):
2450
cpu_place_type = 0
25-
cuda_place_type = 1
26-
cuda_pinned_place_type = 2
51+
cuda_place_type = PlaceType.default_device()
52+
cuda_pinned_place_type = PlaceType.default_pinned()
2753

2854
def __init__(self, mp_ring_id=None, dp_ring_id=None):
2955
self.mp_ring_id = mp_ring_id

0 commit comments

Comments
 (0)