Skip to content

Commit 2a672f6

Browse files
authored
[NPU] enable async copy and add wait before sync operation (#31956)
* enable async copy and add wait before sync operation * remove unneccessary wait * add FillNpuTensorWithConstant * refine * fix fill_constant * make TensorFromVector/TensorToVector sync
1 parent efa85f8 commit 2a672f6

File tree

7 files changed

+78
-38
lines changed

7 files changed

+78
-38
lines changed

paddle/fluid/framework/tensor_util.h

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,15 @@ void TensorFromVector(const std::vector<T>& src,
160160
}
161161
#endif
162162
#ifdef PADDLE_WITH_ASCEND_CL
163+
// NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
164+
// cudaMemcpyAsync.
165+
// cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
166+
// aclrtMemcpyAsync is really "async" between cpu <-> npu.
167+
// Since vector is on cpu, I think this function should be a "sync" operation,
168+
// so pass nullptr as stream to memory::Copy().
163169
else if (platform::is_npu_place(dst_place)) { // NOLINT
164-
memory::Copy(
165-
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
166-
src_ptr, size,
167-
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
170+
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
171+
src_place, src_ptr, size, nullptr);
168172
}
169173
#endif
170174
}
@@ -203,10 +207,8 @@ inline void TensorFromVector(const std::vector<bool>& src,
203207
#endif
204208
#ifdef PADDLE_WITH_ASCEND_CL
205209
else if (platform::is_npu_place(dst_place)) { // NOLINT
206-
memory::Copy(
207-
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
208-
src_ptr, size,
209-
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
210+
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
211+
src_place, src_ptr, size, nullptr);
210212
}
211213
#endif
212214
delete[] array;
@@ -266,10 +268,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
266268
#endif
267269
#ifdef PADDLE_WITH_ASCEND_CL
268270
else if (platform::is_npu_place(src.place())) { // NOLINT
269-
memory::Copy(
270-
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
271-
src_ptr, size,
272-
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
271+
memory::Copy(dst_place, dst_ptr,
272+
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
273+
size, nullptr);
273274
}
274275
#endif
275276
}
@@ -302,10 +303,9 @@ inline void TensorToVector(const Tensor& src,
302303
#endif
303304
#ifdef PADDLE_WITH_ASCEND_CL
304305
else if (platform::is_npu_place(src.place())) { // NOLINT
305-
memory::Copy(
306-
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
307-
src_ptr, size,
308-
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
306+
memory::Copy(dst_place, dst_ptr,
307+
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
308+
size, nullptr);
309309
}
310310
#endif
311311
for (unsigned int i = 0; i < src.numel(); i++) {

paddle/fluid/memory/memcpy.cc

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -209,19 +209,19 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
209209

210210
platform::SetNPUDeviceId(dst_place.device);
211211

212-
// NOTE(ascendrc): NPU memcpy async from host to device is a "real" async,
213-
// which is different from CUDA. In Paddle, when async is called, "sync"
214-
// is run actually, which means Paddle doesn't fully supported async.
215-
// TODO(ascendrc): Support NPU memcpy async for better performance.
216-
stream = nullptr;
217-
218212
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
219213
<< dst_place << " by thream(" << stream << ")";
220214

221215
if (stream) {
222216
platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
223217
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
224218
} else {
219+
// On NPU, async operation after sync operation is ok, while sync operation
220+
// after async is not ok, since the async operation may not done.
221+
// So, its needed to do wait before sync operation.
222+
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
223+
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
224+
225225
platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
226226
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
227227
}
@@ -237,19 +237,16 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
237237

238238
platform::SetNPUDeviceId(src_place.device);
239239

240-
// NOTE(ascendrc): NPU memcpy async from device to host is a "real" async,
241-
// which is different from CUDA. In Paddle, when async is called, "sync"
242-
// is run actually, which means Paddle doesn't fully supported async.
243-
// TODO(ascendrc): Support NPU memcpy async for better performance.
244-
stream = nullptr;
245-
246240
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
247241
<< dst_place << " by thream(" << stream << ")";
248242

249243
if (stream) {
250244
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
251245
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
252246
} else {
247+
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
248+
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
249+
253250
platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
254251
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
255252
}
@@ -272,6 +269,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
272269
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
273270
stream);
274271
} else {
272+
platform::DeviceContextPool& pool =
273+
platform::DeviceContextPool::Instance();
274+
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
275+
275276
platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
276277
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
277278
}
@@ -286,6 +287,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
286287
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
287288
stream);
288289
} else {
290+
platform::DeviceContextPool& pool =
291+
platform::DeviceContextPool::Instance();
292+
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
293+
289294
platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
290295
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
291296
}

paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
6262
TensorFromVector(init_y, ctx, tensor_y);
6363
tensor_y->Resize({10, 10});
6464

65-
ctx.Wait();
66-
6765
auto place = ctx.GetPlace();
6866
auto out = scope->Var("Out");
6967
auto tensor_out = out->GetMutable<f::LoDTensor>();
@@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
7472
{{"Out", {"Out"}}}, attrs);
7573

7674
op->Run(*scope, place);
77-
ctx.Wait();
7875

7976
std::vector<T> out_vec;
8077
TensorToVector(*tensor_out, ctx, &out_vec);
@@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
122119
TensorFromVector(init_dout, ctx, tensor_dout);
123120
tensor_dout->Resize({2, 3, 5});
124121

125-
ctx.Wait();
126-
127122
// run
128123
f::AttributeMap attrs;
129124
auto op = f::OpRegistry::CreateOp(
@@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
132127

133128
auto place = ctx.GetPlace();
134129
op->Run(*scope, place);
135-
ctx.Wait();
136130

137131
std::vector<T> dx_vec;
138132
TensorToVector(*tensor_dx, ctx, &dx_vec);

paddle/fluid/operators/fill_constant_op_npu.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
6565

6666
Tensor tensor_tmp(data_type);
6767
tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
68-
std::vector<T> init = {value};
69-
TensorFromVector(init, ctx.device_context(), &tensor_tmp);
68+
FillNpuTensorWithConstant<T>(&tensor_tmp, value);
7069

7170
out_var->mutable_data<T>(shape, place);
7271
auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},

paddle/fluid/operators/npu_op_runner.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
6464
return iter->second;
6565
}
6666

67-
aclrtStream GetCurrentNPUStream() {
68-
int device_id = platform::GetCurrentNPUDeviceId();
67+
aclrtStream GetCurrentNPUStream(int device_id) {
68+
if (device_id == -1) {
69+
device_id = platform::GetCurrentNPUDeviceId();
70+
}
6971
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
7072
auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
7173
pool.Get(platform::NPUPlace(device_id)));
@@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) {
299301
VLOG(4) << "after aclopCompileAndExecute: " << ret;
300302
PADDLE_ENFORCE_NPU_SUCCESS(ret);
301303
}
304+
302305
} // namespace operators
303306
} // namespace paddle

paddle/fluid/operators/npu_op_runner.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,44 @@ class NpuOpRunner {
8686

8787
aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);
8888

89+
aclrtStream GetCurrentNPUStream(int device_id = -1);
90+
91+
template <typename T>
92+
void FillNpuTensorWithConstant(Tensor *tensor, T val) {
93+
PADDLE_ENFORCE_EQ(
94+
tensor->IsInitialized(), true,
95+
platform::errors::InvalidArgument("The tensor should be initialized."));
96+
PADDLE_ENFORCE_EQ(
97+
platform::is_npu_place(tensor->place()), true,
98+
platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
99+
// do async for better performance
100+
if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) {
101+
Tensor tmp(tensor->type());
102+
tmp.Resize(tensor->dims());
103+
tmp.mutable_data<T>(tensor->place());
104+
auto stream = GetCurrentNPUStream(
105+
BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
106+
platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
107+
stream);
108+
auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
109+
{{"power", static_cast<float>(1)},
110+
{"scale", static_cast<float>(0)},
111+
{"shift", static_cast<float>(val)}});
112+
runner.Run(stream);
113+
} else {
114+
T *array = new T[tensor->numel()];
115+
for (unsigned int i = 0; i < tensor->numel(); ++i) {
116+
array[i] = static_cast<T>(val);
117+
}
118+
std::vector<T> vec(tensor->numel(), static_cast<T>(val));
119+
// do sync copy
120+
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
121+
tensor->data<void>(), platform::CPUPlace(), array,
122+
tensor->numel() * sizeof(T), nullptr);
123+
delete[] array;
124+
}
125+
}
126+
89127
} // namespace operators
90128
} // namespace paddle
91129
#endif

paddle/fluid/platform/device_context.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ NPUDeviceContext::~NPUDeviceContext() {
255255
void NPUDeviceContext::Wait() const {
256256
platform::RecordEvent record_event("NPUDeviceContext/wait");
257257
NPUDeviceGuard guard(place_.device);
258+
VLOG(4) << "NPU context Wait";
258259
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
259260
}
260261

0 commit comments

Comments
 (0)