@@ -43,6 +43,7 @@ BufferedReader::BufferedReader(
4343 buffer_size_(buffer_size),
4444 pin_memory_(pin_memory) {
4545 VLOG (1 ) << " BufferedReader" ;
46+
4647#ifdef PADDLE_WITH_CUDA
4748 if (platform::is_gpu_place (place_) && !pin_memory) {
4849 int dev_idx = BOOST_GET_CONST (platform::CUDAPlace, place_).device ;
@@ -57,9 +58,25 @@ BufferedReader::BufferedReader(
5758 stream_ = platform::CudaStreamResourcePool::Instance ().New (dev_idx);
5859 }
5960#endif
61+
62+ #ifdef PADDLE_WITH_ASCEND_CL
63+ if (platform::is_npu_place (place_)) {
64+ int dev_idx = BOOST_GET_CONST (platform::NPUPlace, place_).device ;
65+ compute_stream_ =
66+ ((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance ()
67+ .Get (place_)))
68+ ->stream ();
69+ events_.resize (buffer_size);
70+ for (auto &event : events_) {
71+ event = platform::NpuEventResourcePool::Instance ().New (dev_idx);
72+ }
73+ stream_ = platform::NpuStreamResourcePool::Instance ().New (dev_idx);
74+ }
75+ #endif
6076 is_same_place_ = false ;
6177 cpu_buffer_.resize (buffer_size);
6278 cuda_buffer_.resize (buffer_size);
79+ npu_buffer_.resize (buffer_size);
6380 ReadTillBufferFullAsync ();
6481}
6582
@@ -186,6 +203,58 @@ void BufferedReader::ReadAsync(size_t i) {
186203 }
187204 }
188205#endif
206+
207+ #ifdef PADDLE_WITH_ASCEND_CL
208+ if (platform::is_npu_place (place_)) {
209+ TensorVec &npu = npu_buffer_[i];
210+ if (npu.empty ()) {
211+ npu.resize (cpu.size ());
212+ } else {
213+ PADDLE_ENFORCE_EQ (
214+ npu.size (), cpu.size (),
215+ platform::errors::InvalidArgument (
216+ " Input tensor number on NPU and CPU devices are not matched. "
217+ " The number on NPU is %d, on CPU is %d" ,
218+ npu.size (), cpu.size ()));
219+ }
220+
221+ std::vector<void *> npu_ptrs;
222+ npu_ptrs.reserve (cpu.size ());
223+ for (size_t i = 0 ; i < cpu.size (); ++i) {
224+ npu[i].Resize (cpu[i].dims ());
225+ npu[i].set_layout (cpu[i].layout ());
226+ npu_ptrs.emplace_back (npu[i].mutable_data (place_, cpu[i].type ()));
227+ }
228+
229+ platform::SetNPUDeviceId (
230+ BOOST_GET_CONST (platform::NPUPlace, place_).device );
231+ PADDLE_ENFORCE_NPU_SUCCESS (
232+ aclrtRecordEvent (events_[i].get (), compute_stream_));
233+ PADDLE_ENFORCE_NPU_SUCCESS (
234+ aclrtStreamWaitEvent (stream_.get (), events_[i].get ()));
235+
236+ platform::RecordEvent record_event (" BufferedReader:MemoryCopy" );
237+ for (size_t i = 0 ; i < cpu.size (); ++i) {
238+ auto cpu_place = cpu[i].place ();
239+ auto cpu_ptr = cpu[i].data <void >();
240+ auto npu_ptr = npu_ptrs[i];
241+ auto size =
242+ cpu[i].numel () * paddle::framework::SizeOfType (cpu[i].type ());
243+ if ((platform::is_npu_place (cpu_place))) {
244+ memory::Copy (BOOST_GET_CONST (platform::NPUPlace, place_), npu_ptr,
245+ BOOST_GET_CONST (platform::NPUPlace, cpu_place), cpu_ptr,
246+ size, stream_.get ());
247+ } else {
248+ memory::Copy (BOOST_GET_CONST (platform::NPUPlace, place_), npu_ptr,
249+ BOOST_GET_CONST (platform::CPUPlace, cpu_place), cpu_ptr,
250+ size, stream_.get ());
251+ PADDLE_ENFORCE_NPU_SUCCESS (aclrtSynchronizeStream (stream_.get ()));
252+ }
253+ npu[i].set_lod (cpu[i].lod ());
254+ }
255+ PADDLE_ENFORCE_NPU_SUCCESS (aclrtSynchronizeStream (stream_.get ()));
256+ }
257+ #endif
189258 return i;
190259 }));
191260}
@@ -217,9 +286,13 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
217286 return ;
218287 }
219288
220- *out = std::move ((platform::is_gpu_place (place_) && !is_same_place_)
221- ? cuda_buffer_[i]
222- : cpu_buffer_[i]);
289+ if (platform::is_gpu_place (place_) && !is_same_place_) {
290+ *out = std::move (cuda_buffer_[i]);
291+ } else if (platform::is_npu_place (place_) && !is_same_place_) {
292+ *out = std::move (npu_buffer_[i]);
293+ } else {
294+ *out = std::move (cpu_buffer_[i]);
295+ }
223296
224297 // Do not push current position into ReadAsync. Push the previous position
225298 // Since all computation in fluid are async, change the data of
0 commit comments