Skip to content

Commit ac33c0c

Browse files
authored
Add copy from tensor (#34406)
* add api * temp save * revert * copytocpu async ok * fix style * copy sync ok * fix compile error * fix compile error * api done * update python async api * fix compile * remove async python api; add c++ async unittest * remove python async api * update unittest * update unittest * add C++ unittest for copytensor * add unittest * update namespace utils to class TensorUtils * add unittest * update unittest * update unittest * update code style * update code style * update unittest
1 parent 223c01f commit ac33c0c

File tree

11 files changed

+710
-11
lines changed

11 files changed

+710
-11
lines changed

cmake/configure.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ if(WITH_TESTING)
2020
add_definitions(-DPADDLE_WITH_TESTING)
2121
endif(WITH_TESTING)
2222

23+
if(WITH_INFERENCE_API_TEST)
24+
add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
25+
endif(WITH_INFERENCE_API_TEST)
26+
2327
if(NOT WITH_PROFILER)
2428
add_definitions(-DPADDLE_DISABLE_PROFILER)
2529
endif(NOT WITH_PROFILER)

paddle/fluid/inference/api/CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,15 @@ if(WITH_MKLDNN)
2828
endif()
2929

3030
cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer)
31+
cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
3132
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
3233

3334
if(WITH_CRYPTO)
3435
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
35-
analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
36+
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
3637
else()
3738
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
38-
analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
39+
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
3940
endif()
4041

4142
if(WIN32)

paddle/fluid/inference/api/details/zero_copy_tensor.cc

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ DataType Tensor::type() const {
121121
return DataType::FLOAT32;
122122
}
123123

124+
PlaceType Tensor::place() const { return place_; }
125+
124126
template <typename T>
125127
void Tensor::CopyFromCpu(const T *data) {
126128
EAGER_GET_TENSOR;
@@ -185,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) {
185187
}
186188

187189
template <typename T>
188-
void Tensor::CopyToCpu(T *data) {
190+
void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
191+
void *cb_params) const {
189192
EAGER_GET_TENSOR;
190193
auto ele_num = tensor->numel();
191194
auto *t_data = tensor->data<T>();
@@ -222,7 +225,16 @@ void Tensor::CopyToCpu(T *data) {
222225
#ifdef PADDLE_WITH_HIP
223226
hipStreamSynchronize(dev_ctx->stream());
224227
#else
225-
cudaStreamSynchronize(dev_ctx->stream());
228+
// async, return stream
229+
if (nullptr != exec_stream) {
230+
*(static_cast<cudaStream_t *>(exec_stream)) = dev_ctx->stream();
231+
// async with callback
232+
} else if (cb) {
233+
cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
234+
// sync
235+
} else {
236+
cudaStreamSynchronize(dev_ctx->stream());
237+
}
226238
#endif
227239
#else
228240
PADDLE_THROW(paddle::platform::errors::Unavailable(
@@ -261,19 +273,61 @@ void Tensor::CopyToCpu(T *data) {
261273
"The analysis predictor supports CPU, GPU, NPU and XPU now."));
262274
}
263275
}
276+
277+
template <typename T>
278+
void Tensor::CopyToCpu(T *data) const {
279+
CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
280+
}
281+
282+
template <typename T>
283+
void Tensor::CopyToCpuAsync(T *data, void *exec_stream) const {
284+
CopyToCpuImpl<T>(data, exec_stream, nullptr, nullptr);
285+
}
286+
287+
template <typename T>
288+
void Tensor::CopyToCpuAsync(T *data, CallbackFunc cb, void *cb_params) const {
289+
CopyToCpuImpl<T>(data, nullptr, cb, cb_params);
290+
}
291+
264292
template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
265293
template PD_INFER_DECL void Tensor::CopyFromCpu<int64_t>(const int64_t *data);
266294
template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
267295
template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
268296
template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
269297
template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
270298

271-
template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data);
272-
template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data);
273-
template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data);
274-
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data);
275-
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data);
276-
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data);
299+
template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
300+
template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
301+
template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
302+
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
303+
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
304+
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
305+
306+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
307+
float *data, void *exec_stream) const;
308+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
309+
int64_t *data, void *exec_stream) const;
310+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
311+
int32_t *data, void *exec_stream) const;
312+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
313+
uint8_t *data, void *exec_stream) const;
314+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
315+
int8_t *data, void *exec_stream) const;
316+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
317+
float16 *data, void *exec_stream) const;
318+
319+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
320+
float *data, CallbackFunc cb, void *cb_params) const;
321+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
322+
int64_t *data, CallbackFunc cb, void *cb_params) const;
323+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
324+
int32_t *data, CallbackFunc cb, void *cb_params) const;
325+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
326+
uint8_t *data, CallbackFunc cb, void *cb_params) const;
327+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
328+
int8_t *data, CallbackFunc cb, void *cb_params) const;
329+
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
330+
float16 *data, CallbackFunc cb, void *cb_params) const;
277331

278332
template PD_INFER_DECL float *Tensor::data<float>(PlaceType *place,
279333
int *size) const;
@@ -285,12 +339,15 @@ template PD_INFER_DECL uint8_t *Tensor::data<uint8_t>(PlaceType *place,
285339
int *size) const;
286340
template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
287341
int *size) const;
342+
template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
343+
int *size) const;
288344

289345
template PD_INFER_DECL float *Tensor::mutable_data<float>(PlaceType place);
290346
template PD_INFER_DECL int64_t *Tensor::mutable_data<int64_t>(PlaceType place);
291347
template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
292348
template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
293349
template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
350+
template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
294351

295352
Tensor::Tensor(void *scope) : scope_{scope} {
296353
PADDLE_ENFORCE_NOT_NULL(scope_,
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
16+
#include "paddle/fluid/framework/scope.h"
17+
#include "paddle/fluid/memory/memcpy.h"
18+
#include "paddle/fluid/platform/device_context.h"
19+
#include "paddle/fluid/platform/enforce.h"
20+
#include "paddle/fluid/platform/float16.h"
21+
22+
namespace paddle_infer {
23+
namespace contrib {
24+
25+
using paddle::PaddleDType;
26+
27+
void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
28+
#if defined(PADDLE_WITH_CUDA)
29+
void* ptr = nullptr;
30+
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size));
31+
return ptr;
32+
#else
33+
return nullptr;
34+
#endif
35+
}
36+
37+
void TensorUtils::CudaFreePinnedMemory(void* ptr) {
38+
#if defined(PADDLE_WITH_CUDA)
39+
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr));
40+
#endif
41+
}
42+
43+
void TensorUtils::CopyTensorImpl(Tensor* p_dst, const Tensor& src,
44+
void* exec_stream, CallbackFunc cb,
45+
void* cb_params) {
46+
Tensor& dst = *p_dst;
47+
dst.Reshape(src.shape());
48+
PADDLE_ENFORCE(
49+
src.place() == PlaceType::kCPU || src.place() == PlaceType::kGPU,
50+
paddle::platform::errors::InvalidArgument(
51+
"CopyTensor only support PlaceType kCPU/kGPU now."));
52+
PADDLE_ENFORCE(
53+
dst.place() == PlaceType::kCPU || dst.place() == PlaceType::kGPU,
54+
paddle::platform::errors::InvalidArgument(
55+
"CopyTensor only support PlaceType kCPU/kGPU now."));
56+
// copy to cpu, gpu => cpu or cpu => cpu
57+
if (dst.place() == PlaceType::kCPU) {
58+
switch (src.type()) {
59+
case PaddleDType::INT32:
60+
src.CopyToCpuImpl(dst.mutable_data<int32_t>(PlaceType::kCPU),
61+
exec_stream, cb, cb_params);
62+
break;
63+
case PaddleDType::INT64:
64+
src.CopyToCpuImpl(dst.mutable_data<int64_t>(PlaceType::kCPU),
65+
exec_stream, cb, cb_params);
66+
break;
67+
case PaddleDType::FLOAT32:
68+
src.CopyToCpuImpl(dst.mutable_data<float>(PlaceType::kCPU), exec_stream,
69+
cb, cb_params);
70+
break;
71+
case PaddleDType::UINT8:
72+
src.CopyToCpuImpl(dst.mutable_data<uint8_t>(PlaceType::kCPU),
73+
exec_stream, cb, cb_params);
74+
break;
75+
case PaddleDType::INT8:
76+
src.CopyToCpuImpl(dst.mutable_data<int8_t>(PlaceType::kCPU),
77+
exec_stream, cb, cb_params);
78+
break;
79+
case PaddleDType::FLOAT16:
80+
src.CopyToCpuImpl(
81+
dst.mutable_data<paddle::platform::float16>(PlaceType::kCPU),
82+
exec_stream, cb, cb_params);
83+
break;
84+
default:
85+
PADDLE_THROW(paddle::platform::errors::Unimplemented(
86+
"Only INT32, INT64, UINT8, INT8, FLOAT16 and "
87+
"FLOAT32 is supported in Tensor. Others not implements"));
88+
}
89+
// gpu => gpu or cpu => gpu
90+
} else {
91+
#if defined(PADDLE_WITH_CUDA)
92+
void* dst_data = nullptr;
93+
void* src_data = nullptr;
94+
size_t data_len = 0;
95+
int data_size = 0;
96+
PlaceType src_place;
97+
switch (src.type()) {
98+
case PaddleDType::INT32:
99+
dst_data =
100+
static_cast<void*>(dst.mutable_data<int32_t>(PlaceType::kGPU));
101+
src_data =
102+
static_cast<void*>(src.data<int32_t>(&src_place, &data_size));
103+
data_len = data_size * sizeof(int32_t);
104+
break;
105+
case PaddleDType::INT64:
106+
dst_data =
107+
static_cast<void*>(dst.mutable_data<int64_t>(PlaceType::kGPU));
108+
src_data =
109+
static_cast<void*>(src.data<int64_t>(&src_place, &data_size));
110+
data_len = data_size * sizeof(int64_t);
111+
break;
112+
case PaddleDType::FLOAT32:
113+
dst_data = static_cast<void*>(dst.mutable_data<float>(PlaceType::kGPU));
114+
src_data = static_cast<void*>(src.data<float>(&src_place, &data_size));
115+
data_len = data_size * sizeof(float);
116+
break;
117+
case PaddleDType::UINT8:
118+
dst_data =
119+
static_cast<void*>(dst.mutable_data<uint8_t>(PlaceType::kGPU));
120+
src_data =
121+
static_cast<void*>(src.data<uint8_t>(&src_place, &data_size));
122+
data_len = data_size * sizeof(uint8_t);
123+
break;
124+
case PaddleDType::INT8:
125+
dst_data =
126+
static_cast<void*>(dst.mutable_data<int8_t>(PlaceType::kGPU));
127+
src_data = static_cast<void*>(src.data<int8_t>(&src_place, &data_size));
128+
data_len = data_size * sizeof(int8_t);
129+
break;
130+
case PaddleDType::FLOAT16:
131+
dst_data = static_cast<void*>(
132+
dst.mutable_data<paddle::platform::float16>(PlaceType::kGPU));
133+
src_data = static_cast<void*>(
134+
src.data<paddle::platform::float16>(&src_place, &data_size));
135+
data_len = data_size * 2;
136+
break;
137+
default:
138+
PADDLE_THROW(paddle::platform::errors::Unimplemented(
139+
"Only INT32, INT64, UINT8, INT8, FLOAT16 and "
140+
"FLOAT32 is supported in Tensor. Others not implements"));
141+
}
142+
143+
paddle::platform::DeviceContextPool& pool =
144+
paddle::platform::DeviceContextPool::Instance();
145+
paddle::platform::CUDAPlace gpu_place(dst.device_);
146+
auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
147+
pool.Get(gpu_place));
148+
149+
if (src.place() == PlaceType::kCPU) {
150+
paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
151+
paddle::platform::CPUPlace(), src_data, data_len,
152+
dev_ctx->stream());
153+
} else {
154+
paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
155+
paddle::platform::CUDAPlace(), src_data, data_len,
156+
dev_ctx->stream());
157+
}
158+
159+
if (nullptr != exec_stream) {
160+
*(static_cast<cudaStream_t*>(exec_stream)) = dev_ctx->stream();
161+
} else if (cb) {
162+
cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
163+
} else {
164+
cudaStreamSynchronize(dev_ctx->stream());
165+
}
166+
#else
167+
PADDLE_THROW(paddle::platform::errors::Unavailable(
168+
"Can not copy tensor to GPU CUDA place because paddle is not compiled "
169+
"with CUDA."));
170+
#endif
171+
}
172+
return;
173+
}
174+
175+
void TensorUtils::CopyTensor(Tensor* p_dst, const Tensor& src) {
176+
CopyTensorImpl(p_dst, src, nullptr, nullptr, nullptr);
177+
}
178+
179+
void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
180+
void* exec_stream) {
181+
CopyTensorImpl(p_dst, src, exec_stream, nullptr, nullptr);
182+
}
183+
184+
void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
185+
CallbackFunc cb, void* cb_params) {
186+
CopyTensorImpl(p_dst, src, nullptr, cb, cb_params);
187+
}
188+
189+
} // namespace contrib
190+
} // namespace paddle_infer
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
17+
#include "paddle/fluid/inference/api/paddle_inference_api.h"
18+
19+
namespace paddle_infer {
20+
namespace contrib {
21+
22+
class TensorUtils {
23+
public:
24+
static void* CudaMallocPinnedMemory(size_t size);
25+
static void CudaFreePinnedMemory(void* mem);
26+
27+
static void CopyTensor(Tensor* p_dst, const Tensor& src);
28+
static void CopyTensorAsync(Tensor* p_dst, const Tensor& src,
29+
void* exec_stream);
30+
static void CopyTensorAsync(Tensor* p_dst, const Tensor& src, CallbackFunc cb,
31+
void* cb_params);
32+
33+
private:
34+
static void CopyTensorImpl(Tensor* p_dst, const Tensor& src,
35+
void* exec_stream, CallbackFunc cb,
36+
void* cb_params);
37+
};
38+
39+
} // namespace contrib
40+
} // namespace paddle_infer

0 commit comments

Comments
 (0)