PaddlePaddle
diff --git a/‎cmake/configure.cmake‎
Lines changed: 4 additions & 0 deletions b/‎cmake/configure.cmake‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/api/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions b/‎paddle/fluid/inference/api/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎paddle/fluid/inference/api/details/zero_copy_tensor.cc‎
Lines changed: 65 additions & 8 deletions b/‎paddle/fluid/inference/api/details/zero_copy_tensor.cc‎
Lines changed: 65 additions & 8 deletions
diff --git a/‎paddle/fluid/inference/api/paddle_infer_contrib.cc‎
Lines changed: 190 additions & 0 deletions b/‎paddle/fluid/inference/api/paddle_infer_contrib.cc‎
Lines changed: 190 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/api/paddle_infer_contrib.h‎
Lines changed: 40 additions & 0 deletions b/‎paddle/fluid/inference/api/paddle_infer_contrib.h‎
Lines changed: 40 additions & 0 deletions
@@ -20,6 +20,10 @@ if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
 
+if(WITH_INFERENCE_API_TEST)
+    add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
+endif(WITH_INFERENCE_API_TEST)
+
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
 
@@ -28,14 +28,15 @@ if(WITH_MKLDNN)
 endif()
 
 cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer)
+cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
 if(WITH_CRYPTO)
     cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
+              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
 else()
     cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
+              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
 endif()
 
 if(WIN32)
 
@@ -121,6 +121,8 @@ DataType Tensor::type() const {
   return DataType::FLOAT32;
 }
 
+PlaceType Tensor::place() const { return place_; }
+
 template <typename T>
 void Tensor::CopyFromCpu(const T *data) {
   EAGER_GET_TENSOR;
@@ -185,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) {
 }
 
 template <typename T>
-void Tensor::CopyToCpu(T *data) {
+void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
+                           void *cb_params) const {
   EAGER_GET_TENSOR;
   auto ele_num = tensor->numel();
   auto *t_data = tensor->data<T>();
@@ -222,7 +225,16 @@ void Tensor::CopyToCpu(T *data) {
 #ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(dev_ctx->stream());
 #else
-    cudaStreamSynchronize(dev_ctx->stream());
+    // async, return stream
+    if (nullptr != exec_stream) {
+      *(static_cast<cudaStream_t *>(exec_stream)) = dev_ctx->stream();
+      // async with callback
+    } else if (cb) {
+      cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
+      // sync
+    } else {
+      cudaStreamSynchronize(dev_ctx->stream());
+    }
 #endif
 #else
     PADDLE_THROW(paddle::platform::errors::Unavailable(
@@ -261,19 +273,61 @@ void Tensor::CopyToCpu(T *data) {
         "The analysis predictor supports CPU, GPU, NPU and XPU now."));
   }
 }
+
+template <typename T>
+void Tensor::CopyToCpu(T *data) const {
+  CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
+}
+
+template <typename T>
+void Tensor::CopyToCpuAsync(T *data, void *exec_stream) const {
+  CopyToCpuImpl<T>(data, exec_stream, nullptr, nullptr);
+}
+
+template <typename T>
+void Tensor::CopyToCpuAsync(T *data, CallbackFunc cb, void *cb_params) const {
+  CopyToCpuImpl<T>(data, nullptr, cb, cb_params);
+}
+
 template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int64_t>(const int64_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
 
-template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data);
-template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data);
+template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
+
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
+    float *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
+    int64_t *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
+    int32_t *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
+    uint8_t *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
+    int8_t *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
+    float16 *data, void *exec_stream) const;
+
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
+    float *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
+    int64_t *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
+    int32_t *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
+    uint8_t *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
+    int8_t *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
+    float16 *data, CallbackFunc cb, void *cb_params) const;
 
 template PD_INFER_DECL float *Tensor::data<float>(PlaceType *place,
                                                   int *size) const;
@@ -285,12 +339,15 @@ template PD_INFER_DECL uint8_t *Tensor::data<uint8_t>(PlaceType *place,
                                                       int *size) const;
 template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
                                                     int *size) const;
+template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
+                                                      int *size) const;
 
 template PD_INFER_DECL float *Tensor::mutable_data<float>(PlaceType place);
 template PD_INFER_DECL int64_t *Tensor::mutable_data<int64_t>(PlaceType place);
 template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
 template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
+template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
 
 Tensor::Tensor(void *scope) : scope_{scope} {
   PADDLE_ENFORCE_NOT_NULL(scope_,
 
@@ -0,0 +1,190 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle_infer {
+namespace contrib {
+
+using paddle::PaddleDType;
+
+void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
+#if defined(PADDLE_WITH_CUDA)
+  void* ptr = nullptr;
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size));
+  return ptr;
+#else
+  return nullptr;
+#endif
+}
+
+void TensorUtils::CudaFreePinnedMemory(void* ptr) {
+#if defined(PADDLE_WITH_CUDA)
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr));
+#endif
+}
+
+void TensorUtils::CopyTensorImpl(Tensor* p_dst, const Tensor& src,
+                                 void* exec_stream, CallbackFunc cb,
+                                 void* cb_params) {
+  Tensor& dst = *p_dst;
+  dst.Reshape(src.shape());
+  PADDLE_ENFORCE(
+      src.place() == PlaceType::kCPU || src.place() == PlaceType::kGPU,
+      paddle::platform::errors::InvalidArgument(
+          "CopyTensor only support PlaceType kCPU/kGPU now."));
+  PADDLE_ENFORCE(
+      dst.place() == PlaceType::kCPU || dst.place() == PlaceType::kGPU,
+      paddle::platform::errors::InvalidArgument(
+          "CopyTensor only support PlaceType kCPU/kGPU now."));
+  // copy to cpu, gpu => cpu or cpu => cpu
+  if (dst.place() == PlaceType::kCPU) {
+    switch (src.type()) {
+      case PaddleDType::INT32:
+        src.CopyToCpuImpl(dst.mutable_data<int32_t>(PlaceType::kCPU),
+                          exec_stream, cb, cb_params);
+        break;
+      case PaddleDType::INT64:
+        src.CopyToCpuImpl(dst.mutable_data<int64_t>(PlaceType::kCPU),
+                          exec_stream, cb, cb_params);
+        break;
+      case PaddleDType::FLOAT32:
+        src.CopyToCpuImpl(dst.mutable_data<float>(PlaceType::kCPU), exec_stream,
+                          cb, cb_params);
+        break;
+      case PaddleDType::UINT8:
+        src.CopyToCpuImpl(dst.mutable_data<uint8_t>(PlaceType::kCPU),
+                          exec_stream, cb, cb_params);
+        break;
+      case PaddleDType::INT8:
+        src.CopyToCpuImpl(dst.mutable_data<int8_t>(PlaceType::kCPU),
+                          exec_stream, cb, cb_params);
+        break;
+      case PaddleDType::FLOAT16:
+        src.CopyToCpuImpl(
+            dst.mutable_data<paddle::platform::float16>(PlaceType::kCPU),
+            exec_stream, cb, cb_params);
+        break;
+      default:
+        PADDLE_THROW(paddle::platform::errors::Unimplemented(
+            "Only INT32, INT64, UINT8, INT8, FLOAT16 and "
+            "FLOAT32 is supported in Tensor. Others not implements"));
+    }
+    // gpu => gpu or cpu => gpu
+  } else {
+#if defined(PADDLE_WITH_CUDA)
+    void* dst_data = nullptr;
+    void* src_data = nullptr;
+    size_t data_len = 0;
+    int data_size = 0;
+    PlaceType src_place;
+    switch (src.type()) {
+      case PaddleDType::INT32:
+        dst_data =
+            static_cast<void*>(dst.mutable_data<int32_t>(PlaceType::kGPU));
+        src_data =
+            static_cast<void*>(src.data<int32_t>(&src_place, &data_size));
+        data_len = data_size * sizeof(int32_t);
+        break;
+      case PaddleDType::INT64:
+        dst_data =
+            static_cast<void*>(dst.mutable_data<int64_t>(PlaceType::kGPU));
+        src_data =
+            static_cast<void*>(src.data<int64_t>(&src_place, &data_size));
+        data_len = data_size * sizeof(int64_t);
+        break;
+      case PaddleDType::FLOAT32:
+        dst_data = static_cast<void*>(dst.mutable_data<float>(PlaceType::kGPU));
+        src_data = static_cast<void*>(src.data<float>(&src_place, &data_size));
+        data_len = data_size * sizeof(float);
+        break;
+      case PaddleDType::UINT8:
+        dst_data =
+            static_cast<void*>(dst.mutable_data<uint8_t>(PlaceType::kGPU));
+        src_data =
+            static_cast<void*>(src.data<uint8_t>(&src_place, &data_size));
+        data_len = data_size * sizeof(uint8_t);
+        break;
+      case PaddleDType::INT8:
+        dst_data =
+            static_cast<void*>(dst.mutable_data<int8_t>(PlaceType::kGPU));
+        src_data = static_cast<void*>(src.data<int8_t>(&src_place, &data_size));
+        data_len = data_size * sizeof(int8_t);
+        break;
+      case PaddleDType::FLOAT16:
+        dst_data = static_cast<void*>(
+            dst.mutable_data<paddle::platform::float16>(PlaceType::kGPU));
+        src_data = static_cast<void*>(
+            src.data<paddle::platform::float16>(&src_place, &data_size));
+        data_len = data_size * 2;
+        break;
+      default:
+        PADDLE_THROW(paddle::platform::errors::Unimplemented(
+            "Only INT32, INT64, UINT8, INT8, FLOAT16 and "
+            "FLOAT32 is supported in Tensor. Others not implements"));
+    }
+
+    paddle::platform::DeviceContextPool& pool =
+        paddle::platform::DeviceContextPool::Instance();
+    paddle::platform::CUDAPlace gpu_place(dst.device_);
+    auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
+        pool.Get(gpu_place));
+
+    if (src.place() == PlaceType::kCPU) {
+      paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
+                           paddle::platform::CPUPlace(), src_data, data_len,
+                           dev_ctx->stream());
+    } else {
+      paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
+                           paddle::platform::CUDAPlace(), src_data, data_len,
+                           dev_ctx->stream());
+    }
+
+    if (nullptr != exec_stream) {
+      *(static_cast<cudaStream_t*>(exec_stream)) = dev_ctx->stream();
+    } else if (cb) {
+      cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
+    } else {
+      cudaStreamSynchronize(dev_ctx->stream());
+    }
+#else
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not copy tensor to GPU CUDA place because paddle is not compiled "
+        "with CUDA."));
+#endif
+  }
+  return;
+}
+
+void TensorUtils::CopyTensor(Tensor* p_dst, const Tensor& src) {
+  CopyTensorImpl(p_dst, src, nullptr, nullptr, nullptr);
+}
+
+void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
+                                  void* exec_stream) {
+  CopyTensorImpl(p_dst, src, exec_stream, nullptr, nullptr);
+}
+
+void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
+                                  CallbackFunc cb, void* cb_params) {
+  CopyTensorImpl(p_dst, src, nullptr, cb, cb_params);
+}
+
+}  // namespace contrib
+}  // namespace paddle_infer
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle_infer {
+namespace contrib {
+
+class TensorUtils {
+ public:
+  static void* CudaMallocPinnedMemory(size_t size);
+  static void CudaFreePinnedMemory(void* mem);
+
+  static void CopyTensor(Tensor* p_dst, const Tensor& src);
+  static void CopyTensorAsync(Tensor* p_dst, const Tensor& src,
+                              void* exec_stream);
+  static void CopyTensorAsync(Tensor* p_dst, const Tensor& src, CallbackFunc cb,
+                              void* cb_params);
+
+ private:
+  static void CopyTensorImpl(Tensor* p_dst, const Tensor& src,
+                             void* exec_stream, CallbackFunc cb,
+                             void* cb_params);
+};
+
+}  // namespace contrib
+}  // namespace paddle_infer