From c40461a412fc24a2a20536df9ece0b0e70c2f8cf Mon Sep 17 00:00:00 2001
From: wang-xinyu <wangxinyu_es@163.com>
Date: Thu, 20 Oct 2022 15:49:18 +0800
Subject: [PATCH 1/5] TRT backend use pinned memory

---
 fastdeploy/backends/tensorrt/trt_backend.cc | 40 +++++++++++++--------
 fastdeploy/backends/tensorrt/trt_backend.h  |  5 +--
 fastdeploy/backends/tensorrt/utils.h        |  2 ++
 fastdeploy/core/allocate.cc                 |  6 ++++
 fastdeploy/core/allocate.h                  | 10 ++++++
 fastdeploy/core/fd_tensor.cc                | 36 +++++++++++++++++--
 fastdeploy/core/fd_tensor.h                 |  4 +++
 7 files changed, 84 insertions(+), 19 deletions(-)
diff --git a/fastdeploy/backends/tensorrt/trt_backend.cc b/fastdeploy/backends/tensorrt/trt_backend.cc
index 395215db030..f7e05f8680e 100644
--- a/fastdeploy/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/backends/tensorrt/trt_backend.cc
@@ -306,17 +306,21 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
 
   SetInputs(inputs);
   AllocateOutputsBuffer(outputs);
+
   if (!context_->enqueueV2(bindings_.data(), stream_, nullptr)) {
     FDERROR << "Failed to Infer with TensorRT." << std::endl;
     return false;
   }
   for (size_t i = 0; i < outputs->size(); ++i) {
-    FDASSERT(cudaMemcpyAsync((*outputs)[i].Data(),
-                             outputs_buffer_[(*outputs)[i].name].data(),
+    FDASSERT(cudaMemcpyAsync(outputs_pinned_buffer_[(*outputs)[i].name].data(),
+                             outputs_device_buffer_[(*outputs)[i].name].data(),
                              (*outputs)[i].Nbytes(), cudaMemcpyDeviceToHost,
                              stream_) == 0,
              "[ERROR] Error occurs while copy memory from GPU to CPU.");
   }
+  FDASSERT(cudaStreamSynchronize(stream_) == cudaSuccess,
+           "[ERROR] Error occurs while sync cuda stream.");
+
   return true;
 }
 
@@ -332,10 +336,11 @@ void TrtBackend::GetInputOutputInfo() {
     auto dtype = engine_->getBindingDataType(i);
     if (engine_->bindingIsInput(i)) {
       inputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype});
-      inputs_buffer_[name] = FDDeviceBuffer(dtype);
+      inputs_device_buffer_[name] = FDDeviceBuffer(dtype);
     } else {
       outputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype});
-      outputs_buffer_[name] = FDDeviceBuffer(dtype);
+      outputs_device_buffer_[name] = FDDeviceBuffer(dtype);
+      outputs_pinned_buffer_[name] = FDDeviceHostBuffer(dtype);
     }
   }
   bindings_.resize(num_binds);
@@ -357,30 +362,31 @@ void TrtBackend::SetInputs(const std::vector<FDTensor>& inputs) {
                  "please use INT32 input");
       } else {
         // no copy
-        inputs_buffer_[item.name].SetExternalData(dims, item.Data());
+        inputs_device_buffer_[item.name].SetExternalData(dims, item.Data());
       }
     } else {
       // Allocate input buffer memory
-      inputs_buffer_[item.name].resize(dims);
+      inputs_device_buffer_[item.name].resize(dims);
 
       // copy from cpu to gpu
       if (item.dtype == FDDataType::INT64) {
         int64_t* data = static_cast<int64_t*>(const_cast<void*>(item.Data()));
         std::vector<int32_t> casted_data(data, data + item.Numel());
-        FDASSERT(cudaMemcpyAsync(inputs_buffer_[item.name].data(),
+        FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
                                  static_cast<void*>(casted_data.data()),
                                  item.Nbytes() / 2, cudaMemcpyHostToDevice,
                                  stream_) == 0,
                  "Error occurs while copy memory from CPU to GPU.");
       } else {
-        FDASSERT(cudaMemcpyAsync(inputs_buffer_[item.name].data(), item.Data(),
+        FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
+                                 item.Data(),
                                  item.Nbytes(), cudaMemcpyHostToDevice,
                                  stream_) == 0,
                  "Error occurs while copy memory from CPU to GPU.");
       }
     }
     // binding input buffer
-    bindings_[idx] = inputs_buffer_[item.name].data();
+    bindings_[idx] = inputs_device_buffer_[item.name].data();
   }
 }
 
@@ -399,15 +405,21 @@ void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs) {
         "Cannot find output: %s of tensorrt network from the original model.",
         outputs_desc_[i].name.c_str());
     auto ori_idx = iter->second;
+
+    // Allocate output buffer memory
+    outputs_device_buffer_[outputs_desc_[i].name].resize(output_dims);
+    outputs_pinned_buffer_[outputs_desc_[i].name].resize(output_dims);
+
     // set user's outputs info
     std::vector<int64_t> shape(output_dims.d,
                                output_dims.d + output_dims.nbDims);
-    (*outputs)[ori_idx].Resize(shape, GetFDDataType(outputs_desc_[i].dtype),
-                               outputs_desc_[i].name);
-    // Allocate output buffer memory
-    outputs_buffer_[outputs_desc_[i].name].resize(output_dims);
+    (*outputs)[ori_idx].SetExternalData(shape, GetFDDataType(outputs_desc_[i].dtype),
+                                        outputs_pinned_buffer_[outputs_desc_[i].name].data());
+    (*outputs)[ori_idx].name = outputs_desc_[i].name;
+    (*outputs)[ori_idx].is_pinned_memory = true;
+  
     // binding output buffer
-    bindings_[idx] = outputs_buffer_[outputs_desc_[i].name].data();
+    bindings_[idx] = outputs_device_buffer_[outputs_desc_[i].name].data();
   }
 }
 
diff --git a/fastdeploy/backends/tensorrt/trt_backend.h b/fastdeploy/backends/tensorrt/trt_backend.h
index ad3ace6a438..17dc119d9d6 100755
--- a/fastdeploy/backends/tensorrt/trt_backend.h
+++ b/fastdeploy/backends/tensorrt/trt_backend.h
@@ -118,8 +118,9 @@ class TrtBackend : public BaseBackend {
   std::vector<void*> bindings_;
   std::vector<TrtValueInfo> inputs_desc_;
   std::vector<TrtValueInfo> outputs_desc_;
-  std::map<std::string, FDDeviceBuffer> inputs_buffer_;
-  std::map<std::string, FDDeviceBuffer> outputs_buffer_;
+  std::map<std::string, FDDeviceBuffer> inputs_device_buffer_;
+  std::map<std::string, FDDeviceBuffer> outputs_device_buffer_;
+  std::map<std::string, FDDeviceHostBuffer> outputs_pinned_buffer_;
 
   std::string calibration_str_;
 
diff --git a/fastdeploy/backends/tensorrt/utils.h b/fastdeploy/backends/tensorrt/utils.h
index f7623052607..7f2e7344bad 100644
--- a/fastdeploy/backends/tensorrt/utils.h
+++ b/fastdeploy/backends/tensorrt/utils.h
@@ -206,6 +206,8 @@ class FDGenericBuffer {
 };
 
 using FDDeviceBuffer = FDGenericBuffer<FDDeviceAllocator, FDDeviceFree>;
+using FDDeviceHostBuffer = FDGenericBuffer<FDDeviceHostAllocator,
+                                           FDDeviceHostFree>;
 
 class FDTrtLogger : public nvinfer1::ILogger {
  public:
diff --git a/fastdeploy/core/allocate.cc b/fastdeploy/core/allocate.cc
index 285642d5ce3..e71cd34434a 100644
--- a/fastdeploy/core/allocate.cc
+++ b/fastdeploy/core/allocate.cc
@@ -34,6 +34,12 @@ bool FDDeviceAllocator::operator()(void** ptr, size_t size) const {
 
 void FDDeviceFree::operator()(void* ptr) const { cudaFree(ptr); }
 
+bool FDDeviceHostAllocator::operator()(void** ptr, size_t size) const {
+  return cudaMallocHost(ptr, size) == cudaSuccess;
+}
+
+void FDDeviceHostFree::operator()(void* ptr) const { cudaFreeHost(ptr); }
+
 #endif
 
 }  // namespace fastdeploy
diff --git a/fastdeploy/core/allocate.h b/fastdeploy/core/allocate.h
index c48bb7cee3c..1e88787f4d6 100644
--- a/fastdeploy/core/allocate.h
+++ b/fastdeploy/core/allocate.h
@@ -45,6 +45,16 @@ class FASTDEPLOY_DECL FDDeviceFree {
   void operator()(void* ptr) const;
 };
 
+class FASTDEPLOY_DECL FDDeviceHostAllocator {
+ public:
+  bool operator()(void** ptr, size_t size) const;
+};
+
+class FASTDEPLOY_DECL FDDeviceHostFree {
+ public:
+  void operator()(void* ptr) const;
+};
+
 #endif
 
 }  // namespace fastdeploy
diff --git a/fastdeploy/core/fd_tensor.cc b/fastdeploy/core/fd_tensor.cc
index 1161d2b0e2d..14e25a440bf 100644
--- a/fastdeploy/core/fd_tensor.cc
+++ b/fastdeploy/core/fd_tensor.cc
@@ -191,6 +191,23 @@ void FDTensor::PrintInfo(const std::string& prefix) {
 }
 
 bool FDTensor::ReallocFn(size_t nbytes) {
+  if (is_pinned_memory) {
+#ifdef WITH_GPU
+    size_t original_nbytes = Nbytes();
+    if (nbytes > original_nbytes) {
+      if (buffer_ != nullptr) {
+        FDDeviceHostFree()(buffer_);
+      }
+      FDDeviceHostAllocator()(&buffer_, nbytes);
+    }
+    return buffer_ != nullptr;
+#else
+    FDASSERT(false,
+             "The FastDeploy FDTensor allocator didn't compile under "
+             "-DWITH_GPU=ON,"
+             "so this is an unexpected problem happend.");
+#endif
+  }
   if (device == Device::GPU) {
 #ifdef WITH_GPU
     size_t original_nbytes = Nbytes();
@@ -215,7 +232,11 @@ bool FDTensor::ReallocFn(size_t nbytes) {
 void FDTensor::FreeFn() {
   if (external_data_ptr != nullptr) external_data_ptr = nullptr;
   if (buffer_ != nullptr) {
-    if (device == Device::GPU) {
+    if (is_pinned_memory) {
+#ifdef WITH_GPU
+      FDDeviceHostFree()(buffer_);
+#endif
+    } else if (device == Device::GPU) {
 #ifdef WITH_GPU
       FDDeviceFree()(buffer_);
 #endif
@@ -227,11 +248,20 @@ void FDTensor::FreeFn() {
 }
 
 void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes) {
-  if (device == Device::GPU) {
+  if (is_pinned_memory) {
+#ifdef WITH_GPU
+    FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToHost) == 0,
+             "[ERROR] Error occurs while copy memory from host to host");
+#else
+    FDASSERT(false,
+             "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
+             "gpu buffer is "
+             "an unexpected problem happend.");
+#endif
+  } else if (device == Device::GPU) {
 #ifdef WITH_GPU
     FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToDevice) == 0,
              "[ERROR] Error occurs while copy memory from GPU to GPU");
-
 #else
     FDASSERT(false,
              "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
diff --git a/fastdeploy/core/fd_tensor.h b/fastdeploy/core/fd_tensor.h
index 7e8bb785199..1619fe27118 100644
--- a/fastdeploy/core/fd_tensor.h
+++ b/fastdeploy/core/fd_tensor.h
@@ -40,6 +40,10 @@ struct FASTDEPLOY_DECL FDTensor {
   // so we can skip data transfer, which may improve the efficience
   Device device = Device::CPU;
 
+  // Whether the data buffer is in pinned memory, which is allocated
+  // with cudaMallocHost()
+  bool is_pinned_memory = false;
+
   // if the external data is not on CPU, we use this temporary buffer
   // to transfer data to CPU at some cases we need to visit the
   // other devices' data

From 73db80b4f151dca67178a70e17e61323bedd1686 Mon Sep 17 00:00:00 2001
From: wang-xinyu <wangxinyu_es@163.com>
Date: Thu, 20 Oct 2022 20:52:51 +0800
Subject: [PATCH 2/5] refine fd tensor pinned memory logic

---
 fastdeploy/core/fd_tensor.cc | 77 +++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/fastdeploy/core/fd_tensor.cc b/fastdeploy/core/fd_tensor.cc
index 14e25a440bf..e98a81e1b78 100644
--- a/fastdeploy/core/fd_tensor.cc
+++ b/fastdeploy/core/fd_tensor.cc
@@ -191,14 +191,14 @@ void FDTensor::PrintInfo(const std::string& prefix) {
 }
 
 bool FDTensor::ReallocFn(size_t nbytes) {
-  if (is_pinned_memory) {
+  if (device == Device::GPU) {
 #ifdef WITH_GPU
     size_t original_nbytes = Nbytes();
     if (nbytes > original_nbytes) {
       if (buffer_ != nullptr) {
-        FDDeviceHostFree()(buffer_);
+        FDDeviceFree()(buffer_);
       }
-      FDDeviceHostAllocator()(&buffer_, nbytes);
+      FDDeviceAllocator()(&buffer_, nbytes);
     }
     return buffer_ != nullptr;
 #else
@@ -207,58 +207,51 @@ bool FDTensor::ReallocFn(size_t nbytes) {
              "-DWITH_GPU=ON,"
              "so this is an unexpected problem happend.");
 #endif
-  }
-  if (device == Device::GPU) {
+  } else {
+    if (is_pinned_memory) {
 #ifdef WITH_GPU
-    size_t original_nbytes = Nbytes();
-    if (nbytes > original_nbytes) {
-      if (buffer_ != nullptr) {
-        FDDeviceFree()(buffer_);
+      size_t original_nbytes = Nbytes();
+      if (nbytes > original_nbytes) {
+        if (buffer_ != nullptr) {
+          FDDeviceHostFree()(buffer_);
+        }
+        FDDeviceHostAllocator()(&buffer_, nbytes);
       }
-      FDDeviceAllocator()(&buffer_, nbytes);
-    }
-    return buffer_ != nullptr;
+      return buffer_ != nullptr;
 #else
-    FDASSERT(false,
-             "The FastDeploy FDTensor allocator didn't compile under "
-             "-DWITH_GPU=ON,"
-             "so this is an unexpected problem happend.");
+      FDASSERT(false,
+              "The FastDeploy FDTensor allocator didn't compile under "
+              "-DWITH_GPU=ON,"
+              "so this is an unexpected problem happend.");
 #endif
+    }
+    buffer_ = realloc(buffer_, nbytes);
+    return buffer_ != nullptr;
   }
-  buffer_ = realloc(buffer_, nbytes);
-  return buffer_ != nullptr;
 }
 
 void FDTensor::FreeFn() {
   if (external_data_ptr != nullptr) external_data_ptr = nullptr;
   if (buffer_ != nullptr) {
-    if (is_pinned_memory) {
-#ifdef WITH_GPU
-      FDDeviceHostFree()(buffer_);
-#endif
-    } else if (device == Device::GPU) {
+    if (device == Device::GPU) {
 #ifdef WITH_GPU
       FDDeviceFree()(buffer_);
 #endif
     } else {
-      FDHostFree()(buffer_);
+      if (is_pinned_memory) {
+#ifdef WITH_GPU
+        FDDeviceHostFree()(buffer_);
+#endif
+      } else {
+        FDHostFree()(buffer_);
+      }
     }
     buffer_ = nullptr;
   }
 }
 
 void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes) {
-  if (is_pinned_memory) {
-#ifdef WITH_GPU
-    FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToHost) == 0,
-             "[ERROR] Error occurs while copy memory from host to host");
-#else
-    FDASSERT(false,
-             "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
-             "gpu buffer is "
-             "an unexpected problem happend.");
-#endif
-  } else if (device == Device::GPU) {
+  if (device == Device::GPU) {
 #ifdef WITH_GPU
     FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToDevice) == 0,
              "[ERROR] Error occurs while copy memory from GPU to GPU");
@@ -269,7 +262,19 @@ void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes) {
              "an unexpected problem happend.");
 #endif
   } else {
-    std::memcpy(dst, src, nbytes);
+    if (is_pinned_memory) {
+#ifdef WITH_GPU
+      FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToHost) == 0,
+               "[ERROR] Error occurs while copy memory from host to host");
+#else
+      FDASSERT(false,
+               "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
+               "gpu buffer is "
+               "an unexpected problem happend.");
+#endif
+    } else {
+      std::memcpy(dst, src, nbytes);
+    }
   }
 }
 

From ba054b7f580053620122c14e4101a5991202a602 Mon Sep 17 00:00:00 2001
From: wang-xinyu <wangxinyu_es@163.com>
Date: Thu, 20 Oct 2022 21:47:21 +0800
Subject: [PATCH 3/5] TRT enable pinned memory configurable

---
 fastdeploy/backends/tensorrt/trt_backend.cc | 17 +++++++----------
 fastdeploy/backends/tensorrt/trt_backend.h  |  2 +-
 fastdeploy/runtime.cc                       |  5 +++++
 fastdeploy/runtime.h                        | 10 ++++++++++
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/fastdeploy/backends/tensorrt/trt_backend.cc b/fastdeploy/backends/tensorrt/trt_backend.cc
index f7e05f8680e..363a9d1ce42 100644
--- a/fastdeploy/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/backends/tensorrt/trt_backend.cc
@@ -312,7 +312,7 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
     return false;
   }
   for (size_t i = 0; i < outputs->size(); ++i) {
-    FDASSERT(cudaMemcpyAsync(outputs_pinned_buffer_[(*outputs)[i].name].data(),
+    FDASSERT(cudaMemcpyAsync((*outputs)[i].Data(),
                              outputs_device_buffer_[(*outputs)[i].name].data(),
                              (*outputs)[i].Nbytes(), cudaMemcpyDeviceToHost,
                              stream_) == 0,
@@ -340,7 +340,6 @@ void TrtBackend::GetInputOutputInfo() {
     } else {
       outputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype});
       outputs_device_buffer_[name] = FDDeviceBuffer(dtype);
-      outputs_pinned_buffer_[name] = FDDeviceHostBuffer(dtype);
     }
   }
   bindings_.resize(num_binds);
@@ -406,17 +405,15 @@ void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs) {
         outputs_desc_[i].name.c_str());
     auto ori_idx = iter->second;
 
-    // Allocate output buffer memory
-    outputs_device_buffer_[outputs_desc_[i].name].resize(output_dims);
-    outputs_pinned_buffer_[outputs_desc_[i].name].resize(output_dims);
-
     // set user's outputs info
     std::vector<int64_t> shape(output_dims.d,
                                output_dims.d + output_dims.nbDims);
-    (*outputs)[ori_idx].SetExternalData(shape, GetFDDataType(outputs_desc_[i].dtype),
-                                        outputs_pinned_buffer_[outputs_desc_[i].name].data());
-    (*outputs)[ori_idx].name = outputs_desc_[i].name;
-    (*outputs)[ori_idx].is_pinned_memory = true;
+    (*outputs)[ori_idx].is_pinned_memory = option_.enable_pinned_memory;
+    (*outputs)[ori_idx].Resize(shape, GetFDDataType(outputs_desc_[i].dtype),
+                               outputs_desc_[i].name);
+
+    // Allocate output buffer memory
+    outputs_device_buffer_[outputs_desc_[i].name].resize(output_dims);
   
     // binding output buffer
     bindings_[idx] = outputs_device_buffer_[outputs_desc_[i].name].data();
diff --git a/fastdeploy/backends/tensorrt/trt_backend.h b/fastdeploy/backends/tensorrt/trt_backend.h
index 17dc119d9d6..09f18b2dff7 100755
--- a/fastdeploy/backends/tensorrt/trt_backend.h
+++ b/fastdeploy/backends/tensorrt/trt_backend.h
@@ -70,6 +70,7 @@ struct TrtBackendOption {
   std::map<std::string, std::vector<int32_t>> min_shape;
   std::map<std::string, std::vector<int32_t>> opt_shape;
   std::string serialize_file = "";
+  bool enable_pinned_memory = false;
 
   // inside parameter, maybe remove next version
   bool remove_multiclass_nms_ = false;
@@ -120,7 +121,6 @@ class TrtBackend : public BaseBackend {
   std::vector<TrtValueInfo> outputs_desc_;
   std::map<std::string, FDDeviceBuffer> inputs_device_buffer_;
   std::map<std::string, FDDeviceBuffer> outputs_device_buffer_;
-  std::map<std::string, FDDeviceHostBuffer> outputs_pinned_buffer_;
 
   std::string calibration_str_;
 
diff --git a/fastdeploy/runtime.cc b/fastdeploy/runtime.cc
index 0877402d727..561bab97352 100755
--- a/fastdeploy/runtime.cc
+++ b/fastdeploy/runtime.cc
@@ -356,6 +356,10 @@ void RuntimeOption::EnableTrtFP16() { trt_enable_fp16 = true; }
 
 void RuntimeOption::DisableTrtFP16() { trt_enable_fp16 = false; }
 
+void RuntimeOption::EnableTrtPinnedMemory() { trt_enable_pinned_memory = true; }
+
+void RuntimeOption::DisableTrtPinnedMemory() { trt_enable_pinned_memory = false; }
+
 void RuntimeOption::SetTrtCacheFile(const std::string& cache_file_path) {
   trt_serialize_file = cache_file_path;
 }
@@ -606,6 +610,7 @@ void Runtime::CreateTrtBackend() {
   trt_option.min_shape = option.trt_min_shape;
   trt_option.opt_shape = option.trt_opt_shape;
   trt_option.serialize_file = option.trt_serialize_file;
+  trt_option.enable_pinned_memory = option.trt_enable_pinned_memory;
 
   // TODO(jiangjiajun): inside usage, maybe remove this later
   trt_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
diff --git a/fastdeploy/runtime.h b/fastdeploy/runtime.h
index 32ad1615c76..7a3024eadf6 100755
--- a/fastdeploy/runtime.h
+++ b/fastdeploy/runtime.h
@@ -204,6 +204,15 @@ struct FASTDEPLOY_DECL RuntimeOption {
    */
   void SetTrtCacheFile(const std::string& cache_file_path);
 
+  /**
+   * @brief Enable pinned memory in trt backend
+   */
+  void EnableTrtPinnedMemory();
+
+  /**
+   * @brief Disable pinned memory in trt backend
+   */
+  void DisableTrtPinnedMemory();
 
   /**
    * @brief Enable to collect shape in paddle trt backend
@@ -259,6 +268,7 @@ struct FASTDEPLOY_DECL RuntimeOption {
   bool trt_enable_int8 = false;
   size_t trt_max_batch_size = 32;
   size_t trt_max_workspace_size = 1 << 30;
+  bool trt_enable_pinned_memory = false;
 
   // ======Only for Poros Backend=======
   bool is_dynamic = false;

From 11dae75c9ae26702ffeffc7e66868ffd8200ebd2 Mon Sep 17 00:00:00 2001
From: wang-xinyu <wangxinyu_es@163.com>
Date: Fri, 21 Oct 2022 13:27:30 +0800
Subject: [PATCH 4/5] paddle inference support pinned memory

---
 fastdeploy/backends/paddle/paddle_backend.cc |  2 ++
 fastdeploy/backends/paddle/paddle_backend.h  |  2 ++
 fastdeploy/backends/paddle/util.cc           |  2 +-
 fastdeploy/runtime.cc                        |  8 +++++---
 fastdeploy/runtime.h                         | 11 ++++++-----
 5 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/fastdeploy/backends/paddle/paddle_backend.cc b/fastdeploy/backends/paddle/paddle_backend.cc
index 674a3795481..25951dae550 100644
--- a/fastdeploy/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/backends/paddle/paddle_backend.cc
@@ -19,6 +19,7 @@
 namespace fastdeploy {
 
 void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
+  option_ = option;
   if (option.use_gpu) {
     config_.EnableUseGpu(option.gpu_mem_init_size, option.gpu_id);
     if (option.enable_trt) {
@@ -190,6 +191,7 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
   outputs->resize(outputs_desc_.size());
   for (size_t i = 0; i < outputs_desc_.size(); ++i) {
     auto handle = predictor_->GetOutputHandle(outputs_desc_[i].name);
+    (*outputs)[i].is_pinned_memory = option_.enable_pinned_memory;
     CopyTensorToCpu(handle, &((*outputs)[i]));
   }
   return true;
diff --git a/fastdeploy/backends/paddle/paddle_backend.h b/fastdeploy/backends/paddle/paddle_backend.h
index 78b939feaea..1d4f53db3d8 100755
--- a/fastdeploy/backends/paddle/paddle_backend.h
+++ b/fastdeploy/backends/paddle/paddle_backend.h
@@ -53,6 +53,7 @@ struct PaddleBackendOption {
   int gpu_mem_init_size = 100;
   // gpu device id
   int gpu_id = 0;
+  bool enable_pinned_memory = false;
 
   std::vector<std::string> delete_pass_names = {};
 };
@@ -105,6 +106,7 @@ class PaddleBackend : public BaseBackend {
       std::map<std::string, std::vector<int>>* opt_shape) const;
   void SetTRTDynamicShapeToConfig(const PaddleBackendOption& option);
 #endif
+  PaddleBackendOption option_;
   paddle_infer::Config config_;
   std::shared_ptr<paddle_infer::Predictor> predictor_;
   std::vector<TensorInfo> inputs_desc_;
diff --git a/fastdeploy/backends/paddle/util.cc b/fastdeploy/backends/paddle/util.cc
index 216c788b293..924e1af4e31 100644
--- a/fastdeploy/backends/paddle/util.cc
+++ b/fastdeploy/backends/paddle/util.cc
@@ -63,7 +63,7 @@ void CopyTensorToCpu(std::unique_ptr<paddle_infer::Tensor>& tensor,
   std::vector<int64_t> shape;
   auto tmp_shape = tensor->shape();
   shape.assign(tmp_shape.begin(), tmp_shape.end());
-  fd_tensor->Allocate(shape, fd_dtype, tensor->name());
+  fd_tensor->Resize(shape, fd_dtype, tensor->name());
   if (fd_tensor->dtype == FDDataType::FP32) {
     tensor->CopyToCpu(static_cast<float*>(fd_tensor->MutableData()));
     return;
diff --git a/fastdeploy/runtime.cc b/fastdeploy/runtime.cc
index 561bab97352..5037dc1206d 100755
--- a/fastdeploy/runtime.cc
+++ b/fastdeploy/runtime.cc
@@ -356,9 +356,9 @@ void RuntimeOption::EnableTrtFP16() { trt_enable_fp16 = true; }
 
 void RuntimeOption::DisableTrtFP16() { trt_enable_fp16 = false; }
 
-void RuntimeOption::EnableTrtPinnedMemory() { trt_enable_pinned_memory = true; }
+void RuntimeOption::EnablePinnedMemory() { enable_pinned_memory = true; }
 
-void RuntimeOption::DisableTrtPinnedMemory() { trt_enable_pinned_memory = false; }
+void RuntimeOption::DisablePinnedMemory() { enable_pinned_memory = false; }
 
 void RuntimeOption::SetTrtCacheFile(const std::string& cache_file_path) {
   trt_serialize_file = cache_file_path;
@@ -507,6 +507,7 @@ void Runtime::CreatePaddleBackend() {
   pd_option.gpu_id = option.device_id;
   pd_option.delete_pass_names = option.pd_delete_pass_names;
   pd_option.cpu_thread_num = option.cpu_thread_num;
+  pd_option.enable_pinned_memory = option.enable_pinned_memory;
 #ifdef ENABLE_TRT_BACKEND
   if (pd_option.use_gpu && option.pd_enable_trt) {
     pd_option.enable_trt = true;
@@ -520,6 +521,7 @@ void Runtime::CreatePaddleBackend() {
     trt_option.min_shape = option.trt_min_shape;
     trt_option.opt_shape = option.trt_opt_shape;
     trt_option.serialize_file = option.trt_serialize_file;
+    trt_option.enable_pinned_memory = option.enable_pinned_memory;
     pd_option.trt_option = trt_option;
   }
 #endif
@@ -610,7 +612,7 @@ void Runtime::CreateTrtBackend() {
   trt_option.min_shape = option.trt_min_shape;
   trt_option.opt_shape = option.trt_opt_shape;
   trt_option.serialize_file = option.trt_serialize_file;
-  trt_option.enable_pinned_memory = option.trt_enable_pinned_memory;
+  trt_option.enable_pinned_memory = option.enable_pinned_memory;
 
   // TODO(jiangjiajun): inside usage, maybe remove this later
   trt_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
diff --git a/fastdeploy/runtime.h b/fastdeploy/runtime.h
index 7a3024eadf6..021103cb21f 100755
--- a/fastdeploy/runtime.h
+++ b/fastdeploy/runtime.h
@@ -205,14 +205,14 @@ struct FASTDEPLOY_DECL RuntimeOption {
   void SetTrtCacheFile(const std::string& cache_file_path);
 
   /**
-   * @brief Enable pinned memory in trt backend
+   * @brief Enable pinned memory. Pinned memory can be utilized to speedup the data transfer between CPU and GPU. Currently it's only suppurted in TRT backend and Paddle Inference backend.
    */
-  void EnableTrtPinnedMemory();
+  void EnablePinnedMemory();
 
   /**
-   * @brief Disable pinned memory in trt backend
+   * @brief Disable pinned memory
    */
-  void DisableTrtPinnedMemory();
+  void DisablePinnedMemory();
 
   /**
    * @brief Enable to collect shape in paddle trt backend
@@ -232,6 +232,8 @@ struct FASTDEPLOY_DECL RuntimeOption {
 
   Device device = Device::CPU;
 
+  bool enable_pinned_memory = false;
+
   // ======Only for ORT Backend========
   // -1 means use default value by ort
   // 0: ORT_DISABLE_ALL 1: ORT_ENABLE_BASIC 2: ORT_ENABLE_EXTENDED 3:
@@ -268,7 +270,6 @@ struct FASTDEPLOY_DECL RuntimeOption {
   bool trt_enable_int8 = false;
   size_t trt_max_batch_size = 32;
   size_t trt_max_workspace_size = 1 << 30;
-  bool trt_enable_pinned_memory = false;
 
   // ======Only for Poros Backend=======
   bool is_dynamic = false;

From 446a7d6cab8d886773929bd0685eb6b6879e768d Mon Sep 17 00:00:00 2001
From: wang-xinyu <wangxinyu_es@163.com>
Date: Fri, 21 Oct 2022 14:00:46 +0800
Subject: [PATCH 5/5] pinned memory pybindings

---
 fastdeploy/pybind/runtime.cc |  3 +++
 python/fastdeploy/runtime.py | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/fastdeploy/pybind/runtime.cc b/fastdeploy/pybind/runtime.cc
index 6d8eb78048a..70f9a5917b1 100755
--- a/fastdeploy/pybind/runtime.cc
+++ b/fastdeploy/pybind/runtime.cc
@@ -44,6 +44,8 @@ void BindRuntime(pybind11::module& m) {
       .def("enable_trt_fp16", &RuntimeOption::EnableTrtFP16)
       .def("disable_trt_fp16", &RuntimeOption::DisableTrtFP16)
       .def("set_trt_cache_file", &RuntimeOption::SetTrtCacheFile)
+      .def("enable_pinned_memory", &RuntimeOption::EnablePinnedMemory)
+      .def("disable_pinned_memory", &RuntimeOption::DisablePinnedMemory)
       .def("enable_paddle_trt_collect_shape", &RuntimeOption::EnablePaddleTrtCollectShape)
       .def("disable_paddle_trt_collect_shape", &RuntimeOption::DisablePaddleTrtCollectShape)
       .def_readwrite("model_file", &RuntimeOption::model_file)
@@ -200,6 +202,7 @@ void BindRuntime(pybind11::module& m) {
       .def("numel", &FDTensor::Numel)
       .def("nbytes", &FDTensor::Nbytes)
       .def_readwrite("name", &FDTensor::name)
+      .def_readwrite("is_pinned_memory", &FDTensor::is_pinned_memory)
       .def_readonly("shape", &FDTensor::shape)
       .def_readonly("dtype", &FDTensor::dtype)
       .def_readonly("device", &FDTensor::device);
diff --git a/python/fastdeploy/runtime.py b/python/fastdeploy/runtime.py
index 90e64d4005c..61d1039318b 100755
--- a/python/fastdeploy/runtime.py
+++ b/python/fastdeploy/runtime.py
@@ -319,6 +319,16 @@ def disable_trt_fp16(self):
         """
         return self._option.disable_trt_fp16()
 
+    def enable_pinned_memory(self):
+        """Enable pinned memory. Pinned memory can be utilized to speedup the data transfer between CPU and GPU. Currently it's only suppurted in TRT backend and Paddle Inference backend.
+        """
+        return self._option.enable_pinned_memory()
+
+    def disable_pinned_memory(self):
+        """Disable pinned memory.
+        """
+        return self._option.disable_pinned_memory()
+
     def enable_paddle_to_trt(self):
         """While using TensorRT backend, enable_paddle_to_trt() will change to use Paddle Inference backend, and use its integrated TensorRT instead.
         """