diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index 543ac9d638594a..39be7865149c70 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -71,6 +71,7 @@ void AnakinEngine::Execute( const std::map &inputs, const std::map &outputs, cudaStream_t stream) { + cudaDeviceSynchronize(); for (const auto &input : inputs) { auto *tensor = input.second; auto *data = tensor->data(); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index cf02901d963858..f1bf4caed5d0be 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -74,6 +74,19 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { return res; } +PaddleDType ZeroCopyTensor::type() { + EAGER_GET_TENSOR; + auto type = tensor->type(); + if (type == framework::proto::VarType::FP32) { + return PaddleDType::FLOAT32; + } else if (type == framework::proto::VarType::INT64) { + return PaddleDType::INT64; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + return PaddleDType::FLOAT32; +} + template void ZeroCopyTensor::copy_from_cpu(const T *data) { EAGER_GET_TENSOR; @@ -119,6 +132,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { static_cast(pool.Get(gpu_place)); memory::Copy(platform::CPUPlace(), static_cast(data), gpu_place, t_data, ele_num * sizeof(T), dev_ctx->stream()); + cudaDeviceSynchronize(); #else PADDLE_THROW("Not compile with CUDA, should not reach here."); #endif diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index f807289f6aee06..382dee79bb6f6d 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -176,6 +176,8 @@ class ZeroCopyTensor { device_ = device; } + PaddleDType type(); + protected: explicit ZeroCopyTensor(void* scope) : scope_{scope} {} void SetName(const std::string& name) { name_ = name; } @@ -190,6 +192,7 @@ class ZeroCopyTensor { // performance. mutable void* tensor_{nullptr}; PaddlePlace place_; + PaddleDType dtype_; int device_; };