@@ -30,16 +30,24 @@ void TensorRTEngine::Build(const DescType& paddle_model) {
3030}
3131
3232void TensorRTEngine::Execute (int batch_size) {
33- infer_context_->enqueue (batch_size, buffers_.data (), *stream_, nullptr );
33+ std::vector<void *> buffers;
34+ for (auto & buf : buffers_) {
35+ PADDLE_ENFORCE_NOT_NULL (buf.buffer , " buffer should be allocated" );
36+ PADDLE_ENFORCE_GT (buf.max_size , 0 );
37+ PADDLE_ENFORCE (buf.device == DeviceType::GPU);
38+ buffers.push_back (buf.buffer );
39+ }
40+ infer_context_->enqueue (batch_size, buffers.data (), *stream_, nullptr );
3441 cudaStreamSynchronize (*stream_);
3542}
3643
3744TensorRTEngine::~TensorRTEngine () {
3845 // clean buffer
39- for (auto & buffer : buffers_) {
40- if (buffer != nullptr ) {
41- PADDLE_ENFORCE_EQ (0 , cudaFree (buffer));
42- buffer = nullptr ;
46+ for (auto & buf : buffers_) {
47+ if (buf.buffer != nullptr ) {
48+ PADDLE_ENFORCE_EQ (0 , cudaFree (buf.buffer ));
49+ buf.buffer = nullptr ;
50+ buf.max_size = 0 ;
4351 }
4452 }
4553}
@@ -59,15 +67,19 @@ void TensorRTEngine::FreezeNetwork() {
5967 infer_context_.reset (infer_engine_->createExecutionContext ());
6068
6169 // allocate GPU buffers.
62- buffers_.resize (buffer_sizes_.size (), nullptr );
70+ buffers_.resize (buffer_sizes_.size ());
6371 for (auto & item : buffer_sizes_) {
6472 if (item.second == 0 ) {
6573 auto slot_offset = infer_engine_->getBindingIndex (item.first .c_str ());
6674 item.second = kDataTypeSize [static_cast <int >(
6775 infer_engine_->getBindingDataType (slot_offset))] *
6876 AccumDims (infer_engine_->getBindingDimensions (slot_offset));
6977 }
70- PADDLE_ENFORCE_EQ (0 , cudaMalloc (&buffer (item.first ), item.second ));
78+ auto & buf = buffer (item.first );
79+ CHECK (buf.buffer == nullptr ); // buffer should be allocated only once.
80+ PADDLE_ENFORCE_EQ (0 , cudaMalloc (&buf.buffer , item.second ));
81+ buf.size = buf.max_size = item.second ;
82+ buf.device = DeviceType::GPU;
7183 }
7284}
7385
@@ -113,7 +125,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
113125}
114126
115127void * TensorRTEngine::GetOutputInGPU (const std::string& name) {
116- return buffer (name);
128+ return buffer (name). buffer ;
117129}
118130
119131void TensorRTEngine::GetOutputInCPU (const std::string& name, void * dst,
@@ -123,11 +135,13 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
123135 PADDLE_ENFORCE (it != buffer_sizes_.end ());
124136 PADDLE_ENFORCE_GT (it->second , 0 );
125137 PADDLE_ENFORCE_GE (max_size, it->second );
126- PADDLE_ENFORCE_EQ (0 , cudaMemcpyAsync (dst, buffer (name), it->second ,
138+ auto & buf = buffer (name);
139+ PADDLE_ENFORCE_NOT_NULL (buf.buffer , " buffer should be allocated before" );
140+ PADDLE_ENFORCE_EQ (0 , cudaMemcpyAsync (dst, buf.buffer , it->second ,
127141 cudaMemcpyDeviceToHost, *stream_));
128142}
129143
130- void * & TensorRTEngine::buffer (const std::string& name) {
144+ Buffer & TensorRTEngine::buffer (const std::string& name) {
131145 PADDLE_ENFORCE (infer_engine_ != nullptr , " call FreezeNetwork first." );
132146 auto it = buffer_sizes_.find (name);
133147 PADDLE_ENFORCE (it != buffer_sizes_.end ());
@@ -137,10 +151,12 @@ void*& TensorRTEngine::buffer(const std::string& name) {
137151
138152void TensorRTEngine::SetInputFromCPU (const std::string& name, void * data,
139153 size_t size) {
140- void * buf = buffer (name);
141- cudaMemcpyAsync (buf, data, size, cudaMemcpyHostToDevice, *stream_);
142- PADDLE_ENFORCE_EQ (
143- 0 , cudaMemcpyAsync (buf, data, size, cudaMemcpyHostToDevice, *stream_));
154+ auto & buf = buffer (name);
155+ PADDLE_ENFORCE_NOT_NULL (buf.buffer );
156+ PADDLE_ENFORCE_LE (size, buf.max_size , " buffer is too small" );
157+ PADDLE_ENFORCE (buf.device == DeviceType::GPU);
158+ PADDLE_ENFORCE_EQ (0 , cudaMemcpyAsync (buf.buffer , data, size,
159+ cudaMemcpyHostToDevice, *stream_));
144160}
145161
146162void TensorRTEngine::SetITensor (const std::string& name,
0 commit comments