PaddlePaddle · jiweibo · Nov 11, 2020 · Nov 4, 2020 · Nov 4, 2020 · Nov 6, 2020
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -591,7 +591,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
         gflags.push_back("--allocator_strategy=thread_local");
         process_level_allocator_enabled = false;
       } else {
-        gflags.push_back("--allocator_strategy=naive_best_fit");
         process_level_allocator_enabled = true;
       }
 
@@ -890,6 +889,11 @@ bool AnalysisPredictor::LoadParameters() {
   return true;
 }
 
+void AnalysisPredictor::ShrinkMemory() {
+  ClearIntermediateTensor();
+  paddle::memory::Release(place_);
+}
+
 void AnalysisPredictor::ClearIntermediateTensor() {
   PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                           platform::errors::PreconditionNotMet(
@@ -1142,6 +1146,8 @@ void Predictor::ClearIntermediateTensor() {
   predictor_->ClearIntermediateTensor();
 }
 
+void Predictor::ShrinkMemory() { predictor_->ShrinkMemory(); }
+
 int GetNumBytesOfDataType(DataType dtype) {
   switch (dtype) {
     case DataType::FLOAT32:

diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
@@ -193,6 +193,12 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   void ClearIntermediateTensor();
 
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// After this operation, we reduced the memory usage.
+  ///
+  void ShrinkMemory() override;
+
   ///
   /// \brief Get the argument used by predictor
   ///

diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -135,6 +135,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
   auto* out_data = out->data<float>(&place, &size);
   LOG(INFO) << "output size: " << size / sizeof(float);
   LOG(INFO) << "output_data: " << out_data;
+  predictor->ShrinkMemory();
 }
 
 TEST(AnalysisPredictor, Clone) {
@@ -507,3 +508,45 @@ TEST(AnalysisPredictor, bf16_pass_strategy) {
 }
 
 }  // namespace paddle
+
+namespace paddle_infer {
+
+TEST(Predictor, Run) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+
+  auto predictor = CreatePredictor(config);
+
+  auto w0 = predictor->GetInputHandle("firstw");
+  auto w1 = predictor->GetInputHandle("secondw");
+  auto w2 = predictor->GetInputHandle("thirdw");
+  auto w3 = predictor->GetInputHandle("forthw");
+
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+
+  auto* w0_data = w0->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PlaceType::kCPU);
+
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+
+  predictor->Run();
+
+  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
+  PlaceType place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  predictor->ShrinkMemory();
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc
@@ -60,6 +60,7 @@ TEST(paddle_inference_api, demo) {
   auto predictor = CreatePaddlePredictor(config);
   std::vector<PaddleTensor> outputs;
   predictor->Run({}, &outputs);
+  predictor->ShrinkMemory();
 }
 
 TEST(paddle_inference_api, get_version) {

diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
@@ -319,6 +319,12 @@ class PD_INFER_DECL PaddlePredictor {
   ///
   virtual void ClearIntermediateTensor() {}
 
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// After this operation, we reduced the memory usage.
+  ///
+  virtual void ShrinkMemory() {}
+
   /// \brief Clone an existing predictor
   /// When using clone, the same network will be created,
   /// and the parameters between them are shared.

diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -224,6 +224,12 @@ class PD_INFER_DECL Predictor {
   /// \brief Clear the intermediate tensors of the predictor
   void ClearIntermediateTensor();
 
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// After this operation, we reduced the memory usage.
+  ///
+  void ShrinkMemory();
+
  private:
   std::unique_ptr<paddle::PaddlePredictor> predictor_;
 };

diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
@@ -566,6 +566,7 @@ void BindAnalysisPredictor(py::module *m) {
       .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
       .def("clear_intermediate_tensor",
            &AnalysisPredictor::ClearIntermediateTensor)
+      .def("shrink_memory", &AnalysisPredictor::ShrinkMemory)
       .def("create_feed_fetch_var", &AnalysisPredictor::CreateFeedFetchVar)
       .def("prepare_feed_fetch", &AnalysisPredictor::PrepareFeedFetch)
       .def("prepare_argument", &AnalysisPredictor::PrepareArgument)
@@ -593,6 +594,7 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
       .def("run", &paddle_infer::Predictor::Run)
       .def("clone", &paddle_infer::Predictor::Clone)
+      .def("shrink_memory", &paddle_infer::Predictor::ShrinkMemory)
       .def("clear_intermediate_tensor",
            &paddle_infer::Predictor::ClearIntermediateTensor);
 }