PaddlePaddle · jiweibo · Nov 2, 2020 · Nov 2, 2020
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -590,7 +590,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
         gflags.push_back("--allocator_strategy=thread_local");
         process_level_allocator_enabled = false;
       } else {
-        gflags.push_back("--allocator_strategy=naive_best_fit");
         process_level_allocator_enabled = true;
       }
 
@@ -889,6 +888,38 @@ bool AnalysisPredictor::LoadParameters() {
   return true;
 }
 
+void AnalysisPredictor::ShrinkMemory() {
+  ClearIntermediateTensor();
+  std::lock_guard<std::mutex> lk(clone_mutex_);
+
+  for (auto name : scope_->LocalVarNames()) {
+    auto *variable = scope_->FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(variable,
+                            platform::errors::PreconditionNotMet(
+                                "Not found variable %s in scope.", name));
+    if (variable != nullptr && variable->IsType<framework::LoDTensor>()) {
+      VLOG(3) << "Clear Intermediate Tensor: " << name;
+      auto *t = variable->GetMutable<framework::LoDTensor>();
+      t->clear();
+    } else if (variable != nullptr &&
+               variable->IsType<framework::LoDTensorArray>()) {
+      VLOG(3) << "Clear Intermediate TensorArray: " << name;
+      auto *tr = variable->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = 0; i < tr->size(); ++i) {
+        tr[i].clear();
+      }
+    } else {
+      VLOG(3) << "Not supported type: " << variable->Type()
+              << " in ShrinkMemory";
+    }
+  }
+  scope_->EraseVars(scope_->LocalVarNames());
+  // Release-operation release all weights and tmp tensor, so we need to init
+  // predictor again.
+  paddle::memory::Release(place_);
+  Init(nullptr);
+}
+
 void AnalysisPredictor::ClearIntermediateTensor() {
   PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                           platform::errors::PreconditionNotMet(
@@ -1141,6 +1172,8 @@ void Predictor::ClearIntermediateTensor() {
   predictor_->ClearIntermediateTensor();
 }
 
+void Predictor::ShrinkMemory() { predictor_->ShrinkMemory(); }
+
 int GetNumBytesOfDataType(DataType dtype) {
   switch (dtype) {
     case DataType::FLOAT32:

diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
@@ -193,6 +193,13 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   void ClearIntermediateTensor();
 
+  ///
+  /// \brief Shrink memory. Release all weights and tmp tensor to destructor
+  /// Allocator. And reinit predictor to reconstruct an allocator. After
+  /// this operation, we reduced the memory usage.
+  ///
+  void ShrinkMemory() override;
+
   ///
   /// \brief Get the argument used by predictor
   ///

diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
@@ -319,6 +319,13 @@ class PD_INFER_DECL PaddlePredictor {
   ///
   virtual void ClearIntermediateTensor() {}
 
+  ///
+  /// \brief Shrink memory. Release all weights and tmp tensor to destructor
+  /// Allocator. And reinit predictor to reconstruct an Allocator. After
+  /// this operation, we reduced the memory usage.
+  ///
+  virtual void ShrinkMemory() {}
+
   /// \brief Clone an existing predictor
   /// When using clone, the same network will be created,
   /// and the parameters between them are shared.

diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -224,6 +224,13 @@ class PD_INFER_DECL Predictor {
   /// \brief Clear the intermediate tensors of the predictor
   void ClearIntermediateTensor();
 
+  ///
+  /// \brief Shrink memory. Release all weights and tmp tensor to destructor
+  /// Allocator. And reinit predictor to reconstruct an Allocator. After
+  /// this operation, we reduced the memory usage.
+  ///
+  void ShrinkMemory();
+
  private:
   std::unique_ptr<paddle::PaddlePredictor> predictor_;
 };

diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
@@ -178,12 +178,15 @@ class Allocator {
     FreeImpl(allocation);
   }
 
+  inline void Release(const platform::Place& place) { ReleaseImpl(place); }
+
   // True if the `Allocate` is thread safe.
   virtual bool IsAllocThreadSafe() const;
 
  protected:
   virtual Allocation* AllocateImpl(size_t size) = 0;
   virtual void FreeImpl(Allocation* allocation);
+  virtual void ReleaseImpl(const platform::Place& place) {}
 };
 
 using AllocationDeleter = Allocator::AllocationDeleter;

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -287,6 +287,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
   return m_->GetAllocator(place, size)->Allocate(size);
 }
 
+void AllocatorFacade::Release(const platform::Place& place) {
+  m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+      ->Release(place);
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -44,6 +44,9 @@ class AllocatorFacade {
   // Allocate a unique allocation.
   AllocationPtr Alloc(const platform::Place& place, size_t size);
 
+  // Release unused memory pool.
+  void Release(const platform::Place& place);
+
   // TODO(yy): Allocate a Copy-On-Write allocation?
  private:
   AllocatorFacade();

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -39,6 +39,9 @@ class AutoGrowthBestFitAllocator : public Allocator {
 
   void FreeImpl(Allocation *allocation) override;
 
+  // Release the memory block which is not used in pool.
+  void ReleaseImpl(const platform::Place &place) override { FreeIdleChunks(); }
+
  private:
   void FreeIdleChunks();
 

diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -53,6 +53,9 @@ void *Alloc(const Place &place, size_t size);
 template <typename Place>
 void Free(const Place &place, void *p, size_t size);
 
+template <typename Place>
+void Release(const Place &place);
+
 template <typename Place>
 size_t Used(const Place &place);
 
@@ -99,6 +102,11 @@ void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
   GetCPUBuddyAllocator()->Free(p);
 }
 
+template <>
+void Release<platform::CPUPlace>(const platform::CPUPlace &place) {
+  GetCPUBuddyAllocator()->Release();
+}
+
 template <>
 size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
@@ -186,6 +194,17 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
 #endif
 }
 
+template <>
+void Release<platform::XPUPlace>(const platform::XPUPlace &place) {
+#ifdef PADDLE_WITH_XPU
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("Release XPU pool is not supported."));
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
 template <>
 size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #ifdef PADDLE_WITH_XPU
@@ -313,6 +332,16 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
 #endif
 }
 
+template <>
+void Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
+#ifdef PADDLE_WITH_CUDA
+  GetGPUBuddyAllocator(place.device)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CUDAPlace' is not supported in CPU only device."));
+#endif
+}
+
 #ifdef PADDLE_WITH_CUDA
 BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
   static std::once_flag init_flag;
@@ -371,6 +400,17 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 #endif
 }
 
+template <>
+void Release<platform::CUDAPinnedPlace>(
+    const platform::CUDAPinnedPlace &place) {
+#ifdef PADDLE_WITH_CUDA
+  GetCUDAPinnedBuddyAllocator()->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CUDAPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
 struct AllocVisitor : public boost::static_visitor<void *> {
   inline explicit AllocVisitor(size_t size) : size_(size) {}
 
@@ -397,6 +437,13 @@ struct FreeVisitor : public boost::static_visitor<void> {
   size_t size_;
 };
 
+struct ReleaseVisitor : public boost::static_visitor<void> {
+  template <typename Place>
+  inline void operator()(const Place &place) const {
+    Release<Place>(place);
+  }
+};
+
 size_t Usage::operator()(const platform::CPUPlace &cpu) const {
   return Used(cpu);
 }
@@ -439,6 +486,10 @@ void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
   delete allocation;
 }
 
+void NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
+  boost::apply_visitor(legacy::ReleaseVisitor(), place);
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -35,6 +35,7 @@ class NaiveBestFitAllocator : public Allocator {
  protected:
   Allocation *AllocateImpl(size_t size) override;
   void FreeImpl(Allocation *allocation) override;
+  void ReleaseImpl(const platform::Place &place) override;
 
  private:
   platform::Place place_;

diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -47,6 +47,9 @@ class RetryAllocator : public Allocator {
  protected:
   void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size) override;
+  void ReleaseImpl(const platform::Place& place) override {
+    underlying_allocator_->Release(place);
+  }
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;

diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -72,6 +72,8 @@ void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
   delete allocation;
 }
 
+void ThreadLocalAllocatorImpl::ReleaseImpl() { buddy_allocator_->Release(); }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -52,6 +52,7 @@ class ThreadLocalAllocatorImpl
   explicit ThreadLocalAllocatorImpl(const platform::Place& p);
   ThreadLocalAllocation* AllocateImpl(size_t size);
   void FreeImpl(ThreadLocalAllocation* allocation);
+  void ReleaseImpl();
 
  private:
   std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
@@ -91,6 +92,9 @@ class ThreadLocalCUDAAllocator : public Allocator {
     auto allocator_impl = tl_allocation->GetAllocator();
     allocator_impl->FreeImpl(tl_allocation);
   }
+  void ReleaseImpl(const platform::Place& p) override {
+    return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->ReleaseImpl();
+  }
 
  private:
   int gpu_id_;

diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
@@ -62,6 +62,7 @@ TEST(ThreadLocalAllocator, cross_scope_release) {
         auto tl_allocator_impl =
             ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]);
         allocator_addresses[j][i] = tl_allocator_impl.get();
+        memory::Release(platform::CUDAPlace(devices[j]));
       }
     });
   }

diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -39,9 +39,10 @@ BuddyAllocator::~BuddyAllocator() {
   while (!pool_.empty()) {
     auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
     auto desc = cache_.LoadDesc(block);
-    VLOG(10) << "Free from block (" << block << ", " << desc->get_size() << ")";
+    VLOG(10) << "Free from block (" << block << ", " << desc->get_total_size()
+             << ")";
 
-    system_allocator_->Free(block, desc->get_size(), desc->get_index());
+    system_allocator_->Free(block, desc->get_total_size(), desc->get_index());
     cache_.Invalidate(block);
     pool_.erase(pool_.begin());
   }
@@ -161,6 +162,31 @@ void BuddyAllocator::Free(void* p) {
       IndexSizeAddress(desc->get_index(), desc->get_total_size(), block));
 }
 
+void BuddyAllocator::Release() {
+  // Acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  if (total_used_ == 0) {
+    while (!pool_.empty()) {
+      auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+      auto desc = cache_.LoadDesc(block);
+
+      VLOG(10) << "Release from block (" << block << ", "
+               << desc->get_total_size() << ")";
+      total_free_ -= desc->get_total_size();
+      system_allocator_->Free(static_cast<void*>(block), desc->get_total_size(),
+                              desc->get_index());
+      cache_.Invalidate(block);
+      pool_.erase(pool_.begin());
+    }
+  } else {
+    LOG(WARNING) << "The memory pool is not ready to release, please release "
+                    "all variables that occupy the allocator memory."
+                 << " If you are in multi-thread mode, please use "
+                    "thread_local_allocator.";
+  }
+}
+
 size_t BuddyAllocator::Used() { return total_used_; }
 size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
 size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }

diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -40,6 +40,8 @@ class BuddyAllocator {
  public:
   void* Alloc(size_t unaligned_size);
   void Free(void* ptr);
+  // Release the unused memory pool, a real free operation for the OS.
+  void Release();
   size_t Used();
   size_t GetMinChunkSize();
   size_t GetMaxChunkSize();

diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -305,6 +305,23 @@ TEST(BuddyAllocator, SpeedAna) {
   std::cerr << "time cost " << diff.count() << std::endl;
 }
 
+TEST(BuddyAllocator, Release) {
+  // In a 8 GB machine, the pool size will be about 800 MB
+  FLAGS_fraction_of_gpu_memory_to_use = 0.1;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+  // Less than pool size
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 50 << 20);
+
+  buddy_allocator.Release();
+}
 #endif
 
 }  // namespace detail

diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
@@ -31,5 +31,9 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) {
   return allocation::AllocatorFacade::Instance().Alloc(place, size);
 }
 
+void Release(const platform::Place &place) {
+  return allocation::AllocatorFacade::Instance().Release(place);
+}
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
@@ -38,5 +38,7 @@ extern AllocationPtr Alloc(const platform::Place& place, size_t size);
 
 extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
 
+extern void Release(const platform::Place& place);
+
 }  // namespace memory
 }  // namespace paddle