diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5dae7368a8e7df..01162bd7c1066e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -590,7 +590,6 @@ std::unique_ptr CreatePaddlePredictor< gflags.push_back("--allocator_strategy=thread_local"); process_level_allocator_enabled = false; } else { - gflags.push_back("--allocator_strategy=naive_best_fit"); process_level_allocator_enabled = true; } @@ -889,6 +888,38 @@ bool AnalysisPredictor::LoadParameters() { return true; } +void AnalysisPredictor::ShrinkMemory() { + ClearIntermediateTensor(); + std::lock_guard lk(clone_mutex_); + + for (auto name : scope_->LocalVarNames()) { + auto *variable = scope_->FindVar(name); + PADDLE_ENFORCE_NOT_NULL(variable, + platform::errors::PreconditionNotMet( + "Not found variable %s in scope.", name)); + if (variable != nullptr && variable->IsType()) { + VLOG(3) << "Clear Intermediate Tensor: " << name; + auto *t = variable->GetMutable(); + t->clear(); + } else if (variable != nullptr && + variable->IsType()) { + VLOG(3) << "Clear Intermediate TensorArray: " << name; + auto *tr = variable->GetMutable(); + for (size_t i = 0; i < tr->size(); ++i) { + tr[i].clear(); + } + } else { + VLOG(3) << "Not supported type: " << variable->Type() + << " in ShrinkMemory"; + } + } + scope_->EraseVars(scope_->LocalVarNames()); + // Release-operation release all weights and tmp tensor, so we need to init + // predictor again. + paddle::memory::Release(place_); + Init(nullptr); +} + void AnalysisPredictor::ClearIntermediateTensor() { PADDLE_ENFORCE_NOT_NULL(inference_program_.get(), platform::errors::PreconditionNotMet( @@ -1141,6 +1172,8 @@ void Predictor::ClearIntermediateTensor() { predictor_->ClearIntermediateTensor(); } +void Predictor::ShrinkMemory() { predictor_->ShrinkMemory(); } + int GetNumBytesOfDataType(DataType dtype) { switch (dtype) { case DataType::FLOAT32: diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 269f2fd80bb47d..dee5da835c18b6 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -193,6 +193,13 @@ class AnalysisPredictor : public PaddlePredictor { /// void ClearIntermediateTensor(); + /// + /// \brief Shrink memory. Release all weights and tmp tensor to destructor + /// Allocator. And reinit predictor to reconstruct an allocator. After + /// this operation, we reduced the memory usage. + /// + void ShrinkMemory() override; + /// /// \brief Get the argument used by predictor /// diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 064f63542683a0..5983594ebee5ec 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -319,6 +319,13 @@ class PD_INFER_DECL PaddlePredictor { /// virtual void ClearIntermediateTensor() {} + /// + /// \brief Shrink memory. Release all weights and tmp tensor to destructor + /// Allocator. And reinit predictor to reconstruct an Allocator. After + /// this operation, we reduced the memory usage. + /// + virtual void ShrinkMemory() {} + /// \brief Clone an existing predictor /// When using clone, the same network will be created, /// and the parameters between them are shared. diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 5dc4430fde4715..c2ee5a8a43e85d 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -224,6 +224,13 @@ class PD_INFER_DECL Predictor { /// \brief Clear the intermediate tensors of the predictor void ClearIntermediateTensor(); + /// + /// \brief Shrink memory. Release all weights and tmp tensor to destructor + /// Allocator. And reinit predictor to reconstruct an Allocator. After + /// this operation, we reduced the memory usage. + /// + void ShrinkMemory(); + private: std::unique_ptr predictor_; }; diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index e54748a53679d1..b83d3efb72b719 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -178,12 +178,15 @@ class Allocator { FreeImpl(allocation); } + inline void Release(const platform::Place& place) { ReleaseImpl(place); } + // True if the `Allocate` is thread safe. virtual bool IsAllocThreadSafe() const; protected: virtual Allocation* AllocateImpl(size_t size) = 0; virtual void FreeImpl(Allocation* allocation); + virtual void ReleaseImpl(const platform::Place& place) {} }; using AllocationDeleter = Allocator::AllocationDeleter; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 3213684c140b02..59b06d082872c1 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -287,6 +287,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, return m_->GetAllocator(place, size)->Allocate(size); } +void AllocatorFacade::Release(const platform::Place& place) { + m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + ->Release(place); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 64b6fe25c352e8..2f2f222f6c74a5 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -44,6 +44,9 @@ class AllocatorFacade { // Allocate a unique allocation. AllocationPtr Alloc(const platform::Place& place, size_t size); + // Release unused memory pool. + void Release(const platform::Place& place); + // TODO(yy): Allocate a Copy-On-Write allocation? private: AllocatorFacade(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index cbc126264ac2c0..b55ebf18934f2b 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -39,6 +39,9 @@ class AutoGrowthBestFitAllocator : public Allocator { void FreeImpl(Allocation *allocation) override; + // Release the memory block which is not used in pool. + void ReleaseImpl(const platform::Place &place) override { FreeIdleChunks(); } + private: void FreeIdleChunks(); diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index c661c9f9c37509..842ebd16cf8afe 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -53,6 +53,9 @@ void *Alloc(const Place &place, size_t size); template void Free(const Place &place, void *p, size_t size); +template +void Release(const Place &place); + template size_t Used(const Place &place); @@ -99,6 +102,11 @@ void Free(const platform::CPUPlace &place, void *p, GetCPUBuddyAllocator()->Free(p); } +template <> +void Release(const platform::CPUPlace &place) { + GetCPUBuddyAllocator()->Release(); +} + template <> size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); @@ -186,6 +194,17 @@ void Free(const platform::XPUPlace &place, void *p, #endif } +template <> +void Release(const platform::XPUPlace &place) { +#ifdef PADDLE_WITH_XPU + PADDLE_THROW( + platform::errors::PermissionDenied("Release XPU pool is not supported.")); +#else + PADDLE_THROW( + platform::errors::PermissionDenied("'XPUPlace' is not supported.")); +#endif +} + template <> size_t Used(const platform::XPUPlace &place) { #ifdef PADDLE_WITH_XPU @@ -313,6 +332,16 @@ void Free(const platform::CUDAPlace &place, void *p, #endif } +template <> +void Release(const platform::CUDAPlace &place) { +#ifdef PADDLE_WITH_CUDA + GetGPUBuddyAllocator(place.device)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CUDAPlace' is not supported in CPU only device.")); +#endif +} + #ifdef PADDLE_WITH_CUDA BuddyAllocator *GetCUDAPinnedBuddyAllocator() { static std::once_flag init_flag; @@ -371,6 +400,17 @@ void Free(const platform::CUDAPinnedPlace &place, #endif } +template <> +void Release( + const platform::CUDAPinnedPlace &place) { +#ifdef PADDLE_WITH_CUDA + GetCUDAPinnedBuddyAllocator()->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CUDAPinnedPlace' is not supported in CPU only device.")); +#endif +} + struct AllocVisitor : public boost::static_visitor { inline explicit AllocVisitor(size_t size) : size_(size) {} @@ -397,6 +437,13 @@ struct FreeVisitor : public boost::static_visitor { size_t size_; }; +struct ReleaseVisitor : public boost::static_visitor { + template + inline void operator()(const Place &place) const { + Release(place); + } +}; + size_t Usage::operator()(const platform::CPUPlace &cpu) const { return Used(cpu); } @@ -439,6 +486,10 @@ void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) { delete allocation; } +void NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) { + boost::apply_visitor(legacy::ReleaseVisitor(), place); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h index 4cf1bd6123e5fb..ba4c4ca226b1e0 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h @@ -35,6 +35,7 @@ class NaiveBestFitAllocator : public Allocator { protected: Allocation *AllocateImpl(size_t size) override; void FreeImpl(Allocation *allocation) override; + void ReleaseImpl(const platform::Place &place) override; private: platform::Place place_; diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 4a787ff2d7b384..74828a0ede3f43 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -47,6 +47,9 @@ class RetryAllocator : public Allocator { protected: void FreeImpl(Allocation* allocation) override; Allocation* AllocateImpl(size_t size) override; + void ReleaseImpl(const platform::Place& place) override { + underlying_allocator_->Release(place); + } private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc index 50fe9c9b752494..d2a8250d3db58c 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc @@ -72,6 +72,8 @@ void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) { delete allocation; } +void ThreadLocalAllocatorImpl::ReleaseImpl() { buddy_allocator_->Release(); } + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h index 10ca4b828a4bb5..764509e75ba23a 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.h +++ b/paddle/fluid/memory/allocation/thread_local_allocator.h @@ -52,6 +52,7 @@ class ThreadLocalAllocatorImpl explicit ThreadLocalAllocatorImpl(const platform::Place& p); ThreadLocalAllocation* AllocateImpl(size_t size); void FreeImpl(ThreadLocalAllocation* allocation); + void ReleaseImpl(); private: std::unique_ptr buddy_allocator_; @@ -91,6 +92,9 @@ class ThreadLocalCUDAAllocator : public Allocator { auto allocator_impl = tl_allocation->GetAllocator(); allocator_impl->FreeImpl(tl_allocation); } + void ReleaseImpl(const platform::Place& p) override { + return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->ReleaseImpl(); + } private: int gpu_id_; diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc index f9e2ea8c27a74c..70fd3a48d7861e 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc @@ -62,6 +62,7 @@ TEST(ThreadLocalAllocator, cross_scope_release) { auto tl_allocator_impl = ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]); allocator_addresses[j][i] = tl_allocator_impl.get(); + memory::Release(platform::CUDAPlace(devices[j])); } }); } diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 6ac99744d79380..3c391d76f4c9cf 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -39,9 +39,10 @@ BuddyAllocator::~BuddyAllocator() { while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); auto desc = cache_.LoadDesc(block); - VLOG(10) << "Free from block (" << block << ", " << desc->get_size() << ")"; + VLOG(10) << "Free from block (" << block << ", " << desc->get_total_size() + << ")"; - system_allocator_->Free(block, desc->get_size(), desc->get_index()); + system_allocator_->Free(block, desc->get_total_size(), desc->get_index()); cache_.Invalidate(block); pool_.erase(pool_.begin()); } @@ -161,6 +162,31 @@ void BuddyAllocator::Free(void* p) { IndexSizeAddress(desc->get_index(), desc->get_total_size(), block)); } +void BuddyAllocator::Release() { + // Acquire the allocator lock + std::lock_guard lock(mutex_); + + if (total_used_ == 0) { + while (!pool_.empty()) { + auto block = static_cast(std::get<2>(*pool_.begin())); + auto desc = cache_.LoadDesc(block); + + VLOG(10) << "Release from block (" << block << ", " + << desc->get_total_size() << ")"; + total_free_ -= desc->get_total_size(); + system_allocator_->Free(static_cast(block), desc->get_total_size(), + desc->get_index()); + cache_.Invalidate(block); + pool_.erase(pool_.begin()); + } + } else { + LOG(WARNING) << "The memory pool is not ready to release, please release " + "all variables that occupy the allocator memory." + << " If you are in multi-thread mode, please use " + "thread_local_allocator."; + } +} + size_t BuddyAllocator::Used() { return total_used_; } size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; } size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; } diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 791f8b56277723..f3f84cc7dcf14d 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -40,6 +40,8 @@ class BuddyAllocator { public: void* Alloc(size_t unaligned_size); void Free(void* ptr); + // Release the unused memory pool, a real free operation for the OS. + void Release(); size_t Used(); size_t GetMinChunkSize(); size_t GetMaxChunkSize(); diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 1722acd10aa38e..90f7e33eb3540f 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -305,6 +305,23 @@ TEST(BuddyAllocator, SpeedAna) { std::cerr << "time cost " << diff.count() << std::endl; } +TEST(BuddyAllocator, Release) { + // In a 8 GB machine, the pool size will be about 800 MB + FLAGS_fraction_of_gpu_memory_to_use = 0.1; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + // Less than pool size + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 50 << 20); + + buddy_allocator.Release(); +} #endif } // namespace detail diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index e01f030585a833..2fbde03b42bcc0 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -31,5 +31,9 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) { return allocation::AllocatorFacade::Instance().Alloc(place, size); } +void Release(const platform::Place &place) { + return allocation::AllocatorFacade::Instance().Release(place); +} + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 73487795f752ea..3d6836e1d255b4 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -38,5 +38,7 @@ extern AllocationPtr Alloc(const platform::Place& place, size_t size); extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size); +extern void Release(const platform::Place& place); + } // namespace memory } // namespace paddle