Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
gflags.push_back("--allocator_strategy=thread_local");
process_level_allocator_enabled = false;
} else {
gflags.push_back("--allocator_strategy=naive_best_fit");
process_level_allocator_enabled = true;
}

Expand Down Expand Up @@ -889,6 +888,38 @@ bool AnalysisPredictor::LoadParameters() {
return true;
}

void AnalysisPredictor::ShrinkMemory() {
ClearIntermediateTensor();
std::lock_guard<std::mutex> lk(clone_mutex_);

for (auto name : scope_->LocalVarNames()) {
auto *variable = scope_->FindVar(name);
PADDLE_ENFORCE_NOT_NULL(variable,
platform::errors::PreconditionNotMet(
"Not found variable %s in scope.", name));
if (variable != nullptr && variable->IsType<framework::LoDTensor>()) {
VLOG(3) << "Clear Intermediate Tensor: " << name;
auto *t = variable->GetMutable<framework::LoDTensor>();
t->clear();
} else if (variable != nullptr &&
variable->IsType<framework::LoDTensorArray>()) {
VLOG(3) << "Clear Intermediate TensorArray: " << name;
auto *tr = variable->GetMutable<framework::LoDTensorArray>();
for (size_t i = 0; i < tr->size(); ++i) {
tr[i].clear();
}
} else {
VLOG(3) << "Not supported type: " << variable->Type()
<< " in ShrinkMemory";
}
}
scope_->EraseVars(scope_->LocalVarNames());
// Release-operation release all weights and tmp tensor, so we need to init
// predictor again.
paddle::memory::Release(place_);
Init(nullptr);
}

void AnalysisPredictor::ClearIntermediateTensor() {
PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
platform::errors::PreconditionNotMet(
Expand Down Expand Up @@ -1141,6 +1172,8 @@ void Predictor::ClearIntermediateTensor() {
predictor_->ClearIntermediateTensor();
}

void Predictor::ShrinkMemory() { predictor_->ShrinkMemory(); }

int GetNumBytesOfDataType(DataType dtype) {
switch (dtype) {
case DataType::FLOAT32:
Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,13 @@ class AnalysisPredictor : public PaddlePredictor {
///
void ClearIntermediateTensor();

///
/// \brief Shrink memory. Release all weights and tmp tensor to destructor
/// Allocator. And reinit predictor to reconstruct an allocator. After
/// this operation, we reduced the memory usage.
///
void ShrinkMemory() override;

///
/// \brief Get the argument used by predictor
///
Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/inference/api/paddle_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,13 @@ class PD_INFER_DECL PaddlePredictor {
///
virtual void ClearIntermediateTensor() {}

///
/// \brief Shrink memory. Release all weights and tmp tensor to destructor
/// Allocator. And reinit predictor to reconstruct an Allocator. After
/// this operation, we reduced the memory usage.
///
virtual void ShrinkMemory() {}

/// \brief Clone an existing predictor
/// When using clone, the same network will be created,
/// and the parameters between them are shared.
Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/inference/api/paddle_inference_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,13 @@ class PD_INFER_DECL Predictor {
/// \brief Clear the intermediate tensors of the predictor
void ClearIntermediateTensor();

///
/// \brief Shrink memory. Release all weights and tmp tensor to destructor
/// Allocator. And reinit predictor to reconstruct an Allocator. After
/// this operation, we reduced the memory usage.
///
void ShrinkMemory();

private:
std::unique_ptr<paddle::PaddlePredictor> predictor_;
};
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/memory/allocation/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,15 @@ class Allocator {
FreeImpl(allocation);
}

inline void Release(const platform::Place& place) { ReleaseImpl(place); }

// True if the `Allocate` is thread safe.
virtual bool IsAllocThreadSafe() const;

protected:
virtual Allocation* AllocateImpl(size_t size) = 0;
virtual void FreeImpl(Allocation* allocation);
virtual void ReleaseImpl(const platform::Place& place) {}
};

using AllocationDeleter = Allocator::AllocationDeleter;
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
return m_->GetAllocator(place, size)->Allocate(size);
}

void AllocatorFacade::Release(const platform::Place& place) {
m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place);
}

} // namespace allocation
} // namespace memory
} // namespace paddle
3 changes: 3 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ class AllocatorFacade {
// Allocate a unique allocation.
AllocationPtr Alloc(const platform::Place& place, size_t size);

// Release unused memory pool.
void Release(const platform::Place& place);

// TODO(yy): Allocate a Copy-On-Write allocation?
private:
AllocatorFacade();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ class AutoGrowthBestFitAllocator : public Allocator {

void FreeImpl(Allocation *allocation) override;

// Release the memory block which is not used in pool.
void ReleaseImpl(const platform::Place &place) override { FreeIdleChunks(); }

private:
void FreeIdleChunks();

Expand Down
51 changes: 51 additions & 0 deletions paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ void *Alloc(const Place &place, size_t size);
template <typename Place>
void Free(const Place &place, void *p, size_t size);

template <typename Place>
void Release(const Place &place);

template <typename Place>
size_t Used(const Place &place);

Expand Down Expand Up @@ -99,6 +102,11 @@ void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
GetCPUBuddyAllocator()->Free(p);
}

template <>
void Release<platform::CPUPlace>(const platform::CPUPlace &place) {
GetCPUBuddyAllocator()->Release();
}

template <>
size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
Expand Down Expand Up @@ -186,6 +194,17 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
#endif
}

template <>
void Release<platform::XPUPlace>(const platform::XPUPlace &place) {
#ifdef PADDLE_WITH_XPU
PADDLE_THROW(
platform::errors::PermissionDenied("Release XPU pool is not supported."));
#else
PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif
}

template <>
size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#ifdef PADDLE_WITH_XPU
Expand Down Expand Up @@ -313,6 +332,16 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
#endif
}

template <>
void Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator(place.device)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CUDAPlace' is not supported in CPU only device."));
#endif
}

#ifdef PADDLE_WITH_CUDA
BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
static std::once_flag init_flag;
Expand Down Expand Up @@ -371,6 +400,17 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
#endif
}

template <>
void Release<platform::CUDAPinnedPlace>(
const platform::CUDAPinnedPlace &place) {
#ifdef PADDLE_WITH_CUDA
GetCUDAPinnedBuddyAllocator()->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'CUDAPinnedPlace' is not supported in CPU only device."));
#endif
}

struct AllocVisitor : public boost::static_visitor<void *> {
inline explicit AllocVisitor(size_t size) : size_(size) {}

Expand All @@ -397,6 +437,13 @@ struct FreeVisitor : public boost::static_visitor<void> {
size_t size_;
};

struct ReleaseVisitor : public boost::static_visitor<void> {
template <typename Place>
inline void operator()(const Place &place) const {
Release<Place>(place);
}
};

size_t Usage::operator()(const platform::CPUPlace &cpu) const {
return Used(cpu);
}
Expand Down Expand Up @@ -439,6 +486,10 @@ void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
delete allocation;
}

void NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
boost::apply_visitor(legacy::ReleaseVisitor(), place);
}

} // namespace allocation
} // namespace memory
} // namespace paddle
1 change: 1 addition & 0 deletions paddle/fluid/memory/allocation/naive_best_fit_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class NaiveBestFitAllocator : public Allocator {
protected:
Allocation *AllocateImpl(size_t size) override;
void FreeImpl(Allocation *allocation) override;
void ReleaseImpl(const platform::Place &place) override;

private:
platform::Place place_;
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/memory/allocation/retry_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class RetryAllocator : public Allocator {
protected:
void FreeImpl(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size) override;
void ReleaseImpl(const platform::Place& place) override {
underlying_allocator_->Release(place);
}

private:
std::shared_ptr<Allocator> underlying_allocator_;
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/memory/allocation/thread_local_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
delete allocation;
}

void ThreadLocalAllocatorImpl::ReleaseImpl() { buddy_allocator_->Release(); }

} // namespace allocation
} // namespace memory
} // namespace paddle
4 changes: 4 additions & 0 deletions paddle/fluid/memory/allocation/thread_local_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class ThreadLocalAllocatorImpl
explicit ThreadLocalAllocatorImpl(const platform::Place& p);
ThreadLocalAllocation* AllocateImpl(size_t size);
void FreeImpl(ThreadLocalAllocation* allocation);
void ReleaseImpl();

private:
std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
Expand Down Expand Up @@ -91,6 +92,9 @@ class ThreadLocalCUDAAllocator : public Allocator {
auto allocator_impl = tl_allocation->GetAllocator();
allocator_impl->FreeImpl(tl_allocation);
}
void ReleaseImpl(const platform::Place& p) override {
return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->ReleaseImpl();
}

private:
int gpu_id_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ TEST(ThreadLocalAllocator, cross_scope_release) {
auto tl_allocator_impl =
ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]);
allocator_addresses[j][i] = tl_allocator_impl.get();
memory::Release(platform::CUDAPlace(devices[j]));
}
});
}
Expand Down
30 changes: 28 additions & 2 deletions paddle/fluid/memory/detail/buddy_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ BuddyAllocator::~BuddyAllocator() {
while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
auto desc = cache_.LoadDesc(block);
VLOG(10) << "Free from block (" << block << ", " << desc->get_size() << ")";
VLOG(10) << "Free from block (" << block << ", " << desc->get_total_size()
<< ")";

system_allocator_->Free(block, desc->get_size(), desc->get_index());
system_allocator_->Free(block, desc->get_total_size(), desc->get_index());
cache_.Invalidate(block);
pool_.erase(pool_.begin());
}
Expand Down Expand Up @@ -161,6 +162,31 @@ void BuddyAllocator::Free(void* p) {
IndexSizeAddress(desc->get_index(), desc->get_total_size(), block));
}

void BuddyAllocator::Release() {
// Acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_);

if (total_used_ == 0) {
while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
auto desc = cache_.LoadDesc(block);

VLOG(10) << "Release from block (" << block << ", "
<< desc->get_total_size() << ")";
total_free_ -= desc->get_total_size();
system_allocator_->Free(static_cast<void*>(block), desc->get_total_size(),
desc->get_index());
cache_.Invalidate(block);
pool_.erase(pool_.begin());
}
} else {
LOG(WARNING) << "The memory pool is not ready to release, please release "
"all variables that occupy the allocator memory."
<< " If you are in multi-thread mode, please use "
"thread_local_allocator.";
}
}

size_t BuddyAllocator::Used() { return total_used_; }
size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/memory/detail/buddy_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class BuddyAllocator {
public:
void* Alloc(size_t unaligned_size);
void Free(void* ptr);
// Release the unused memory pool, a real free operation for the OS.
void Release();
size_t Used();
size_t GetMinChunkSize();
size_t GetMaxChunkSize();
Expand Down
17 changes: 17 additions & 0 deletions paddle/fluid/memory/detail/buddy_allocator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,23 @@ TEST(BuddyAllocator, SpeedAna) {
std::cerr << "time cost " << diff.count() << std::endl;
}

TEST(BuddyAllocator, Release) {
// In a 8 GB machine, the pool size will be about 800 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.1;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;

BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());

// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 50 << 20);

buddy_allocator.Release();
}
#endif

} // namespace detail
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/memory/malloc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,9 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) {
return allocation::AllocatorFacade::Instance().Alloc(place, size);
}

void Release(const platform::Place &place) {
return allocation::AllocatorFacade::Instance().Release(place);
}

} // namespace memory
} // namespace paddle
2 changes: 2 additions & 0 deletions paddle/fluid/memory/malloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,7 @@ extern AllocationPtr Alloc(const platform::Place& place, size_t size);

extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);

extern void Release(const platform::Place& place);

} // namespace memory
} // namespace paddle