Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions paddle/phi/core/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class Allocator {

virtual ~Allocator() = default;
virtual AllocationPtr Allocate(size_t bytes_size) = 0;
virtual void PreAlloc() {}

virtual bool IsAllocThreadSafe() const { return false; }
};
Expand Down
43 changes: 37 additions & 6 deletions paddle/phi/core/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ PHI_DEFINE_EXPORTED_bool(

COMMON_DECLARE_string(allocator_strategy);
COMMON_DECLARE_uint64(auto_growth_chunk_size_in_mb);
COMMON_DECLARE_uint64(alignment_size);
COMMON_DECLARE_uint64(small_pool_size_in_mb);
COMMON_DECLARE_bool(use_auto_growth_pinned_allocator);
COMMON_DECLARE_bool(use_cuda_malloc_async_allocator);
COMMON_DECLARE_bool(auto_free_cudagraph_allocations_on_launch);
Expand Down Expand Up @@ -252,6 +254,7 @@ class AllocatorFacadePrivate {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
InitAutoGrowthCUDAAllocator(phi::GPUPlace(dev_id),
allow_free_idle_chunk_);
PreAllocCUDAAllocator(phi::GPUPlace(dev_id));
}
auto_growth_allocators_ = allocators_;

Expand Down Expand Up @@ -932,6 +935,33 @@ class AllocatorFacadePrivate {
}
}

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void PreAllocCUDAAllocator(phi::GPUPlace p) {
// fallback to single pool.
if (FLAGS_small_pool_size_in_mb <= 0) {
return;
}
if (FLAGS_use_auto_growth_v2 || FLAGS_use_cuda_malloc_async_allocator ||
FLAGS_use_virtual_memory_auto_growth) {
VLOG(6) << "PreAlloc is not implemented for "
"AutoGrowthBestFitAllocatorV2, CUDAMallocAsyncAllocator or "
"VirtualMemoryAutoGrowthBestFitAllocator.";
return;
}
const auto current_device_id = phi::backends::gpu::GetCurrentDeviceId();
auto it = allocators_.find(p);
PADDLE_ENFORCE_NE(it,
allocators_.end(),
common::errors::NotFound("No allocator for %s", p));
if (current_device_id == p.GetDeviceId()) {
auto allocator =
std::dynamic_pointer_cast<AutoGrowthBestFitAllocator>(it->second);
VLOG(8) << "PreAlloc for dev_id=" << p.GetDeviceId();
allocator->PreAlloc();
}
}
#endif

void InitCUDAMallocAsyncAllocator(phi::GPUPlace p, gpuStream_t stream) {
#ifdef PADDLE_WITH_CUDA
std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
Expand All @@ -945,8 +975,10 @@ class AllocatorFacadePrivate {

void InitAutoGrowthCUDAAllocator(phi::GPUPlace p, gpuStream_t stream) {
auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
auto alignment_size = FLAGS_alignment_size;
VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
<< FLAGS_auto_growth_chunk_size_in_mb;
<< FLAGS_auto_growth_chunk_size_in_mb << ", alignment_size is "
<< alignment_size;
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = CreateCUDAAllocator(p);
if (FLAGS_use_auto_growth_v2) {
Expand All @@ -959,11 +991,10 @@ class AllocatorFacadePrivate {
allow_free_idle_chunk_);
} else {
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
chunk_size,
allow_free_idle_chunk_);
std::make_shared<AutoGrowthBestFitAllocator>(cuda_allocator,
alignment_size,
chunk_size,
allow_free_idle_chunk_);
}
#endif

Expand Down
161 changes: 138 additions & 23 deletions paddle/phi/core/memory/allocation/auto_growth_best_fit_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,39 @@ PHI_DEFINE_EXPORTED_READONLY_bool(print_allocator_trace_info,
"print trace memory info");

PHI_DEFINE_EXPORTED_READONLY_bool(dump_chunk_info, false, "dump chunk info");
PHI_DEFINE_EXPORTED_uint64(
alignment_size,
256,
"All sizes are rounded up to a multiple of this value. Default: 256.");
PHI_DEFINE_EXPORTED_uint64(
small_pool_size_in_mb,
0,
"Threshold (MiB) separating the small and large pools. "
"0 disables the small pool and enables single-pool mode "
"(all requests go to the large pool). When > 0, requests "
"<= threshold use the small pool; larger requests use the "
"large pool. Default: 0.");
PHI_DEFINE_EXPORTED_uint64(small_pool_auto_growth_chunk_size_in_mb,
0,
"The minimal chunk size for the small pool in MiB. "
"If small_pool_size_in_mb > 0, this overrides "
"the constructor-provided global growth size "
"(FLAGS_auto_growth_chunk_size_in_mb).");
PHI_DEFINE_EXPORTED_uint64(large_pool_auto_growth_chunk_size_in_mb,
0,
"The minimal chunk size for the large pool in MiB. "
"If small_pool_size_in_mb > 0, this overrides "
"the constructor-provided global growth size "
"(FLAGS_auto_growth_chunk_size_in_mb).");
PHI_DEFINE_EXPORTED_uint64(
large_pool_pre_alloc_in_mb,
0,
"Pre-reserve this many MiB in the large pool. 0 disables pre-allocation.");
PHI_DEFINE_EXPORTED_uint64(
small_pool_pre_alloc_in_mb,
0,
"Pre-reserve this many MiB in the small pool. 0 disables pre-allocation.");

namespace paddle::memory::allocation {

AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
Expand Down Expand Up @@ -85,6 +118,66 @@ void AutoGrowthBestFitAllocator::DumpInfo() const {
<< std::endl;
}
}

bool AutoGrowthBestFitAllocator::is_small_free_block(size_t size) {
auto small_pool_size = FLAGS_small_pool_size_in_mb << 20;
if (size <= small_pool_size) {
return true;
} else {
return false;
}
}

size_t AutoGrowthBestFitAllocator::auto_growth_size(bool is_small,
size_t chunk_size) {
// fallback to single pool and use constructor-provided chunk_size.
if (FLAGS_small_pool_size_in_mb == 0) {
return chunk_size;
}

const uint64_t pool_auto_growth_chunk_size_mb =
is_small ? FLAGS_small_pool_auto_growth_chunk_size_in_mb
: FLAGS_large_pool_auto_growth_chunk_size_in_mb;
const size_t auto_growth_size =
pool_auto_growth_chunk_size_mb
? (static_cast<size_t>(pool_auto_growth_chunk_size_mb) << 20)
: 0;

return AlignedSize(auto_growth_size, alignment_);
}

void AutoGrowthBestFitAllocator::PreAlloc() {
auto small_pool_pre_alloc = FLAGS_small_pool_pre_alloc_in_mb << 20;
auto large_pool_pre_alloc = FLAGS_large_pool_pre_alloc_in_mb << 20;
if (small_pool_pre_alloc > 0) {
VLOG(10) << "PreAlloc small_pool_pre_alloc_in_mb = "
<< FLAGS_small_pool_pre_alloc_in_mb;
chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
underlying_allocator_->Allocate(small_pool_pre_alloc)));
auto *chunk = &(*chunks_.rbegin());
uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
auto &blocks = chunk->blocks_;
blocks.emplace_back(
p, small_pool_pre_alloc, /*is_free=*/true, /*is_small=*/true, chunk);
small_free_blocks_.emplace(std::make_pair(small_pool_pre_alloc, p),
--(blocks.end()));
}

if (large_pool_pre_alloc > 0) {
VLOG(10) << "PreAlloc large_pool_pre_alloc_in_mb = "
<< FLAGS_large_pool_pre_alloc_in_mb;
chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
underlying_allocator_->Allocate(large_pool_pre_alloc)));
auto *chunk = &(*chunks_.rbegin());
uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
auto &blocks = chunk->blocks_;
blocks.emplace_back(
p, large_pool_pre_alloc, /*is_free=*/true, /*is_small=*/false, chunk);
large_free_blocks_.emplace(std::make_pair(large_pool_pre_alloc, p),
--(blocks.end()));
}
}

phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
size_t unaligned_size) {
phi::RecordEvent record("AutoGrowthBestFitAllocator::Allocate",
Expand All @@ -97,26 +190,31 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
<< ", extra size " << extra_padding_size_;

std::lock_guard<SpinLock> guard(spinlock_);
auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
bool is_small = is_small_free_block(size);
auto &free_blocks = is_small ? small_free_blocks_ : large_free_blocks_;
auto iter = free_blocks.lower_bound(std::make_pair(size, nullptr));
BlockIt block_it;
if (iter != free_blocks_.end()) {
if (iter != free_blocks.end()) {
block_it = iter->second;
free_blocks_.erase(iter);
free_blocks.erase(iter);
auto *chunk = block_it->chunk_;
size_t remaining_size = block_it->size_ - size;
VLOG(10) << "Allocate " << size << " bytes from chunk size "
<< block_it->size_ << ", remaining " << remaining_size;
if (remaining_size == 0) {
block_it->is_free_ = false;
block_it->is_small_ = is_small;
} else {
auto remaining_free_block = chunk->blocks_.insert(
block_it, Block(block_it->ptr_, remaining_size, true, chunk));
free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
remaining_free_block);
block_it,
Block(block_it->ptr_, remaining_size, true, is_small, chunk));
free_blocks.emplace(std::make_pair(remaining_size, block_it->ptr_),
remaining_free_block);
block_it->ptr_ =
reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
block_it->size_ = size;
block_it->is_free_ = false;
block_it->is_small_ = is_small;
}
} else {
if (FLAGS_dump_chunk_info) {
Expand All @@ -128,7 +226,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
if (FLAGS_free_when_no_cache_hit) {
FreeIdleChunks();
}
size_t realloc_size = std::max(size, chunk_size_);
size_t realloc_size =
std::max(size, auto_growth_size(is_small, chunk_size_));

try {
chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
Expand All @@ -151,10 +250,10 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(

size_t remaining_size = realloc_size - size;
if (remaining_size > 0) {
blocks.emplace_back(p, remaining_size, true, chunk);
free_blocks_.emplace(std::make_pair(remaining_size, p), --(blocks.end()));
blocks.emplace_back(p, remaining_size, true, is_small, chunk);
free_blocks.emplace(std::make_pair(remaining_size, p), --(blocks.end()));
}
blocks.emplace_back(p + remaining_size, size, false, chunk);
blocks.emplace_back(p + remaining_size, size, false, is_small, chunk);
block_it = --(blocks.end());
VLOG(2) << "Not found and reallocate " << realloc_size << "("
<< static_cast<void *>(p) << "), and remaining " << remaining_size;
Expand All @@ -167,7 +266,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
++total_alloc_times_;
total_alloc_size_ += size;
VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
return new BlockAllocation(block_it);
auto block_t = new BlockAllocation(block_it);
return block_t;
}

void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
Expand All @@ -179,6 +279,8 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
std::lock_guard<SpinLock> guard(spinlock_);
auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
auto &blocks = block_it->chunk_->blocks_;
bool is_small = block_it->is_small_;
auto &free_blocks = is_small ? small_free_blocks_ : large_free_blocks_;

total_free_times_ += 1;
total_free_size_ += block_it->size_;
Expand All @@ -190,7 +292,7 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
--prev_it;

if (prev_it->is_free_) {
free_blocks_.erase(std::make_pair(prev_it->size_, prev_it->ptr_));
free_blocks.erase(std::make_pair(prev_it->size_, prev_it->ptr_));
prev_it->size_ += block_it->size_;
blocks.erase(block_it);
block_it = prev_it;
Expand All @@ -202,19 +304,22 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {

// It's weird that using `next_it == blocks.end()` will cause a judgment fail.
if (block_it != (--blocks.end()) && next_it->is_free_) {
free_blocks_.erase(std::make_pair(next_it->size_, next_it->ptr_));
free_blocks.erase(std::make_pair(next_it->size_, next_it->ptr_));
block_it->size_ += next_it->size_;
blocks.erase(next_it);
}

free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
block_it);
free_blocks.emplace(std::make_pair(block_it->size_, block_it->ptr_),
block_it);

delete allocation;

if (FLAGS_free_idle_chunk) {
FreeIdleChunks();
}
if (FLAGS_dump_chunk_info) {
DumpInfo();
}
}

uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
Expand All @@ -229,13 +334,15 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
auto &blocks = chunk_it->blocks_;
if (blocks.size() == 1 && blocks.begin()->is_free_) {
auto &block = *blocks.begin();
bool is_small = block.is_small_;
auto &free_blocks = is_small ? small_free_blocks_ : large_free_blocks_;
VLOG(2) << "Free chunk with size " << block.size_;
if (FLAGS_dump_chunk_info) {
std::cout << "FreeIdleChunks chunk is " << block.size_ << ", "
<< block.ptr_ << std::endl;
}
bytes += block.size_;
free_blocks_.erase(std::make_pair(block.size_, block.ptr_));
free_blocks.erase(std::make_pair(block.size_, block.ptr_));
chunk_it = chunks_.erase(chunk_it);
} else {
++chunk_it;
Expand All @@ -249,10 +356,15 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
}

void AutoGrowthBestFitAllocator::Trace() const {
size_t cur_idle_bytes = 0;
auto it = free_blocks_.begin();
for (; it != free_blocks_.end(); ++it) {
cur_idle_bytes += it->second->size_;
size_t small_cur_idle_bytes = 0;
auto small_it = small_free_blocks_.begin();
for (; small_it != small_free_blocks_.end(); ++small_it) {
small_cur_idle_bytes += small_it->second->size_;
}
size_t large_cur_idle_bytes = 0;
auto large_it = large_free_blocks_.begin();
for (; large_it != large_free_blocks_.end(); ++large_it) {
large_cur_idle_bytes += large_it->second->size_;
}

VLOG(1) << "alloc:"
Expand All @@ -262,11 +374,14 @@ void AutoGrowthBestFitAllocator::Trace() const {
<< "m busy:"
<< (total_alloc_size_ - total_free_size_) / // NOLINT
static_cast<double>(1024 * 1024)
<< "m idle:"
<< cur_idle_bytes / static_cast<double>(1024 * 1024) // NOLINT
<< "m small idle:"
<< small_cur_idle_bytes / static_cast<double>(1024 * 1024) // NOLINT
<< "m large idle:"
<< large_cur_idle_bytes / static_cast<double>(1024 * 1024) // NOLINT
<< "m alloc_times:" << total_alloc_times_
<< " free_times:" << total_free_times_
<< " free_blocks_num:" << free_blocks_.size()
<< " small free_blocks_num:" << small_free_blocks_.size()
<< " large free_blocks_num:" << large_free_blocks_.size()
<< " curr_chunks_num:" << chunks_.size();
}

Expand Down
Loading