Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions paddle/phi/core/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class Allocator {

virtual ~Allocator() = default;
virtual AllocationPtr Allocate(size_t bytes_size) = 0;
virtual void PreAlloc() {}

virtual bool IsAllocThreadSafe() const { return false; }
};
Expand Down
42 changes: 36 additions & 6 deletions paddle/phi/core/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ PHI_DEFINE_EXPORTED_bool(

COMMON_DECLARE_string(allocator_strategy);
COMMON_DECLARE_uint64(auto_growth_chunk_size_in_mb);
COMMON_DECLARE_uint64(alignment_size);
COMMON_DECLARE_uint64(small_pool_size_in_mb);
COMMON_DECLARE_bool(use_auto_growth_pinned_allocator);
COMMON_DECLARE_bool(use_cuda_malloc_async_allocator);
COMMON_DECLARE_bool(auto_free_cudagraph_allocations_on_launch);
Expand Down Expand Up @@ -252,6 +254,7 @@ class AllocatorFacadePrivate {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
InitAutoGrowthCUDAAllocator(phi::GPUPlace(dev_id),
allow_free_idle_chunk_);
PreAllocCUDAAllocator(phi::GPUPlace(dev_id));
}
auto_growth_allocators_ = allocators_;

Expand Down Expand Up @@ -932,6 +935,32 @@ class AllocatorFacadePrivate {
}
}

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void PreAllocCUDAAllocator(phi::GPUPlace p) {
// fallback to single pool.
if (FLAGS_small_pool_size_in_mb <= 0) {
return;
}
const auto current_device_id = phi::backends::gpu::GetCurrentDeviceId();
if (FLAGS_use_auto_growth_v2 || FLAGS_use_cuda_malloc_async_allocator ||
FLAGS_use_virtual_memory_auto_growth) {
VLOG(6) << "PreAlloc is not implemented for "
"AutoGrowthBestFitAllocatorV2, CUDAMallocAsyncAllocator or "
"VirtualMemoryAutoGrowthBestFitAllocator.";
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个if分支内是否应该添加return?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

auto it = allocators_.find(p);
PADDLE_ENFORCE_NE(it,
allocators_.end(),
common::errors::NotFound("No allocator for %s", p));
if (current_device_id == p.GetDeviceId()) {
auto allocator =
std::dynamic_pointer_cast<AutoGrowthBestFitAllocator>(it->second);
VLOG(8) << "PreAlloc for dev_id=" << p.GetDeviceId();
allocator->PreAlloc();
}
}
#endif

void InitCUDAMallocAsyncAllocator(phi::GPUPlace p, gpuStream_t stream) {
#ifdef PADDLE_WITH_CUDA
std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
Expand All @@ -945,8 +974,10 @@ class AllocatorFacadePrivate {

void InitAutoGrowthCUDAAllocator(phi::GPUPlace p, gpuStream_t stream) {
auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
auto alignment_size = FLAGS_alignment_size;
VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
<< FLAGS_auto_growth_chunk_size_in_mb;
<< FLAGS_auto_growth_chunk_size_in_mb << ", alignment_size is "
<< alignment_size;
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = CreateCUDAAllocator(p);
if (FLAGS_use_auto_growth_v2) {
Expand All @@ -959,11 +990,10 @@ class AllocatorFacadePrivate {
allow_free_idle_chunk_);
} else {
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
chunk_size,
allow_free_idle_chunk_);
std::make_shared<AutoGrowthBestFitAllocator>(cuda_allocator,
alignment_size,
chunk_size,
allow_free_idle_chunk_);
}
#endif

Expand Down
161 changes: 138 additions & 23 deletions paddle/phi/core/memory/allocation/auto_growth_best_fit_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,39 @@ PHI_DEFINE_EXPORTED_READONLY_bool(print_allocator_trace_info,
"print trace memory info");

PHI_DEFINE_EXPORTED_READONLY_bool(dump_chunk_info, false, "dump chunk info");
PHI_DEFINE_EXPORTED_uint64(
alignment_size,
256,
"All sizes are rounded up to a multiple of this value. Default: 256.");
PHI_DEFINE_EXPORTED_uint64(
small_pool_size_in_mb,
0,
"Threshold (MiB) separating the small and large pools. "
"0 disables the small pool and enables single-pool mode "
"(all requests go to the large pool). When > 0, requests "
"<= threshold use the small pool; larger requests use the "
"large pool. Default: 0.");
PHI_DEFINE_EXPORTED_uint64(small_pool_auto_growth_chunk_size_in_mb,
0,
"The minimal chunk size for the small pool in MiB. "
"If small_pool_size_in_mb > 0, this overrides "
"the constructor-provided global growth size "
"(FLAGS_auto_growth_chunk_size_in_mb).");
PHI_DEFINE_EXPORTED_uint64(large_pool_auto_growth_chunk_size_in_mb,
0,
"The minimal chunk size for the large pool in MiB. "
"If small_pool_size_in_mb > 0, this overrides "
"the constructor-provided global growth size "
"(FLAGS_auto_growth_chunk_size_in_mb).");
PHI_DEFINE_EXPORTED_uint64(
large_pool_pre_alloc_in_mb,
0,
"Pre-reserve this many MiB in the large pool. 0 disables pre-allocation.");
PHI_DEFINE_EXPORTED_uint64(
small_pool_pre_alloc_in_mb,
0,
"Pre-reserve this many MiB in the small pool. 0 disables pre-allocation.");

namespace paddle::memory::allocation {

AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
Expand Down Expand Up @@ -85,6 +118,66 @@ void AutoGrowthBestFitAllocator::DumpInfo() const {
<< std::endl;
}
}

bool AutoGrowthBestFitAllocator::is_small_free_block(size_t size) {
auto small_pool_size = FLAGS_small_pool_size_in_mb << 20;
if (size <= small_pool_size) {
return true;
} else {
return false;
}
}

size_t AutoGrowthBestFitAllocator::auto_growth_size(bool is_small,
size_t chunk_size) {
// fallback to single pool and use constructor-provided chunk_size.
if (FLAGS_small_pool_size_in_mb == 0) {
return chunk_size;
}

const uint64_t pool_auto_growth_chunk_size_mb =
is_small ? FLAGS_small_pool_auto_growth_chunk_size_in_mb
: FLAGS_large_pool_auto_growth_chunk_size_in_mb;
const size_t auto_growth_size =
pool_auto_growth_chunk_size_mb
? (static_cast<size_t>(pool_auto_growth_chunk_size_mb) << 20)
: 0;

return AlignedSize(auto_growth_size, alignment_);
}

void AutoGrowthBestFitAllocator::PreAlloc() {
auto small_pool_pre_alloc = FLAGS_small_pool_pre_alloc_in_mb << 20;
auto large_pool_pre_alloc = FLAGS_large_pool_pre_alloc_in_mb << 20;
if (small_pool_pre_alloc > 0) {
VLOG(10) << "PreAlloc small_pool_pre_alloc_in_mb = "
<< FLAGS_small_pool_pre_alloc_in_mb;
chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
underlying_allocator_->Allocate(small_pool_pre_alloc)));
auto *chunk = &(*chunks_.rbegin());
uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
auto &blocks = chunk->blocks_;
blocks.emplace_back(
p, small_pool_pre_alloc, /*is_free=*/true, /*is_small=*/true, chunk);
small_free_blocks_.emplace(std::make_pair(small_pool_pre_alloc, p),
--(blocks.end()));
}

if (large_pool_pre_alloc > 0) {
VLOG(10) << "PreAlloc large_pool_pre_alloc_in_mb = "
<< FLAGS_large_pool_pre_alloc_in_mb;
chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
underlying_allocator_->Allocate(large_pool_pre_alloc)));
auto *chunk = &(*chunks_.rbegin());
uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
auto &blocks = chunk->blocks_;
blocks.emplace_back(
p, large_pool_pre_alloc, /*is_free=*/true, /*is_small=*/false, chunk);
large_free_blocks_.emplace(std::make_pair(large_pool_pre_alloc, p),
--(blocks.end()));
}
}

phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
size_t unaligned_size) {
phi::RecordEvent record("AutoGrowthBestFitAllocator::Allocate",
Expand All @@ -97,26 +190,31 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
<< ", extra size " << extra_padding_size_;

std::lock_guard<SpinLock> guard(spinlock_);
auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
bool is_small = is_small_free_block(size);
auto &free_blocks = is_small ? small_free_blocks_ : large_free_blocks_;
auto iter = free_blocks.lower_bound(std::make_pair(size, nullptr));
BlockIt block_it;
if (iter != free_blocks_.end()) {
if (iter != free_blocks.end()) {
block_it = iter->second;
free_blocks_.erase(iter);
free_blocks.erase(iter);
auto *chunk = block_it->chunk_;
size_t remaining_size = block_it->size_ - size;
VLOG(10) << "Allocate " << size << " bytes from chunk size "
<< block_it->size_ << ", remaining " << remaining_size;
if (remaining_size == 0) {
block_it->is_free_ = false;
block_it->is_small_ = is_small;
} else {
auto remaining_free_block = chunk->blocks_.insert(
block_it, Block(block_it->ptr_, remaining_size, true, chunk));
free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
remaining_free_block);
block_it,
Block(block_it->ptr_, remaining_size, true, is_small, chunk));
free_blocks.emplace(std::make_pair(remaining_size, block_it->ptr_),
remaining_free_block);
block_it->ptr_ =
reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
block_it->size_ = size;
block_it->is_free_ = false;
block_it->is_small_ = is_small;
}
} else {
if (FLAGS_dump_chunk_info) {
Expand All @@ -128,7 +226,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
if (FLAGS_free_when_no_cache_hit) {
FreeIdleChunks();
}
size_t realloc_size = std::max(size, chunk_size_);
size_t realloc_size =
std::max(size, auto_growth_size(is_small, chunk_size_));

try {
chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
Expand All @@ -151,10 +250,10 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(

size_t remaining_size = realloc_size - size;
if (remaining_size > 0) {
blocks.emplace_back(p, remaining_size, true, chunk);
free_blocks_.emplace(std::make_pair(remaining_size, p), --(blocks.end()));
blocks.emplace_back(p, remaining_size, true, is_small, chunk);
free_blocks.emplace(std::make_pair(remaining_size, p), --(blocks.end()));
}
blocks.emplace_back(p + remaining_size, size, false, chunk);
blocks.emplace_back(p + remaining_size, size, false, is_small, chunk);
block_it = --(blocks.end());
VLOG(2) << "Not found and reallocate " << realloc_size << "("
<< static_cast<void *>(p) << "), and remaining " << remaining_size;
Expand All @@ -167,7 +266,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
++total_alloc_times_;
total_alloc_size_ += size;
VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
return new BlockAllocation(block_it);
auto block_t = new BlockAllocation(block_it);
return block_t;
}

void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
Expand All @@ -179,6 +279,8 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
std::lock_guard<SpinLock> guard(spinlock_);
auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
auto &blocks = block_it->chunk_->blocks_;
bool is_small = block_it->is_small_;
auto &free_blocks = is_small ? small_free_blocks_ : large_free_blocks_;

total_free_times_ += 1;
total_free_size_ += block_it->size_;
Expand All @@ -190,7 +292,7 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
--prev_it;

if (prev_it->is_free_) {
free_blocks_.erase(std::make_pair(prev_it->size_, prev_it->ptr_));
free_blocks.erase(std::make_pair(prev_it->size_, prev_it->ptr_));
prev_it->size_ += block_it->size_;
blocks.erase(block_it);
block_it = prev_it;
Expand All @@ -202,19 +304,22 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {

// It's weird that using `next_it == blocks.end()` will cause a judgment fail.
if (block_it != (--blocks.end()) && next_it->is_free_) {
free_blocks_.erase(std::make_pair(next_it->size_, next_it->ptr_));
free_blocks.erase(std::make_pair(next_it->size_, next_it->ptr_));
block_it->size_ += next_it->size_;
blocks.erase(next_it);
}

free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
block_it);
free_blocks.emplace(std::make_pair(block_it->size_, block_it->ptr_),
block_it);

delete allocation;

if (FLAGS_free_idle_chunk) {
FreeIdleChunks();
}
if (FLAGS_dump_chunk_info) {
DumpInfo();
}
}

uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
Expand All @@ -229,13 +334,15 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
auto &blocks = chunk_it->blocks_;
if (blocks.size() == 1 && blocks.begin()->is_free_) {
auto &block = *blocks.begin();
bool is_small = block.is_small_;
auto &free_blocks = is_small ? small_free_blocks_ : large_free_blocks_;
VLOG(2) << "Free chunk with size " << block.size_;
if (FLAGS_dump_chunk_info) {
std::cout << "FreeIdleChunks chunk is " << block.size_ << ", "
<< block.ptr_ << std::endl;
}
bytes += block.size_;
free_blocks_.erase(std::make_pair(block.size_, block.ptr_));
free_blocks.erase(std::make_pair(block.size_, block.ptr_));
chunk_it = chunks_.erase(chunk_it);
} else {
++chunk_it;
Expand All @@ -249,10 +356,15 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
}

void AutoGrowthBestFitAllocator::Trace() const {
size_t cur_idle_bytes = 0;
auto it = free_blocks_.begin();
for (; it != free_blocks_.end(); ++it) {
cur_idle_bytes += it->second->size_;
size_t small_cur_idle_bytes = 0;
auto small_it = small_free_blocks_.begin();
for (; small_it != small_free_blocks_.end(); ++small_it) {
small_cur_idle_bytes += small_it->second->size_;
}
size_t large_cur_idle_bytes = 0;
auto large_it = large_free_blocks_.begin();
for (; large_it != large_free_blocks_.end(); ++large_it) {
large_cur_idle_bytes += large_it->second->size_;
}

VLOG(1) << "alloc:"
Expand All @@ -262,11 +374,14 @@ void AutoGrowthBestFitAllocator::Trace() const {
<< "m busy:"
<< (total_alloc_size_ - total_free_size_) / // NOLINT
static_cast<double>(1024 * 1024)
<< "m idle:"
<< cur_idle_bytes / static_cast<double>(1024 * 1024) // NOLINT
<< "m small idle:"
<< small_cur_idle_bytes / static_cast<double>(1024 * 1024) // NOLINT
<< "m large idle:"
<< large_cur_idle_bytes / static_cast<double>(1024 * 1024) // NOLINT
<< "m alloc_times:" << total_alloc_times_
<< " free_times:" << total_free_times_
<< " free_blocks_num:" << free_blocks_.size()
<< " small free_blocks_num:" << small_free_blocks_.size()
<< " large free_blocks_num:" << large_free_blocks_.size()
<< " curr_chunks_num:" << chunks_.size();
}

Expand Down
Loading
Loading