PaddlePaddle · zhangting2020 · Aug 29, 2025 · Aug 3, 2025 · Aug 7, 2025 · Aug 24, 2025
diff --git a/paddle/phi/core/allocator.h b/paddle/phi/core/allocator.h
@@ -102,6 +102,7 @@ class Allocator {
 
   virtual ~Allocator() = default;
   virtual AllocationPtr Allocate(size_t bytes_size) = 0;
+  virtual void PreAlloc() {}
 
   virtual bool IsAllocThreadSafe() const { return false; }
 };

diff --git a/paddle/phi/core/memory/allocation/allocator_facade.cc b/paddle/phi/core/memory/allocation/allocator_facade.cc
@@ -115,6 +115,8 @@ PHI_DEFINE_EXPORTED_bool(
 
 COMMON_DECLARE_string(allocator_strategy);
 COMMON_DECLARE_uint64(auto_growth_chunk_size_in_mb);
+COMMON_DECLARE_uint64(alignment_size);
+COMMON_DECLARE_uint64(small_pool_size_in_mb);
 COMMON_DECLARE_bool(use_auto_growth_pinned_allocator);
 COMMON_DECLARE_bool(use_cuda_malloc_async_allocator);
 COMMON_DECLARE_bool(auto_free_cudagraph_allocations_on_launch);
@@ -252,6 +254,7 @@ class AllocatorFacadePrivate {
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitAutoGrowthCUDAAllocator(phi::GPUPlace(dev_id),
                                       allow_free_idle_chunk_);
+          PreAllocCUDAAllocator(phi::GPUPlace(dev_id));
         }
         auto_growth_allocators_ = allocators_;
 
@@ -932,6 +935,32 @@ class AllocatorFacadePrivate {
     }
   }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void PreAllocCUDAAllocator(phi::GPUPlace p) {
+    // fallback to single pool.
+    if (FLAGS_small_pool_size_in_mb <= 0) {
+      return;
+    }
+    const auto current_device_id = phi::backends::gpu::GetCurrentDeviceId();
+    if (FLAGS_use_auto_growth_v2 || FLAGS_use_cuda_malloc_async_allocator ||
+        FLAGS_use_virtual_memory_auto_growth) {
+      VLOG(6) << "PreAlloc is not implemented for "
+                 "AutoGrowthBestFitAllocatorV2, CUDAMallocAsyncAllocator or "
+                 "VirtualMemoryAutoGrowthBestFitAllocator.";
+    }
+    auto it = allocators_.find(p);
+    PADDLE_ENFORCE_NE(it,
+                      allocators_.end(),
+                      common::errors::NotFound("No allocator for %s", p));
+    if (current_device_id == p.GetDeviceId()) {
+      auto allocator =
+          std::dynamic_pointer_cast<AutoGrowthBestFitAllocator>(it->second);
+      VLOG(8) << "PreAlloc for dev_id=" << p.GetDeviceId();
+      allocator->PreAlloc();
+    }
+  }
+#endif
+
   void InitCUDAMallocAsyncAllocator(phi::GPUPlace p, gpuStream_t stream) {
 #ifdef PADDLE_WITH_CUDA
     std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
@@ -945,8 +974,10 @@ class AllocatorFacadePrivate {
 
   void InitAutoGrowthCUDAAllocator(phi::GPUPlace p, gpuStream_t stream) {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
+    auto alignment_size = FLAGS_alignment_size;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
-            << FLAGS_auto_growth_chunk_size_in_mb;
+            << FLAGS_auto_growth_chunk_size_in_mb << ", alignment_size is "
+            << alignment_size;
 #if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = CreateCUDAAllocator(p);
     if (FLAGS_use_auto_growth_v2) {
@@ -959,11 +990,10 @@ class AllocatorFacadePrivate {
               allow_free_idle_chunk_);
     } else {
       cuda_allocators_[p][stream] =
-          std::make_shared<AutoGrowthBestFitAllocator>(
-              cuda_allocator,
-              platform::GpuMinChunkSize(),
-              chunk_size,
-              allow_free_idle_chunk_);
+          std::make_shared<AutoGrowthBestFitAllocator>(cuda_allocator,
+                                                       alignment_size,
+                                                       chunk_size,
+                                                       allow_free_idle_chunk_);
     }
 #endif
 

diff --git a/paddle/phi/core/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/phi/core/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -44,6 +44,39 @@ PHI_DEFINE_EXPORTED_READONLY_bool(print_allocator_trace_info,
                                   "print trace memory info");
 
 PHI_DEFINE_EXPORTED_READONLY_bool(dump_chunk_info, false, "dump chunk info");
+PHI_DEFINE_EXPORTED_uint64(
+    alignment_size,
+    256,
+    "All sizes are rounded up to a multiple of this value. Default: 256.");
+PHI_DEFINE_EXPORTED_uint64(
+    small_pool_size_in_mb,
+    0,
+    "Threshold (MiB) separating the small and large pools. "
+    "0 disables the small pool and enables single-pool mode "
+    "(all requests go to the large pool). When > 0, requests "
+    "<= threshold use the small pool; larger requests use the "
+    "large pool. Default: 0.");
+PHI_DEFINE_EXPORTED_uint64(small_pool_auto_growth_chunk_size_in_mb,
+                           0,
+                           "The minimal chunk size for the small pool in MiB. "
+                           "If small_pool_size_in_mb > 0, this overrides "
+                           "the constructor-provided global growth size "
+                           "(FLAGS_auto_growth_chunk_size_in_mb).");
+PHI_DEFINE_EXPORTED_uint64(large_pool_auto_growth_chunk_size_in_mb,
+                           0,
+                           "The minimal chunk size for the large pool in MiB. "
+                           "If small_pool_size_in_mb > 0, this overrides "
+                           "the constructor-provided global growth size "
+                           "(FLAGS_auto_growth_chunk_size_in_mb).");
+PHI_DEFINE_EXPORTED_uint64(
+    large_pool_pre_alloc_in_mb,
+    0,
+    "Pre-reserve this many MiB in the large pool. 0 disables pre-allocation.");
+PHI_DEFINE_EXPORTED_uint64(
+    small_pool_pre_alloc_in_mb,
+    0,
+    "Pre-reserve this many MiB in the small pool. 0 disables pre-allocation.");
+
 namespace paddle::memory::allocation {
 
 AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
@@ -85,6 +118,66 @@ void AutoGrowthBestFitAllocator::DumpInfo() const {
               << std::endl;
   }
 }
+
+bool AutoGrowthBestFitAllocator::is_small_free_block(size_t size) {
+  auto small_pool_size = FLAGS_small_pool_size_in_mb << 20;
+  if (size <= small_pool_size) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+size_t AutoGrowthBestFitAllocator::auto_growth_size(bool is_small,
+                                                    size_t chunk_size) {
+  // fallback to single pool and use constructor-provided chunk_size.
+  if (FLAGS_small_pool_size_in_mb == 0) {
+    return chunk_size;
+  }
+
+  const uint64_t pool_auto_growth_chunk_size_mb =
+      is_small ? FLAGS_small_pool_auto_growth_chunk_size_in_mb
+               : FLAGS_large_pool_auto_growth_chunk_size_in_mb;
+  const size_t auto_growth_size =
+      pool_auto_growth_chunk_size_mb
+          ? (static_cast<size_t>(pool_auto_growth_chunk_size_mb) << 20)
+          : 0;
+
+  return AlignedSize(auto_growth_size, alignment_);
+}
+
+void AutoGrowthBestFitAllocator::PreAlloc() {
+  auto small_pool_pre_alloc = FLAGS_small_pool_pre_alloc_in_mb << 20;
+  auto large_pool_pre_alloc = FLAGS_large_pool_pre_alloc_in_mb << 20;
+  if (small_pool_pre_alloc > 0) {
+    VLOG(10) << "PreAlloc small_pool_pre_alloc_in_mb = "
+             << FLAGS_small_pool_pre_alloc_in_mb;
+    chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+        underlying_allocator_->Allocate(small_pool_pre_alloc)));
+    auto *chunk = &(*chunks_.rbegin());
+    uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
+    auto &blocks = chunk->blocks_;
+    blocks.emplace_back(
+        p, small_pool_pre_alloc, /*is_free=*/true, /*is_small=*/true, chunk);
+    small_free_blocks_.emplace(std::make_pair(small_pool_pre_alloc, p),
+                               --(blocks.end()));
+  }
+
+  if (large_pool_pre_alloc > 0) {
+    VLOG(10) << "PreAlloc large_pool_pre_alloc_in_mb = "
+             << FLAGS_large_pool_pre_alloc_in_mb;
+    chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+        underlying_allocator_->Allocate(large_pool_pre_alloc)));
+    auto *chunk = &(*chunks_.rbegin());
+    uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
+    auto &blocks = chunk->blocks_;
+    blocks.emplace_back(
+        p, large_pool_pre_alloc, /*is_free=*/true, /*is_small=*/false, chunk);
+    large_free_blocks_.emplace(std::make_pair(large_pool_pre_alloc, p),
+                               --(blocks.end()));
+  }
+}
+
 phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
     size_t unaligned_size) {
   phi::RecordEvent record("AutoGrowthBestFitAllocator::Allocate",
@@ -97,26 +190,31 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
            << ", extra size " << extra_padding_size_;
 
   std::lock_guard<SpinLock> guard(spinlock_);
-  auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
+  bool is_small = is_small_free_block(size);
+  auto &free_blocks = is_small ? small_free_blocks_ : large_free_blocks_;
+  auto iter = free_blocks.lower_bound(std::make_pair(size, nullptr));
   BlockIt block_it;
-  if (iter != free_blocks_.end()) {
+  if (iter != free_blocks.end()) {
     block_it = iter->second;
-    free_blocks_.erase(iter);
+    free_blocks.erase(iter);
     auto *chunk = block_it->chunk_;
     size_t remaining_size = block_it->size_ - size;
     VLOG(10) << "Allocate " << size << " bytes from chunk size "
              << block_it->size_ << ", remaining " << remaining_size;
     if (remaining_size == 0) {
       block_it->is_free_ = false;
+      block_it->is_small_ = is_small;
     } else {
       auto remaining_free_block = chunk->blocks_.insert(
-          block_it, Block(block_it->ptr_, remaining_size, true, chunk));
-      free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
-                           remaining_free_block);
+          block_it,
+          Block(block_it->ptr_, remaining_size, true, is_small, chunk));
+      free_blocks.emplace(std::make_pair(remaining_size, block_it->ptr_),
+                          remaining_free_block);
       block_it->ptr_ =
           reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
       block_it->size_ = size;
       block_it->is_free_ = false;
+      block_it->is_small_ = is_small;
     }
   } else {
     if (FLAGS_dump_chunk_info) {
@@ -128,7 +226,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
     if (FLAGS_free_when_no_cache_hit) {
       FreeIdleChunks();
     }
-    size_t realloc_size = std::max(size, chunk_size_);
+    size_t realloc_size =
+        std::max(size, auto_growth_size(is_small, chunk_size_));
 
     try {
       chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
@@ -151,10 +250,10 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
 
     size_t remaining_size = realloc_size - size;
     if (remaining_size > 0) {
-      blocks.emplace_back(p, remaining_size, true, chunk);
-      free_blocks_.emplace(std::make_pair(remaining_size, p), --(blocks.end()));
+      blocks.emplace_back(p, remaining_size, true, is_small, chunk);
+      free_blocks.emplace(std::make_pair(remaining_size, p), --(blocks.end()));
     }
-    blocks.emplace_back(p + remaining_size, size, false, chunk);
+    blocks.emplace_back(p + remaining_size, size, false, is_small, chunk);
     block_it = --(blocks.end());
     VLOG(2) << "Not found and reallocate " << realloc_size << "("
             << static_cast<void *>(p) << "), and remaining " << remaining_size;
@@ -167,7 +266,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
   ++total_alloc_times_;
   total_alloc_size_ += size;
   VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
-  return new BlockAllocation(block_it);
+  auto block_t = new BlockAllocation(block_it);
+  return block_t;
 }
 
 void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
@@ -179,6 +279,8 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
   std::lock_guard<SpinLock> guard(spinlock_);
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;
+  bool is_small = block_it->is_small_;
+  auto &free_blocks = is_small ? small_free_blocks_ : large_free_blocks_;
 
   total_free_times_ += 1;
   total_free_size_ += block_it->size_;
@@ -190,7 +292,7 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
     --prev_it;
 
     if (prev_it->is_free_) {
-      free_blocks_.erase(std::make_pair(prev_it->size_, prev_it->ptr_));
+      free_blocks.erase(std::make_pair(prev_it->size_, prev_it->ptr_));
       prev_it->size_ += block_it->size_;
       blocks.erase(block_it);
       block_it = prev_it;
@@ -202,19 +304,22 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
 
   // It's weird that using `next_it == blocks.end()` will cause a judgment fail.
   if (block_it != (--blocks.end()) && next_it->is_free_) {
-    free_blocks_.erase(std::make_pair(next_it->size_, next_it->ptr_));
+    free_blocks.erase(std::make_pair(next_it->size_, next_it->ptr_));
     block_it->size_ += next_it->size_;
     blocks.erase(next_it);
   }
 
-  free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
-                       block_it);
+  free_blocks.emplace(std::make_pair(block_it->size_, block_it->ptr_),
+                      block_it);
 
   delete allocation;
 
   if (FLAGS_free_idle_chunk) {
     FreeIdleChunks();
   }
+  if (FLAGS_dump_chunk_info) {
+    DumpInfo();
+  }
 }
 
 uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
@@ -229,13 +334,15 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
     auto &blocks = chunk_it->blocks_;
     if (blocks.size() == 1 && blocks.begin()->is_free_) {
       auto &block = *blocks.begin();
+      bool is_small = block.is_small_;
+      auto &free_blocks = is_small ? small_free_blocks_ : large_free_blocks_;
       VLOG(2) << "Free chunk with size " << block.size_;
       if (FLAGS_dump_chunk_info) {
         std::cout << "FreeIdleChunks chunk is " << block.size_ << ", "
                   << block.ptr_ << std::endl;
       }
       bytes += block.size_;
-      free_blocks_.erase(std::make_pair(block.size_, block.ptr_));
+      free_blocks.erase(std::make_pair(block.size_, block.ptr_));
       chunk_it = chunks_.erase(chunk_it);
     } else {
       ++chunk_it;
@@ -249,10 +356,15 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
 }
 
 void AutoGrowthBestFitAllocator::Trace() const {
-  size_t cur_idle_bytes = 0;
-  auto it = free_blocks_.begin();
-  for (; it != free_blocks_.end(); ++it) {
-    cur_idle_bytes += it->second->size_;
+  size_t small_cur_idle_bytes = 0;
+  auto small_it = small_free_blocks_.begin();
+  for (; small_it != small_free_blocks_.end(); ++small_it) {
+    small_cur_idle_bytes += small_it->second->size_;
+  }
+  size_t large_cur_idle_bytes = 0;
+  auto large_it = large_free_blocks_.begin();
+  for (; large_it != large_free_blocks_.end(); ++large_it) {
+    large_cur_idle_bytes += large_it->second->size_;
   }
 
   VLOG(1) << "alloc:"
@@ -262,11 +374,14 @@ void AutoGrowthBestFitAllocator::Trace() const {
           << "m busy:"
           << (total_alloc_size_ - total_free_size_) /  // NOLINT
                  static_cast<double>(1024 * 1024)
-          << "m idle:"
-          << cur_idle_bytes / static_cast<double>(1024 * 1024)  // NOLINT
+          << "m small idle:"
+          << small_cur_idle_bytes / static_cast<double>(1024 * 1024)  // NOLINT
+          << "m large idle:"
+          << large_cur_idle_bytes / static_cast<double>(1024 * 1024)  // NOLINT
           << "m alloc_times:" << total_alloc_times_
           << " free_times:" << total_free_times_
-          << " free_blocks_num:" << free_blocks_.size()
+          << " small free_blocks_num:" << small_free_blocks_.size()
+          << " large free_blocks_num:" << large_free_blocks_.size()
           << " curr_chunks_num:" << chunks_.size();
 }