Skip to content

Commit 263c0ec

Browse files
committed
polish code
1 parent b3a2b2f commit 263c0ec

File tree

4 files changed

+79
-47
lines changed

4 files changed

+79
-47
lines changed

paddle/phi/core/memory/allocation/allocator_facade.cc

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ class AllocatorFacadePrivate {
253253
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
254254
InitAutoGrowthCUDAAllocator(phi::GPUPlace(dev_id),
255255
allow_free_idle_chunk_);
256+
PreAllocCUDAAllocator(phi::GPUPlace(dev_id));
256257
}
257258
auto_growth_allocators_ = allocators_;
258259

@@ -933,6 +934,24 @@ class AllocatorFacadePrivate {
933934
}
934935
}
935936

937+
void PreAllocCUDAAllocator(phi::GPUPlace p) {
938+
const auto current_device_id = phi::backends::gpu::GetCurrentDeviceId();
939+
if (FLAGS_use_auto_growth_v2) {
940+
PADDLE_THROW(common::errors::Unavailable(
941+
"PreAlloc is not implemented for AutoGrowthBestFitAllocatorV2."));
942+
}
943+
auto it = allocators_.find(p);
944+
PADDLE_ENFORCE_NE(it,
945+
allocators_.end(),
946+
common::errors::NotFound("No allocator for %s", p));
947+
if (current_device_id == p.GetDeviceId()) {
948+
auto allocator =
949+
std::dynamic_pointer_cast<AutoGrowthBestFitAllocator>(it->second);
950+
VLOG(8) << "PreAlloc for dev_id=" << p.GetDeviceId();
951+
allocator->PreAlloc();
952+
}
953+
}
954+
936955
void InitCUDAMallocAsyncAllocator(phi::GPUPlace p, gpuStream_t stream) {
937956
#ifdef PADDLE_WITH_CUDA
938957
std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
@@ -1188,7 +1207,6 @@ class AllocatorFacadePrivate {
11881207
}
11891208

11901209
void WrapStreamSafeCUDAAllocatorForDefault() {
1191-
const auto current_device_id = phi::backends::gpu::GetCurrentDeviceId();
11921210
for (auto& pair : allocators_) {
11931211
auto& place = pair.first;
11941212
if (phi::is_gpu_place(place)) {
@@ -1198,10 +1216,6 @@ class AllocatorFacadePrivate {
11981216
place,
11991217
/* default_stream = */ nullptr,
12001218
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
1201-
if (place.GetDeviceId() == current_device_id) {
1202-
VLOG(8) << "PreAlloc for current_device_id=" << current_device_id;
1203-
allocator->PreAlloc();
1204-
}
12051219
pair.second = allocator;
12061220

12071221
// NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an

paddle/phi/core/memory/allocation/auto_growth_best_fit_allocator.cc

Lines changed: 60 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -44,20 +44,40 @@ PHI_DEFINE_EXPORTED_READONLY_bool(print_allocator_trace_info,
4444
"print trace memory info");
4545

4646
PHI_DEFINE_EXPORTED_READONLY_bool(dump_chunk_info, false, "dump chunk info");
47-
PHI_DEFINE_EXPORTED_uint64(alignment_size, 256, "alignment_size");
48-
PHI_DEFINE_EXPORTED_uint64(small_pool_size_in_mb, 1, "small_pool_size_in_mb");
49-
PHI_DEFINE_EXPORTED_uint64(small_pool_auto_growth_chunk_size_in_mb,
50-
0,
51-
"small_pool_auto_growth_chunk_size_in_mb");
52-
PHI_DEFINE_EXPORTED_uint64(large_pool_auto_growth_chunk_size_in_mb,
53-
0,
54-
"large_pool_auto_growth_chunk_size_in_mb");
55-
PHI_DEFINE_EXPORTED_uint64(large_pool_pre_alloc_in_mb,
56-
0,
57-
"large_pool_pre_alloc_in_mb");
58-
PHI_DEFINE_EXPORTED_uint64(small_pool_pre_alloc_in_mb,
59-
0,
60-
"small_pool_pre_alloc_in_mb");
47+
PHI_DEFINE_EXPORTED_uint64(
48+
alignment_size,
49+
256,
50+
"All sizes are rounded up to a multiple of this value. Default: 256.");
51+
PHI_DEFINE_EXPORTED_uint64(
52+
small_pool_size_in_mb,
53+
0,
54+
"Threshold (MiB) separating the small and large pools. "
55+
"0 disables the small pool and enables single-pool mode "
56+
"(all requests go to the large pool). When > 0, requests "
57+
"<= threshold use the small pool; larger requests use the "
58+
"large pool. Default: 0.");
59+
PHI_DEFINE_EXPORTED_uint64(
60+
small_pool_auto_growth_chunk_size_in_mb,
61+
0,
62+
"The minimal chunk size for the small pool in MiB. If > 0, this overrides "
63+
"the constructor-provided global growth size "
64+
"(FLAGS_auto_growth_chunk_size_in_mb) "
65+
"If 0, falls back to the global growth size.");
66+
PHI_DEFINE_EXPORTED_uint64(
67+
large_pool_auto_growth_chunk_size_in_mb,
68+
0,
69+
"The minimal chunk size for the large pool in MiB. If > 0, this overrides "
70+
"the constructor-provided global growth size "
71+
"(FLAGS_auto_growth_chunk_size_in_mb) "
72+
"If 0, falls back to the global growth size.");
73+
PHI_DEFINE_EXPORTED_uint64(
74+
large_pool_pre_alloc_in_mb,
75+
0,
76+
"Pre-reserve this many MiB in the large pool. 0 disables pre-allocation.");
77+
PHI_DEFINE_EXPORTED_uint64(
78+
small_pool_pre_alloc_in_mb,
79+
0,
80+
"Pre-reserve this many MiB in the small pool. 0 disables pre-allocation.");
6181

6282
namespace paddle::memory::allocation {
6383

@@ -112,28 +132,34 @@ bool AutoGrowthBestFitAllocator::is_small_free_block(size_t size) {
112132

113133
size_t AutoGrowthBestFitAllocator::auto_growth_size(bool is_small,
114134
size_t chunk_size) {
115-
size_t auto_growth_chunk_size = 0;
116-
if (chunk_size > 0) {
117-
auto_growth_chunk_size = chunk_size;
118-
}
119-
120-
if (is_small) {
121-
auto_growth_chunk_size = FLAGS_small_pool_auto_growth_chunk_size_in_mb
122-
<< 20;
135+
// Priority: pool-specific flag (>0) > constructor-provided chunk_size (>0) >
136+
// member chunk_size_. Return value is aligned to alignment_ and at least
137+
// alignment_.
138+
const uint64_t pool_auto_growth_chunk_size_mb =
139+
is_small ? FLAGS_small_pool_auto_growth_chunk_size_in_mb
140+
: FLAGS_large_pool_auto_growth_chunk_size_in_mb;
141+
const size_t pool_auto_growth_chunk_size_bytes =
142+
pool_auto_growth_chunk_size_mb
143+
? (static_cast<size_t>(pool_auto_growth_chunk_size_mb) << 20)
144+
: 0;
145+
146+
size_t auto_growth_size = 0;
147+
if (pool_auto_growth_chunk_size_bytes) {
148+
auto_growth_size = pool_auto_growth_chunk_size_bytes; // 1) pool-specific
149+
// flag (MB -> bytes)
150+
} else if (chunk_size > 0) {
151+
auto_growth_size = chunk_size; // 2) value provided at construction (bytes)
123152
} else {
124-
auto_growth_chunk_size = FLAGS_large_pool_auto_growth_chunk_size_in_mb
125-
<< 20;
153+
auto_growth_size =
154+
chunk_size_; // 3) member fallback (already aligned in constructor)
126155
}
127156

128-
if (FLAGS_dump_chunk_info) {
129-
std::cout << "is_small = " << is_small
130-
<< "auto_growth_size = " << auto_growth_chunk_size << std::endl;
131-
}
132-
return auto_growth_chunk_size;
157+
auto_growth_size = AlignedSize(auto_growth_size, alignment_);
158+
159+
return auto_growth_size;
133160
}
134161

135162
void AutoGrowthBestFitAllocator::PreAlloc() {
136-
VLOG(10) << "AutoGrowthBestFitAllocator start PreAlloc ";
137163
auto small_pool_pre_alloc = FLAGS_small_pool_pre_alloc_in_mb << 20;
138164
auto large_pool_pre_alloc = FLAGS_large_pool_pre_alloc_in_mb << 20;
139165
if (small_pool_pre_alloc > 0) {
@@ -144,7 +170,8 @@ void AutoGrowthBestFitAllocator::PreAlloc() {
144170
auto *chunk = &(*chunks_.rbegin());
145171
uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
146172
auto &blocks = chunk->blocks_;
147-
blocks.emplace_back(p, small_pool_pre_alloc, true, true, chunk);
173+
blocks.emplace_back(
174+
p, small_pool_pre_alloc, /*is_free=*/true, /*is_small=*/true, chunk);
148175
small_free_blocks_.emplace(std::make_pair(small_pool_pre_alloc, p),
149176
--(blocks.end()));
150177
}
@@ -157,7 +184,8 @@ void AutoGrowthBestFitAllocator::PreAlloc() {
157184
auto *chunk = &(*chunks_.rbegin());
158185
uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
159186
auto &blocks = chunk->blocks_;
160-
blocks.emplace_back(p, large_pool_pre_alloc, true, true, chunk);
187+
blocks.emplace_back(
188+
p, large_pool_pre_alloc, /*is_free=*/true, /*is_small=*/false, chunk);
161189
large_free_blocks_.emplace(std::make_pair(large_pool_pre_alloc, p),
162190
--(blocks.end()));
163191
}
@@ -252,10 +280,6 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
252280
total_alloc_size_ += size;
253281
VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
254282
auto block_t = new BlockAllocation(block_it);
255-
if (FLAGS_dump_chunk_info) {
256-
DumpInfo();
257-
}
258-
Trace();
259283
return block_t;
260284
}
261285

paddle/phi/core/memory/allocation/stream_safe_cuda_allocator.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,11 +193,6 @@ void StreamSafeCUDAAllocator::SetDefaultStream(gpuStream_t stream) {
193193
default_stream_ = stream;
194194
}
195195

196-
void StreamSafeCUDAAllocator::PreAlloc() {
197-
std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
198-
underlying_allocator_->PreAlloc();
199-
}
200-
201196
phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
202197
phi::RecordEvent record("StreamSafeCUDAAllocator::Allocate",
203198
phi::TracerEventType::UserDefined,

paddle/phi/core/memory/allocation/stream_safe_cuda_allocator.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ class StreamSafeCUDAAllocator
7272
bool IsAllocThreadSafe() const override;
7373
gpuStream_t GetDefaultStream() const;
7474
void SetDefaultStream(gpuStream_t stream);
75-
void PreAlloc() override;
7675

7776
protected:
7877
phi::Allocation *AllocateImpl(size_t size) override;

0 commit comments

Comments
 (0)