Skip to content

Commit 40ea050

Browse files
qxy11meta-codesync[bot]
authored andcommitted
Option to exclude UVM weights from core dump (#5784)
Summary: Pull Request resolved: #5784 X-link: https://github.com/facebookresearch/FBGEMM/pull/2714 Add custom allocator/deallocator functions to host-mapped tensor allocators. Reviewed By: q10 Differential Revision: D102202275 fbshipit-source-id: cfaa02bb24d41ec64333c2c6f74ee9e42f4bbc53
1 parent 5eae523 commit 40ea050

3 files changed

Lines changed: 124 additions & 7 deletions

File tree

fbgemm_gpu/include/fbgemm_gpu/cumem_utils.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,23 @@ Tensor new_host_mapped_tensor(
5555
const Tensor& self,
5656
const std::vector<std::int64_t>& sizes);
5757

58+
/// Allocate the `at::Tensor` with host-mapped memory using custom allocation
59+
/// hooks.
60+
///
61+
/// @param self The input tensor
62+
/// @param sizes The target tensor dimensions
63+
/// @param alloc_fn Custom allocator. Must be provided together with
64+
/// `dealloc_fn`. If both are nullptr, malloc/free is used.
65+
/// @param dealloc_fn Custom deallocator. Must be provided together with
66+
/// `alloc_fn`. If both are nullptr, malloc/free is used.
67+
///
68+
/// @return A new tensor backed by host-mapped memory
69+
Tensor new_host_mapped_tensor_with_allocator(
70+
const Tensor& self,
71+
const std::vector<std::int64_t>& sizes,
72+
void* (*alloc_fn)(size_t),
73+
void (*dealloc_fn)(void*));
74+
5875
/// @ingroup cumem-utils
5976
///
6077
/// Allocate the `at::Tensor` with either unified managed memory (UVM) or

fbgemm_gpu/src/memory_utils/memory_utils.cu

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,20 @@ namespace {
2929
struct CUDAHostMappedContext {
3030
void* ptr_;
3131
int cuda_device_;
32+
void (*dealloc_fn_)(void*);
3233

33-
CUDAHostMappedContext(void* ptr, int cuda_device)
34-
: ptr_(ptr), cuda_device_(cuda_device) {};
34+
CUDAHostMappedContext(
35+
void* ptr,
36+
int cuda_device,
37+
void (*dealloc_fn)(void*) = nullptr)
38+
: ptr_(ptr),
39+
cuda_device_(cuda_device),
40+
dealloc_fn_(dealloc_fn ? dealloc_fn : &free) {}
3541

3642
~CUDAHostMappedContext() {
3743
at::cuda::OptionalCUDAGuard device_guard(cuda_device_);
3844
AT_CUDA_CHECK(cudaHostUnregister(ptr_));
39-
free(ptr_);
45+
dealloc_fn_(ptr_);
4046
}
4147

4248
static void release(void* ptr) {
@@ -247,10 +253,16 @@ Tensor new_vanilla_managed_tensor(
247253
return new_managed_tensor_internal(self, sizes);
248254
}
249255

250-
Tensor new_host_mapped_tensor(
256+
Tensor new_host_mapped_tensor_with_allocator(
251257
const Tensor& self,
252-
const std::vector<std::int64_t>& sizes) {
258+
const std::vector<std::int64_t>& sizes,
259+
void* (*alloc_fn)(size_t),
260+
void (*dealloc_fn)(void*)) {
253261
CUDA_DEVICE_GUARD(self);
262+
TORCH_CHECK(
263+
(alloc_fn == nullptr) == (dealloc_fn == nullptr),
264+
"new_host_mapped_tensor_with_allocator requires alloc_fn and dealloc_fn to both be "
265+
"null or both be non-null");
254266

255267
auto strides = defaultStrides(sizes);
256268
size_t size_bytes =
@@ -262,7 +274,7 @@ Tensor new_host_mapped_tensor(
262274
// of using this cuda API, we can do regular malloc, pre-fault the pages, and
263275
// then do cudaHostRegister with GPU mapping flags to lock the pages, so we
264276
// can minimize the cost while holding this global lock.
265-
void* const ptr = malloc(size_bytes);
277+
void* const ptr = alloc_fn ? alloc_fn(size_bytes) : malloc(size_bytes);
266278

267279
// Pre-fault/map the pages by setting the first byte of the page
268280
// TODO: parallelize the mapping of pages with a threadpool executor
@@ -283,7 +295,7 @@ Tensor new_host_mapped_tensor(
283295
size_bytes,
284296
at::DataPtr(
285297
dev_ptr,
286-
new CUDAHostMappedContext(ptr, self.get_device()),
298+
new CUDAHostMappedContext(ptr, self.get_device(), dealloc_fn),
287299
&CUDAHostMappedContext::release,
288300
{at::DeviceType::CUDA, self.device().index()}),
289301
nullptr, /* allocator */
@@ -292,6 +304,12 @@ Tensor new_host_mapped_tensor(
292304
.set_(std::move(storage), 0, sizes, strides);
293305
}
294306

307+
Tensor new_host_mapped_tensor(
308+
const Tensor& self,
309+
const std::vector<std::int64_t>& sizes) {
310+
return new_host_mapped_tensor_with_allocator(self, sizes, nullptr, nullptr);
311+
}
312+
295313
Tensor new_unified_tensor(
296314
const Tensor& self,
297315
const std::vector<std::int64_t>& sizes,
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
11+
#include <atomic>
12+
#include <cstdlib>
13+
14+
#include "fbgemm_gpu/cumem_utils.h"
15+
16+
using namespace ::testing;
17+
18+
namespace {
19+
20+
std::atomic<int> g_alloc_count{0};
21+
std::atomic<int> g_dealloc_count{0};
22+
23+
void* trackingAllocate(size_t size) {
24+
void* ptr = std::malloc(size); // NOLINT(cppcoreguidelines-no-malloc)
25+
g_alloc_count.fetch_add(1);
26+
return ptr;
27+
}
28+
29+
void trackingDeallocate(void* ptr) {
30+
g_dealloc_count.fetch_add(1);
31+
std::free(ptr); // NOLINT(cppcoreguidelines-no-malloc)
32+
}
33+
34+
void resetCounters() {
35+
g_alloc_count.store(0);
36+
g_dealloc_count.store(0);
37+
}
38+
39+
} // namespace
40+
41+
TEST(HostMappedTensorTest, CustomAllocatorIsUsed) {
42+
resetCounters();
43+
44+
auto self = at::empty({0}, at::device(at::kCUDA).dtype(at::kByte));
45+
std::vector<int64_t> sizes = {1024};
46+
47+
{
48+
auto tensor = fbgemm_gpu::new_host_mapped_tensor_with_allocator(
49+
self, sizes, &trackingAllocate, &trackingDeallocate);
50+
51+
EXPECT_EQ(g_alloc_count.load(), 1)
52+
<< "Custom allocator should have been called exactly once";
53+
EXPECT_EQ(tensor.numel(), 1024);
54+
}
55+
56+
EXPECT_EQ(g_dealloc_count.load(), 1)
57+
<< "Custom deallocator should have been called on tensor destruction";
58+
}
59+
60+
TEST(HostMappedTensorTest, DefaultAllocatorWhenNull) {
61+
auto self = at::empty({0}, at::device(at::kCUDA).dtype(at::kByte));
62+
std::vector<int64_t> sizes = {512};
63+
64+
auto tensor = fbgemm_gpu::new_host_mapped_tensor_with_allocator(
65+
self, sizes, nullptr, nullptr);
66+
67+
EXPECT_EQ(tensor.numel(), 512);
68+
}
69+
70+
TEST(HostMappedTensorTest, RejectsMismatchedAllocatorPair) {
71+
auto self = at::empty({0}, at::device(at::kCUDA).dtype(at::kByte));
72+
std::vector<int64_t> sizes = {512};
73+
74+
EXPECT_THROW(
75+
fbgemm_gpu::new_host_mapped_tensor_with_allocator(
76+
self, sizes, &trackingAllocate, nullptr),
77+
c10::Error);
78+
EXPECT_THROW(
79+
fbgemm_gpu::new_host_mapped_tensor_with_allocator(
80+
self, sizes, nullptr, &trackingDeallocate),
81+
c10::Error);
82+
}

0 commit comments

Comments
 (0)