Option to exclude UVM weights from core dump (#5784)

qxy11 · meta-codesync[bot] · commit 40ea050b0a14 · 2026-05-27T21:44:06.000-07:00
Summary: Pull Request resolved: #5784 X-link: https://github.com/facebookresearch/FBGEMM/pull/2714 Add custom allocator/deallocator functions to host-mapped tensor allocators. Reviewed By: q10 Differential Revision: D102202275 fbshipit-source-id: cfaa02bb24d41ec64333c2c6f74ee9e42f4bbc53
diff --git a/fbgemm_gpu/include/fbgemm_gpu/cumem_utils.h b/fbgemm_gpu/include/fbgemm_gpu/cumem_utils.h
@@ -55,6 +55,23 @@ Tensor new_host_mapped_tensor(
     const Tensor& self,
     const std::vector<std::int64_t>& sizes);
 
+/// Allocate the `at::Tensor` with host-mapped memory using custom allocation
+/// hooks.
+///
+/// @param self The input tensor
+/// @param sizes The target tensor dimensions
+/// @param alloc_fn Custom allocator. Must be provided together with
+///                 `dealloc_fn`. If both are nullptr, malloc/free is used.
+/// @param dealloc_fn Custom deallocator. Must be provided together with
+///                   `alloc_fn`. If both are nullptr, malloc/free is used.
+///
+/// @return A new tensor backed by host-mapped memory
+Tensor new_host_mapped_tensor_with_allocator(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes,
+    void* (*alloc_fn)(size_t),
+    void (*dealloc_fn)(void*));
+
 /// @ingroup cumem-utils
 ///
 /// Allocate the `at::Tensor` with either unified managed memory (UVM) or
diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu
@@ -29,14 +29,20 @@ namespace {
 struct CUDAHostMappedContext {
   void* ptr_;
   int cuda_device_;
+  void (*dealloc_fn_)(void*);
 
-  CUDAHostMappedContext(void* ptr, int cuda_device)
-      : ptr_(ptr), cuda_device_(cuda_device) {};
+  CUDAHostMappedContext(
+      void* ptr,
+      int cuda_device,
+      void (*dealloc_fn)(void*) = nullptr)
+      : ptr_(ptr),
+        cuda_device_(cuda_device),
+        dealloc_fn_(dealloc_fn ? dealloc_fn : &free) {}
 
   ~CUDAHostMappedContext() {
     at::cuda::OptionalCUDAGuard device_guard(cuda_device_);
     AT_CUDA_CHECK(cudaHostUnregister(ptr_));
-    free(ptr_);
+    dealloc_fn_(ptr_);
   }
 
   static void release(void* ptr) {
@@ -247,10 +253,16 @@ Tensor new_vanilla_managed_tensor(
   return new_managed_tensor_internal(self, sizes);
 }
 
-Tensor new_host_mapped_tensor(
+Tensor new_host_mapped_tensor_with_allocator(
     const Tensor& self,
-    const std::vector<std::int64_t>& sizes) {
+    const std::vector<std::int64_t>& sizes,
+    void* (*alloc_fn)(size_t),
+    void (*dealloc_fn)(void*)) {
   CUDA_DEVICE_GUARD(self);
+  TORCH_CHECK(
+      (alloc_fn == nullptr) == (dealloc_fn == nullptr),
+      "new_host_mapped_tensor_with_allocator requires alloc_fn and dealloc_fn to both be "
+      "null or both be non-null");
 
   auto strides = defaultStrides(sizes);
   size_t size_bytes =
@@ -262,7 +274,7 @@ Tensor new_host_mapped_tensor(
   // of using this cuda API, we can do regular malloc, pre-fault the pages, and
   // then do cudaHostRegister with GPU mapping flags to lock the pages, so we
   // can minimize the cost while holding this global lock.
-  void* const ptr = malloc(size_bytes);
+  void* const ptr = alloc_fn ? alloc_fn(size_bytes) : malloc(size_bytes);
 
   // Pre-fault/map the pages by setting the first byte of the page
   // TODO: parallelize the mapping of pages with a threadpool executor
@@ -283,7 +295,7 @@ Tensor new_host_mapped_tensor(
       size_bytes,
       at::DataPtr(
           dev_ptr,
-          new CUDAHostMappedContext(ptr, self.get_device()),
+          new CUDAHostMappedContext(ptr, self.get_device(), dealloc_fn),
           &CUDAHostMappedContext::release,
           {at::DeviceType::CUDA, self.device().index()}),
       nullptr, /* allocator */
@@ -292,6 +304,12 @@ Tensor new_host_mapped_tensor(
       .set_(std::move(storage), 0, sizes, strides);
 }
 
+Tensor new_host_mapped_tensor(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes) {
+  return new_host_mapped_tensor_with_allocator(self, sizes, nullptr, nullptr);
+}
+
 Tensor new_unified_tensor(
     const Tensor& self,
     const std::vector<std::int64_t>& sizes,
diff --git a/fbgemm_gpu/test/tbe/cache/host_mapped_tensor_test.cpp b/fbgemm_gpu/test/tbe/cache/host_mapped_tensor_test.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstdlib>
+
+#include "fbgemm_gpu/cumem_utils.h"
+
+using namespace ::testing;
+
+namespace {
+
+std::atomic<int> g_alloc_count{0};
+std::atomic<int> g_dealloc_count{0};
+
+void* trackingAllocate(size_t size) {
+  void* ptr = std::malloc(size); // NOLINT(cppcoreguidelines-no-malloc)
+  g_alloc_count.fetch_add(1);
+  return ptr;
+}
+
+void trackingDeallocate(void* ptr) {
+  g_dealloc_count.fetch_add(1);
+  std::free(ptr); // NOLINT(cppcoreguidelines-no-malloc)
+}
+
+void resetCounters() {
+  g_alloc_count.store(0);
+  g_dealloc_count.store(0);
+}
+
+} // namespace
+
+TEST(HostMappedTensorTest, CustomAllocatorIsUsed) {
+  resetCounters();
+
+  auto self = at::empty({0}, at::device(at::kCUDA).dtype(at::kByte));
+  std::vector<int64_t> sizes = {1024};
+
+  {
+    auto tensor = fbgemm_gpu::new_host_mapped_tensor_with_allocator(
+        self, sizes, &trackingAllocate, &trackingDeallocate);
+
+    EXPECT_EQ(g_alloc_count.load(), 1)
+        << "Custom allocator should have been called exactly once";
+    EXPECT_EQ(tensor.numel(), 1024);
+  }
+
+  EXPECT_EQ(g_dealloc_count.load(), 1)
+      << "Custom deallocator should have been called on tensor destruction";
+}
+
+TEST(HostMappedTensorTest, DefaultAllocatorWhenNull) {
+  auto self = at::empty({0}, at::device(at::kCUDA).dtype(at::kByte));
+  std::vector<int64_t> sizes = {512};
+
+  auto tensor = fbgemm_gpu::new_host_mapped_tensor_with_allocator(
+      self, sizes, nullptr, nullptr);
+
+  EXPECT_EQ(tensor.numel(), 512);
+}
+
+TEST(HostMappedTensorTest, RejectsMismatchedAllocatorPair) {
+  auto self = at::empty({0}, at::device(at::kCUDA).dtype(at::kByte));
+  std::vector<int64_t> sizes = {512};
+
+  EXPECT_THROW(
+      fbgemm_gpu::new_host_mapped_tensor_with_allocator(
+          self, sizes, &trackingAllocate, nullptr),
+      c10::Error);
+  EXPECT_THROW(
+      fbgemm_gpu::new_host_mapped_tensor_with_allocator(
+          self, sizes, nullptr, &trackingDeallocate),
+      c10::Error);
+}