Add experimental cuda_async_pinned_memory_resource

bdice · bdice · commit e99ed6e8b7ba · 2025-11-25T17:45:13.000-06:00
Adds a new cuda_async_pinned_memory_resource that provides stream-ordered pinned (page-locked) host memory allocation using CUDA 13.0's cudaMemGetDefaultMemPool API with cudaMemAllocationTypePinned. This parallels the cuda_async_managed_memory_resource added in rapidsai#2056 and addresses part of rapidsai#2054. Key features: - Uses default pinned memory pool for stream-ordered allocation - Accessible from both host and device - Requires CUDA 13.0+ (matches managed version for consistency) Implementation includes: - C++ header and implementation in cuda_async_pinned_memory_resource.hpp - Runtime capability check in runtime_capabilities.hpp - C++ tests in cuda_async_pinned_mr_tests.cpp - Python bindings in experimental module - Python tests in test_cuda_async_pinned_memory_resource.py
diff --git a/cpp/include/rmm/detail/runtime_capabilities.hpp b/cpp/include/rmm/detail/runtime_capabilities.hpp
@@ -25,6 +25,11 @@ namespace detail {
  */
 #define RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION 13000
 
+/**
+ * @brief Minimum CUDA driver version for stream-ordered pinned memory allocator support
+ */
+#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 13000
+
 /**
  * @brief Determine at runtime if the CUDA driver supports the stream-ordered
  * memory allocator functions.
@@ -146,5 +151,31 @@ struct runtime_async_managed_alloc {
   }
 };
 
+/**
+ * @brief Determine at runtime if the CUDA driver/runtime supports the stream-ordered
+ * pinned memory allocator functions.
+ *
+ * Stream-ordered pinned memory pools were introduced in CUDA 12.6 but our
+ * implementation requires features from CUDA 13.0 or higher.
+ */
+struct runtime_async_pinned_alloc {
+  static bool is_supported()
+  {
+    static auto supports_async_pinned_pool{[] {
+      // Basic pool support required
+      if (not runtime_async_alloc::is_supported()) { return false; }
+      // CUDA 13.0 or higher is required for async pinned memory pools
+      int cuda_driver_version{};
+      auto driver_result = cudaDriverGetVersion(&cuda_driver_version);
+      int cuda_runtime_version{};
+      auto runtime_result = cudaRuntimeGetVersion(&cuda_runtime_version);
+      return driver_result == cudaSuccess and runtime_result == cudaSuccess and
+             cuda_driver_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION and
+             cuda_runtime_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION;
+    }()};
+    return supports_async_pinned_pool;
+  }
+};
+
 }  // namespace detail
 }  // namespace RMM_NAMESPACE
diff --git a/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp b/cpp/include/rmm/mr/cuda_async_pinned_memory_resource.hpp
@@ -0,0 +1,143 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/detail/export.hpp>
+#include <rmm/detail/runtime_capabilities.hpp>
+#include <rmm/detail/thrust_namespace.h>
+#include <rmm/mr/cuda_async_view_memory_resource.hpp>
+#include <rmm/mr/device_memory_resource.hpp>
+
+#include <cuda/std/type_traits>
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+
+namespace RMM_NAMESPACE {
+namespace mr {
+/**
+ * @addtogroup memory_resources
+ * @{
+ * @file
+ */
+
+/**
+ * @brief `device_memory_resource` derived class that uses
+ * `cudaMallocFromPoolAsync`/`cudaFreeFromPoolAsync` with a pinned memory pool
+ * for allocation/deallocation.
+ */
+class cuda_async_pinned_memory_resource final : public device_memory_resource {
+ public:
+  /**
+   * @brief Constructs a cuda_async_pinned_memory_resource with the default pinned memory pool for
+   * the current device.
+   *
+   * The default pinned memory pool is the pool that is created when the device is created.
+   * Pool properties such as the release threshold are not modified.
+   *
+   * @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with
+   * pinned memory pool
+   */
+  cuda_async_pinned_memory_resource()
+  {
+    // Check if pinned memory pools are supported
+    RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(),
+                "cuda_async_pinned_memory_resource requires CUDA 13.0 or higher");
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION
+    cudaMemPool_t pinned_pool_handle{};
+    cudaMemLocation location{.type = cudaMemLocationTypeDevice,
+                             .id   = rmm::get_current_cuda_device().value()};
+    RMM_CUDA_TRY(
+      cudaMemGetDefaultMemPool(&pinned_pool_handle, &location, cudaMemAllocationTypePinned));
+    pool_ = cuda_async_view_memory_resource{pinned_pool_handle};
+#endif
+  }
+
+  /**
+   * @brief Returns the underlying native handle to the CUDA pool
+   *
+   * @return cudaMemPool_t Handle to the underlying CUDA pool
+   */
+  [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
+
+  ~cuda_async_pinned_memory_resource() override {}
+  cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&)            = delete;
+  cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&)                 = delete;
+  cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete;
+  cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&)      = delete;
+
+ private:
+  cuda_async_view_memory_resource pool_{};
+
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * @param bytes The size of the allocation
+   * @param stream Stream on which to perform allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    return pool_.allocate(stream, bytes);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * @param ptr Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param stream Stream on which to perform deallocation
+   */
+  void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
+  {
+    pool_.deallocate(stream, ptr, bytes);
+  }
+
+  /**
+   * @brief Compare this resource to another.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    auto const* async_mr = dynamic_cast<cuda_async_pinned_memory_resource const*>(&other);
+    return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
+  }
+
+  friend auto get_property(cuda_async_pinned_memory_resource const&,
+                           cuda::mr::device_accessible) noexcept
+  {
+    return cuda::mr::device_accessible{};
+  }
+  friend auto get_property(cuda_async_pinned_memory_resource const&,
+                           cuda::mr::host_accessible) noexcept
+  {
+    return cuda::mr::host_accessible{};
+  }
+};
+
+// static property checks
+static_assert(rmm::detail::polyfill::resource<cuda_async_pinned_memory_resource>);
+static_assert(rmm::detail::polyfill::async_resource<cuda_async_pinned_memory_resource>);
+static_assert(rmm::detail::polyfill::resource_with<cuda_async_pinned_memory_resource,
+                                                   cuda::mr::host_accessible,
+                                                   cuda::mr::device_accessible>);
+static_assert(rmm::detail::polyfill::async_resource_with<cuda_async_pinned_memory_resource,
+                                                         cuda::mr::host_accessible,
+                                                         cuda::mr::device_accessible>);
+/** @} */  // end of group
+}  // namespace mr
+}  // namespace RMM_NAMESPACE
diff --git a/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp b/cpp/tests/mr/cuda_async_pinned_mr_tests.cpp
@@ -0,0 +1,112 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <rmm/detail/error.hpp>
+#include <rmm/detail/runtime_capabilities.hpp>
+#include <rmm/mr/cuda_async_pinned_memory_resource.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <gtest/gtest.h>
+
+namespace rmm::test {
+namespace {
+
+using cuda_async_pinned_mr = rmm::mr::cuda_async_pinned_memory_resource;
+
+class AsyncPinnedMRTest : public ::testing::Test {
+ protected:
+  void SetUp() override
+  {
+    if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) {
+      GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource "
+                   << "requires CUDA 13.0 or higher and memory pool support.";
+    }
+  }
+};
+
+TEST_F(AsyncPinnedMRTest, BasicAllocateDeallocate)
+{
+  const auto alloc_size{100};
+  cuda_async_pinned_mr mr{};
+  void* ptr = mr.allocate_sync(alloc_size);
+  ASSERT_NE(nullptr, ptr);
+  mr.deallocate_sync(ptr, alloc_size);
+}
+
+TEST_F(AsyncPinnedMRTest, EqualityWithSamePool)
+{
+  // Two instances wrapping the same default pinned pool should compare equal if they
+  // ultimately refer to the same underlying pool handle. Construct two and compare.
+  cuda_async_pinned_mr mr1{};
+  cuda_async_pinned_mr mr2{};
+  EXPECT_TRUE(mr1.is_equal(mr2));
+}
+
+TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromHost)
+{
+  const auto alloc_size{sizeof(int) * 100};
+  cuda_async_pinned_mr mr{};
+  auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size));
+  ASSERT_NE(nullptr, ptr);
+
+  // Pinned memory should be accessible from host
+  // Write from host
+  EXPECT_NO_THROW({
+    for (int i = 0; i < 100; ++i) {
+      ptr[i] = i;
+    }
+  });
+
+  // Verify we can read back
+  EXPECT_EQ(ptr[0], 0);
+  EXPECT_EQ(ptr[50], 50);
+  EXPECT_EQ(ptr[99], 99);
+
+  mr.deallocate_sync(ptr, alloc_size);
+}
+
+TEST_F(AsyncPinnedMRTest, MultipleAllocationsAccessible)
+{
+  const auto alloc_size{512};
+  cuda_async_pinned_mr mr{};
+
+  void* ptr1 = mr.allocate_sync(alloc_size);
+  void* ptr2 = mr.allocate_sync(alloc_size * 2);
+  void* ptr3 = mr.allocate_sync(alloc_size / 2);
+
+  ASSERT_NE(nullptr, ptr1);
+  ASSERT_NE(nullptr, ptr2);
+  ASSERT_NE(nullptr, ptr3);
+
+  // Verify all pointers are accessible from host
+  auto* typed_ptr1 = static_cast<char*>(ptr1);
+  auto* typed_ptr2 = static_cast<char*>(ptr2);
+  auto* typed_ptr3 = static_cast<char*>(ptr3);
+
+  EXPECT_NO_THROW({
+    typed_ptr1[0] = 'a';
+    typed_ptr2[0] = 'b';
+    typed_ptr3[0] = 'c';
+  });
+
+  EXPECT_EQ(typed_ptr1[0], 'a');
+  EXPECT_EQ(typed_ptr2[0], 'b');
+  EXPECT_EQ(typed_ptr3[0], 'c');
+
+  mr.deallocate_sync(ptr1, alloc_size);
+  mr.deallocate_sync(ptr2, alloc_size * 2);
+  mr.deallocate_sync(ptr3, alloc_size / 2);
+}
+
+TEST_F(AsyncPinnedMRTest, PoolHandleIsValid)
+{
+  cuda_async_pinned_mr mr{};
+  cudaMemPool_t pool_handle = mr.pool_handle();
+  EXPECT_NE(pool_handle, nullptr);
+}
+
+}  // namespace
+}  // namespace rmm::test
diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd
@@ -134,6 +134,13 @@ cdef extern from "rmm/mr/cuda_async_managed_memory_resource.hpp" \
         cuda_async_managed_memory_resource() except +
         cudaMemPool_t pool_handle() const
 
+cdef extern from "rmm/mr/cuda_async_pinned_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+
+    cdef cppclass cuda_async_pinned_memory_resource(device_memory_resource):
+        cuda_async_pinned_memory_resource() except +
+        cudaMemPool_t pool_handle() const
+
 cdef extern from "rmm/mr/cuda_async_memory_resource.hpp" \
         namespace \
         "rmm::mr::cuda_async_memory_resource" \
diff --git a/python/rmm/rmm/mr/experimental.py b/python/rmm/rmm/mr/experimental.py
@@ -5,8 +5,10 @@
 
 from rmm.pylibrmm.memory_resource.experimental import (
     CudaAsyncManagedMemoryResource,
+    CudaAsyncPinnedMemoryResource,
 )
 
 __all__ = [
     "CudaAsyncManagedMemoryResource",
+    "CudaAsyncPinnedMemoryResource",
 ]
diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd
@@ -7,3 +7,6 @@ from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource
 
 cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource):
     pass
+
+cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource):
+    pass
diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi
@@ -6,3 +6,7 @@ from rmm.pylibrmm.memory_resource._memory_resource import DeviceMemoryResource
 class CudaAsyncManagedMemoryResource(DeviceMemoryResource):
     def __init__(self) -> None: ...
     def pool_handle(self) -> int: ...
+
+class CudaAsyncPinnedMemoryResource(DeviceMemoryResource):
+    def __init__(self) -> None: ...
+    def pool_handle(self) -> int: ...
diff --git a/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx b/python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx
@@ -5,7 +5,10 @@
 
 from libc.stdint cimport uintptr_t
 
-from rmm.librmm.memory_resource cimport cuda_async_managed_memory_resource
+from rmm.librmm.memory_resource cimport (
+    cuda_async_managed_memory_resource,
+    cuda_async_pinned_memory_resource,
+)
 # import from the private _memory_resource to avoid a circular import
 from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource
 
@@ -38,3 +41,33 @@ cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource):
         cdef cuda_async_managed_memory_resource* c_mr = \
             <cuda_async_managed_memory_resource*>self.c_obj.get()
         return <uintptr_t>c_mr.pool_handle()
+
+
+cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource):
+    """
+    Memory resource that uses ``cudaMallocFromPoolAsync``/``cudaFreeAsync`` for
+    allocation/deallocation with a pinned memory pool.
+
+    This resource uses the default pinned memory pool for the current device.
+    Pinned memory is page-locked host memory that can be accessed from both
+    the host and device. This provides fast host-device transfers.
+
+    Requires CUDA 13.0 or higher.
+    """
+    def __cinit__(self):
+        self.c_obj.reset(
+            new cuda_async_pinned_memory_resource()
+        )
+
+    def pool_handle(self):
+        """
+        Returns the underlying CUDA memory pool handle.
+
+        Returns
+        -------
+        int
+            Handle to the underlying CUDA memory pool
+        """
+        cdef cuda_async_pinned_memory_resource* c_mr = \
+            <cuda_async_pinned_memory_resource*>self.c_obj.get()
+        return <uintptr_t>c_mr.pool_handle()
diff --git a/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py b/python/rmm/rmm/tests/test_cuda_async_pinned_memory_resource.py
diff --git a/python/rmm/rmm/tests/test_helpers.py b/python/rmm/rmm/tests/test_helpers.py

Original file line number	Diff line number	Diff line change
`@@ -5,8 +5,10 @@`
`5`	`5`
`6`	`6`	`from rmm.pylibrmm.memory_resource.experimental import (`
`7`	`7`	`CudaAsyncManagedMemoryResource,`
	`8`	`+ CudaAsyncPinnedMemoryResource,`
`8`	`9`	`)`
`9`	`10`
`10`	`11`	`__all__ = [`
`11`	`12`	`"CudaAsyncManagedMemoryResource",`
	`13`	`+ "CudaAsyncPinnedMemoryResource",`
`12`	`14`	`]`