Skip to content

Commit e99ed6e

Browse files
committed
Add experimental cuda_async_pinned_memory_resource
Adds a new cuda_async_pinned_memory_resource that provides stream-ordered pinned (page-locked) host memory allocation using CUDA 13.0's cudaMemGetDefaultMemPool API with cudaMemAllocationTypePinned. This parallels the cuda_async_managed_memory_resource added in rapidsai#2056 and addresses part of rapidsai#2054. Key features: - Uses default pinned memory pool for stream-ordered allocation - Accessible from both host and device - Requires CUDA 13.0+ (matches managed version for consistency) Implementation includes: - C++ header and implementation in cuda_async_pinned_memory_resource.hpp - Runtime capability check in runtime_capabilities.hpp - C++ tests in cuda_async_pinned_mr_tests.cpp - Python bindings in experimental module - Python tests in test_cuda_async_pinned_memory_resource.py
1 parent d0b50ed commit e99ed6e

10 files changed

Lines changed: 419 additions & 1 deletion

File tree

cpp/include/rmm/detail/runtime_capabilities.hpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ namespace detail {
2525
*/
2626
#define RMM_MIN_ASYNC_MANAGED_ALLOC_CUDA_VERSION 13000
2727

28+
/**
29+
* @brief Minimum CUDA driver version for stream-ordered pinned memory allocator support
30+
*/
31+
#define RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION 13000
32+
2833
/**
2934
* @brief Determine at runtime if the CUDA driver supports the stream-ordered
3035
* memory allocator functions.
@@ -146,5 +151,31 @@ struct runtime_async_managed_alloc {
146151
}
147152
};
148153

154+
/**
155+
* @brief Determine at runtime if the CUDA driver/runtime supports the stream-ordered
156+
* pinned memory allocator functions.
157+
*
158+
* Stream-ordered pinned memory pools were introduced in CUDA 12.6 but our
159+
* implementation requires features from CUDA 13.0 or higher.
160+
*/
161+
struct runtime_async_pinned_alloc {
162+
static bool is_supported()
163+
{
164+
static auto supports_async_pinned_pool{[] {
165+
// Basic pool support required
166+
if (not runtime_async_alloc::is_supported()) { return false; }
167+
// CUDA 13.0 or higher is required for async pinned memory pools
168+
int cuda_driver_version{};
169+
auto driver_result = cudaDriverGetVersion(&cuda_driver_version);
170+
int cuda_runtime_version{};
171+
auto runtime_result = cudaRuntimeGetVersion(&cuda_runtime_version);
172+
return driver_result == cudaSuccess and runtime_result == cudaSuccess and
173+
cuda_driver_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION and
174+
cuda_runtime_version >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION;
175+
}()};
176+
return supports_async_pinned_pool;
177+
}
178+
};
179+
149180
} // namespace detail
150181
} // namespace RMM_NAMESPACE
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
#pragma once
6+
7+
#include <rmm/cuda_device.hpp>
8+
#include <rmm/cuda_stream_view.hpp>
9+
#include <rmm/detail/error.hpp>
10+
#include <rmm/detail/export.hpp>
11+
#include <rmm/detail/runtime_capabilities.hpp>
12+
#include <rmm/detail/thrust_namespace.h>
13+
#include <rmm/mr/cuda_async_view_memory_resource.hpp>
14+
#include <rmm/mr/device_memory_resource.hpp>
15+
16+
#include <cuda/std/type_traits>
17+
#include <cuda_runtime_api.h>
18+
19+
#include <cstddef>
20+
#include <cstdint>
21+
#include <optional>
22+
23+
namespace RMM_NAMESPACE {
24+
namespace mr {
25+
/**
26+
* @addtogroup memory_resources
27+
* @{
28+
* @file
29+
*/
30+
31+
/**
32+
* @brief `device_memory_resource` derived class that uses
33+
* `cudaMallocFromPoolAsync`/`cudaFreeFromPoolAsync` with a pinned memory pool
34+
* for allocation/deallocation.
35+
*/
36+
class cuda_async_pinned_memory_resource final : public device_memory_resource {
37+
public:
38+
/**
39+
* @brief Constructs a cuda_async_pinned_memory_resource with the default pinned memory pool for
40+
* the current device.
41+
*
42+
* The default pinned memory pool is the pool that is created when the device is created.
43+
* Pool properties such as the release threshold are not modified.
44+
*
45+
* @throws rmm::logic_error if the CUDA version does not support `cudaMallocFromPoolAsync` with
46+
* pinned memory pool
47+
*/
48+
cuda_async_pinned_memory_resource()
49+
{
50+
// Check if pinned memory pools are supported
51+
RMM_EXPECTS(rmm::detail::runtime_async_pinned_alloc::is_supported(),
52+
"cuda_async_pinned_memory_resource requires CUDA 13.0 or higher");
53+
54+
#if defined(CUDA_VERSION) && CUDA_VERSION >= RMM_MIN_ASYNC_PINNED_ALLOC_CUDA_VERSION
55+
cudaMemPool_t pinned_pool_handle{};
56+
cudaMemLocation location{.type = cudaMemLocationTypeDevice,
57+
.id = rmm::get_current_cuda_device().value()};
58+
RMM_CUDA_TRY(
59+
cudaMemGetDefaultMemPool(&pinned_pool_handle, &location, cudaMemAllocationTypePinned));
60+
pool_ = cuda_async_view_memory_resource{pinned_pool_handle};
61+
#endif
62+
}
63+
64+
/**
65+
* @brief Returns the underlying native handle to the CUDA pool
66+
*
67+
* @return cudaMemPool_t Handle to the underlying CUDA pool
68+
*/
69+
[[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
70+
71+
~cuda_async_pinned_memory_resource() override {}
72+
cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource const&) = delete;
73+
cuda_async_pinned_memory_resource(cuda_async_pinned_memory_resource&&) = delete;
74+
cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource const&) = delete;
75+
cuda_async_pinned_memory_resource& operator=(cuda_async_pinned_memory_resource&&) = delete;
76+
77+
private:
78+
cuda_async_view_memory_resource pool_{};
79+
80+
/**
81+
* @brief Allocates memory of size at least \p bytes.
82+
*
83+
* The returned pointer will have at minimum 256 byte alignment.
84+
*
85+
* @param bytes The size of the allocation
86+
* @param stream Stream on which to perform allocation
87+
* @return void* Pointer to the newly allocated memory
88+
*/
89+
void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
90+
{
91+
return pool_.allocate(stream, bytes);
92+
}
93+
94+
/**
95+
* @brief Deallocate memory pointed to by \p p.
96+
*
97+
* @param ptr Pointer to be deallocated
98+
* @param bytes The size in bytes of the allocation. This must be equal to the
99+
* value of `bytes` that was passed to the `allocate` call that returned `p`.
100+
* @param stream Stream on which to perform deallocation
101+
*/
102+
void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
103+
{
104+
pool_.deallocate(stream, ptr, bytes);
105+
}
106+
107+
/**
108+
* @brief Compare this resource to another.
109+
*
110+
* @param other The other resource to compare to
111+
* @return true If the two resources are equivalent
112+
* @return false If the two resources are not equal
113+
*/
114+
[[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
115+
{
116+
auto const* async_mr = dynamic_cast<cuda_async_pinned_memory_resource const*>(&other);
117+
return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
118+
}
119+
120+
friend auto get_property(cuda_async_pinned_memory_resource const&,
121+
cuda::mr::device_accessible) noexcept
122+
{
123+
return cuda::mr::device_accessible{};
124+
}
125+
friend auto get_property(cuda_async_pinned_memory_resource const&,
126+
cuda::mr::host_accessible) noexcept
127+
{
128+
return cuda::mr::host_accessible{};
129+
}
130+
};
131+
132+
// static property checks
133+
static_assert(rmm::detail::polyfill::resource<cuda_async_pinned_memory_resource>);
134+
static_assert(rmm::detail::polyfill::async_resource<cuda_async_pinned_memory_resource>);
135+
static_assert(rmm::detail::polyfill::resource_with<cuda_async_pinned_memory_resource,
136+
cuda::mr::host_accessible,
137+
cuda::mr::device_accessible>);
138+
static_assert(rmm::detail::polyfill::async_resource_with<cuda_async_pinned_memory_resource,
139+
cuda::mr::host_accessible,
140+
cuda::mr::device_accessible>);
141+
/** @} */ // end of group
142+
} // namespace mr
143+
} // namespace RMM_NAMESPACE
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#include <rmm/detail/error.hpp>
7+
#include <rmm/detail/runtime_capabilities.hpp>
8+
#include <rmm/mr/cuda_async_pinned_memory_resource.hpp>
9+
10+
#include <cuda_runtime_api.h>
11+
12+
#include <gtest/gtest.h>
13+
14+
namespace rmm::test {
15+
namespace {
16+
17+
using cuda_async_pinned_mr = rmm::mr::cuda_async_pinned_memory_resource;
18+
19+
class AsyncPinnedMRTest : public ::testing::Test {
20+
protected:
21+
void SetUp() override
22+
{
23+
if (!rmm::detail::runtime_async_pinned_alloc::is_supported()) {
24+
GTEST_SKIP() << "Skipping tests because cuda_async_pinned_memory_resource "
25+
<< "requires CUDA 13.0 or higher and memory pool support.";
26+
}
27+
}
28+
};
29+
30+
TEST_F(AsyncPinnedMRTest, BasicAllocateDeallocate)
31+
{
32+
const auto alloc_size{100};
33+
cuda_async_pinned_mr mr{};
34+
void* ptr = mr.allocate_sync(alloc_size);
35+
ASSERT_NE(nullptr, ptr);
36+
mr.deallocate_sync(ptr, alloc_size);
37+
}
38+
39+
TEST_F(AsyncPinnedMRTest, EqualityWithSamePool)
40+
{
41+
// Two instances wrapping the same default pinned pool should compare equal if they
42+
// ultimately refer to the same underlying pool handle. Construct two and compare.
43+
cuda_async_pinned_mr mr1{};
44+
cuda_async_pinned_mr mr2{};
45+
EXPECT_TRUE(mr1.is_equal(mr2));
46+
}
47+
48+
TEST_F(AsyncPinnedMRTest, AllocatedPointerIsAccessibleFromHost)
49+
{
50+
const auto alloc_size{sizeof(int) * 100};
51+
cuda_async_pinned_mr mr{};
52+
auto* ptr = static_cast<int*>(mr.allocate_sync(alloc_size));
53+
ASSERT_NE(nullptr, ptr);
54+
55+
// Pinned memory should be accessible from host
56+
// Write from host
57+
EXPECT_NO_THROW({
58+
for (int i = 0; i < 100; ++i) {
59+
ptr[i] = i;
60+
}
61+
});
62+
63+
// Verify we can read back
64+
EXPECT_EQ(ptr[0], 0);
65+
EXPECT_EQ(ptr[50], 50);
66+
EXPECT_EQ(ptr[99], 99);
67+
68+
mr.deallocate_sync(ptr, alloc_size);
69+
}
70+
71+
TEST_F(AsyncPinnedMRTest, MultipleAllocationsAccessible)
72+
{
73+
const auto alloc_size{512};
74+
cuda_async_pinned_mr mr{};
75+
76+
void* ptr1 = mr.allocate_sync(alloc_size);
77+
void* ptr2 = mr.allocate_sync(alloc_size * 2);
78+
void* ptr3 = mr.allocate_sync(alloc_size / 2);
79+
80+
ASSERT_NE(nullptr, ptr1);
81+
ASSERT_NE(nullptr, ptr2);
82+
ASSERT_NE(nullptr, ptr3);
83+
84+
// Verify all pointers are accessible from host
85+
auto* typed_ptr1 = static_cast<char*>(ptr1);
86+
auto* typed_ptr2 = static_cast<char*>(ptr2);
87+
auto* typed_ptr3 = static_cast<char*>(ptr3);
88+
89+
EXPECT_NO_THROW({
90+
typed_ptr1[0] = 'a';
91+
typed_ptr2[0] = 'b';
92+
typed_ptr3[0] = 'c';
93+
});
94+
95+
EXPECT_EQ(typed_ptr1[0], 'a');
96+
EXPECT_EQ(typed_ptr2[0], 'b');
97+
EXPECT_EQ(typed_ptr3[0], 'c');
98+
99+
mr.deallocate_sync(ptr1, alloc_size);
100+
mr.deallocate_sync(ptr2, alloc_size * 2);
101+
mr.deallocate_sync(ptr3, alloc_size / 2);
102+
}
103+
104+
TEST_F(AsyncPinnedMRTest, PoolHandleIsValid)
105+
{
106+
cuda_async_pinned_mr mr{};
107+
cudaMemPool_t pool_handle = mr.pool_handle();
108+
EXPECT_NE(pool_handle, nullptr);
109+
}
110+
111+
} // namespace
112+
} // namespace rmm::test

python/rmm/rmm/librmm/memory_resource.pxd

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,13 @@ cdef extern from "rmm/mr/cuda_async_managed_memory_resource.hpp" \
134134
cuda_async_managed_memory_resource() except +
135135
cudaMemPool_t pool_handle() const
136136

137+
cdef extern from "rmm/mr/cuda_async_pinned_memory_resource.hpp" \
138+
namespace "rmm::mr" nogil:
139+
140+
cdef cppclass cuda_async_pinned_memory_resource(device_memory_resource):
141+
cuda_async_pinned_memory_resource() except +
142+
cudaMemPool_t pool_handle() const
143+
137144
cdef extern from "rmm/mr/cuda_async_memory_resource.hpp" \
138145
namespace \
139146
"rmm::mr::cuda_async_memory_resource" \

python/rmm/rmm/mr/experimental.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55

66
from rmm.pylibrmm.memory_resource.experimental import (
77
CudaAsyncManagedMemoryResource,
8+
CudaAsyncPinnedMemoryResource,
89
)
910

1011
__all__ = [
1112
"CudaAsyncManagedMemoryResource",
13+
"CudaAsyncPinnedMemoryResource",
1214
]

python/rmm/rmm/pylibrmm/memory_resource/experimental.pxd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@ from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource
77

88
cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource):
99
pass
10+
11+
cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource):
12+
pass

python/rmm/rmm/pylibrmm/memory_resource/experimental.pyi

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@ from rmm.pylibrmm.memory_resource._memory_resource import DeviceMemoryResource
66
class CudaAsyncManagedMemoryResource(DeviceMemoryResource):
77
def __init__(self) -> None: ...
88
def pool_handle(self) -> int: ...
9+
10+
class CudaAsyncPinnedMemoryResource(DeviceMemoryResource):
11+
def __init__(self) -> None: ...
12+
def pool_handle(self) -> int: ...

python/rmm/rmm/pylibrmm/memory_resource/experimental.pyx

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55

66
from libc.stdint cimport uintptr_t
77

8-
from rmm.librmm.memory_resource cimport cuda_async_managed_memory_resource
8+
from rmm.librmm.memory_resource cimport (
9+
cuda_async_managed_memory_resource,
10+
cuda_async_pinned_memory_resource,
11+
)
912
# import from the private _memory_resource to avoid a circular import
1013
from rmm.pylibrmm.memory_resource._memory_resource cimport DeviceMemoryResource
1114

@@ -38,3 +41,33 @@ cdef class CudaAsyncManagedMemoryResource(DeviceMemoryResource):
3841
cdef cuda_async_managed_memory_resource* c_mr = \
3942
<cuda_async_managed_memory_resource*>self.c_obj.get()
4043
return <uintptr_t>c_mr.pool_handle()
44+
45+
46+
cdef class CudaAsyncPinnedMemoryResource(DeviceMemoryResource):
47+
"""
48+
Memory resource that uses ``cudaMallocFromPoolAsync``/``cudaFreeAsync`` for
49+
allocation/deallocation with a pinned memory pool.
50+
51+
This resource uses the default pinned memory pool for the current device.
52+
Pinned memory is page-locked host memory that can be accessed from both
53+
the host and device. This provides fast host-device transfers.
54+
55+
Requires CUDA 13.0 or higher.
56+
"""
57+
def __cinit__(self):
58+
self.c_obj.reset(
59+
new cuda_async_pinned_memory_resource()
60+
)
61+
62+
def pool_handle(self):
63+
"""
64+
Returns the underlying CUDA memory pool handle.
65+
66+
Returns
67+
-------
68+
int
69+
Handle to the underlying CUDA memory pool
70+
"""
71+
cdef cuda_async_pinned_memory_resource* c_mr = \
72+
<cuda_async_pinned_memory_resource*>self.c_obj.get()
73+
return <uintptr_t>c_mr.pool_handle()

0 commit comments

Comments
 (0)