[libcu++] Automatically bump up the release threshold of default mempools (#6718)

pciolkosz · web-flow · commit 5bd0f5bf7a20 · 2025-11-22T09:29:26.000Z
diff --git a/libcudacxx/include/cuda/__driver/driver_api.h b/libcudacxx/include/cuda/__driver/driver_api.h
@@ -413,7 +413,15 @@ __getDefaultMemPool(CUmemLocation __location, CUmemAllocationType_enum __allocat
     __driver_fn, "Failed to get default memory pool", &__result, &__location, __allocation_type);
   return __result;
 }
-#  endif // _CCCL_CTK_AT_LEAST(13, 0)
+#  else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
+_CCCL_HOST_API inline ::CUmemoryPool __deviceGetDefaultMemPool(::CUdevice __device)
+{
+  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDeviceGetDefaultMemPool);
+  ::CUmemoryPool __result = nullptr;
+  ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get default memory pool", &__result, __device);
+  return __result;
+}
+#  endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
 
 _CCCL_HOST_API inline ::CUdeviceptr __mallocManaged(::cuda::std::size_t __bytes, unsigned int __flags)
 {
diff --git a/libcudacxx/include/cuda/__memory_resource/device_memory_pool.h b/libcudacxx/include/cuda/__memory_resource/device_memory_pool.h
@@ -27,16 +27,16 @@
 #endif // _CCCL_CUDA_COMPILER(CLANG)
 
 #include <cuda/__memory_resource/get_property.h>
-#include <cuda/__memory_resource/memory_resource_base.h>
+#include <cuda/__memory_resource/memory_pool_base.h>
 #include <cuda/__memory_resource/properties.h>
 #include <cuda/__runtime/api_wrapper.h>
 #include <cuda/std/__concepts/concept_macros.h>
 
 #include <cuda/std/__cccl/prologue.h>
 
 //! @file
-//! The \c device_memory_pool class provides an asynchronous memory resource that allocates device memory in stream
-//! order.
+//! The \c device_memory_pool class provides an asynchronous memory resource
+//! that allocates device memory in stream order.
 _CCCL_BEGIN_NAMESPACE_CUDA
 
 //! @rst
@@ -45,30 +45,34 @@ _CCCL_BEGIN_NAMESPACE_CUDA
 //! Stream ordered memory pool
 //! ------------------------------
 //!
-//! ``device_memory_pool_ref`` allocates device memory using `cudaMallocFromPoolAsync / cudaFreeAsync
-//! <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__ for allocation/deallocation. A
-//! ``device_memory_pool_ref`` is a thin wrapper around a \c cudaMemPool_t with the location type set to \c
-//! cudaMemLocationTypeDevice.
+//! ``device_memory_pool_ref`` allocates device memory using
+//! `cudaMallocFromPoolAsync / cudaFreeAsync
+//! <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+//! for allocation/deallocation. A
+//! ``device_memory_pool_ref`` is a thin wrapper around a \c cudaMemPool_t with
+//! the location type set to \c cudaMemLocationTypeDevice.
 //!
 //! .. warning::
 //!
-//!    ``device_memory_pool_ref`` does not own the pool and it is the responsibility of the user to ensure that the
-//!    lifetime of the pool exceeds the lifetime of the ``device_memory_pool_ref``.
+//!    ``device_memory_pool_ref`` does not own the pool and it is the
+//!    responsibility of the user to ensure that the lifetime of the pool
+//!    exceeds the lifetime of the ``device_memory_pool_ref``.
 //!
 //! @endrst
-class device_memory_pool_ref : public __memory_resource_base
+class device_memory_pool_ref : public __memory_pool_base
 {
 public:
   //! @brief  Constructs the device_memory_pool_ref from a \c cudaMemPool_t.
   //! @param __pool The \c cudaMemPool_t used to allocate memory.
   _CCCL_HOST_API explicit device_memory_pool_ref(::cudaMemPool_t __pool) noexcept
-      : __memory_resource_base(__pool)
+      : __memory_pool_base(__pool)
   {}
 
   device_memory_pool_ref(int)                    = delete;
   device_memory_pool_ref(::cuda::std::nullptr_t) = delete;
 
-  //! @brief Enables the \c device_accessible property for \c device_memory_pool_ref.
+  //! @brief Enables the \c device_accessible property for \c
+  //! device_memory_pool_ref.
   //! @relates device_memory_pool_ref
   _CCCL_HOST_API friend constexpr void
   get_property(device_memory_pool_ref const&, ::cuda::mr::device_accessible) noexcept
@@ -82,12 +86,9 @@ class device_memory_pool_ref : public __memory_resource_base
 //! @returns The default memory pool of the specified device.
 [[nodiscard]] inline device_memory_pool_ref device_default_memory_pool(::cuda::device_ref __device)
 {
-  ::cuda::__verify_device_supports_stream_ordered_allocations(__device.get());
-
-  ::cudaMemPool_t __pool;
-  _CCCL_TRY_CUDA_API(
-    ::cudaDeviceGetDefaultMemPool, "Failed to call cudaDeviceGetDefaultMemPool", &__pool, __device.get());
-  return device_memory_pool_ref{__pool};
+  static ::cudaMemPool_t __pool = ::cuda::__get_default_memory_pool(
+    ::CUmemLocation{::CU_MEM_LOCATION_TYPE_DEVICE, __device.get()}, ::CU_MEM_ALLOCATION_TYPE_PINNED);
+  return device_memory_pool_ref(__pool);
 }
 
 //! @rst
@@ -96,22 +97,28 @@ class device_memory_pool_ref : public __memory_resource_base
 //! Stream ordered memory resource
 //! ------------------------------
 //!
-//! ``device_memory_pool`` allocates device memory using `cudaMallocFromPoolAsync / cudaFreeAsync
-//! <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__ for allocation/deallocation. A
-//! When constructed it creates an underlying \c cudaMemPool_t with the location type set to \c
-//! cudaMemLocationTypeDevice and owns it.
+//! ``device_memory_pool`` allocates device memory using
+//! `cudaMallocFromPoolAsync / cudaFreeAsync
+//! <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+//! for allocation/deallocation. A When constructed it creates an underlying \c
+//! cudaMemPool_t with the location type set to \c cudaMemLocationTypeDevice and
+//! owns it.
 //!
 //! @endrst
 struct device_memory_pool : device_memory_pool_ref
 {
   using reference_type = device_memory_pool_ref;
 
-  //! @brief Constructs a \c device_memory_pool with the optionally specified initial pool size and release
-  //! threshold. If the pool size grows beyond the release threshold, unused memory held by the pool will be released at
-  //! the next synchronization event.
-  //! @throws cuda_error if the CUDA version does not support ``cudaMallocAsync``.
-  //! @param __device_id The device id of the device the stream pool is constructed on.
-  //! @param __pool_properties Optional, additional properties of the pool to be created.
+  //! @brief Constructs a \c device_memory_pool with the optionally specified
+  //! initial pool size and release threshold. If the pool size grows beyond the
+  //! release threshold, unused memory held by the pool will be released at the
+  //! next synchronization event.
+  //! @throws cuda_error if the CUDA version does not support
+  //! ``cudaMallocAsync``.
+  //! @param __device_id The device id of the device the stream pool is
+  //! constructed on.
+  //! @param __pool_properties Optional, additional properties of the pool to be
+  //! created.
   _CCCL_HOST_API device_memory_pool(::cuda::device_ref __device_id, memory_pool_properties __properties = {})
       : device_memory_pool_ref(__create_cuda_mempool(
           __properties,
diff --git a/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/legacy_pinned_memory_resource.h
@@ -26,9 +26,11 @@
 #  include <cuda_runtime_api.h>
 #endif // _CCCL_CUDA_COMPILER(CLANG)
 
-#include <cuda/__memory_resource/memory_resource_base.h>
+#include <cuda/__device/device_ref.h>
 #include <cuda/__memory_resource/properties.h>
+#include <cuda/__memory_resource/resource.h>
 #include <cuda/__runtime/api_wrapper.h>
+#include <cuda/__runtime/ensure_current_context.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__exception/throw_error.h>
 
diff --git a/libcudacxx/include/cuda/__memory_resource/managed_memory_pool.h b/libcudacxx/include/cuda/__memory_resource/managed_memory_pool.h
@@ -23,47 +23,45 @@
 
 #if _CCCL_CTK_AT_LEAST(13, 0)
 
-#  include <cuda/__memory_resource/memory_resource_base.h>
+#  include <cuda/__memory_resource/memory_pool_base.h>
 #  include <cuda/__memory_resource/properties.h>
 #  include <cuda/std/__concepts/concept_macros.h>
 #  include <cuda/std/__exception/throw_error.h>
 
 #  include <cuda/std/__cccl/prologue.h>
 
 //! @file
-//! The \c managed_memory_resource class provides a memory resource that allocates managed memory.
+//! The \c managed_memory_resource class provides a memory resource that
+//! allocates managed memory.
 _CCCL_BEGIN_NAMESPACE_CUDA
 
-[[nodiscard]] static ::cudaMemPool_t __get_default_managed_pool()
-{
-  return ::cuda::__driver::__getDefaultMemPool(
-    ::CUmemLocation{::CU_MEM_LOCATION_TYPE_NONE, 0}, ::CU_MEM_ALLOCATION_TYPE_MANAGED);
-}
-
 //! @rst
 //! .. _cudax-memory-resource-async:
 //!
 //! Stream ordered memory resource
 //! ------------------------------
 //!
-//! ``managed_memory_pool_ref`` allocates managed memory using `cudaMallocFromPoolAsync / cudaFreeAsync
-//! <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__ for allocation/deallocation. A
-//! ``managed_memory_pool_ref`` is a thin wrapper around a \c cudaMemPool_t with the allocation type set to \c
-//! cudaMemAllocationTypeManaged.
+//! ``managed_memory_pool_ref`` allocates managed memory using
+//! `cudaMallocFromPoolAsync / cudaFreeAsync
+//! <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+//! for allocation/deallocation. A
+//! ``managed_memory_pool_ref`` is a thin wrapper around a \c cudaMemPool_t with
+//! the allocation type set to \c cudaMemAllocationTypeManaged.
 //!
 //! .. warning::
 //!
-//!    ``managed_memory_pool_ref`` does not own the pool and it is the responsibility of the user to ensure that the
-//!    lifetime of the pool exceeds the lifetime of the ``managed_memory_pool_ref``.
+//!    ``managed_memory_pool_ref`` does not own the pool and it is the
+//!    responsibility of the user to ensure that the lifetime of the pool
+//!    exceeds the lifetime of the ``managed_memory_pool_ref``.
 //!
 //! @endrst
-class managed_memory_pool_ref : public __memory_resource_base
+class managed_memory_pool_ref : public __memory_pool_base
 {
 public:
   //! @brief  Constructs the managed_memory_pool_ref from a \c cudaMemPool_t.
   //! @param __pool The \c cudaMemPool_t used to allocate memory.
   _CCCL_HOST_API explicit managed_memory_pool_ref(::cudaMemPool_t __pool) noexcept
-      : __memory_resource_base(__pool)
+      : __memory_pool_base(__pool)
   {}
 
   //! @brief Enables the \c device_accessible property
@@ -82,7 +80,9 @@ class managed_memory_pool_ref : public __memory_resource_base
 //! @returns The default managed memory pool.
 [[nodiscard]] inline managed_memory_pool_ref managed_default_memory_pool()
 {
-  return managed_memory_pool_ref{::cuda::__get_default_managed_pool()};
+  static ::cudaMemPool_t __pool = ::cuda::__get_default_memory_pool(
+    ::CUmemLocation{::CU_MEM_LOCATION_TYPE_NONE, 0}, ::CU_MEM_ALLOCATION_TYPE_MANAGED);
+  return managed_memory_pool_ref(__pool);
 }
 
 //! @rst
@@ -91,9 +91,11 @@ class managed_memory_pool_ref : public __memory_resource_base
 //! Stream ordered memory resource
 //! ------------------------------
 //!
-//! ``managed_memory_pool`` allocates managed memory using `cudaMallocFromPoolAsync / cudaFreeAsync
-//! <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__ for allocation/deallocation. A
-//! When constructed it creates an underlying \c cudaMemPool_t with the allocation type set to \c
+//! ``managed_memory_pool`` allocates managed memory using
+//! `cudaMallocFromPoolAsync / cudaFreeAsync
+//! <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html>`__
+//! for allocation/deallocation. A When constructed it creates an underlying \c
+//! cudaMemPool_t with the allocation type set to \c
 //! cudaMemAllocationTypeManaged and owns it.
 //!
 //! @endrst
@@ -102,15 +104,18 @@ struct managed_memory_pool : managed_memory_pool_ref
   using reference_type = managed_memory_pool_ref;
 
   //! @brief Constructs a \c managed_memory_pool with optional properties.
-  //! Properties include the initial pool size and the release threshold. If the pool size grows beyond the release
-  //! threshold, unused memory held by the pool will be released at the next synchronization event.
-  //! @param __properties Optional, additional properties of the pool to be created.
+  //! Properties include the initial pool size and the release threshold. If the
+  //! pool size grows beyond the release threshold, unused memory held by the
+  //! pool will be released at the next synchronization event.
+  //! @param __properties Optional, additional properties of the pool to be
+  //! created.
   _CCCL_HOST_API managed_memory_pool(memory_pool_properties __properties = {})
       : managed_memory_pool_ref(__create_cuda_mempool(
           __properties, ::CUmemLocation{::CU_MEM_LOCATION_TYPE_NONE, 0}, ::CU_MEM_ALLOCATION_TYPE_MANAGED))
   {}
 
-  // TODO add a constructor that accepts memory location one a type for it is added
+  // TODO add a constructor that accepts memory location one a type for it is
+  // added
 
   ~managed_memory_pool() noexcept
   {
diff --git a/libcudacxx/include/cuda/__memory_resource/memory_pool_base.h b/libcudacxx/include/cuda/__memory_resource/memory_pool_base.h
diff --git a/libcudacxx/include/cuda/__memory_resource/pinned_memory_pool.h b/libcudacxx/include/cuda/__memory_resource/pinned_memory_pool.h