rapidsai · bdice · May 1, 2026 · May 1, 2026 · May 3, 2026 · coderabbitai
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -18,7 +18,7 @@ static void BM_StreamPoolGetStream(benchmark::State& state)
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
     auto stream = stream_pool.get_stream();
-    cudaStreamQuery(stream.value());
+    cudaStreamQuery(cuda::stream_ref{stream}.get());
   }
 
   state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
@@ -29,7 +29,7 @@ static void BM_CudaStreamClass(benchmark::State& state)
 {
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
     auto stream = rmm::cuda_stream{};
-    cudaStreamQuery(stream.view().value());
+    cudaStreamQuery(cuda::stream_ref{stream}.get());
   }
 
   state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));

@@ -15,6 +15,7 @@
 #include <rmm/mr/per_device_resource.hpp>
 #include <rmm/mr/pool_memory_resource.hpp>
 
+#include <cuda/stream_ref>
 #include <cuda_runtime_api.h>
 #include <thrust/device_vector.h>
 #include <thrust/memory.h>
@@ -31,7 +32,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
     rmm::device_uvector<std::int32_t> vec(static_cast<std::size_t>(state.range(0)),
-                                          rmm::cuda_stream_view{});
+                                          cuda::stream_ref{cudaStream_t{nullptr}});
     cudaDeviceSynchronize();
   }
 
@@ -78,7 +79,7 @@ using rmm_vector    = rmm::device_vector<int32_t>;
 using rmm_uvector   = rmm::device_uvector<int32_t>;
 
 template <typename Vector>
-Vector make_vector(std::size_t num_elements, rmm::cuda_stream_view stream, bool zero_init = false)
+Vector make_vector(std::size_t num_elements, cuda::stream_ref stream, bool zero_init = false)
 {
   static_assert(std::is_same_v<Vector, thrust_vector> or std::is_same_v<Vector, rmm_vector> or
                   std::is_same_v<Vector, rmm_uvector>,
@@ -90,7 +91,7 @@ Vector make_vector(std::size_t num_elements, rmm::cuda_stream_view stream, bool
   } else if constexpr (std::is_same_v<Vector, rmm_uvector>) {
     auto vec = Vector(num_elements, stream);
     if (zero_init) {
-      cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.value());
+      cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get());
-      cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get());
+      RMM_CUDA_TRY(
+        cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get()));
-      cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get());
+      RMM_CUDA_TRY(
+        cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get()));
     }
     return vec;
   }
@@ -111,14 +112,14 @@ void vector_workflow(std::size_t num_elements,
 {
   auto input = make_vector<Vector>(num_elements, input_stream, true);
   input_stream.synchronize();
-  for (rmm::cuda_stream_view stream : streams) {
+  for (cuda::stream_ref stream : streams) {
     auto output = make_vector<Vector>(num_elements, stream);
-    kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    kernel<<<num_blocks, block_size, 0, stream.get()>>>(
       vector_data(input), vector_data(output), num_elements);
   }
 
-  for (rmm::cuda_stream_view stream : streams) {
-    stream.synchronize();
+  for (cuda::stream_ref stream : streams) {
+    RMM_CUDA_TRY(cudaStreamSynchronize(stream.get()));
   }
 }
 

@@ -15,6 +15,7 @@
 #include <rmm/mr/pool_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/stream_ref>
 #include <cuda_runtime_api.h>
 
 #include <benchmark/benchmark.h>
@@ -54,9 +55,9 @@ static void run_test(std::size_t num_kernels,
                      rmm::device_async_resource_ref mr)
 {
   for (std::size_t i = 0; i < num_kernels; i++) {
-    auto stream = stream_pool.get_stream(i);
+    auto stream = cuda::stream_ref{stream_pool.get_stream(i)};
     auto buffer = rmm::device_uvector<int64_t>(1, stream, mr);
-    compute_bound_kernel<<<1, 1, 0, stream.value()>>>(buffer.data());
+    compute_bound_kernel<<<1, 1, 0, stream.get()>>>(buffer.data());
   }
 }
 

@@ -13,6 +13,9 @@
 #include <rmm/mr/pool_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/stream_ref>
+#include <cuda_runtime_api.h>
+
 #include <benchmark/benchmark.h>
 #include <benchmarks/utilities/cxxopts.hpp>
 
@@ -54,7 +57,7 @@ void random_allocation_free(rmm::device_async_resource_ref mr,
                             SizeDistribution size_distribution,
                             std::size_t num_allocations,
                             std::size_t max_usage,  // in MiB
-                            rmm::cuda_stream_view stream = {})
+                            cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}})
 {
   std::default_random_engine generator;
 
@@ -132,7 +135,7 @@ void uniform_random_allocations(
   std::size_t num_allocations,      // NOLINT(bugprone-easily-swappable-parameters)
   std::size_t max_allocation_size,  // size in MiB
   std::size_t max_usage,
-  rmm::cuda_stream_view stream = {})
+  cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}})
 {
   std::uniform_int_distribution<std::size_t> size_distribution(1, max_allocation_size * size_mb);
   random_allocation_free(mr, size_distribution, num_allocations, max_usage, stream);
@@ -144,7 +147,7 @@ void uniform_random_allocations(
                                 std::size_t mean_allocation_size = 500, // in MiB
                                 std::size_t stddev_allocation_size = 500, // in MiB
                                 std::size_t max_usage = 8 << 20,
-                                cuda_stream_view stream) {
+                                cuda::stream_ref stream) {
   std::normal_distribution<std::size_t> size_distribution(, max_allocation_size * size_mb);
 }*/
 

@@ -4,7 +4,6 @@
  */
 
 #include <rmm/aligned.hpp>
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/arena_memory_resource.hpp>
@@ -16,6 +15,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/iterator>
+#include <cuda/stream_ref>
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
 
@@ -253,8 +253,14 @@ std::vector<std::vector<rmm::detail::event>> parse_per_thread_events(std::string
                           [](auto const& event) {
                             cudaStream_t custream;
                             memcpy(&custream, &event.stream, sizeof(cudaStream_t));
-                            auto stream = rmm::cuda_stream_view{custream};
-                            return stream.is_default() or stream.is_per_thread_default();
+                            auto stream = cuda::stream_ref{custream};
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+                            return stream.get() == cudaStreamLegacy or
+                                   stream.get() == cudaStreamPerThread or stream.get() == nullptr;
+#else
+                            return stream.get() == cudaStreamLegacy or stream.get() == nullptr or
+                                   stream.get() == cudaStreamPerThread;
+#endif
                           }),
               "Non-default streams not currently supported.");
 

@@ -19,7 +19,7 @@
 
 cuda_event_timer::cuda_event_timer(benchmark::State& state,
                                    bool flush_l2_cache,
-                                   rmm::cuda_stream_view stream)
+                                   cuda::stream_ref stream)
   : stream(stream), p_state(&state)
 {
   // flush all of L2$
@@ -36,18 +36,18 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
       RMM_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer.data(),
                                    memset_value,
                                    static_cast<std::size_t>(l2_cache_bytes),
-                                   stream.value()));
+                                   stream.get()));
     }
   }
 
   RMM_CUDA_TRY(cudaEventCreate(&start));
   RMM_CUDA_TRY(cudaEventCreate(&stop));
-  RMM_CUDA_TRY(cudaEventRecord(start, stream.value()));
+  RMM_CUDA_TRY(cudaEventRecord(start, stream.get()));
 }
 
 cuda_event_timer::~cuda_event_timer()
 {
-  RMM_CUDA_ASSERT_OK(cudaEventRecord(stop, stream.value()));
+  RMM_CUDA_ASSERT_OK(cudaEventRecord(stop, stream.get()));
   RMM_CUDA_ASSERT_OK(cudaEventSynchronize(stop));
 
   float milliseconds = 0.0F;

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -47,9 +47,7 @@
 
 #pragma once
 
-#include <rmm/cuda_stream_view.hpp>
-
-// Google Benchmark library
+#include <cuda/stream_ref>
 #include <cuda_runtime_api.h>
 
 #include <benchmark/benchmark.h>
@@ -68,7 +66,7 @@ class cuda_event_timer {
    */
   cuda_event_timer(benchmark::State& state,
                    bool flush_l2_cache,
-                   rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                   cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
   // The user will HAVE to provide a benchmark::State object to set
   // the timer so we disable the default c'tor.
@@ -88,6 +86,6 @@ class cuda_event_timer {
  private:
   cudaEvent_t start{};
   cudaEvent_t stop{};
-  rmm::cuda_stream_view stream{};
+  cuda::stream_ref stream{cudaStream_t{nullptr}};
   benchmark::State* p_state{};
 };
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <rmm/cuda_stream_view.hpp>
+#include <cuda/stream_ref>
 
 #include <array>
 #include <cstdio>
@@ -31,10 +31,10 @@ inline std::string format_bytes(std::size_t value)
 }
 
 // Stringify a stream ID
-inline std::string format_stream(rmm::cuda_stream_view stream)
+inline std::string format_stream(cuda::stream_ref stream)
 {
   std::stringstream sstr{};
-  sstr << std::hex << stream.value();
+  sstr << std::hex << stream.get();
   return sstr.str();
 }
 

@@ -13,6 +13,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/memory_resource>
+#include <cuda/stream_ref>
 #include <cuda_runtime_api.h>
 
 #include <cassert>
@@ -379,8 +380,9 @@ class device_buffer {
   void* _data{nullptr};  ///< Pointer to device memory allocation
   std::size_t _size{};   ///< Requested size of the device memory allocation
   std::size_t _alignment{rmm::CUDA_ALLOCATION_ALIGNMENT};  ///< The alignment of the allocation
-  std::size_t _capacity{};     ///< The actual size of the device memory allocation
-  cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
+  std::size_t _capacity{};  ///< The actual size of the device memory allocation
+  cuda::stream_ref _stream{
+    cudaStream_t{nullptr}};  ///< Stream to use for device memory deallocation
 
   cuda::mr::any_resource<cuda::mr::device_accessible> _mr;  ///< The memory resource used to
                                                             ///< allocate/deallocate device memory

@@ -11,6 +11,8 @@
 #include <rmm/mr/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/stream_ref>
+
 #include <type_traits>
 
 namespace RMM_NAMESPACE {
@@ -84,7 +86,7 @@ class device_scalar {
    * @param mr Optional, resource with which to allocate.
    */
   explicit device_scalar(
-    cuda_stream_view stream,
+    cuda::stream_ref stream,
     cuda::mr::any_resource<cuda::mr::device_accessible> mr = mr::get_current_device_resource_ref())
     : _storage{1, stream, std::move(mr)}
   {
@@ -110,7 +112,7 @@ class device_scalar {
    */
   explicit device_scalar(
     value_type const& initial_value,
-    cuda_stream_view stream,
+    cuda::stream_ref stream,
     cuda::mr::any_resource<cuda::mr::device_accessible> mr = mr::get_current_device_resource_ref())
     : _storage{1, stream, std::move(mr)}
   {
@@ -131,7 +133,7 @@ class device_scalar {
    */
   device_scalar(
     device_scalar const& other,
-    cuda_stream_view stream,
+    cuda::stream_ref stream,
     cuda::mr::any_resource<cuda::mr::device_accessible> mr = mr::get_current_device_resource_ref())
     : _storage{other._storage, stream, std::move(mr)}
   {
@@ -153,7 +155,7 @@ class device_scalar {
    * @return T The value of the scalar.
    * @param stream CUDA stream on which to perform the copy and synchronize.
    */
-  [[nodiscard]] value_type value(cuda_stream_view stream) const
+  [[nodiscard]] value_type value(cuda::stream_ref stream) const
   {
     return _storage.front_element(stream);
   }
@@ -191,14 +193,14 @@ class device_scalar {
    * @param value The host value which will be copied to device
    * @param stream CUDA stream on which to perform the copy
    */
-  void set_value_async(value_type const& value, cuda_stream_view stream)
+  void set_value_async(value_type const& value, cuda::stream_ref stream)
   {
     _storage.set_element_async(0, value, stream);
   }
 
   // Disallow passing literals to set_value to avoid race conditions where the memory holding the
   // literal can be freed before the async memcpy / memset executes.
-  void set_value_async(value_type&&, cuda_stream_view) = delete;
+  void set_value_async(value_type&&, cuda::stream_ref) = delete;
 
   /**
    * @brief Sets the value of the `device_scalar` to zero on the specified stream.
@@ -214,7 +216,7 @@ class device_scalar {
    *
    * @param stream CUDA stream on which to perform the copy
    */
-  void set_value_to_zero_async(cuda_stream_view stream)
+  void set_value_to_zero_async(cuda::stream_ref stream)
   {
     _storage.set_element_to_zero_async(value_type{0}, stream);
-  void set_value_to_zero_async(cuda::stream_ref stream)
-  {
-    _storage.set_element_to_zero_async(value_type{0}, stream);
+  void set_value_to_zero_async(cuda::stream_ref stream)
+  {
+    _storage.set_element_to_zero_async(size_type{0}, stream);
+  }
-  void set_value_to_zero_async(cuda::stream_ref stream)
-  {
-    _storage.set_element_to_zero_async(value_type{0}, stream);
+  void set_value_to_zero_async(cuda::stream_ref stream)
+  {
+    _storage.set_element_to_zero_async(size_type{0}, stream);
+  }
   }
@@ -261,7 +263,7 @@ class device_scalar {
    *
    * @param stream Stream to be used for deallocation
    */
-  void set_stream(cuda_stream_view stream) noexcept { _storage.set_stream(stream); }
+  void set_stream(cuda::stream_ref stream) noexcept { _storage.set_stream(stream); }
 
  private:
   rmm::device_uvector<T> _storage;