Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cpp/benchmarks/cuda_stream_pool/cuda_stream_pool_bench.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2020-2021, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -18,7 +18,7 @@ static void BM_StreamPoolGetStream(benchmark::State& state)

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
auto stream = stream_pool.get_stream();
cudaStreamQuery(stream.value());
cudaStreamQuery(cuda::stream_ref{stream}.get());
}

state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
Expand All @@ -29,7 +29,7 @@ static void BM_CudaStreamClass(benchmark::State& state)
{
for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
auto stream = rmm::cuda_stream{};
cudaStreamQuery(stream.view().value());
cudaStreamQuery(cuda::stream_ref{stream}.get());
}

state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
Expand Down
15 changes: 8 additions & 7 deletions cpp/benchmarks/device_uvector/device_uvector_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <rmm/mr/per_device_resource.hpp>
#include <rmm/mr/pool_memory_resource.hpp>

#include <cuda/stream_ref>
#include <cuda_runtime_api.h>
#include <thrust/device_vector.h>
#include <thrust/memory.h>
Expand All @@ -31,7 +32,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)

for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
rmm::device_uvector<std::int32_t> vec(static_cast<std::size_t>(state.range(0)),
rmm::cuda_stream_view{});
cuda::stream_ref{cudaStream_t{nullptr}});
cudaDeviceSynchronize();
}

Expand Down Expand Up @@ -78,7 +79,7 @@ using rmm_vector = rmm::device_vector<int32_t>;
using rmm_uvector = rmm::device_uvector<int32_t>;

template <typename Vector>
Vector make_vector(std::size_t num_elements, rmm::cuda_stream_view stream, bool zero_init = false)
Vector make_vector(std::size_t num_elements, cuda::stream_ref stream, bool zero_init = false)
{
static_assert(std::is_same_v<Vector, thrust_vector> or std::is_same_v<Vector, rmm_vector> or
std::is_same_v<Vector, rmm_uvector>,
Expand All @@ -90,7 +91,7 @@ Vector make_vector(std::size_t num_elements, rmm::cuda_stream_view stream, bool
} else if constexpr (std::is_same_v<Vector, rmm_uvector>) {
auto vec = Vector(num_elements, stream);
if (zero_init) {
cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.value());
cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get());
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify async memset call sites and wrapper usage in this file
rg -n -C2 'cudaMemsetAsync\(' cpp/benchmarks/device_uvector/device_uvector_bench.cu
rg -n 'RMM_CUDA_TRY\s*\(\s*cudaMemsetAsync' cpp/benchmarks/device_uvector/device_uvector_bench.cu

Repository: rapidsai/rmm

Length of output: 248


Wrap cudaMemsetAsync with RMM_CUDA_TRY to detect CUDA errors.

Line 94 has an unchecked cudaMemsetAsync call that could silently fail. Per coding guidelines, all CUDA API calls must be wrapped with RMM_CUDA_TRY to detect errors early.

Suggested patch
-      cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get());
+      RMM_CUDA_TRY(
+        cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get()));
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get());
RMM_CUDA_TRY(
cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get()));
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cpp/benchmarks/device_uvector/device_uvector_bench.cu` at line 94, The
cudaMemsetAsync call using vec.data(), num_elements and stream.get() must be
wrapped with the RMM_CUDA_TRY macro to surface CUDA errors; replace the
unchecked call cudaMemsetAsync(vec.data(), 0, num_elements *
sizeof(std::int32_t), stream.get()) with
RMM_CUDA_TRY(cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t),
stream.get())) so failures are reported (ensure RMM_CUDA_TRY is available in the
translation unit).

}
return vec;
}
Expand All @@ -111,14 +112,14 @@ void vector_workflow(std::size_t num_elements,
{
auto input = make_vector<Vector>(num_elements, input_stream, true);
input_stream.synchronize();
for (rmm::cuda_stream_view stream : streams) {
for (cuda::stream_ref stream : streams) {
auto output = make_vector<Vector>(num_elements, stream);
kernel<<<num_blocks, block_size, 0, stream.value()>>>(
kernel<<<num_blocks, block_size, 0, stream.get()>>>(
vector_data(input), vector_data(output), num_elements);
}

for (rmm::cuda_stream_view stream : streams) {
stream.synchronize();
for (cuda::stream_ref stream : streams) {
RMM_CUDA_TRY(cudaStreamSynchronize(stream.get()));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <rmm/mr/pool_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda/stream_ref>
#include <cuda_runtime_api.h>

#include <benchmark/benchmark.h>
Expand Down Expand Up @@ -54,9 +55,9 @@ static void run_test(std::size_t num_kernels,
rmm::device_async_resource_ref mr)
{
for (std::size_t i = 0; i < num_kernels; i++) {
auto stream = stream_pool.get_stream(i);
auto stream = cuda::stream_ref{stream_pool.get_stream(i)};
auto buffer = rmm::device_uvector<int64_t>(1, stream, mr);
compute_bound_kernel<<<1, 1, 0, stream.value()>>>(buffer.data());
compute_bound_kernel<<<1, 1, 0, stream.get()>>>(buffer.data());
}
}

Expand Down
9 changes: 6 additions & 3 deletions cpp/benchmarks/random_allocations/random_allocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
#include <rmm/mr/pool_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda/stream_ref>
#include <cuda_runtime_api.h>

#include <benchmark/benchmark.h>
#include <benchmarks/utilities/cxxopts.hpp>

Expand Down Expand Up @@ -54,7 +57,7 @@ void random_allocation_free(rmm::device_async_resource_ref mr,
SizeDistribution size_distribution,
std::size_t num_allocations,
std::size_t max_usage, // in MiB
rmm::cuda_stream_view stream = {})
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}})
{
std::default_random_engine generator;

Expand Down Expand Up @@ -132,7 +135,7 @@ void uniform_random_allocations(
std::size_t num_allocations, // NOLINT(bugprone-easily-swappable-parameters)
std::size_t max_allocation_size, // size in MiB
std::size_t max_usage,
rmm::cuda_stream_view stream = {})
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}})
{
std::uniform_int_distribution<std::size_t> size_distribution(1, max_allocation_size * size_mb);
random_allocation_free(mr, size_distribution, num_allocations, max_usage, stream);
Expand All @@ -144,7 +147,7 @@ void uniform_random_allocations(
std::size_t mean_allocation_size = 500, // in MiB
std::size_t stddev_allocation_size = 500, // in MiB
std::size_t max_usage = 8 << 20,
cuda_stream_view stream) {
cuda::stream_ref stream) {
std::normal_distribution<std::size_t> size_distribution(, max_allocation_size * size_mb);
}*/

Expand Down
12 changes: 9 additions & 3 deletions cpp/benchmarks/replay/replay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
*/

#include <rmm/aligned.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/logger.hpp>
#include <rmm/mr/arena_memory_resource.hpp>
Expand All @@ -16,6 +15,7 @@
#include <rmm/resource_ref.hpp>

#include <cuda/iterator>
#include <cuda/stream_ref>
#include <thrust/execution_policy.h>
#include <thrust/reduce.h>

Expand Down Expand Up @@ -253,8 +253,14 @@ std::vector<std::vector<rmm::detail::event>> parse_per_thread_events(std::string
[](auto const& event) {
cudaStream_t custream;
memcpy(&custream, &event.stream, sizeof(cudaStream_t));
auto stream = rmm::cuda_stream_view{custream};
return stream.is_default() or stream.is_per_thread_default();
auto stream = cuda::stream_ref{custream};
#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
return stream.get() == cudaStreamLegacy or
stream.get() == cudaStreamPerThread or stream.get() == nullptr;
#else
return stream.get() == cudaStreamLegacy or stream.get() == nullptr or
stream.get() == cudaStreamPerThread;
#endif
}),
"Non-default streams not currently supported.");

Expand Down
8 changes: 4 additions & 4 deletions cpp/benchmarks/synchronization/synchronization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

cuda_event_timer::cuda_event_timer(benchmark::State& state,
bool flush_l2_cache,
rmm::cuda_stream_view stream)
cuda::stream_ref stream)
: stream(stream), p_state(&state)
{
// flush all of L2$
Expand All @@ -36,18 +36,18 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
RMM_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer.data(),
memset_value,
static_cast<std::size_t>(l2_cache_bytes),
stream.value()));
stream.get()));
}
}

RMM_CUDA_TRY(cudaEventCreate(&start));
RMM_CUDA_TRY(cudaEventCreate(&stop));
RMM_CUDA_TRY(cudaEventRecord(start, stream.value()));
RMM_CUDA_TRY(cudaEventRecord(start, stream.get()));
}

cuda_event_timer::~cuda_event_timer()
{
RMM_CUDA_ASSERT_OK(cudaEventRecord(stop, stream.value()));
RMM_CUDA_ASSERT_OK(cudaEventRecord(stop, stream.get()));
RMM_CUDA_ASSERT_OK(cudaEventSynchronize(stop));

float milliseconds = 0.0F;
Expand Down
10 changes: 4 additions & 6 deletions cpp/benchmarks/synchronization/synchronization.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2019-2021, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand Down Expand Up @@ -47,9 +47,7 @@

#pragma once

#include <rmm/cuda_stream_view.hpp>

// Google Benchmark library
#include <cuda/stream_ref>
#include <cuda_runtime_api.h>

#include <benchmark/benchmark.h>
Expand All @@ -68,7 +66,7 @@ class cuda_event_timer {
*/
cuda_event_timer(benchmark::State& state,
bool flush_l2_cache,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});

// The user will HAVE to provide a benchmark::State object to set
// the timer so we disable the default c'tor.
Expand All @@ -88,6 +86,6 @@ class cuda_event_timer {
private:
cudaEvent_t start{};
cudaEvent_t stop{};
rmm::cuda_stream_view stream{};
cuda::stream_ref stream{cudaStream_t{nullptr}};
benchmark::State* p_state{};
};
6 changes: 3 additions & 3 deletions cpp/include/rmm/detail/format.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#pragma once

#include <rmm/cuda_stream_view.hpp>
#include <cuda/stream_ref>

#include <array>
#include <cstdio>
Expand All @@ -31,10 +31,10 @@ inline std::string format_bytes(std::size_t value)
}

// Stringify a stream ID
inline std::string format_stream(rmm::cuda_stream_view stream)
inline std::string format_stream(cuda::stream_ref stream)
{
std::stringstream sstr{};
sstr << std::hex << stream.value();
sstr << std::hex << stream.get();
return sstr.str();
}

Expand Down
6 changes: 4 additions & 2 deletions cpp/include/rmm/device_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <rmm/resource_ref.hpp>

#include <cuda/memory_resource>
#include <cuda/stream_ref>
#include <cuda_runtime_api.h>

#include <cassert>
Expand Down Expand Up @@ -379,8 +380,9 @@ class device_buffer {
void* _data{nullptr}; ///< Pointer to device memory allocation
std::size_t _size{}; ///< Requested size of the device memory allocation
std::size_t _alignment{rmm::CUDA_ALLOCATION_ALIGNMENT}; ///< The alignment of the allocation
std::size_t _capacity{}; ///< The actual size of the device memory allocation
cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation
std::size_t _capacity{}; ///< The actual size of the device memory allocation
cuda::stream_ref _stream{
cudaStream_t{nullptr}}; ///< Stream to use for device memory deallocation

cuda::mr::any_resource<cuda::mr::device_accessible> _mr; ///< The memory resource used to
///< allocate/deallocate device memory
Expand Down
18 changes: 10 additions & 8 deletions cpp/include/rmm/device_scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <rmm/mr/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda/stream_ref>

#include <type_traits>

namespace RMM_NAMESPACE {
Expand Down Expand Up @@ -84,7 +86,7 @@ class device_scalar {
* @param mr Optional, resource with which to allocate.
*/
explicit device_scalar(
cuda_stream_view stream,
cuda::stream_ref stream,
cuda::mr::any_resource<cuda::mr::device_accessible> mr = mr::get_current_device_resource_ref())
: _storage{1, stream, std::move(mr)}
{
Expand All @@ -110,7 +112,7 @@ class device_scalar {
*/
explicit device_scalar(
value_type const& initial_value,
cuda_stream_view stream,
cuda::stream_ref stream,
cuda::mr::any_resource<cuda::mr::device_accessible> mr = mr::get_current_device_resource_ref())
: _storage{1, stream, std::move(mr)}
{
Expand All @@ -131,7 +133,7 @@ class device_scalar {
*/
device_scalar(
device_scalar const& other,
cuda_stream_view stream,
cuda::stream_ref stream,
cuda::mr::any_resource<cuda::mr::device_accessible> mr = mr::get_current_device_resource_ref())
: _storage{other._storage, stream, std::move(mr)}
{
Expand All @@ -153,7 +155,7 @@ class device_scalar {
* @return T The value of the scalar.
* @param stream CUDA stream on which to perform the copy and synchronize.
*/
[[nodiscard]] value_type value(cuda_stream_view stream) const
[[nodiscard]] value_type value(cuda::stream_ref stream) const
{
return _storage.front_element(stream);
}
Expand Down Expand Up @@ -191,14 +193,14 @@ class device_scalar {
* @param value The host value which will be copied to device
* @param stream CUDA stream on which to perform the copy
*/
void set_value_async(value_type const& value, cuda_stream_view stream)
void set_value_async(value_type const& value, cuda::stream_ref stream)
{
_storage.set_element_async(0, value, stream);
}

// Disallow passing literals to set_value to avoid race conditions where the memory holding the
// literal can be freed before the async memcpy / memset executes.
void set_value_async(value_type&&, cuda_stream_view) = delete;
void set_value_async(value_type&&, cuda::stream_ref) = delete;

/**
* @brief Sets the value of the `device_scalar` to zero on the specified stream.
Expand All @@ -214,7 +216,7 @@ class device_scalar {
*
* @param stream CUDA stream on which to perform the copy
*/
void set_value_to_zero_async(cuda_stream_view stream)
void set_value_to_zero_async(cuda::stream_ref stream)
{
_storage.set_element_to_zero_async(value_type{0}, stream);
Comment on lines +219 to 221
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify the callee signature and this call site side-by-side.
rg -n -C2 'set_element_to_zero_async\s*\(' cpp/include/rmm/device_uvector.hpp cpp/include/rmm/device_scalar.hpp

Repository: rapidsai/rmm

Length of output: 1085


Use element index 0 in zeroing call (Line 221).

set_element_to_zero_async expects an element index as its first argument, not a value. Passing value_type{0} is incorrect—the function signature requires size_type element_index (see line 248 in device_uvector.hpp). This will fail for trivially copyable types that are not implicitly convertible to the index type.

Proposed fix
-    _storage.set_element_to_zero_async(value_type{0}, stream);
+    _storage.set_element_to_zero_async(size_type{0}, stream);
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
void set_value_to_zero_async(cuda::stream_ref stream)
{
_storage.set_element_to_zero_async(value_type{0}, stream);
void set_value_to_zero_async(cuda::stream_ref stream)
{
_storage.set_element_to_zero_async(size_type{0}, stream);
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cpp/include/rmm/device_scalar.hpp` around lines 219 - 221, The call in
set_value_to_zero_async is passing value_type{0} to
_storage.set_element_to_zero_async but that helper expects an element index
(size_type) not a value; update the call in set_value_to_zero_async to pass
element index 0 (i.e. use 0 as the first argument) so
_storage.set_element_to_zero_async(0, stream) is invoked; reference
functions/fields: set_value_to_zero_async, _storage.set_element_to_zero_async,
and value_type to locate and correct the call.

}
Expand Down Expand Up @@ -261,7 +263,7 @@ class device_scalar {
*
* @param stream Stream to be used for deallocation
*/
void set_stream(cuda_stream_view stream) noexcept { _storage.set_stream(stream); }
void set_stream(cuda::stream_ref stream) noexcept { _storage.set_stream(stream); }

private:
rmm::device_uvector<T> _storage;
Expand Down
Loading
Loading