Skip to content

Commit 73ad4eb

Browse files
committed
Use cuda::stream_ref in benchmarks
1 parent a8b279c commit 73ad4eb

7 files changed

Lines changed: 37 additions & 28 deletions

File tree

cpp/benchmarks/cuda_stream_pool/cuda_stream_pool_bench.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2020-2021, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -18,7 +18,7 @@ static void BM_StreamPoolGetStream(benchmark::State& state)
1818

1919
for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
2020
auto stream = stream_pool.get_stream();
21-
cudaStreamQuery(stream.value());
21+
cudaStreamQuery(cuda::stream_ref{stream}.get());
2222
}
2323

2424
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
@@ -29,7 +29,7 @@ static void BM_CudaStreamClass(benchmark::State& state)
2929
{
3030
for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
3131
auto stream = rmm::cuda_stream{};
32-
cudaStreamQuery(stream.view().value());
32+
cudaStreamQuery(cuda::stream_ref{stream}.get());
3333
}
3434

3535
state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));

cpp/benchmarks/device_uvector/device_uvector_bench.cu

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <rmm/mr/per_device_resource.hpp>
1616
#include <rmm/mr/pool_memory_resource.hpp>
1717

18+
#include <cuda/stream_ref>
1819
#include <cuda_runtime_api.h>
1920
#include <thrust/device_vector.h>
2021
#include <thrust/memory.h>
@@ -31,7 +32,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
3132

3233
for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
3334
rmm::device_uvector<std::int32_t> vec(static_cast<std::size_t>(state.range(0)),
34-
rmm::cuda_stream_view{});
35+
cuda::stream_ref{cudaStream_t{nullptr}});
3536
cudaDeviceSynchronize();
3637
}
3738

@@ -78,7 +79,7 @@ using rmm_vector = rmm::device_vector<int32_t>;
7879
using rmm_uvector = rmm::device_uvector<int32_t>;
7980

8081
template <typename Vector>
81-
Vector make_vector(std::size_t num_elements, rmm::cuda_stream_view stream, bool zero_init = false)
82+
Vector make_vector(std::size_t num_elements, cuda::stream_ref stream, bool zero_init = false)
8283
{
8384
static_assert(std::is_same_v<Vector, thrust_vector> or std::is_same_v<Vector, rmm_vector> or
8485
std::is_same_v<Vector, rmm_uvector>,
@@ -90,7 +91,7 @@ Vector make_vector(std::size_t num_elements, rmm::cuda_stream_view stream, bool
9091
} else if constexpr (std::is_same_v<Vector, rmm_uvector>) {
9192
auto vec = Vector(num_elements, stream);
9293
if (zero_init) {
93-
cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.value());
94+
cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.get());
9495
}
9596
return vec;
9697
}
@@ -111,14 +112,14 @@ void vector_workflow(std::size_t num_elements,
111112
{
112113
auto input = make_vector<Vector>(num_elements, input_stream, true);
113114
input_stream.synchronize();
114-
for (rmm::cuda_stream_view stream : streams) {
115+
for (cuda::stream_ref stream : streams) {
115116
auto output = make_vector<Vector>(num_elements, stream);
116-
kernel<<<num_blocks, block_size, 0, stream.value()>>>(
117+
kernel<<<num_blocks, block_size, 0, stream.get()>>>(
117118
vector_data(input), vector_data(output), num_elements);
118119
}
119120

120-
for (rmm::cuda_stream_view stream : streams) {
121-
stream.synchronize();
121+
for (cuda::stream_ref stream : streams) {
122+
RMM_CUDA_TRY(cudaStreamSynchronize(stream.get()));
122123
}
123124
}
124125

cpp/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <rmm/mr/pool_memory_resource.hpp>
1616
#include <rmm/resource_ref.hpp>
1717

18+
#include <cuda/stream_ref>
1819
#include <cuda_runtime_api.h>
1920

2021
#include <benchmark/benchmark.h>
@@ -54,9 +55,9 @@ static void run_test(std::size_t num_kernels,
5455
rmm::device_async_resource_ref mr)
5556
{
5657
for (std::size_t i = 0; i < num_kernels; i++) {
57-
auto stream = stream_pool.get_stream(i);
58+
auto stream = cuda::stream_ref{stream_pool.get_stream(i)};
5859
auto buffer = rmm::device_uvector<int64_t>(1, stream, mr);
59-
compute_bound_kernel<<<1, 1, 0, stream.value()>>>(buffer.data());
60+
compute_bound_kernel<<<1, 1, 0, stream.get()>>>(buffer.data());
6061
}
6162
}
6263

cpp/benchmarks/random_allocations/random_allocations.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
#include <rmm/mr/pool_memory_resource.hpp>
1414
#include <rmm/resource_ref.hpp>
1515

16+
#include <cuda/stream_ref>
17+
#include <cuda_runtime_api.h>
18+
1619
#include <benchmark/benchmark.h>
1720
#include <benchmarks/utilities/cxxopts.hpp>
1821

@@ -54,7 +57,7 @@ void random_allocation_free(rmm::device_async_resource_ref mr,
5457
SizeDistribution size_distribution,
5558
std::size_t num_allocations,
5659
std::size_t max_usage, // in MiB
57-
rmm::cuda_stream_view stream = {})
60+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}})
5861
{
5962
std::default_random_engine generator;
6063

@@ -132,7 +135,7 @@ void uniform_random_allocations(
132135
std::size_t num_allocations, // NOLINT(bugprone-easily-swappable-parameters)
133136
std::size_t max_allocation_size, // size in MiB
134137
std::size_t max_usage,
135-
rmm::cuda_stream_view stream = {})
138+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}})
136139
{
137140
std::uniform_int_distribution<std::size_t> size_distribution(1, max_allocation_size * size_mb);
138141
random_allocation_free(mr, size_distribution, num_allocations, max_usage, stream);
@@ -144,7 +147,7 @@ void uniform_random_allocations(
144147
std::size_t mean_allocation_size = 500, // in MiB
145148
std::size_t stddev_allocation_size = 500, // in MiB
146149
std::size_t max_usage = 8 << 20,
147-
cuda_stream_view stream) {
150+
cuda::stream_ref stream) {
148151
std::normal_distribution<std::size_t> size_distribution(, max_allocation_size * size_mb);
149152
}*/
150153

cpp/benchmarks/replay/replay.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
*/
55

66
#include <rmm/aligned.hpp>
7-
#include <rmm/cuda_stream_view.hpp>
87
#include <rmm/detail/error.hpp>
98
#include <rmm/logger.hpp>
109
#include <rmm/mr/arena_memory_resource.hpp>
@@ -16,6 +15,7 @@
1615
#include <rmm/resource_ref.hpp>
1716

1817
#include <cuda/iterator>
18+
#include <cuda/stream_ref>
1919
#include <thrust/execution_policy.h>
2020
#include <thrust/reduce.h>
2121

@@ -253,8 +253,14 @@ std::vector<std::vector<rmm::detail::event>> parse_per_thread_events(std::string
253253
[](auto const& event) {
254254
cudaStream_t custream;
255255
memcpy(&custream, &event.stream, sizeof(cudaStream_t));
256-
auto stream = rmm::cuda_stream_view{custream};
257-
return stream.is_default() or stream.is_per_thread_default();
256+
auto stream = cuda::stream_ref{custream};
257+
#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
258+
return stream.get() == cudaStreamLegacy or
259+
stream.get() == cudaStreamPerThread or stream.get() == nullptr;
260+
#else
261+
return stream.get() == cudaStreamLegacy or stream.get() == nullptr or
262+
stream.get() == cudaStreamPerThread;
263+
#endif
258264
}),
259265
"Non-default streams not currently supported.");
260266

cpp/benchmarks/synchronization/synchronization.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
cuda_event_timer::cuda_event_timer(benchmark::State& state,
2121
bool flush_l2_cache,
22-
rmm::cuda_stream_view stream)
22+
cuda::stream_ref stream)
2323
: stream(stream), p_state(&state)
2424
{
2525
// flush all of L2$
@@ -36,18 +36,18 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
3636
RMM_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer.data(),
3737
memset_value,
3838
static_cast<std::size_t>(l2_cache_bytes),
39-
stream.value()));
39+
stream.get()));
4040
}
4141
}
4242

4343
RMM_CUDA_TRY(cudaEventCreate(&start));
4444
RMM_CUDA_TRY(cudaEventCreate(&stop));
45-
RMM_CUDA_TRY(cudaEventRecord(start, stream.value()));
45+
RMM_CUDA_TRY(cudaEventRecord(start, stream.get()));
4646
}
4747

4848
cuda_event_timer::~cuda_event_timer()
4949
{
50-
RMM_CUDA_ASSERT_OK(cudaEventRecord(stop, stream.value()));
50+
RMM_CUDA_ASSERT_OK(cudaEventRecord(stop, stream.get()));
5151
RMM_CUDA_ASSERT_OK(cudaEventSynchronize(stop));
5252

5353
float milliseconds = 0.0F;

cpp/benchmarks/synchronization/synchronization.hpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2019-2021, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -47,9 +47,7 @@
4747

4848
#pragma once
4949

50-
#include <rmm/cuda_stream_view.hpp>
51-
52-
// Google Benchmark library
50+
#include <cuda/stream_ref>
5351
#include <cuda_runtime_api.h>
5452

5553
#include <benchmark/benchmark.h>
@@ -68,7 +66,7 @@ class cuda_event_timer {
6866
*/
6967
cuda_event_timer(benchmark::State& state,
7068
bool flush_l2_cache,
71-
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
69+
cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
7270

7371
// The user will HAVE to provide a benchmark::State object to set
7472
// the timer so we disable the default c'tor.
@@ -88,6 +86,6 @@ class cuda_event_timer {
8886
private:
8987
cudaEvent_t start{};
9088
cudaEvent_t stop{};
91-
rmm::cuda_stream_view stream{};
89+
cuda::stream_ref stream{cudaStream_t{nullptr}};
9290
benchmark::State* p_state{};
9391
};

0 commit comments

Comments
 (0)