Skip to content
Merged
178 changes: 178 additions & 0 deletions cub/cub/device/device_segmented_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,184 @@ struct DeviceSegmentedReduce
_CCCL_UNREACHABLE();
}

//! @rst
//! Computes a device-wide segmented sum using the addition (``+``) operator.
//!
//! - Uses ``0`` as the initial value of the reduction for each segment.
//! - When input a contiguous sequence of segments, a single sequence
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
//! the latter is specified as ``segment_offsets + 1``).
//! - Does not support ``+`` operators that are non-commutative.
//! - Let ``s`` be in ``[0, num_segments)``. The range
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
//! - Can use a specific stream or cuda memory resource through the `env` parameter
//! - @devicestorage
//!
//! Snippet
//! +++++++++++++++++++++++++++++++++++++++++++++
//! TODO (a new snippet)
//! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements.
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin segmented-reduce-sum-env-stream
//! :end-before: example-end segmented-reduce-sum-env-stream
//!
//! @endrst
//!
//! @tparam InputIteratorT
//! **[inferred]** Random-access input iterator type for reading input items @iterator
//!
//! @tparam OutputIteratorT
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
//!
//! @tparam BeginOffsetIteratorT
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
//!
//! @tparam EndOffsetIteratorT
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
//!
//! @tparam EnvT
//! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
//!
//! @param[in] d_in
//! Pointer to the input sequence of data items
//!
//! @param[out] d_out
//! Pointer to the output aggregate
//!
//! @param[in] num_segments
//! The number of segments that comprise the segmented reduction data
//!
//! @param[in] d_begin_offsets
//! @rst
//! Random-access input iterator to the sequence of beginning offsets of
//! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
//! element of the *i*\ :sup:`th` data segment in ``d_in``
//! @endrst
//!
//! @param[in] d_end_offsets
//! @rst
//! Random-access input iterator to the sequence of ending offsets of length
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
//! the *i*\ :sup:`th` data segment in ``d_in``.
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
//! @endrst
//!
//! @param[in] env
//! @rst
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename InputIteratorT,
typename OutputIteratorT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename = ::cuda::std::void_t<typename ::cuda::std::iterator_traits<BeginOffsetIteratorT>::value_type,
typename ::cuda::std::iterator_traits<EndOffsetIteratorT>::value_type>,
typename EnvT = ::cuda::std::execution::env<>>
CUB_RUNTIME_FUNCTION static cudaError_t
Sum(InputIteratorT d_in,
OutputIteratorT d_out,
::cuda::std::int64_t num_segments,
BeginOffsetIteratorT d_begin_offsets,
EndOffsetIteratorT d_end_offsets,
EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceSegmentedReduce::Sum");

using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
using init_t = OutputT;

using requirements_t = ::cuda::std::execution::
__query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
using requested_determinism_t =
::cuda::std::execution::__query_result_or_t<requirements_t, //
::cuda::execution::determinism::__get_determinism_t,
::cuda::execution::determinism::run_to_run_t>;

// Static assert to reject gpu_to_gpu determinism since it's not properly implemented
static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
"gpu_to_gpu determinism is not supported for device segmented reductions ");

static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
if constexpr (::cuda::std::is_integral_v<OffsetT>)
{
auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
auto mr =
::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});

void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;

// Query the required temporary storage size
cudaError_t error = DispatchSegmentedReduce<
InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
OffsetT,
::cuda::std::plus<>,
init_t>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
::cuda::std::plus<>{},
init_t{}, // zero-initialize
stream.get());
if (error != cudaSuccess)
{
return error;
}

// TODO(gevtushenko): use uninitialized buffer whenit's available
error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
if (error != cudaSuccess)
{
return error;
}

// Run the algorithm
error = DispatchSegmentedReduce<
InputIteratorT,
OutputIteratorT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
OffsetT,
::cuda::std::plus<>,
init_t>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
num_segments,
d_begin_offsets,
d_end_offsets,
::cuda::std::plus<>{},
init_t{}, // zero-initialize
stream.get());

// Try to deallocate regardless of the error to avoid memory leaks
cudaError_t deallocate_error =
CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));

if (error != cudaSuccess)
{
// Reduction error takes precedence over deallocation error since it happens first
return error;
}
return deallocate_error;
}
_CCCL_UNREACHABLE();
}

//! @rst
//! Computes a device-wide segmented sum using the addition (``+``) operator.
//!
Expand Down
77 changes: 77 additions & 0 deletions cub/test/catch2_test_device_segmented_reduce_env_api.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#include "insert_nested_NVTX_range_guard.h"
// above header needs to be included first

#include <cub/device/device_segmented_reduce.cuh>

#include <thrust/device_vector.h>

#include <cuda/__execution/determinism.h>
#include <cuda/__execution/require.h>

#include <cuda/experimental/memory_resource.cuh>

#include <c2h/catch2_test_helper.h>

C2H_TEST("cub::DeviceSegmentedReduce::Sum accepts stream", "[segmented_reduce][env]")
{
// example-begin segmented-reduce-sum-env-stream
int num_segments = 3;
c2h::device_vector<int> d_offsets = {0, 3, 3, 7};
auto d_offsets_it = thrust::raw_pointer_cast(d_offsets.data());
c2h::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
c2h::device_vector<int> d_out(3);

cudaStream_t legacy_stream = 0;
cuda::stream_ref stream_ref{legacy_stream};

auto error = cub::DeviceSegmentedReduce::Sum(
d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, stream_ref);
c2h::device_vector<int> expected{21, 0, 17};

// example-end segmented-reduce-reduce-sum-env-stream
REQUIRE(d_out == expected);
REQUIRE(error == cudaSuccess);
}

C2H_TEST("cub::DeviceSegmentedReduce::Sum accepts not_guaranteed determinism requirements", "[segmented_reduce][env]")
{
// example-begin segmented-reduce-sum-env-determinism
int num_segments = 3;
c2h::device_vector<int> d_offsets = {0, 3, 3, 7};
auto d_offsets_it = thrust::raw_pointer_cast(d_offsets.data());
c2h::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
c2h::device_vector<int> d_out(3);

auto env = cuda::execution::require(cuda::execution::determinism::run_to_run);

auto error =
cub::DeviceSegmentedReduce::Sum(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env);
c2h::device_vector<int> expected{21, 0, 17};

// example-end segmented-reduce-reduce-sum-env-determinism
REQUIRE(d_out == expected);
REQUIRE(error == cudaSuccess);
}

C2H_TEST("cub::DeviceSegmentedReduce::Sum accepts not_guaranteed determinism requirements", "[segmented_reduce][env]")
{
// example-begin segmented-reduce-sum-env-non-determinism
int num_segments = 3;
c2h::device_vector<int> d_offsets = {0, 3, 3, 7};
auto d_offsets_it = thrust::raw_pointer_cast(d_offsets.data());
c2h::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
c2h::device_vector<int> d_out(3);

auto env = cuda::execution::require(cuda::execution::determinism::not_guaranteed);

auto error =
cub::DeviceSegmentedReduce::Sum(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env);
c2h::device_vector<int> expected{21, 0, 17};

// example-end segmented-reduce-reduce-sum-env-non-determinism
REQUIRE(d_out == expected);
REQUIRE(error == cudaSuccess);
}