Skip to content

Commit e2737a8

Browse files
[CUB] Add a env-based overload of the device segmented reductions primitives (#6674)
* [CUB][device_segmented_reduce] add a env-based overload of device_segmented_reduce * [DO NOT MERGE] sample code for env-based device reduce * static assert determinism * Update cub/cub/device/device_segmented_reduce.cuh Co-authored-by: Srinivas Yadav <[email protected]> * Add tests for the env-based segmented sum * Adress suggestions * Applied pre-commit * 1st attempt at extracting tuning from env * add unit test for tuning * Add proper error handling to the new tests --------- Co-authored-by: Srinivas Yadav <[email protected]>
1 parent 0e4f648 commit e2737a8

3 files changed

Lines changed: 414 additions & 0 deletions

File tree

cub/cub/device/device_segmented_reduce.cuh

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#endif // no system header
2020

2121
#include <cub/detail/choose_offset.cuh>
22+
#include <cub/detail/device_memory_resource.cuh>
23+
#include <cub/detail/temporary_storage.cuh>
2224
#include <cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh>
2325
#include <cub/device/dispatch/dispatch_segmented_reduce.cuh>
2426
#include <cub/iterator/arg_index_input_iterator.cuh>
@@ -27,19 +29,51 @@
2729
#include <thrust/iterator/counting_iterator.h>
2830
#include <thrust/iterator/transform_iterator.h>
2931

32+
#include <cuda/__execution/determinism.h>
33+
#include <cuda/__execution/require.h>
3034
#include <cuda/__functional/maximum.h>
3135
#include <cuda/__functional/minimum.h>
36+
#include <cuda/__memory_resource/get_memory_resource.h>
37+
#include <cuda/__stream/get_stream.h>
38+
#include <cuda/__stream/stream_ref.h>
39+
#include <cuda/std/__execution/env.h>
3240
#include <cuda/std/__functional/operations.h>
3341
#include <cuda/std/__iterator/iterator_traits.h>
42+
#include <cuda/std/__type_traits/conditional.h>
3443
#include <cuda/std/__type_traits/integral_constant.h>
3544
#include <cuda/std/__type_traits/is_integral.h>
45+
#include <cuda/std/__type_traits/is_same.h>
3646
#include <cuda/std/__type_traits/void_t.h>
3747
#include <cuda/std/__utility/pair.h>
3848
#include <cuda/std/cstdint>
3949
#include <cuda/std/limits>
4050

4151
CUB_NAMESPACE_BEGIN
4252

53+
namespace detail
54+
{
55+
namespace segmented_reduce
56+
{
57+
struct get_tuning_query_t
58+
{};
59+
60+
template <class Derived>
61+
struct tuning
62+
{
63+
[[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
64+
{
65+
return static_cast<const Derived&>(*this);
66+
}
67+
};
68+
69+
struct default_tuning : tuning<default_tuning>
70+
{
71+
template <class AccumT, class Offset, class OpT>
72+
using fn = detail::reduce::policy_hub<AccumT, Offset, OpT>;
73+
};
74+
} // namespace segmented_reduce
75+
} // namespace detail
76+
4377
//! @rst
4478
//! DeviceSegmentedReduce provides device-wide, parallel operations for
4579
//! computing a reduction across multiple sequences of data items
@@ -408,6 +442,190 @@ struct DeviceSegmentedReduce
408442
_CCCL_UNREACHABLE();
409443
}
410444

445+
//! @rst
446+
//! Computes a device-wide segmented sum using the addition (``+``) operator.
447+
//!
448+
//! - Uses ``0`` as the initial value of the reduction for each segment.
449+
//! - When input a contiguous sequence of segments, a single sequence
450+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
451+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
452+
//! the latter is specified as ``segment_offsets + 1``).
453+
//! - Does not support ``+`` operators that are non-commutative.
454+
//! - Let ``s`` be in ``[0, num_segments)``. The range
455+
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
456+
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
457+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
458+
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
459+
//! - Can use a specific stream or cuda memory resource through the `env` parameter
460+
//! - @devicestorage
461+
//!
462+
//! Snippet
463+
//! +++++++++++++++++++++++++++++++++++++++++++++
464+
//!
465+
//! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements.
466+
//!
467+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
468+
//! :language: c++
469+
//! :dedent:
470+
//! :start-after: example-begin segmented-reduce-sum-env
471+
//! :end-before: example-end segmented-reduce-sum-env
472+
//!
473+
//! @endrst
474+
//!
475+
//! @tparam InputIteratorT
476+
//! **[inferred]** Random-access input iterator type for reading input items @iterator
477+
//!
478+
//! @tparam OutputIteratorT
479+
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
480+
//!
481+
//! @tparam BeginOffsetIteratorT
482+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
483+
//!
484+
//! @tparam EndOffsetIteratorT
485+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
486+
//!
487+
//! @tparam EnvT
488+
//! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
489+
//!
490+
//! @param[in] d_in
491+
//! Pointer to the input sequence of data items
492+
//!
493+
//! @param[out] d_out
494+
//! Pointer to the output aggregate
495+
//!
496+
//! @param[in] num_segments
497+
//! The number of segments that comprise the segmented reduction data
498+
//!
499+
//! @param[in] d_begin_offsets
500+
//! @rst
501+
//! Random-access input iterator to the sequence of beginning offsets of
502+
//! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
503+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
504+
//! @endrst
505+
//!
506+
//! @param[in] d_end_offsets
507+
//! @rst
508+
//! Random-access input iterator to the sequence of ending offsets of length
509+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
510+
//! the *i*\ :sup:`th` data segment in ``d_in``.
511+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
512+
//! @endrst
513+
//!
514+
//! @param[in] env
515+
//! @rst
516+
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
517+
//! @endrst
518+
template <typename InputIteratorT,
519+
typename OutputIteratorT,
520+
typename BeginOffsetIteratorT,
521+
typename EndOffsetIteratorT,
522+
typename = ::cuda::std::void_t<typename ::cuda::std::iterator_traits<BeginOffsetIteratorT>::value_type,
523+
typename ::cuda::std::iterator_traits<EndOffsetIteratorT>::value_type>,
524+
typename EnvT = ::cuda::std::execution::env<>>
525+
CUB_RUNTIME_FUNCTION static cudaError_t
526+
Sum(InputIteratorT d_in,
527+
OutputIteratorT d_out,
528+
::cuda::std::int64_t num_segments,
529+
BeginOffsetIteratorT d_begin_offsets,
530+
EndOffsetIteratorT d_end_offsets,
531+
EnvT env = {})
532+
{
533+
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceSegmentedReduce::Sum");
534+
535+
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
536+
using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
537+
using init_t = OutputT;
538+
using AccumT = ::cuda::std::__accumulator_t<::cuda::std::plus<>, cub::detail::it_value_t<InputIteratorT>, init_t>;
539+
540+
using segmented_reduce_tuning_t = ::cuda::std::execution::
541+
__query_result_or_t<EnvT, detail::segmented_reduce::get_tuning_query_t, detail::segmented_reduce::default_tuning>;
542+
543+
using policy_t = typename segmented_reduce_tuning_t::template fn<AccumT, OffsetT, ::cuda::std::plus<>>;
544+
545+
using requirements_t = ::cuda::std::execution::
546+
__query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
547+
548+
using requested_determinism_t =
549+
::cuda::std::execution::__query_result_or_t<requirements_t, //
550+
::cuda::execution::determinism::__get_determinism_t,
551+
::cuda::execution::determinism::run_to_run_t>;
552+
553+
using dispatch_t = DispatchSegmentedReduce<
554+
InputIteratorT,
555+
OutputIteratorT,
556+
BeginOffsetIteratorT,
557+
EndOffsetIteratorT,
558+
OffsetT,
559+
::cuda::std::plus<>,
560+
init_t,
561+
AccumT,
562+
policy_t>;
563+
564+
// Static assert to reject gpu_to_gpu determinism since it's not properly implemented atm
565+
static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
566+
"gpu_to_gpu determinism is not supported for device segmented reductions ");
567+
568+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
569+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
570+
{
571+
auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
572+
auto mr =
573+
::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
574+
575+
void* d_temp_storage = nullptr;
576+
size_t temp_storage_bytes = 0;
577+
578+
// Query the required temporary storage size
579+
cudaError_t error = dispatch_t::Dispatch(
580+
d_temp_storage,
581+
temp_storage_bytes,
582+
d_in,
583+
d_out,
584+
num_segments,
585+
d_begin_offsets,
586+
d_end_offsets,
587+
::cuda::std::plus<>{},
588+
init_t{}, // zero-initialize
589+
stream.get());
590+
if (error != cudaSuccess)
591+
{
592+
return error;
593+
}
594+
595+
// TODO(gevtushenko): use uninitialized buffer whenit's available
596+
error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
597+
if (error != cudaSuccess)
598+
{
599+
return error;
600+
}
601+
602+
// Run the algorithm
603+
error = dispatch_t::Dispatch(
604+
d_temp_storage,
605+
temp_storage_bytes,
606+
d_in,
607+
d_out,
608+
num_segments,
609+
d_begin_offsets,
610+
d_end_offsets,
611+
::cuda::std::plus<>{},
612+
init_t{}, // zero-initialize
613+
stream.get());
614+
615+
// Try to deallocate regardless of the error to avoid memory leaks
616+
cudaError_t deallocate_error =
617+
CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
618+
619+
if (error != cudaSuccess)
620+
{
621+
// Reduction error takes precedence over deallocation error since it happens first
622+
return error;
623+
}
624+
return deallocate_error;
625+
}
626+
_CCCL_UNREACHABLE();
627+
}
628+
411629
//! @rst
412630
//! Computes a device-wide segmented sum using the addition (``+``) operator.
413631
//!
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
// SPDX-License-Identifier: BSD-3-Clause
3+
4+
#include "insert_nested_NVTX_range_guard.h"
5+
6+
#include <cub/device/device_segmented_reduce.cuh>
7+
8+
#include <thrust/device_vector.h>
9+
10+
#include <c2h/catch2_test_helper.h>
11+
12+
template <int BlockThreads>
13+
struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
14+
{
15+
template <class /* AccumT */, class /* Offset */, class /* OpT */>
16+
struct fn
17+
{
18+
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
19+
{
20+
struct ReducePolicy
21+
{
22+
static constexpr int VECTOR_LOAD_LENGTH = 1;
23+
24+
static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
25+
26+
static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT;
27+
28+
static constexpr int ITEMS_PER_THREAD = 1;
29+
static constexpr int BLOCK_THREADS = BlockThreads;
30+
};
31+
32+
using SingleTilePolicy = ReducePolicy;
33+
using SegmentedReducePolicy = ReducePolicy;
34+
};
35+
36+
using MaxPolicy = Policy500;
37+
};
38+
};
39+
40+
struct get_scan_tuning_query_t
41+
{};
42+
43+
struct scan_tuning
44+
{
45+
[[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept
46+
{
47+
return *this;
48+
}
49+
50+
// Make sure this is not used
51+
template <class /* AccumT */, class /* Offset */, class /* OpT */>
52+
struct fn
53+
{};
54+
};
55+
56+
using block_sizes = c2h::type_list<cuda::std::integral_constant<int, 32>, cuda::std::integral_constant<int, 64>>;
57+
58+
C2H_TEST("Device segmented sum can be tuned", "[reduce][device]", block_sizes)
59+
{
60+
constexpr int target_block_size = c2h::get<0, TestType>::value;
61+
62+
int num_segments = 3;
63+
thrust::device_vector<int> d_offsets = {0, 3, 3, 7};
64+
auto d_offsets_it = thrust::raw_pointer_cast(d_offsets.data());
65+
thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
66+
thrust::device_vector<int> d_out(3);
67+
68+
// We are expecting that `scan_tuning` is ignored
69+
auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, scan_tuning{});
70+
71+
auto error =
72+
cub::DeviceSegmentedReduce::Sum(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env);
73+
thrust::device_vector<int> expected{21, 0, 17};
74+
75+
REQUIRE(d_out == expected);
76+
REQUIRE(error == cudaSuccess);
77+
}

0 commit comments

Comments
 (0)