[CUB] Add a env-based overload of the device segmented reductions primitives (#6674)

rbourgeois33 · srinivasyadav18 · web-flow · commit e2737a83ecb5 · 2025-12-02T14:54:21.000-06:00
* [CUB][device_segmented_reduce] add a env-based overload of device_segmented_reduce

* [DO NOT MERGE] sample code for env-based device reduce

* static assert determinism

* Update cub/cub/device/device_segmented_reduce.cuh

Co-authored-by: Srinivas Yadav &lt;43375352+srinivasyadav18@users.noreply.github.com&gt;

* Add tests for the env-based segmented sum

* Adress suggestions

* Applied pre-commit

* 1st attempt at extracting tuning from env

* add unit test for tuning

* Add proper error handling to the new tests

---------

Co-authored-by: Srinivas Yadav &lt;43375352+srinivasyadav18@users.noreply.github.com&gt;
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
@@ -19,6 +19,8 @@
 #endif // no system header
 
 #include <cub/detail/choose_offset.cuh>
+#include <cub/detail/device_memory_resource.cuh>
+#include <cub/detail/temporary_storage.cuh>
 #include <cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh>
 #include <cub/device/dispatch/dispatch_segmented_reduce.cuh>
 #include <cub/iterator/arg_index_input_iterator.cuh>
@@ -27,19 +29,51 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
+#include <cuda/__execution/determinism.h>
+#include <cuda/__execution/require.h>
 #include <cuda/__functional/maximum.h>
 #include <cuda/__functional/minimum.h>
+#include <cuda/__memory_resource/get_memory_resource.h>
+#include <cuda/__stream/get_stream.h>
+#include <cuda/__stream/stream_ref.h>
+#include <cuda/std/__execution/env.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__iterator/iterator_traits.h>
+#include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/void_t.h>
 #include <cuda/std/__utility/pair.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>
 
 CUB_NAMESPACE_BEGIN
 
+namespace detail
+{
+namespace segmented_reduce
+{
+struct get_tuning_query_t
+{};
+
+template <class Derived>
+struct tuning
+{
+  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived
+  {
+    return static_cast<const Derived&>(*this);
+  }
+};
+
+struct default_tuning : tuning<default_tuning>
+{
+  template <class AccumT, class Offset, class OpT>
+  using fn = detail::reduce::policy_hub<AccumT, Offset, OpT>;
+};
+} // namespace segmented_reduce
+} // namespace detail
+
 //! @rst
 //! DeviceSegmentedReduce provides device-wide, parallel operations for
 //! computing a reduction across multiple sequences of data items
@@ -408,6 +442,190 @@ struct DeviceSegmentedReduce
     _CCCL_UNREACHABLE();
   }
 
+  //! @rst
+  //! Computes a device-wide segmented sum using the addition (``+``) operator.
+  //!
+  //! - Uses ``0`` as the initial value of the reduction for each segment.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - Does not support ``+`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - Can use a specific stream or cuda memory resource through the `env` parameter
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-reduce-sum-env
+  //!     :end-before: example-end segmented-reduce-sum-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the segmented reduction data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_in``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_in``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT,
+            typename      = ::cuda::std::void_t<typename ::cuda::std::iterator_traits<BeginOffsetIteratorT>::value_type,
+                                                typename ::cuda::std::iterator_traits<EndOffsetIteratorT>::value_type>,
+            typename EnvT = ::cuda::std::execution::env<>>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Sum(InputIteratorT d_in,
+      OutputIteratorT d_out,
+      ::cuda::std::int64_t num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceSegmentedReduce::Sum");
+
+    using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
+    using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
+    using init_t  = OutputT;
+    using AccumT  = ::cuda::std::__accumulator_t<::cuda::std::plus<>, cub::detail::it_value_t<InputIteratorT>, init_t>;
+
+    using segmented_reduce_tuning_t = ::cuda::std::execution::
+      __query_result_or_t<EnvT, detail::segmented_reduce::get_tuning_query_t, detail::segmented_reduce::default_tuning>;
+
+    using policy_t = typename segmented_reduce_tuning_t::template fn<AccumT, OffsetT, ::cuda::std::plus<>>;
+
+    using requirements_t = ::cuda::std::execution::
+      __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
+
+    using requested_determinism_t =
+      ::cuda::std::execution::__query_result_or_t<requirements_t, //
+                                                  ::cuda::execution::determinism::__get_determinism_t,
+                                                  ::cuda::execution::determinism::run_to_run_t>;
+
+    using dispatch_t = DispatchSegmentedReduce<
+      InputIteratorT,
+      OutputIteratorT,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT,
+      ::cuda::std::plus<>,
+      init_t,
+      AccumT,
+      policy_t>;
+
+    // Static assert to reject gpu_to_gpu determinism since it's not properly implemented atm
+    static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
+                  "gpu_to_gpu determinism is not supported for device segmented reductions ");
+
+    static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
+    if constexpr (::cuda::std::is_integral_v<OffsetT>)
+    {
+      auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
+      auto mr =
+        ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
+
+      void* d_temp_storage      = nullptr;
+      size_t temp_storage_bytes = 0;
+
+      // Query the required temporary storage size
+      cudaError_t error = dispatch_t::Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        ::cuda::std::plus<>{},
+        init_t{}, // zero-initialize
+        stream.get());
+      if (error != cudaSuccess)
+      {
+        return error;
+      }
+
+      // TODO(gevtushenko): use uninitialized buffer whenit's available
+      error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
+      if (error != cudaSuccess)
+      {
+        return error;
+      }
+
+      // Run the algorithm
+      error = dispatch_t::Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        ::cuda::std::plus<>{},
+        init_t{}, // zero-initialize
+        stream.get());
+
+      // Try to deallocate regardless of the error to avoid memory leaks
+      cudaError_t deallocate_error =
+        CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
+
+      if (error != cudaSuccess)
+      {
+        // Reduction error takes precedence over deallocation error since it happens first
+        return error;
+      }
+      return deallocate_error;
+    }
+    _CCCL_UNREACHABLE();
+  }
+
   //! @rst
   //! Computes a device-wide segmented sum using the addition (``+``) operator.
   //!
diff --git a/cub/test/catch2_test_device_segmented_reduce_env.cu b/cub/test/catch2_test_device_segmented_reduce_env.cu
@@ -0,0 +1,77 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_segmented_reduce.cuh>
+
+#include <thrust/device_vector.h>
+
+#include <c2h/catch2_test_helper.h>
+
+template <int BlockThreads>
+struct reduce_tuning : cub::detail::reduce::tuning<reduce_tuning<BlockThreads>>
+{
+  template <class /* AccumT */, class /* Offset */, class /* OpT */>
+  struct fn
+  {
+    struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
+    {
+      struct ReducePolicy
+      {
+        static constexpr int VECTOR_LOAD_LENGTH = 1;
+
+        static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
+
+        static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT;
+
+        static constexpr int ITEMS_PER_THREAD = 1;
+        static constexpr int BLOCK_THREADS    = BlockThreads;
+      };
+
+      using SingleTilePolicy      = ReducePolicy;
+      using SegmentedReducePolicy = ReducePolicy;
+    };
+
+    using MaxPolicy = Policy500;
+  };
+};
+
+struct get_scan_tuning_query_t
+{};
+
+struct scan_tuning
+{
+  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_scan_tuning_query_t&) const noexcept
+  {
+    return *this;
+  }
+
+  // Make sure this is not used
+  template <class /* AccumT */, class /* Offset */, class /* OpT */>
+  struct fn
+  {};
+};
+
+using block_sizes = c2h::type_list<cuda::std::integral_constant<int, 32>, cuda::std::integral_constant<int, 64>>;
+
+C2H_TEST("Device segmented sum can be tuned", "[reduce][device]", block_sizes)
+{
+  constexpr int target_block_size = c2h::get<0, TestType>::value;
+
+  int num_segments                     = 3;
+  thrust::device_vector<int> d_offsets = {0, 3, 3, 7};
+  auto d_offsets_it                    = thrust::raw_pointer_cast(d_offsets.data());
+  thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
+  thrust::device_vector<int> d_out(3);
+
+  // We are expecting that `scan_tuning` is ignored
+  auto env = cuda::execution::__tune(reduce_tuning<target_block_size>{}, scan_tuning{});
+
+  auto error =
+    cub::DeviceSegmentedReduce::Sum(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env);
+  thrust::device_vector<int> expected{21, 0, 17};
+
+  REQUIRE(d_out == expected);
+  REQUIRE(error == cudaSuccess);
+}
diff --git a/cub/test/catch2_test_device_segmented_reduce_env_api.cu b/cub/test/catch2_test_device_segmented_reduce_env_api.cu