|
19 | 19 | #endif // no system header |
20 | 20 |
|
21 | 21 | #include <cub/detail/choose_offset.cuh> |
| 22 | +#include <cub/detail/device_memory_resource.cuh> |
| 23 | +#include <cub/detail/temporary_storage.cuh> |
22 | 24 | #include <cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh> |
23 | 25 | #include <cub/device/dispatch/dispatch_segmented_reduce.cuh> |
24 | 26 | #include <cub/iterator/arg_index_input_iterator.cuh> |
|
27 | 29 | #include <thrust/iterator/counting_iterator.h> |
28 | 30 | #include <thrust/iterator/transform_iterator.h> |
29 | 31 |
|
| 32 | +#include <cuda/__execution/determinism.h> |
| 33 | +#include <cuda/__execution/require.h> |
30 | 34 | #include <cuda/__functional/maximum.h> |
31 | 35 | #include <cuda/__functional/minimum.h> |
| 36 | +#include <cuda/__memory_resource/get_memory_resource.h> |
| 37 | +#include <cuda/__stream/get_stream.h> |
| 38 | +#include <cuda/__stream/stream_ref.h> |
| 39 | +#include <cuda/std/__execution/env.h> |
32 | 40 | #include <cuda/std/__functional/operations.h> |
33 | 41 | #include <cuda/std/__iterator/iterator_traits.h> |
| 42 | +#include <cuda/std/__type_traits/conditional.h> |
34 | 43 | #include <cuda/std/__type_traits/integral_constant.h> |
35 | 44 | #include <cuda/std/__type_traits/is_integral.h> |
| 45 | +#include <cuda/std/__type_traits/is_same.h> |
36 | 46 | #include <cuda/std/__type_traits/void_t.h> |
37 | 47 | #include <cuda/std/__utility/pair.h> |
38 | 48 | #include <cuda/std/cstdint> |
39 | 49 | #include <cuda/std/limits> |
40 | 50 |
|
41 | 51 | CUB_NAMESPACE_BEGIN |
42 | 52 |
|
| 53 | +namespace detail |
| 54 | +{ |
| 55 | +namespace segmented_reduce |
| 56 | +{ |
| 57 | +struct get_tuning_query_t |
| 58 | +{}; |
| 59 | + |
| 60 | +template <class Derived> |
| 61 | +struct tuning |
| 62 | +{ |
| 63 | + [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const get_tuning_query_t&) const noexcept -> Derived |
| 64 | + { |
| 65 | + return static_cast<const Derived&>(*this); |
| 66 | + } |
| 67 | +}; |
| 68 | + |
| 69 | +struct default_tuning : tuning<default_tuning> |
| 70 | +{ |
| 71 | + template <class AccumT, class Offset, class OpT> |
| 72 | + using fn = detail::reduce::policy_hub<AccumT, Offset, OpT>; |
| 73 | +}; |
| 74 | +} // namespace segmented_reduce |
| 75 | +} // namespace detail |
| 76 | + |
43 | 77 | //! @rst |
44 | 78 | //! DeviceSegmentedReduce provides device-wide, parallel operations for |
45 | 79 | //! computing a reduction across multiple sequences of data items |
@@ -408,6 +442,190 @@ struct DeviceSegmentedReduce |
408 | 442 | _CCCL_UNREACHABLE(); |
409 | 443 | } |
410 | 444 |
|
| 445 | + //! @rst |
| 446 | + //! Computes a device-wide segmented sum using the addition (``+``) operator. |
| 447 | + //! |
| 448 | + //! - Uses ``0`` as the initial value of the reduction for each segment. |
| 449 | + //! - When input a contiguous sequence of segments, a single sequence |
| 450 | + //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased |
| 451 | + //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where |
| 452 | + //! the latter is specified as ``segment_offsets + 1``). |
| 453 | + //! - Does not support ``+`` operators that are non-commutative. |
| 454 | + //! - Let ``s`` be in ``[0, num_segments)``. The range |
| 455 | + //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not |
| 456 | + //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, |
| 457 | + //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor |
| 458 | + //! ``[d_end_offsets, d_end_offsets + num_segments)``. |
| 459 | + //! - Can use a specific stream or cuda memory resource through the `env` parameter |
| 460 | + //! - @devicestorage |
| 461 | + //! |
| 462 | + //! Snippet |
| 463 | + //! +++++++++++++++++++++++++++++++++++++++++++++ |
| 464 | + //! |
| 465 | + //! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements. |
| 466 | + //! |
| 467 | + //! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu |
| 468 | + //! :language: c++ |
| 469 | + //! :dedent: |
| 470 | + //! :start-after: example-begin segmented-reduce-sum-env |
| 471 | + //! :end-before: example-end segmented-reduce-sum-env |
| 472 | + //! |
| 473 | + //! @endrst |
| 474 | + //! |
| 475 | + //! @tparam InputIteratorT |
| 476 | + //! **[inferred]** Random-access input iterator type for reading input items @iterator |
| 477 | + //! |
| 478 | + //! @tparam OutputIteratorT |
| 479 | + //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator |
| 480 | + //! |
| 481 | + //! @tparam BeginOffsetIteratorT |
| 482 | + //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator |
| 483 | + //! |
| 484 | + //! @tparam EndOffsetIteratorT |
| 485 | + //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator |
| 486 | + //! |
| 487 | + //! @tparam EnvT |
| 488 | + //! **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`. |
| 489 | + //! |
| 490 | + //! @param[in] d_in |
| 491 | + //! Pointer to the input sequence of data items |
| 492 | + //! |
| 493 | + //! @param[out] d_out |
| 494 | + //! Pointer to the output aggregate |
| 495 | + //! |
| 496 | + //! @param[in] num_segments |
| 497 | + //! The number of segments that comprise the segmented reduction data |
| 498 | + //! |
| 499 | + //! @param[in] d_begin_offsets |
| 500 | + //! @rst |
| 501 | + //! Random-access input iterator to the sequence of beginning offsets of |
| 502 | + //! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first |
| 503 | + //! element of the *i*\ :sup:`th` data segment in ``d_in`` |
| 504 | + //! @endrst |
| 505 | + //! |
| 506 | + //! @param[in] d_end_offsets |
| 507 | + //! @rst |
| 508 | + //! Random-access input iterator to the sequence of ending offsets of length |
| 509 | + //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of |
| 510 | + //! the *i*\ :sup:`th` data segment in ``d_in``. |
| 511 | + //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. |
| 512 | + //! @endrst |
| 513 | + //! |
| 514 | + //! @param[in] env |
| 515 | + //! @rst |
| 516 | + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. |
| 517 | + //! @endrst |
| 518 | + template <typename InputIteratorT, |
| 519 | + typename OutputIteratorT, |
| 520 | + typename BeginOffsetIteratorT, |
| 521 | + typename EndOffsetIteratorT, |
| 522 | + typename = ::cuda::std::void_t<typename ::cuda::std::iterator_traits<BeginOffsetIteratorT>::value_type, |
| 523 | + typename ::cuda::std::iterator_traits<EndOffsetIteratorT>::value_type>, |
| 524 | + typename EnvT = ::cuda::std::execution::env<>> |
| 525 | + CUB_RUNTIME_FUNCTION static cudaError_t |
| 526 | + Sum(InputIteratorT d_in, |
| 527 | + OutputIteratorT d_out, |
| 528 | + ::cuda::std::int64_t num_segments, |
| 529 | + BeginOffsetIteratorT d_begin_offsets, |
| 530 | + EndOffsetIteratorT d_end_offsets, |
| 531 | + EnvT env = {}) |
| 532 | + { |
| 533 | + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceSegmentedReduce::Sum"); |
| 534 | + |
| 535 | + using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>; |
| 536 | + using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>; |
| 537 | + using init_t = OutputT; |
| 538 | + using AccumT = ::cuda::std::__accumulator_t<::cuda::std::plus<>, cub::detail::it_value_t<InputIteratorT>, init_t>; |
| 539 | + |
| 540 | + using segmented_reduce_tuning_t = ::cuda::std::execution:: |
| 541 | + __query_result_or_t<EnvT, detail::segmented_reduce::get_tuning_query_t, detail::segmented_reduce::default_tuning>; |
| 542 | + |
| 543 | + using policy_t = typename segmented_reduce_tuning_t::template fn<AccumT, OffsetT, ::cuda::std::plus<>>; |
| 544 | + |
| 545 | + using requirements_t = ::cuda::std::execution:: |
| 546 | + __query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>; |
| 547 | + |
| 548 | + using requested_determinism_t = |
| 549 | + ::cuda::std::execution::__query_result_or_t<requirements_t, // |
| 550 | + ::cuda::execution::determinism::__get_determinism_t, |
| 551 | + ::cuda::execution::determinism::run_to_run_t>; |
| 552 | + |
| 553 | + using dispatch_t = DispatchSegmentedReduce< |
| 554 | + InputIteratorT, |
| 555 | + OutputIteratorT, |
| 556 | + BeginOffsetIteratorT, |
| 557 | + EndOffsetIteratorT, |
| 558 | + OffsetT, |
| 559 | + ::cuda::std::plus<>, |
| 560 | + init_t, |
| 561 | + AccumT, |
| 562 | + policy_t>; |
| 563 | + |
| 564 | + // Static assert to reject gpu_to_gpu determinism since it's not properly implemented atm |
| 565 | + static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>, |
| 566 | + "gpu_to_gpu determinism is not supported for device segmented reductions "); |
| 567 | + |
| 568 | + static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral."); |
| 569 | + if constexpr (::cuda::std::is_integral_v<OffsetT>) |
| 570 | + { |
| 571 | + auto stream = ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}); |
| 572 | + auto mr = |
| 573 | + ::cuda::std::execution::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{}); |
| 574 | + |
| 575 | + void* d_temp_storage = nullptr; |
| 576 | + size_t temp_storage_bytes = 0; |
| 577 | + |
| 578 | + // Query the required temporary storage size |
| 579 | + cudaError_t error = dispatch_t::Dispatch( |
| 580 | + d_temp_storage, |
| 581 | + temp_storage_bytes, |
| 582 | + d_in, |
| 583 | + d_out, |
| 584 | + num_segments, |
| 585 | + d_begin_offsets, |
| 586 | + d_end_offsets, |
| 587 | + ::cuda::std::plus<>{}, |
| 588 | + init_t{}, // zero-initialize |
| 589 | + stream.get()); |
| 590 | + if (error != cudaSuccess) |
| 591 | + { |
| 592 | + return error; |
| 593 | + } |
| 594 | + |
| 595 | + // TODO(gevtushenko): use uninitialized buffer whenit's available |
| 596 | + error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr)); |
| 597 | + if (error != cudaSuccess) |
| 598 | + { |
| 599 | + return error; |
| 600 | + } |
| 601 | + |
| 602 | + // Run the algorithm |
| 603 | + error = dispatch_t::Dispatch( |
| 604 | + d_temp_storage, |
| 605 | + temp_storage_bytes, |
| 606 | + d_in, |
| 607 | + d_out, |
| 608 | + num_segments, |
| 609 | + d_begin_offsets, |
| 610 | + d_end_offsets, |
| 611 | + ::cuda::std::plus<>{}, |
| 612 | + init_t{}, // zero-initialize |
| 613 | + stream.get()); |
| 614 | + |
| 615 | + // Try to deallocate regardless of the error to avoid memory leaks |
| 616 | + cudaError_t deallocate_error = |
| 617 | + CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr)); |
| 618 | + |
| 619 | + if (error != cudaSuccess) |
| 620 | + { |
| 621 | + // Reduction error takes precedence over deallocation error since it happens first |
| 622 | + return error; |
| 623 | + } |
| 624 | + return deallocate_error; |
| 625 | + } |
| 626 | + _CCCL_UNREACHABLE(); |
| 627 | + } |
| 628 | + |
411 | 629 | //! @rst |
412 | 630 | //! Computes a device-wide segmented sum using the addition (``+``) operator. |
413 | 631 | //! |
|
0 commit comments