Skip to content

Commit 123fc37

Browse files
Milestone: test file using segmented scan compiles
1 parent 409ba1d commit 123fc37

File tree

6 files changed

+576
-12
lines changed

6 files changed

+576
-12
lines changed

cub/cub/agent/agent_segmented_scan.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ struct AgentSegmentedScan
230230
AccumT exclusive_prefix{};
231231
OffsetT n_chunks = cuda::ceil_div(inp_idx_end - inp_idx_begin, TILE_ITEMS);
232232

233-
InputT thread_values[ITEMS_PER_THREAD] = {};
233+
AccumT thread_values[ITEMS_PER_THREAD] = {};
234234

235235
for (OffsetT chunk_id = 0; chunk_id < n_chunks; ++chunk_id)
236236
{
@@ -245,7 +245,7 @@ struct AgentSegmentedScan
245245
// execute BlockScan
246246
AccumT block_aggregate;
247247
BlockScanT block_scan_algo(temp_storage.scan);
248-
if (IS_INCLUSIVE)
248+
if constexpr (IS_INCLUSIVE)
249249
{
250250
block_scan_algo.InclusiveScan(thread_values, thread_values, scan_op, block_aggregate);
251251
}
@@ -264,9 +264,9 @@ struct AgentSegmentedScan
264264
{
265265
constexpr auto loop_size = static_cast<int>(ITEMS_PER_THREAD);
266266
cuda::static_for<loop_size>([&](int i) {
267-
thread_values[i] += exclusive_prefix;
267+
thread_values[i] = thread_values[i] + exclusive_prefix;
268268
});
269-
exclusive_prefix += block_aggregate;
269+
exclusive_prefix = exclusive_prefix + block_aggregate;
270270
}
271271

272272
// write out scan values using BlockStore
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: BSD-3-Clause
3+
4+
#pragma once
5+
6+
#include <cub/config.cuh>
7+
8+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
9+
# pragma GCC system_header
10+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
11+
# pragma clang system_header
12+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
13+
# pragma system_header
14+
#endif // no system header
15+
16+
#include <cub/device/dispatch/dispatch_segmented_scan.cuh>
17+
18+
#include <cuda/std/cstdint>
19+
20+
CUB_NAMESPACE_BEGIN
21+
22+
struct DeviceSegmentedScan
23+
{
24+
template <typename InputIteratorT,
25+
typename OutputIteratorT,
26+
typename BeginOffsetIteratorInputT,
27+
typename EndOffsetIteratorInputT,
28+
typename BeginOffsetIteratorOutputT,
29+
typename ScanOpT>
30+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSegmentedScan(
31+
void* d_temp_storage,
32+
size_t& temp_storage_bytes,
33+
InputIteratorT d_in,
34+
OutputIteratorT d_out,
35+
::cuda::std::int64_t num_segments,
36+
BeginOffsetIteratorInputT d_in_begin_offsets,
37+
EndOffsetIteratorInputT d_in_end_offsets,
38+
BeginOffsetIteratorOutputT d_out_begin_offsets,
39+
ScanOpT scan_op,
40+
cudaStream_t stream = 0)
41+
{
42+
// defined in cub/config.cuh
43+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::InclusiveSegmentedScan");
44+
45+
using OffsetT =
46+
detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT, BeginOffsetIteratorOutputT>;
47+
using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
48+
49+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
50+
51+
return cub::DispatchSegmentedScan<
52+
InputIteratorT,
53+
OutputIteratorT,
54+
BeginOffsetIteratorInputT,
55+
EndOffsetIteratorInputT,
56+
BeginOffsetIteratorOutputT,
57+
ScanOpT,
58+
NullType>::Dispatch(d_temp_storage,
59+
temp_storage_bytes,
60+
d_in,
61+
d_out,
62+
num_segments,
63+
d_in_begin_offsets,
64+
d_in_end_offsets,
65+
d_out_begin_offsets,
66+
scan_op,
67+
NullType(),
68+
stream);
69+
}
70+
};
71+
72+
CUB_NAMESPACE_END

0 commit comments

Comments
 (0)