Skip to content

Commit 1d78b59

Browse files
committed
Minor clean-up.
1 parent 12d9b8d commit 1d78b59

1 file changed

Lines changed: 16 additions & 18 deletions

File tree

src/mapreduce.jl

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
## COV_EXCL_START
22

33
# TODO
4-
# - serial version for lower latency
54
# - block-stride loop to delay need for second kernel launch
65

76
# Reduce a value across a warp
@@ -134,7 +133,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
134133
return
135134
end
136135

137-
function big_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
136+
function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
138137
grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
139138
@inbounds if grid_idx <= length(Rother)
140139
Iother = Rother[grid_idx]
@@ -160,7 +159,7 @@ end
160159
## COV_EXCL_STOP
161160

162161
# factored out for use in tests
163-
function big_mapreduce_threshold(dev)
162+
function serial_mapreduce_threshold(dev)
164163
max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) *
165164
attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
166165
return max_concurrency
@@ -197,9 +196,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
197196
@assert length(Rother) > 0
198197

199198
# If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
200-
if length(Rother) >= big_mapreduce_threshold(dev)
199+
if length(Rother) >= serial_mapreduce_threshold(dev)
201200
args = (f, op, init, Rreduce, Rother, R, A)
202-
kernel = @cuda launch=false big_mapreduce_kernel(args...)
201+
kernel = @cuda launch=false serial_mapreduce_kernel(args...)
203202
kernel_config = launch_configuration(kernel.fun)
204203
threads = kernel_config.threads
205204
blocks = cld(length(Rother), threads)
@@ -255,39 +254,38 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
255254
# we can cover the dimensions to reduce using a single block
256255
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
257256
else
258-
# Temporary empty array of the same type for kernel definition
259-
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
257+
# TODO: provide a version that atomically reduces from different blocks
260258

261-
# NOTE: we can't use the previously-compiled kernel, since the type of `partial`
262-
# might not match the original output container (e.g. if that was a view).
263-
# recalculate kernel configuration for the partial array
259+
# temporary empty array whose type will match the final partial array
260+
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
261+
262+
# NOTE: we can't use the previously-compiled kernel, or its launch configuration,
263+
# since the type of `partial` might not match the original output container
264+
# (e.g. if that was a view).
264265
partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
265266
partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmemcompute_threads)
266267
partial_reduce_threads = compute_threads(partial_kernel_config.threads)
267268
partial_reduce_shmem = compute_shmem(partial_reduce_threads)
268-
269-
# recalculate blocks based on the new thread count
270269
partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks
271270
1
272271
else
273-
min(cld(length(Rreduce), partial_reduce_threads), # how many we need at most
274-
cld(partial_kernel_config.blocks, other_blocks)) # maximize occupancy
272+
min(cld(length(Rreduce), partial_reduce_threads),
273+
cld(partial_kernel_config.blocks, other_blocks))
275274
end
276-
277275
partial_threads = partial_reduce_threads
278276
partial_shmem = partial_reduce_shmem
279277
partial_blocks = partial_reduce_blocks*other_blocks
280278

281279
partial = similar(R, (size(R)..., partial_blocks))
282-
283280
if init === nothing
284281
# without an explicit initializer we need to copy from the output container
285282
partial .= R
286283
end
287284

288-
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A; threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
285+
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
286+
threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
289287

290-
GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
288+
GPUArrays.mapreducedim!(identity, op, R, partial; init)
291289
end
292290

293291
return R

0 commit comments

Comments
 (0)