|
1 | 1 | ## COV_EXCL_START |
2 | 2 |
|
3 | 3 | # TODO |
4 | | -# - serial version for lower latency |
5 | 4 | # - block-stride loop to delay need for second kernel launch |
6 | 5 |
|
7 | 6 | # Reduce a value across a warp |
@@ -134,7 +133,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs |
134 | 133 | return |
135 | 134 | end |
136 | 135 |
|
137 | | -function big_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As) |
| 136 | +function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As) |
138 | 137 | grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x |
139 | 138 | @inbounds if grid_idx <= length(Rother) |
140 | 139 | Iother = Rother[grid_idx] |
|
160 | 159 | ## COV_EXCL_STOP |
161 | 160 |
|
162 | 161 | # factored out for use in tests |
163 | | -function big_mapreduce_threshold(dev) |
| 162 | +function serial_mapreduce_threshold(dev) |
164 | 163 | max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) * |
165 | 164 | attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) |
166 | 165 | return max_concurrency |
@@ -197,9 +196,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, |
197 | 196 | @assert length(Rother) > 0 |
198 | 197 |
|
199 | 198 | # If `Rother` is large enough, then a naive loop is more efficient than partial reductions. |
200 | | - if length(Rother) >= big_mapreduce_threshold(dev) |
| 199 | + if length(Rother) >= serial_mapreduce_threshold(dev) |
201 | 200 | args = (f, op, init, Rreduce, Rother, R, A) |
202 | | - kernel = @cuda launch=false big_mapreduce_kernel(args...) |
| 201 | + kernel = @cuda launch=false serial_mapreduce_kernel(args...) |
203 | 202 | kernel_config = launch_configuration(kernel.fun) |
204 | 203 | threads = kernel_config.threads |
205 | 204 | blocks = cld(length(Rother), threads) |
@@ -255,39 +254,38 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, |
255 | 254 | # we can cover the dimensions to reduce using a single block |
256 | 255 | kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem) |
257 | 256 | else |
258 | | - # Temporary empty array of the same type for kernel definition |
259 | | - partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1))) |
| 257 | + # TODO: provide a version that atomically reduces from different blocks |
260 | 258 |
|
261 | | - # NOTE: we can't use the previously-compiled kernel, since the type of `partial` |
262 | | - # might not match the original output container (e.g. if that was a view). |
263 | | - # recalculate kernel configuration for the partial array |
| 259 | + # temporary empty array whose type will match the final partial array |
| 260 | + partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1))) |
| 261 | + |
| 262 | + # NOTE: we can't use the previously-compiled kernel, or its launch configuration, |
| 263 | + # since the type of `partial` might not match the original output container |
| 264 | + # (e.g. if that was a view). |
264 | 265 | partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A) |
265 | 266 | partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmem∘compute_threads) |
266 | 267 | partial_reduce_threads = compute_threads(partial_kernel_config.threads) |
267 | 268 | partial_reduce_shmem = compute_shmem(partial_reduce_threads) |
268 | | - |
269 | | - # recalculate blocks based on the new thread count |
270 | 269 | partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks |
271 | 270 | 1 |
272 | 271 | else |
273 | | - min(cld(length(Rreduce), partial_reduce_threads), # how many we need at most |
274 | | - cld(partial_kernel_config.blocks, other_blocks)) # maximize occupancy |
| 272 | + min(cld(length(Rreduce), partial_reduce_threads), |
| 273 | + cld(partial_kernel_config.blocks, other_blocks)) |
275 | 274 | end |
276 | | - |
277 | 275 | partial_threads = partial_reduce_threads |
278 | 276 | partial_shmem = partial_reduce_shmem |
279 | 277 | partial_blocks = partial_reduce_blocks*other_blocks |
280 | 278 |
|
281 | 279 | partial = similar(R, (size(R)..., partial_blocks)) |
282 | | - |
283 | 280 | if init === nothing |
284 | 281 | # without an explicit initializer we need to copy from the output container |
285 | 282 | partial .= R |
286 | 283 | end |
287 | 284 |
|
288 | | - partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A; threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem) |
| 285 | + partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A; |
| 286 | + threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem) |
289 | 287 |
|
290 | | - GPUArrays.mapreducedim!(identity, op, R, partial; init=init) |
| 288 | + GPUArrays.mapreducedim!(identity, op, R, partial; init) |
291 | 289 | end |
292 | 290 |
|
293 | 291 | return R |
|
0 commit comments