Observed with RX 6700 XT.
MWE:
using KernelAbstractions
using ROCKernels
using AMDGPU
# function ROCKernels.next_queue() <- uncomment to disable queue pooling
# AMDGPU.default_queue()
# end
@kernel function memset!(x, v)
i = @index(Global)
x[i] = v
end
function main()
dev = ROCDevice()
x = ROCArray{Float32}(undef, 1024 * 1024)
v = 1f0
@time wait(memset!(dev)(x, v; ndrange=length(x)))
@time for _ in 1:10_000
wait(memset!(dev)(x, v; ndrange=length(x)))
end
return nothing
end
main()
9.458379 seconds (14.68 M allocations: 967.041 MiB, 3.37% gc time, 3.64% compilation time)
13.647801 seconds (2.31 M allocations: 81.017 MiB, 0.36% gc time)
- Without, using only one default queue:
9.409779 seconds (14.68 M allocations: 967.038 MiB, 3.32% gc time, 3.48% compilation time)
3.316558 seconds (2.30 M allocations: 80.710 MiB, 1.47% gc time)
Observed with RX 6700 XT.
MWE: