diff --git a/Project.toml b/Project.toml index 551c9edc1..1582449f2 100644 --- a/Project.toml +++ b/Project.toml @@ -11,6 +11,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -23,6 +24,7 @@ LinearAlgebra = "1" Printf = "1" Random = "1" Reexport = "1" +ScopedValues = "1" Serialization = "1" Statistics = "1" julia = "1.10" diff --git a/docs/make.jl b/docs/make.jl index 72828e3bb..a37b0cd9b 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,6 +20,7 @@ function main() "Test suite" => "testsuite.md", ], doctest = true, + warnonly = [:missing_docs], ) deploydocs( diff --git a/docs/src/interface.md b/docs/src/interface.md index 239bef879..9e4864ada 100644 --- a/docs/src/interface.md +++ b/docs/src/interface.md @@ -10,7 +10,7 @@ Device functionality is then handled by [KernelAbstractions.jl](https://github.c You should provide an array type that builds on the `AbstractGPUArray` supertype, such as: -``` +```julia mutable struct CustomArray{T, N} <: AbstractGPUArray{T, N} data::DataRef{Vector{UInt8}} offset::Int @@ -23,10 +23,17 @@ end This will allow your defined type (in this case `JLArray`) to use the GPUArrays interface where available. To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you need to define the backend, like so: -``` +```julia import KernelAbstractions: Backend struct CustomBackend <: KernelAbstractions.GPU KernelAbstractions.get_backend(a::CA) where CA <: CustomArray = CustomBackend() ``` There are numerous examples of potential interfaces for GPUArrays, such as with [JLArrays](https://github.com/JuliaGPU/GPUArrays.jl/blob/master/lib/JLArrays/src/JLArrays.jl), [CuArrays](https://github.com/JuliaGPU/CUDA.jl/blob/master/src/gpuarrays.jl), and [ROCArrays](https://github.com/JuliaGPU/AMDGPU.jl/blob/master/src/gpuarrays.jl). + +## Caching Allocator + +```@docs +GPUArrays.@cached +GPUArrays.@uncached +``` diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index dbec3d259..d36e9af2a 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -88,12 +88,16 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} function JLArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} check_eltype(T) maxsize = prod(dims) * sizeof(T) - data = Vector{UInt8}(undef, maxsize) - ref = DataRef(data) do data - resize!(data, 0) - end - obj = new{T,N}(ref, 0, dims) - finalizer(unsafe_free!, obj) + + return GPUArrays.cached_alloc((JLArray, T, dims)) do + data = Vector{UInt8}(undef, maxsize) + ref = DataRef(data) do data + resize!(data, 0) + end + obj = new{T, N}(ref, 0, dims) + finalizer(unsafe_free!, obj) + return obj + end::JLArray{T, N} end # low-level constructor for wrapping existing data @@ -102,6 +106,7 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} check_eltype(T) obj = new{T,N}(ref, offset, dims) finalizer(unsafe_free!, obj) + return obj end end diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 418b87b55..8c1fc14ee 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -34,6 +34,7 @@ include("host/random.jl") include("host/quirks.jl") include("host/uniformscaling.jl") include("host/statistics.jl") +include("host/alloc_cache.jl") end # module diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl new file mode 100644 index 000000000..7ed435de4 --- /dev/null +++ b/src/host/alloc_cache.jl @@ -0,0 +1,164 @@ +using ..GPUArrays + +@static if VERSION < v"1.11" + using ScopedValues +else + using Base.ScopedValues +end + +mutable struct AllocCache + lock::ReentrantLock + busy::Dict{UInt64, Vector{Any}} # hash(key) => GPUArray[] + free::Dict{UInt64, Vector{Any}} + + function AllocCache() + cache = new( + ReentrantLock(), + Dict{UInt64, Vector{Any}}(), + Dict{UInt64, Vector{Any}}() + ) + return finalizer(unsafe_free!, cache) + end +end + +function get_pool!(cache::AllocCache, pool::Symbol, uid::UInt64) + pool = getproperty(cache, pool) + uid_pool = get(pool, uid, nothing) + if uid_pool ≡ nothing + uid_pool = Base.@lock cache.lock pool[uid] = Any[] + end + return uid_pool +end + +function cached_alloc(f, key) + cache = ALLOC_CACHE[] + if cache === nothing + return f() + end + + x = nothing + uid = hash(key) + + busy_pool = get_pool!(cache, :busy, uid) + free_pool = get_pool!(cache, :free, uid) + isempty(free_pool) && (x = f()) + + while !isempty(free_pool) && x ≡ nothing + tmp = Base.@lock cache.lock pop!(free_pool) + # Array was manually freed via `unsafe_free!`. + GPUArrays.storage(tmp).freed && continue + x = tmp + end + + x ≡ nothing && (x = f()) + Base.@lock cache.lock push!(busy_pool, x) + return x +end + +function free_busy!(cache::AllocCache) + for uid in cache.busy.keys + busy_pool = get_pool!(cache, :busy, uid) + isempty(busy_pool) && continue + + Base.@lock cache.lock begin + free_pool = get_pool!(cache, :free, uid) + append!(free_pool, busy_pool) + empty!(busy_pool) + end + end + return +end + +function unsafe_free!(cache::AllocCache) + Base.@lock cache.lock begin + for (_, pool) in cache.busy + isempty(pool) || error( + "Invalidating allocations cache that's currently in use. " * + "Invalidating inside `@cached` is not allowed." + ) + end + for (_, pool) in cache.free + map(unsafe_free!, pool) + end + empty!(cache.free) + end + return +end + +function Base.sizeof(cache::AllocCache) + sz = UInt64(0) + Base.@lock cache.lock begin + for kind in (cache.free, cache.busy), (_, pool) in kind + sz += sum(sizeof, pool; init = UInt64(0)) + end + end + return sz +end + +function Base.show(io::IO, cache::AllocCache) + sz, n_free, n_busy = Base.@lock cache.lock begin + sz = sizeof(cache) + n_free = sum(p -> length(p[2]), cache.free; init = 0) + n_busy = sum(p -> length(p[2]), cache.busy; init = 0) + sz, n_free, n_busy + end + return print(io, "AllocCache(n_free=$n_free, n_busy=$n_busy, sizeof=$(Base.format_bytes(sz)))") +end + +const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing) + +""" + @cached(cache, expr) + +Evaluate `expr` using allocations cache `cache`. + +When GPU memory is allocated during the execution of `expr`, `cache` will first be checked. +If no memory is available in the cache, a new allocation will be requested. + +After the execution of `expr`, all allocations made under the scope of `@cached` will be +cached within `cache` for future use. This is useful to avoid relying on GC to free GPU +memory in time. + +Once `cache` goes out scope, or when the user calls `unsafe_free!` on it, all cached +allocations will be freed. + +# Example + +In the following example, each iteration of the for-loop requires 8 GiB of GPU memory. +Without caching those allocations, significant pressure would be put on the GC, resulting +in high memory usage and latency. By using the allocator cache, the memory usage is stable: + +```julia +cache = GPUArrays.AllocCache() +for i in 1:1000 + GPUArrays.@cached cache begin + sin.(CUDA.rand(Float32, 1024^3)) + end +end + +# optionally: free the memory now, instead of waiting for the GC to collect `cache` +GPUArrays.unsafe_free!(cache) +``` + +See [`@uncached`](@ref). +""" +macro cached(cache, expr) + return quote + res = @with $(esc(ALLOC_CACHE)) => $(esc(cache)) $(esc(expr)) + free_busy!($(esc(cache))) + res + end +end + +""" + uncached(expr) + +Evaluate expression `expr` without using the allocation. This is useful to call from within +`@cached` to avoid caching some allocations, e.g., because they can be returned out of the +`@cached` scope. +""" +macro uncached(expr) + return quote + @with $(esc(ALLOC_CACHE)) => nothing $(esc(expr)) + end +end diff --git a/test/testsuite.jl b/test/testsuite.jl index 179c824b1..e138dabe6 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -93,6 +93,7 @@ include("testsuite/math.jl") include("testsuite/random.jl") include("testsuite/uniformscaling.jl") include("testsuite/statistics.jl") +include("testsuite/alloc_cache.jl") """ Runs the entire GPUArrays test suite on array type `AT` diff --git a/test/testsuite/alloc_cache.jl b/test/testsuite/alloc_cache.jl new file mode 100644 index 000000000..b032c8bda --- /dev/null +++ b/test/testsuite/alloc_cache.jl @@ -0,0 +1,43 @@ +@testsuite "alloc cache" (AT, eltypes) -> begin + if AT <: AbstractGPUArray + cache = GPUArrays.AllocCache() + + T, dims = Float32, (1, 2, 3) + GPUArrays.@cached cache begin + x1 = AT(zeros(T, dims)) + end + @test sizeof(cache) == sizeof(T) * prod(dims) + key = first(keys(cache.free)) + @test length(cache.free[key]) == 1 + @test length(cache.busy[key]) == 0 + @test x1 === cache.free[key][1] + + # Second allocation hits cache. + GPUArrays.@cached cache begin + x2 = AT(zeros(T, dims)) + # Does not hit the cache. + GPUArrays.@uncached x_free = AT(zeros(T, dims)) + end + @test sizeof(cache) == sizeof(T) * prod(dims) + key = first(keys(cache.free)) + @test length(cache.free[key]) == 1 + @test length(cache.busy[key]) == 0 + @test x2 === cache.free[key][1] + @test x_free !== x2 + + # Third allocation is of different shape - allocates. + dims = (2, 2) + GPUArrays.@cached cache begin + x3 = AT(zeros(T, dims)) + end + _keys = collect(keys(cache.free)) + key2 = _keys[findfirst(i -> i != key, _keys)] + @test length(cache.free[key]) == 1 + @test length(cache.free[key2]) == 1 + @test x3 === cache.free[key2][1] + + # Freeing all memory held by cache. + GPUArrays.unsafe_free!(cache) + @test sizeof(cache) == 0 + end +end