From a4ef3ab3cd961bdcedfdd9e4aebb91487a69ef13 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Sat, 14 Dec 2024 23:54:38 +0200 Subject: [PATCH 1/9] Use GPUArrays cache alloc --- src/CUDA.jl | 4 ++++ src/array.jl | 20 +++++++++++++++++--- src/cache_allocator.jl | 13 +++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 src/cache_allocator.jl diff --git a/src/CUDA.jl b/src/CUDA.jl index e22d65a806..dc015302ac 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -1,5 +1,7 @@ module CUDA +using Base.ScopedValues + using GPUCompiler using GPUArrays @@ -123,6 +125,8 @@ include("CUDAKernels.jl") import .CUDAKernels: CUDABackend export CUDABackend +include("cache_allocator.jl") + # StaticArrays is still a direct dependency, so directly include the extension include("../ext/StaticArraysExt.jl") diff --git a/src/array.jl b/src/array.jl index 30657625b2..b74e64aed7 100644 --- a/src/array.jl +++ b/src/array.jl @@ -71,9 +71,23 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N} else maxsize end - data = DataRef(pool_free, pool_alloc(M, bufsize)) - obj = new{T,N,M}(data, maxsize, 0, dims) - finalizer(unsafe_free!, obj) + + name = CacheAllocatorName[] + # Do not use caching allocator if it is not set or + # the buffer is not a device memory. + obj = if !(M <: DeviceMemory) || name == :none + data = DataRef(pool_free, pool_alloc(M, bufsize)) + obj = new{T,N,M}(data, maxsize, 0, dims) + finalizer(unsafe_free!, obj) + else + cache = GPUArrays.named_cache_allocator!(CuCacheAllocator, CUDA.device(), name) + x = GPUArrays.alloc!(cache, T, dims) do + data = DataRef(pool_free, pool_alloc(M, bufsize)) + obj = new{T,N,M}(data, maxsize, 0, dims) + return finalizer(unsafe_free!, obj) + end + end + return obj end function CuArray{T,N}(data::DataRef{Managed{M}}, dims::Dims{N}; diff --git a/src/cache_allocator.jl b/src/cache_allocator.jl new file mode 100644 index 0000000000..edd6cc8146 --- /dev/null +++ b/src/cache_allocator.jl @@ -0,0 +1,13 @@ +const CacheAllocatorName = ScopedValue(:none) + +const CuCacheAllocator = GPUArrays.PerDeviceCacheAllocator(CuArray) + +GPUArrays.cache_alloc_scope(::CUDABackend) = CacheAllocatorName + +GPUArrays.cache_allocator(::CUDABackend) = CuCacheAllocator + +GPUArrays.free_busy_cache_alloc!(pdcache::GPUArrays.PerDeviceCacheAllocator{CuArray}, name::Symbol) = + GPUArrays.free_busy!(GPUArrays.named_cache_allocator!(pdcache, CUDA.device(), name)) + +GPUArrays.invalidate_cache_allocator!(pdcache::GPUArrays.PerDeviceCacheAllocator{CuArray}, name::Symbol) = + GPUArrays.invalidate_cache_allocator!(pdcache, CUDA.device(), name) From b8194cd0b9c0c08f69dec7a9a89fae3217120180 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 17 Dec 2024 14:52:18 +0200 Subject: [PATCH 2/9] upd --- src/array.jl | 20 +++++++++----------- src/cache_allocator.jl | 12 ++---------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/src/array.jl b/src/array.jl index b74e64aed7..fde4f385e3 100644 --- a/src/array.jl +++ b/src/array.jl @@ -72,22 +72,20 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N} maxsize end - name = CacheAllocatorName[] - # Do not use caching allocator if it is not set or - # the buffer is not a device memory. - obj = if !(M <: DeviceMemory) || name == :none + function _alloc_f() data = DataRef(pool_free, pool_alloc(M, bufsize)) obj = new{T,N,M}(data, maxsize, 0, dims) finalizer(unsafe_free!, obj) + end + + name = GPUArrays.CacheAllocatorName[] + # Do not use caching allocator if it is not set or + # the buffer is not a device memory. + return if !(M <: DeviceMemory) || name == :none + _alloc_f() else - cache = GPUArrays.named_cache_allocator!(CuCacheAllocator, CUDA.device(), name) - x = GPUArrays.alloc!(cache, T, dims) do - data = DataRef(pool_free, pool_alloc(M, bufsize)) - obj = new{T,N,M}(data, maxsize, 0, dims) - return finalizer(unsafe_free!, obj) - end + GPUArrays.alloc!(_alloc_f, CUDABackend(), name, T, dims)::CuArray{T, N, M} end - return obj end function CuArray{T,N}(data::DataRef{Managed{M}}, dims::Dims{N}; diff --git a/src/cache_allocator.jl b/src/cache_allocator.jl index edd6cc8146..a9256cefe8 100644 --- a/src/cache_allocator.jl +++ b/src/cache_allocator.jl @@ -1,13 +1,5 @@ -const CacheAllocatorName = ScopedValue(:none) - -const CuCacheAllocator = GPUArrays.PerDeviceCacheAllocator(CuArray) - -GPUArrays.cache_alloc_scope(::CUDABackend) = CacheAllocatorName +const CuCacheAllocator = GPUArrays.PerDeviceCacheAllocator(CuArray; free_immediately=true) GPUArrays.cache_allocator(::CUDABackend) = CuCacheAllocator -GPUArrays.free_busy_cache_alloc!(pdcache::GPUArrays.PerDeviceCacheAllocator{CuArray}, name::Symbol) = - GPUArrays.free_busy!(GPUArrays.named_cache_allocator!(pdcache, CUDA.device(), name)) - -GPUArrays.invalidate_cache_allocator!(pdcache::GPUArrays.PerDeviceCacheAllocator{CuArray}, name::Symbol) = - GPUArrays.invalidate_cache_allocator!(pdcache, CUDA.device(), name) +GPUArrays.device(::CUDABackend) = CUDA.device() From 6a275a7fd4f16b31eb6019ec5bf206baaa2e84a1 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 17 Dec 2024 23:26:50 +0200 Subject: [PATCH 3/9] cleanup --- src/CUDA.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/CUDA.jl b/src/CUDA.jl index dc015302ac..89c1e7841f 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -1,7 +1,5 @@ module CUDA -using Base.ScopedValues - using GPUCompiler using GPUArrays From 6c35632d3e6e5ca5aa2be3032fe160d3b99e8090 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Mon, 6 Jan 2025 21:49:46 +0200 Subject: [PATCH 4/9] Update --- src/array.jl | 8 +++----- src/cache_allocator.jl | 6 +++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/array.jl b/src/array.jl index fde4f385e3..14fe9e0168 100644 --- a/src/array.jl +++ b/src/array.jl @@ -78,13 +78,11 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N} finalizer(unsafe_free!, obj) end - name = GPUArrays.CacheAllocatorName[] - # Do not use caching allocator if it is not set or - # the buffer is not a device memory. - return if !(M <: DeviceMemory) || name == :none + name = GPUArrays.AllocCache.CacheAllocatorName[] + return if name ≡ nothing _alloc_f() else - GPUArrays.alloc!(_alloc_f, CUDABackend(), name, T, dims)::CuArray{T, N, M} + GPUArrays.AllocCache.alloc!(_alloc_f, CuArray, name, (M, T, dims))::CuArray{T, N, M} end end diff --git a/src/cache_allocator.jl b/src/cache_allocator.jl index a9256cefe8..4b01f95bbc 100644 --- a/src/cache_allocator.jl +++ b/src/cache_allocator.jl @@ -1,5 +1,5 @@ -const CuCacheAllocator = GPUArrays.PerDeviceCacheAllocator(CuArray; free_immediately=true) +const CuCacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(CuArray; free_immediately=true) -GPUArrays.cache_allocator(::CUDABackend) = CuCacheAllocator +GPUArrays.AllocCache.cache_allocator(::Type{<: CuArray}) = CuCacheAllocator -GPUArrays.device(::CUDABackend) = CUDA.device() +GPUArrays.AllocCache.device(::Type{<: CuArray}) = CUDA.device() From 12497f5ac24a4efc31d2c1abc734a95fed5d3362 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 7 Jan 2025 13:47:41 +0200 Subject: [PATCH 5/9] Update --- src/cache_allocator.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cache_allocator.jl b/src/cache_allocator.jl index 4b01f95bbc..4464f270ac 100644 --- a/src/cache_allocator.jl +++ b/src/cache_allocator.jl @@ -1,4 +1,4 @@ -const CuCacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(CuArray; free_immediately=true) +const CuCacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(CuArray) GPUArrays.AllocCache.cache_allocator(::Type{<: CuArray}) = CuCacheAllocator From f65176ae929e1c0bf2177569b1f64efbc40427bf Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 9 Jan 2025 11:31:58 +0200 Subject: [PATCH 6/9] Update to latest alloc cache --- src/CUDA.jl | 2 -- src/array.jl | 7 ++++--- src/cache_allocator.jl | 5 ----- 3 files changed, 4 insertions(+), 10 deletions(-) delete mode 100644 src/cache_allocator.jl diff --git a/src/CUDA.jl b/src/CUDA.jl index 89c1e7841f..e22d65a806 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -123,8 +123,6 @@ include("CUDAKernels.jl") import .CUDAKernels: CUDABackend export CUDABackend -include("cache_allocator.jl") - # StaticArrays is still a direct dependency, so directly include the extension include("../ext/StaticArraysExt.jl") diff --git a/src/array.jl b/src/array.jl index 14fe9e0168..e201db155b 100644 --- a/src/array.jl +++ b/src/array.jl @@ -78,11 +78,12 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N} finalizer(unsafe_free!, obj) end - name = GPUArrays.AllocCache.CacheAllocatorName[] - return if name ≡ nothing + cache = GPUArrays.ALLOC_CACHE[] + return if cache ≡ nothing _alloc_f() else - GPUArrays.AllocCache.alloc!(_alloc_f, CuArray, name, (M, T, dims))::CuArray{T, N, M} + cache_key = (CuArray, CUDA.device(), M, T, dims) + GPUArrays.alloc!(_alloc_f, cache, cache_key)::CuArray{T, N, M} end end diff --git a/src/cache_allocator.jl b/src/cache_allocator.jl deleted file mode 100644 index 4464f270ac..0000000000 --- a/src/cache_allocator.jl +++ /dev/null @@ -1,5 +0,0 @@ -const CuCacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(CuArray) - -GPUArrays.AllocCache.cache_allocator(::Type{<: CuArray}) = CuCacheAllocator - -GPUArrays.AllocCache.device(::Type{<: CuArray}) = CUDA.device() From de1177579a03ee1f86f08b82b030c8a7646fe4e9 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 9 Jan 2025 11:58:47 +0100 Subject: [PATCH 7/9] Update --- src/array.jl | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/array.jl b/src/array.jl index e201db155b..3e734f8865 100644 --- a/src/array.jl +++ b/src/array.jl @@ -72,19 +72,12 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N} maxsize end - function _alloc_f() + GPUArrays.cached_alloc((CuArray, CUDA.device(), T, dims, M)) do data = DataRef(pool_free, pool_alloc(M, bufsize)) obj = new{T,N,M}(data, maxsize, 0, dims) finalizer(unsafe_free!, obj) - end - - cache = GPUArrays.ALLOC_CACHE[] - return if cache ≡ nothing - _alloc_f() - else - cache_key = (CuArray, CUDA.device(), M, T, dims) - GPUArrays.alloc!(_alloc_f, cache, cache_key)::CuArray{T, N, M} - end + return obj + end::CuArray{T, N, M} end function CuArray{T,N}(data::DataRef{Managed{M}}, dims::Dims{N}; From 56d9fcc06efd3c2b0f5372cf03070a8cb92540b7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 9 Jan 2025 12:59:25 +0100 Subject: [PATCH 8/9] Bump GPUArrays. --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 92e45311c8..09e7c05dce 100644 --- a/Project.toml +++ b/Project.toml @@ -59,7 +59,7 @@ Crayons = "4" DataFrames = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" -GPUArrays = "11.1" +GPUArrays = "11.2" GPUCompiler = "0.24, 0.25, 0.26, 0.27, 1" KernelAbstractions = "0.9.2" LLVM = "9.1" From c4c363931178b828ed5a0bbfaf207c0495f08cc6 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 9 Jan 2025 14:32:54 +0200 Subject: [PATCH 9/9] Use bufsize in cache key --- src/array.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index 3e734f8865..3213a91652 100644 --- a/src/array.jl +++ b/src/array.jl @@ -72,7 +72,7 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N} maxsize end - GPUArrays.cached_alloc((CuArray, CUDA.device(), T, dims, M)) do + GPUArrays.cached_alloc((CuArray, CUDA.device(), T, bufsize, M)) do data = DataRef(pool_free, pool_alloc(M, bufsize)) obj = new{T,N,M}(data, maxsize, 0, dims) finalizer(unsafe_free!, obj)