From 51817de51c3ddfd50ecabde5e479f1030c0ccf2b Mon Sep 17 00:00:00 2001 From: Simeon David Schaub Date: Mon, 28 Jul 2025 11:27:03 +0200 Subject: [PATCH 1/6] WIP: Int64 atomics --- lib/intrinsics/src/atomic.jl | 25 +++++++++++++------------ src/compiler/execution.jl | 2 +- test/atomics.jl | 14 ++++++++++++++ 3 files changed, 28 insertions(+), 13 deletions(-) create mode 100644 test/atomics.jl diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl index a1c6007d..e90c44ea 100644 --- a/lib/intrinsics/src/atomic.jl +++ b/lib/intrinsics/src/atomic.jl @@ -5,7 +5,7 @@ # "atomic operations on 32-bit signed, unsigned integers and single precision # floating-point to locations in __global or __local memory" -const atomic_integer_types = [UInt32, Int32] +const atomic_integer_types = [UInt32, Int32, UInt64, Int64] # TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS # TODO: additional floating-point atomics with ZE_extension_float_atomics const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup] @@ -14,48 +14,49 @@ const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup] # generically typed for gentype in atomic_integer_types, as in atomic_memory_types + atomic = sizeof(gentype) == 4 ? "atomic" : "atom" @eval begin @device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall("atomic_add", $gentype, + @builtin_ccall($"$(atomic)_add", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_sub!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall("atomic_sub", $gentype, + @builtin_ccall($"$(atomic)_sub", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_inc!(p::LLVMPtr{$gentype,$as}) = - @builtin_ccall("atomic_inc", $gentype, (LLVMPtr{$gentype,$as},), p) + @builtin_ccall($"$(atomic)_inc", $gentype, (LLVMPtr{$gentype,$as},), p) @device_function atomic_dec!(p::LLVMPtr{$gentype,$as}) = - @builtin_ccall("atomic_dec", $gentype, (LLVMPtr{$gentype,$as},), p) + @builtin_ccall($"$(atomic)_dec", $gentype, (LLVMPtr{$gentype,$as},), p) @device_function atomic_min!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall("atomic_min", $gentype, + @builtin_ccall($"$(atomic)_min", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_max!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall("atomic_max", $gentype, + @builtin_ccall($"$(atomic)_max", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_and!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall("atomic_and", $gentype, + @builtin_ccall($"$(atomic)_and", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_or!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall("atomic_or", $gentype, + @builtin_ccall($"$(atomic)_or", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_xor!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall("atomic_xor", $gentype, + @builtin_ccall($"$(atomic)_xor", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_xchg!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall("atomic_xchg", $gentype, + @builtin_ccall($"$(atomic)_xchg", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_cmpxchg!(p::LLVMPtr{$gentype,$as}, cmp::$gentype, val::$gentype) = - @builtin_ccall("atomic_cmpxchg", $gentype, + @builtin_ccall($"$(atomic)_cmpxchg", $gentype, (LLVMPtr{$gentype,$as}, $gentype, $gentype), p, cmp, val) end diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 881ea906..ad761bea 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -4,7 +4,7 @@ export @opencl, clfunction ## high-level @opencl interface const MACRO_KWARGS = [:launch] -const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions] const LAUNCH_KWARGS = [:global_size, :local_size, :queue] macro opencl(ex...) diff --git a/test/atomics.jl b/test/atomics.jl new file mode 100644 index 00000000..990bd53f --- /dev/null +++ b/test/atomics.jl @@ -0,0 +1,14 @@ +@testset "atomics" begin + +function atomic_count(counter) + OpenCL.@atomic counter[] += 1 + return +end + +@testset "atomic_add! ($T)" for T in [Int32, UInt32, Int64, UInt64] + a = OpenCL.zeros(T) + @opencl global_size=1000 atomic_count(a) + @test OpenCL.@allowscalar a[] == 1000 +end + +end From 2b4cdcaa82de8308897031ec57be7891659821d7 Mon Sep 17 00:00:00 2001 From: Simeon David Schaub Date: Mon, 28 Jul 2025 13:01:59 +0200 Subject: [PATCH 2/6] it works! --- lib/intrinsics/src/atomic.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl index e90c44ea..7c238d0f 100644 --- a/lib/intrinsics/src/atomic.jl +++ b/lib/intrinsics/src/atomic.jl @@ -248,7 +248,7 @@ for (op,impl) in [(+) => atomic_add!, Base.max => atomic_max!, Base.min => atomic_min!] @eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op), - val::T) where {T <: Union{Int32,UInt32}} = + val::T) where {T <: Union{atomic_integer_types...}} = $impl(pointer(A, I), val) end From 824b6135a4e3fa2e17979345dbee74ae6aaef471 Mon Sep 17 00:00:00 2001 From: Simeon David Schaub Date: Mon, 28 Jul 2025 13:06:30 +0200 Subject: [PATCH 3/6] atomic_op vs atom_op distinction is unnecessary --- lib/intrinsics/src/atomic.jl | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl index 7c238d0f..83f9451c 100644 --- a/lib/intrinsics/src/atomic.jl +++ b/lib/intrinsics/src/atomic.jl @@ -14,49 +14,48 @@ const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup] # generically typed for gentype in atomic_integer_types, as in atomic_memory_types - atomic = sizeof(gentype) == 4 ? "atomic" : "atom" @eval begin @device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall($"$(atomic)_add", $gentype, + @builtin_ccall("atomic_add", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_sub!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall($"$(atomic)_sub", $gentype, + @builtin_ccall("atomic_sub", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_inc!(p::LLVMPtr{$gentype,$as}) = - @builtin_ccall($"$(atomic)_inc", $gentype, (LLVMPtr{$gentype,$as},), p) + @builtin_ccall("atomic_inc", $gentype, (LLVMPtr{$gentype,$as},), p) @device_function atomic_dec!(p::LLVMPtr{$gentype,$as}) = - @builtin_ccall($"$(atomic)_dec", $gentype, (LLVMPtr{$gentype,$as},), p) + @builtin_ccall("atomic_dec", $gentype, (LLVMPtr{$gentype,$as},), p) @device_function atomic_min!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall($"$(atomic)_min", $gentype, + @builtin_ccall("atomic_min", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_max!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall($"$(atomic)_max", $gentype, + @builtin_ccall("atomic_max", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_and!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall($"$(atomic)_and", $gentype, + @builtin_ccall("atomic_and", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_or!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall($"$(atomic)_or", $gentype, + @builtin_ccall("atomic_or", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_xor!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall($"$(atomic)_xor", $gentype, + @builtin_ccall("atomic_xor", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_xchg!(p::LLVMPtr{$gentype,$as}, val::$gentype) = - @builtin_ccall($"$(atomic)_xchg", $gentype, + @builtin_ccall("atomic_xchg", $gentype, (LLVMPtr{$gentype,$as}, $gentype), p, val) @device_function atomic_cmpxchg!(p::LLVMPtr{$gentype,$as}, cmp::$gentype, val::$gentype) = - @builtin_ccall($"$(atomic)_cmpxchg", $gentype, + @builtin_ccall("atomic_cmpxchg", $gentype, (LLVMPtr{$gentype,$as}, $gentype, $gentype), p, cmp, val) end From ee5de61f9d49ca90644545a1e0bb93119a53d781 Mon Sep 17 00:00:00 2001 From: Simeon David Schaub Date: Mon, 28 Jul 2025 13:44:56 +0200 Subject: [PATCH 4/6] mark tests as requiring il --- test/setup.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/setup.jl b/test/setup.jl index e826d0d5..066424c6 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -90,7 +90,7 @@ function runtests(f, name, platform_filter) end # some tests require native execution capabilities - requires_il = name in ["execution", "intrinsics", "kernelabstractions"] || + requires_il = name in ["atomics", "execution", "intrinsics", "kernelabstractions"] || startswith(name, "gpuarrays/") ex = quote From 20a1dc7db27b8552372cca3e8b7f2de24973eb92 Mon Sep 17 00:00:00 2001 From: Simeon David Schaub Date: Tue, 29 Jul 2025 17:32:10 +0200 Subject: [PATCH 5/6] check for int64 atomic support --- test/atomics.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/atomics.jl b/test/atomics.jl index 990bd53f..068e8cd7 100644 --- a/test/atomics.jl +++ b/test/atomics.jl @@ -6,9 +6,11 @@ function atomic_count(counter) end @testset "atomic_add! ($T)" for T in [Int32, UInt32, Int64, UInt64] - a = OpenCL.zeros(T) - @opencl global_size=1000 atomic_count(a) - @test OpenCL.@allowscalar a[] == 1000 + if sizeof(T) == 4 || "cl_khr_int64_extended_atomics" in cl.device().extensions + a = OpenCL.zeros(T) + @opencl global_size=1000 atomic_count(a) + @test OpenCL.@allowscalar a[] == 1000 + end end end From af1720390bbedd9e237ad3c14f92c39dc189e4cc Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 30 Jul 2025 10:02:45 +0200 Subject: [PATCH 6/6] Bump version + clean-ups. --- Project.toml | 2 +- lib/intrinsics/Project.toml | 2 +- lib/intrinsics/src/atomic.jl | 28 +++++++++++++++++++--------- src/compiler/execution.jl | 2 +- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/Project.toml b/Project.toml index 2121c75a..c76fb007 100644 --- a/Project.toml +++ b/Project.toml @@ -31,7 +31,7 @@ Preferences = "1" Printf = "1" Random = "1" Reexport = "1" -SPIRVIntrinsics = "0.4" +SPIRVIntrinsics = "0.5" SPIRV_LLVM_Backend_jll = "20" SPIRV_Tools_jll = "2025.1" StaticArrays = "1" diff --git a/lib/intrinsics/Project.toml b/lib/intrinsics/Project.toml index daa9f09b..8142aabc 100644 --- a/lib/intrinsics/Project.toml +++ b/lib/intrinsics/Project.toml @@ -1,7 +1,7 @@ name = "SPIRVIntrinsics" uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" authors = ["Tim Besard "] -version = "0.4.0" +version = "0.5.0" [deps] ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl index 83f9451c..9bbbdbe6 100644 --- a/lib/intrinsics/src/atomic.jl +++ b/lib/intrinsics/src/atomic.jl @@ -1,13 +1,9 @@ # Atomic Functions -# TODO: support for 64-bit atomics via atom_cmpxchg (from cl_khr_int64_base_atomics) - -# "atomic operations on 32-bit signed, unsigned integers and single precision -# floating-point to locations in __global or __local memory" +# provides atomic functions that rely on the OpenCL base atomics, as well as the +# cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics extensions. const atomic_integer_types = [UInt32, Int32, UInt64, Int64] -# TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS -# TODO: additional floating-point atomics with ZE_extension_float_atomics const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup] @@ -67,15 +63,23 @@ end for as in atomic_memory_types @eval begin +# There is native support for atomic_xchg on Float32, but not for Float64, +# so we always reinterpret for consistency. @device_function atomic_xchg!(p::LLVMPtr{Float32,$as}, val::Float32) = - @builtin_ccall("atomic_xchg", Float32, (LLVMPtr{Float32,$as}, Float32,), p, val) + reinterpret(Float32, atomic_xchg!(reinterpret(LLVMPtr{UInt32,$as}, p), + reinterpret(UInt32, val))) +@device_function atomic_xchg!(p::LLVMPtr{Float64,$as}, val::Float64) = + reinterpret(Float64, atomic_xchg!(reinterpret(LLVMPtr{UInt64,$as}, p), + reinterpret(UInt64, val))) -# XXX: why is only xchg supported on floats? isn't it safe for cmpxchg too, -# which should only perform bitwise comparisons? @device_function atomic_cmpxchg!(p::LLVMPtr{Float32,$as}, cmp::Float32, val::Float32) = reinterpret(Float32, atomic_cmpxchg!(reinterpret(LLVMPtr{UInt32,$as}, p), reinterpret(UInt32, cmp), reinterpret(UInt32, val))) +@device_function atomic_cmpxchg!(p::LLVMPtr{Float64,$as}, cmp::Float64, val::Float64) = + reinterpret(Float64, atomic_cmpxchg!(reinterpret(LLVMPtr{UInt64,$as}, p), + reinterpret(UInt64, cmp), + reinterpret(UInt64, val))) end end @@ -239,6 +243,11 @@ end atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val)) # native atomics +# TODO: support inc/dec +# TODO: this depends on available extensions +# - UInt64: requires cl_khr_int64_base_atomics for add/sub/inc/dec, +# requires cl_khr_int64_extended_atomics for min/max/and/or/xor +# - Float64: always should hit the fallback for (op,impl) in [(+) => atomic_add!, (-) => atomic_sub!, (&) => atomic_and!, @@ -252,6 +261,7 @@ for (op,impl) in [(+) => atomic_add!, end # fallback using compare-and-swap +# TODO: for 64-bit types, this depends on cl_khr_int64_base_atomics function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} ptr = pointer(A, I) old = Base.unsafe_load(ptr, 1) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index ad761bea..881ea906 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -4,7 +4,7 @@ export @opencl, clfunction ## high-level @opencl interface const MACRO_KWARGS = [:launch] -const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions] +const COMPILER_KWARGS = [:kernel, :name, :always_inline] const LAUNCH_KWARGS = [:global_size, :local_size, :queue] macro opencl(ex...)