From 51817de51c3ddfd50ecabde5e479f1030c0ccf2b Mon Sep 17 00:00:00 2001
From: Simeon David Schaub <simeon@schaub.rocks>
Date: Mon, 28 Jul 2025 11:27:03 +0200
Subject: [PATCH 1/6] WIP: Int64 atomics

---
 lib/intrinsics/src/atomic.jl | 25 +++++++++++++------------
 src/compiler/execution.jl    |  2 +-
 test/atomics.jl              | 14 ++++++++++++++
 3 files changed, 28 insertions(+), 13 deletions(-)
 create mode 100644 test/atomics.jl

diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl
index a1c6007d..e90c44ea 100644
--- a/lib/intrinsics/src/atomic.jl
+++ b/lib/intrinsics/src/atomic.jl
@@ -5,7 +5,7 @@
 # "atomic operations on 32-bit signed, unsigned integers and single precision
 #  floating-point to locations in __global or __local memory"
 
-const atomic_integer_types = [UInt32, Int32]
+const atomic_integer_types = [UInt32, Int32, UInt64, Int64]
 # TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS
 # TODO: additional floating-point atomics with ZE_extension_float_atomics
 const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup]
@@ -14,48 +14,49 @@ const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup]
 # generically typed
 
 for gentype in atomic_integer_types, as in atomic_memory_types
+    atomic = sizeof(gentype) == 4 ? "atomic" : "atom"
 @eval begin
 
 @device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_add", $gentype,
+    @builtin_ccall($"$(atomic)_add", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_sub!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_sub", $gentype,
+    @builtin_ccall($"$(atomic)_sub", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_inc!(p::LLVMPtr{$gentype,$as}) =
-    @builtin_ccall("atomic_inc", $gentype, (LLVMPtr{$gentype,$as},), p)
+    @builtin_ccall($"$(atomic)_inc", $gentype, (LLVMPtr{$gentype,$as},), p)
 
 @device_function atomic_dec!(p::LLVMPtr{$gentype,$as}) =
-    @builtin_ccall("atomic_dec", $gentype, (LLVMPtr{$gentype,$as},), p)
+    @builtin_ccall($"$(atomic)_dec", $gentype, (LLVMPtr{$gentype,$as},), p)
 
 @device_function atomic_min!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_min", $gentype,
+    @builtin_ccall($"$(atomic)_min", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_max!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_max", $gentype,
+    @builtin_ccall($"$(atomic)_max", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_and!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_and", $gentype,
+    @builtin_ccall($"$(atomic)_and", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_or!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_or", $gentype,
+    @builtin_ccall($"$(atomic)_or", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_xor!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_xor", $gentype,
+    @builtin_ccall($"$(atomic)_xor", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_xchg!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall("atomic_xchg", $gentype,
+    @builtin_ccall($"$(atomic)_xchg", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_cmpxchg!(p::LLVMPtr{$gentype,$as}, cmp::$gentype, val::$gentype) =
-    @builtin_ccall("atomic_cmpxchg", $gentype,
+    @builtin_ccall($"$(atomic)_cmpxchg", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype, $gentype), p, cmp, val)
 
 end
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 881ea906..ad761bea 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -4,7 +4,7 @@ export @opencl, clfunction
 ## high-level @opencl interface
 
 const MACRO_KWARGS = [:launch]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions]
 const LAUNCH_KWARGS = [:global_size, :local_size, :queue]
 
 macro opencl(ex...)
diff --git a/test/atomics.jl b/test/atomics.jl
new file mode 100644
index 00000000..990bd53f
--- /dev/null
+++ b/test/atomics.jl
@@ -0,0 +1,14 @@
+@testset "atomics" begin
+
+function atomic_count(counter)
+    OpenCL.@atomic counter[] += 1
+    return
+end
+
+@testset "atomic_add! ($T)" for T in [Int32, UInt32, Int64, UInt64]
+    a = OpenCL.zeros(T)
+    @opencl global_size=1000 atomic_count(a)
+    @test OpenCL.@allowscalar a[] == 1000
+end
+
+end

From 2b4cdcaa82de8308897031ec57be7891659821d7 Mon Sep 17 00:00:00 2001
From: Simeon David Schaub <simeon@schaub.rocks>
Date: Mon, 28 Jul 2025 13:01:59 +0200
Subject: [PATCH 2/6] it works!

---
 lib/intrinsics/src/atomic.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl
index e90c44ea..7c238d0f 100644
--- a/lib/intrinsics/src/atomic.jl
+++ b/lib/intrinsics/src/atomic.jl
@@ -248,7 +248,7 @@ for (op,impl) in [(+)      => atomic_add!,
                   Base.max => atomic_max!,
                   Base.min => atomic_min!]
     @eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op),
-                                  val::T) where {T <: Union{Int32,UInt32}} =
+                                  val::T) where {T <: Union{atomic_integer_types...}} =
         $impl(pointer(A, I), val)
 end
 

From 824b6135a4e3fa2e17979345dbee74ae6aaef471 Mon Sep 17 00:00:00 2001
From: Simeon David Schaub <simeon@schaub.rocks>
Date: Mon, 28 Jul 2025 13:06:30 +0200
Subject: [PATCH 3/6] atomic_op vs atom_op distinction is unnecessary

---
 lib/intrinsics/src/atomic.jl | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl
index 7c238d0f..83f9451c 100644
--- a/lib/intrinsics/src/atomic.jl
+++ b/lib/intrinsics/src/atomic.jl
@@ -14,49 +14,48 @@ const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup]
 # generically typed
 
 for gentype in atomic_integer_types, as in atomic_memory_types
-    atomic = sizeof(gentype) == 4 ? "atomic" : "atom"
 @eval begin
 
 @device_function atomic_add!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall($"$(atomic)_add", $gentype,
+    @builtin_ccall("atomic_add", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_sub!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall($"$(atomic)_sub", $gentype,
+    @builtin_ccall("atomic_sub", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_inc!(p::LLVMPtr{$gentype,$as}) =
-    @builtin_ccall($"$(atomic)_inc", $gentype, (LLVMPtr{$gentype,$as},), p)
+    @builtin_ccall("atomic_inc", $gentype, (LLVMPtr{$gentype,$as},), p)
 
 @device_function atomic_dec!(p::LLVMPtr{$gentype,$as}) =
-    @builtin_ccall($"$(atomic)_dec", $gentype, (LLVMPtr{$gentype,$as},), p)
+    @builtin_ccall("atomic_dec", $gentype, (LLVMPtr{$gentype,$as},), p)
 
 @device_function atomic_min!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall($"$(atomic)_min", $gentype,
+    @builtin_ccall("atomic_min", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_max!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall($"$(atomic)_max", $gentype,
+    @builtin_ccall("atomic_max", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_and!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall($"$(atomic)_and", $gentype,
+    @builtin_ccall("atomic_and", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_or!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall($"$(atomic)_or", $gentype,
+    @builtin_ccall("atomic_or", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_xor!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall($"$(atomic)_xor", $gentype,
+    @builtin_ccall("atomic_xor", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_xchg!(p::LLVMPtr{$gentype,$as}, val::$gentype) =
-    @builtin_ccall($"$(atomic)_xchg", $gentype,
+    @builtin_ccall("atomic_xchg", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype), p, val)
 
 @device_function atomic_cmpxchg!(p::LLVMPtr{$gentype,$as}, cmp::$gentype, val::$gentype) =
-    @builtin_ccall($"$(atomic)_cmpxchg", $gentype,
+    @builtin_ccall("atomic_cmpxchg", $gentype,
                    (LLVMPtr{$gentype,$as}, $gentype, $gentype), p, cmp, val)
 
 end

From ee5de61f9d49ca90644545a1e0bb93119a53d781 Mon Sep 17 00:00:00 2001
From: Simeon David Schaub <simeon@schaub.rocks>
Date: Mon, 28 Jul 2025 13:44:56 +0200
Subject: [PATCH 4/6] mark tests as requiring il

---
 test/setup.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/setup.jl b/test/setup.jl
index e826d0d5..066424c6 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -90,7 +90,7 @@ function runtests(f, name, platform_filter)
         end
 
         # some tests require native execution capabilities
-        requires_il = name in ["execution", "intrinsics", "kernelabstractions"] ||
+        requires_il = name in ["atomics", "execution", "intrinsics", "kernelabstractions"] ||
                       startswith(name, "gpuarrays/")
 
         ex = quote

From 20a1dc7db27b8552372cca3e8b7f2de24973eb92 Mon Sep 17 00:00:00 2001
From: Simeon David Schaub <simeon@schaub.rocks>
Date: Tue, 29 Jul 2025 17:32:10 +0200
Subject: [PATCH 5/6] check for int64 atomic support

---
 test/atomics.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/atomics.jl b/test/atomics.jl
index 990bd53f..068e8cd7 100644
--- a/test/atomics.jl
+++ b/test/atomics.jl
@@ -6,9 +6,11 @@ function atomic_count(counter)
 end
 
 @testset "atomic_add! ($T)" for T in [Int32, UInt32, Int64, UInt64]
-    a = OpenCL.zeros(T)
-    @opencl global_size=1000 atomic_count(a)
-    @test OpenCL.@allowscalar a[] == 1000
+    if sizeof(T) == 4 || "cl_khr_int64_extended_atomics" in cl.device().extensions
+        a = OpenCL.zeros(T)
+        @opencl global_size=1000 atomic_count(a)
+        @test OpenCL.@allowscalar a[] == 1000
+    end
 end
 
 end

From af1720390bbedd9e237ad3c14f92c39dc189e4cc Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 30 Jul 2025 10:02:45 +0200
Subject: [PATCH 6/6] Bump version + clean-ups.

---
 Project.toml                 |  2 +-
 lib/intrinsics/Project.toml  |  2 +-
 lib/intrinsics/src/atomic.jl | 28 +++++++++++++++++++---------
 src/compiler/execution.jl    |  2 +-
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/Project.toml b/Project.toml
index 2121c75a..c76fb007 100644
--- a/Project.toml
+++ b/Project.toml
@@ -31,7 +31,7 @@ Preferences = "1"
 Printf = "1"
 Random = "1"
 Reexport = "1"
-SPIRVIntrinsics = "0.4"
+SPIRVIntrinsics = "0.5"
 SPIRV_LLVM_Backend_jll = "20"
 SPIRV_Tools_jll = "2025.1"
 StaticArrays = "1"
diff --git a/lib/intrinsics/Project.toml b/lib/intrinsics/Project.toml
index daa9f09b..8142aabc 100644
--- a/lib/intrinsics/Project.toml
+++ b/lib/intrinsics/Project.toml
@@ -1,7 +1,7 @@
 name = "SPIRVIntrinsics"
 uuid = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
 authors = ["Tim Besard <tim.besard@gmail.com>"]
-version = "0.4.0"
+version = "0.5.0"
 
 [deps]
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
diff --git a/lib/intrinsics/src/atomic.jl b/lib/intrinsics/src/atomic.jl
index 83f9451c..9bbbdbe6 100644
--- a/lib/intrinsics/src/atomic.jl
+++ b/lib/intrinsics/src/atomic.jl
@@ -1,13 +1,9 @@
 # Atomic Functions
 
-# TODO: support for 64-bit atomics via atom_cmpxchg (from cl_khr_int64_base_atomics)
-
-# "atomic operations on 32-bit signed, unsigned integers and single precision
-#  floating-point to locations in __global or __local memory"
+# provides atomic functions that rely on the OpenCL base atomics, as well as the
+# cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics extensions.
 
 const atomic_integer_types = [UInt32, Int32, UInt64, Int64]
-# TODO: 64-bit atomics with ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS
-# TODO: additional floating-point atomics with ZE_extension_float_atomics
 const atomic_memory_types = [AS.Workgroup, AS.CrossWorkgroup]
 
 
@@ -67,15 +63,23 @@ end
 for as in atomic_memory_types
 @eval begin
 
+# There is native support for atomic_xchg on Float32, but not for Float64,
+# so we always reinterpret for consistency.
 @device_function atomic_xchg!(p::LLVMPtr{Float32,$as}, val::Float32) =
-    @builtin_ccall("atomic_xchg", Float32, (LLVMPtr{Float32,$as}, Float32,), p, val)
+    reinterpret(Float32, atomic_xchg!(reinterpret(LLVMPtr{UInt32,$as}, p),
+                                      reinterpret(UInt32, val)))
+@device_function atomic_xchg!(p::LLVMPtr{Float64,$as}, val::Float64) =
+    reinterpret(Float64, atomic_xchg!(reinterpret(LLVMPtr{UInt64,$as}, p),
+                                      reinterpret(UInt64, val)))
 
-# XXX: why is only xchg supported on floats? isn't it safe for cmpxchg too,
-#      which should only perform bitwise comparisons?
 @device_function atomic_cmpxchg!(p::LLVMPtr{Float32,$as}, cmp::Float32, val::Float32) =
     reinterpret(Float32, atomic_cmpxchg!(reinterpret(LLVMPtr{UInt32,$as}, p),
                                          reinterpret(UInt32, cmp),
                                          reinterpret(UInt32, val)))
+@device_function atomic_cmpxchg!(p::LLVMPtr{Float64,$as}, cmp::Float64, val::Float64) =
+    reinterpret(Float64, atomic_cmpxchg!(reinterpret(LLVMPtr{UInt64,$as}, p),
+                                         reinterpret(UInt64, cmp),
+                                         reinterpret(UInt64, val)))
 
 end
 end
@@ -239,6 +243,11 @@ end
     atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val))
 
 # native atomics
+# TODO: support inc/dec
+# TODO: this depends on available extensions
+#       - UInt64: requires cl_khr_int64_base_atomics for add/sub/inc/dec,
+#                 requires cl_khr_int64_extended_atomics for min/max/and/or/xor
+#       - Float64: always should hit the fallback
 for (op,impl) in [(+)      => atomic_add!,
                   (-)      => atomic_sub!,
                   (&)      => atomic_and!,
@@ -252,6 +261,7 @@ for (op,impl) in [(+)      => atomic_add!,
 end
 
 # fallback using compare-and-swap
+# TODO: for 64-bit types, this depends on cl_khr_int64_base_atomics
 function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T}
     ptr = pointer(A, I)
     old = Base.unsafe_load(ptr, 1)
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index ad761bea..881ea906 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -4,7 +4,7 @@ export @opencl, clfunction
 ## high-level @opencl interface
 
 const MACRO_KWARGS = [:launch]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline]
 const LAUNCH_KWARGS = [:global_size, :local_size, :queue]
 
 macro opencl(ex...)