diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 02476bb5..91cf3583 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -21,3 +21,40 @@ steps: env: OCL_ICD_FILENAMES: "libnvidia-opencl.so.1" timeout_in_minutes: 60 + + - group: "macos" + key: "julia" + steps: + - label: "{{matrix.julia}} macos {{matrix.arch}}" + plugins: + - JuliaCI/julia#v1: + version: "{{matrix.julia}}" + - JuliaCI/julia-coverage#v1: + codecov: true + commands: | + POCL_DEBUG=err julia --project -e ' + using Pkg + + println("--- :julia: Instantiating project") + Pkg.add("pocl_jll") + Pkg.add("InteractiveUtils") + Pkg.develop(path="lib/intrinsics") + + println("+++ :julia: Running tests") + using InteractiveUtils + InteractiveUtils.versioninfo() + Pkg.test(; coverage=true, test_args=`--platform=pocl --verbose --jobs=2`)' + agents: + queue: "juliaecosystem" + os: "macos" + arch: "{{matrix.arch}}" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 120 + matrix: + setup: + julia: + - "1.10" + - "1.12" + arch: + - "x86_64" + - "aarch64" diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index c54588de..77d1350a 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -15,7 +15,7 @@ jobs: test: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ matrix.memory_backend }} - PoCL ${{ matrix.pocl }} runs-on: ${{ matrix.os }} - timeout-minutes: 100 + timeout-minutes: 180 permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created actions: write contents: read @@ -23,39 +23,55 @@ jobs: fail-fast: false matrix: version: ['1.10', '1.12'] - os: [ubuntu-24.04, ubuntu-24.04-arm, macOS-15, macOS-15-intel, windows-2025] + os: [ubuntu-24.04-arm, macOS-14, macOS-15, macOS-26, macOS-15-intel, windows-2022] arch: [x64, arm64] - pocl: [jll, local] + pocl: [jll] memory_backend: [usm, svm, buffer] exclude: # unsupported combinations - os: ubuntu-24.04 arch: arm64 - - os: windows-2025 + - os: windows-2022 arch: arm64 - os: ubuntu-24.04-arm arch: x64 # macOS 13 is Intel-only, while macOS 14+ only support Apple Silicon + - os: macOS-14 + arch: x64 - os: macOS-15 arch: x64 + - os: macOS-26 + arch: x64 - os: macOS-15-intel arch: arm64 # we only test building PoCL on Linux - os: macOS-15-intel pocl: local + - os: macOS-14 + pocl: local - os: macOS-15 pocl: local - - os: windows-2025 + - os: macOS-26 + pocl: local + - os: windows-2022 pocl: local # Remove these exclusions once macOS CI hangs are fixed - os: macOS-15-intel memory_backend: svm - os: macOS-15-intel memory_backend: buffer + - os: macOS-14 + memory_backend: svm + - os: macOS-14 + memory_backend: buffer - os: macOS-15 memory_backend: svm - os: macOS-15 memory_backend: buffer + - os: macOS-26 + memory_backend: svm + - os: macOS-26 + memory_backend: buffer steps: - name: Checkout OpenCL.jl uses: actions/checkout@v5 @@ -150,7 +166,9 @@ jobs: uses: julia-actions/julia-runtest@v1 if: runner.os != 'Windows' with: - test_args: '--quickfail --platform=pocl' + test_args: '--quickfail --platform=pocl --verbose --jobs=2' + env: + POCL_DEBUG: err - name: Setup BusyBox if: runner.os == 'Windows' diff --git a/lib/cl/event.jl b/lib/cl/event.jl index 42a914cf..ca1edf2a 100644 --- a/lib/cl/event.jl +++ b/lib/cl/event.jl @@ -158,46 +158,46 @@ function Base.wait(evts::Vector{AbstractEvent}) return evts end -function enqueue_marker_with_wait_list(wait_for::Vector{AbstractEvent}) +function enqueue_marker_with_wait_list(wait_for::Vector{AbstractEvent}; queue::CmdQueue = queue()) n_wait_events = cl_uint(length(wait_for)) wait_evt_ids = [evt.id for evt in wait_for] ret_evt = Ref{cl_event}() - clEnqueueMarkerWithWaitList(queue(), n_wait_events, + clEnqueueMarkerWithWaitList(queue, n_wait_events, isempty(wait_evt_ids) ? C_NULL : wait_evt_ids, ret_evt) @return_event ret_evt[] end -function enqueue_barrier_with_wait_list(wait_for::Vector{AbstractEvent}) +function enqueue_barrier_with_wait_list(wait_for::Vector{AbstractEvent}; queue::CmdQueue = queue()) n_wait_events = cl_uint(length(wait_for)) wait_evt_ids = [evt.id for evt in wait_for] ret_evt = Ref{cl_event}() - clEnqueueBarrierWithWaitList(queue(), n_wait_events, + clEnqueueBarrierWithWaitList(queue, n_wait_events, isempty(wait_evt_ids) ? C_NULL : wait_evt_ids, ret_evt) @return_event ret_evt[] end -function enqueue_marker() +function enqueue_marker(; queue::CmdQueue = queue()) evt = Ref{cl_event}() - clEnqueueMarker(queue(), evt) + clEnqueueMarker(queue, evt) @return_event evt[] end @deprecate enqueue_marker enqueue_marker_with_wait_list -function enqueue_wait_for_events(wait_for::Vector{T}) where {T<:AbstractEvent} +function enqueue_wait_for_events(wait_for::Vector{T}; queue::CmdQueue = queue()) where {T<:AbstractEvent} wait_evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin - clEnqueueWaitForEvents(queue(), length(wait_for), wait_evt_ids) + clEnqueueWaitForEvents(queue, length(wait_for), wait_evt_ids) end end -function enqueue_wait_for_events(wait_for::AbstractEvent) - enqueue_wait_for_events([wait_for]) +function enqueue_wait_for_events(wait_for::AbstractEvent; queue::CmdQueue = queue()) + enqueue_wait_for_events([wait_for]; queue) end -function enqueue_barrier() - clEnqueueBarrier(queue()) +function enqueue_barrier(; queue::CmdQueue = queue()) + clEnqueueBarrier(queue) return end @deprecate enqueue_barrier enqueue_barrier_with_wait_list diff --git a/lib/cl/kernel.jl b/lib/cl/kernel.jl index d0243130..a7a44b54 100644 --- a/lib/cl/kernel.jl +++ b/lib/cl/kernel.jl @@ -158,7 +158,7 @@ end function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing; global_work_offset=nothing, wait_on::Vector{Event}=Event[], - rng_state=false, nargs=nothing) + rng_state=false, nargs=nothing, queue::CmdQueue = queue()) max_work_dim = device().max_work_item_dims work_dim = length(global_work_size) if work_dim > max_work_dim @@ -224,12 +224,12 @@ function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing; end ret_event = Ref{cl_event}() - clEnqueueNDRangeKernel(queue(), k, work_dim, goffset, gsize, lsize, + clEnqueueNDRangeKernel(queue, k, work_dim, goffset, gsize, lsize, n_events, wait_event_ids, ret_event) return Event(ret_event[], retain=false) end -function enqueue_task(k::Kernel; wait_for=nothing) +function enqueue_task(k::Kernel; wait_for=nothing, queue::CmdQueue = queue()) n_evts = 0 evt_ids = C_NULL #TODO: this should be split out into its own function @@ -244,7 +244,7 @@ function enqueue_task(k::Kernel; wait_for=nothing) end end ret_event = Ref{cl_event}() - clEnqueueTask(queue(), k, n_evts, evt_ids, ret_event) + clEnqueueTask(queue, k, n_evts, evt_ids, ret_event) return ret_event[] end @@ -252,7 +252,7 @@ function call( k::Kernel, args...; global_size = (1,), local_size = nothing, global_work_offset = nothing, wait_on::Vector{Event} = Event[], indirect_memory::Vector{AbstractMemory} = AbstractMemory[], - rng_state=false, + rng_state=false, queue::CmdQueue = queue() ) set_args!(k, args...) if !isempty(indirect_memory) @@ -306,7 +306,7 @@ function call( clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(usm_pointers), usm_pointers) end end - enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on, rng_state, nargs=length(args)) + enqueue_kernel(k, global_size, local_size; global_work_offset, wait_on, rng_state, nargs=length(args), queue) end # From `julia/base/reflection.jl`, adjusted to add specialization on `t`. diff --git a/lib/cl/memory/buffer.jl b/lib/cl/memory/buffer.jl index dfd198d8..b9e265af 100644 --- a/lib/cl/memory/buffer.jl +++ b/lib/cl/memory/buffer.jl @@ -84,12 +84,13 @@ end # reading from buffer to host array, return an event function enqueue_read(dst::Ptr, src::Buffer, src_off::Int, nbytes::Int; - blocking::Bool=false, wait_for::Vector{Event}=Event[]) + blocking::Bool=false, wait_for::Vector{Event}=Event[], + queue::CmdQueue = queue()) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin ret_evt = Ref{cl_event}() - clEnqueueReadBuffer(queue(), src, blocking, src_off, nbytes, dst, + clEnqueueReadBuffer(queue, src, blocking, src_off, nbytes, dst, n_evts, evt_ids, ret_evt) @return_nanny_event(ret_evt[], dst) end @@ -99,12 +100,13 @@ enqueue_read(dst::Ptr, src::Buffer, nbytes; kwargs...) = # writing from host array to buffer, return an event function enqueue_write(dst::Buffer, dst_off::Int, src::Ptr, nbytes::Int; - blocking::Bool=false, wait_for::Vector{Event}=Event[]) + blocking::Bool=false, wait_for::Vector{Event}=Event[], + queue::CmdQueue = queue()) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin ret_evt = Ref{cl_event}() - clEnqueueWriteBuffer(queue(), dst, blocking, dst_off, nbytes, src, + clEnqueueWriteBuffer(queue, dst, blocking, dst_off, nbytes, src, n_evts, evt_ids, ret_evt) @return_nanny_event(ret_evt[], dst) end @@ -115,12 +117,13 @@ enqueue_write(dst::Buffer, src::Ptr, nbytes; kwargs...) = # copying between two buffers, return an event function enqueue_copy(dst::Buffer, dst_off::Int, src::Buffer, src_off::Int, nbytes::Int; blocking::Bool=false, - wait_for::Vector{Event}=Event[]) + wait_for::Vector{Event}=Event[], + queue::CmdQueue = queue()) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin ret_evt = Ref{cl_event}() - clEnqueueCopyBuffer(queue(), src, dst, src_off, dst_off, nbytes, + clEnqueueCopyBuffer(queue, src, dst, src_off, dst_off, nbytes, n_evts, evt_ids, ret_evt) @return_event ret_evt[] end @@ -130,7 +133,8 @@ enqueue_copy(dst::Buffer, src::Buffer, N; kwargs...) = # map a buffer into the host address space, returning a pointer and an event function enqueue_map(buf::Buffer, offset::Integer, nbytes::Int, flags=:rw; - blocking::Bool=false, wait_for::Vector{Event}=Event[]) + blocking::Bool=false, wait_for::Vector{Event}=Event[], + queue::CmdQueue = queue()) flags = if flags == :rw CL_MAP_READ | CL_MAP_WRITE elseif flags == :r @@ -146,7 +150,7 @@ function enqueue_map(buf::Buffer, offset::Integer, nbytes::Int, flags=:rw; evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin status = Ref{Cint}() - ptr = clEnqueueMapBuffer(queue(), buf, blocking, flags, offset, nbytes, + ptr = clEnqueueMapBuffer(queue, buf, blocking, flags, offset, nbytes, n_evts, evt_ids, ret_evt, status) if status[] != CL_SUCCESS throw(CLError(status[])) @@ -159,19 +163,21 @@ enqueue_map(buf::Buffer, nbytes::Int, flags=:rw; kwargs...) = enqueue_map(buf, 0, nbytes, flags; kwargs...) # unmap a buffer, return an event -function enqueue_unmap(buf::Buffer, ptr::Ptr; wait_for::Vector{Event}=Event[]) +function enqueue_unmap(buf::Buffer, ptr::Ptr; wait_for::Vector{Event}=Event[], + queue::CmdQueue = queue()) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin ret_evt = Ref{cl_event}() - clEnqueueUnmapMemObject(queue(), buf, ptr, n_evts, evt_ids, ret_evt) + clEnqueueUnmapMemObject(queue, buf, ptr, n_evts, evt_ids, ret_evt) return Event(ret_evt[]) end end # fill a buffer with a pattern, returning an event function enqueue_fill(buf::Buffer, offset::Integer, pattern::T, N::Integer; - wait_for::Vector{Event}=Event[]) where {T} + wait_for::Vector{Event}=Event[], + queue::CmdQueue = queue()) where {T} nbytes = N * sizeof(T) nbytes_pattern = sizeof(T) @assert nbytes_pattern > 0 @@ -179,7 +185,7 @@ function enqueue_fill(buf::Buffer, offset::Integer, pattern::T, N::Integer; evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve begin ret_evt = Ref{cl_event}() - clEnqueueFillBuffer(queue(), buf, [pattern], + clEnqueueFillBuffer(queue, buf, [pattern], nbytes_pattern, offset, nbytes, n_evts, evt_ids, ret_evt) @return_event ret_evt[] diff --git a/lib/cl/memory/usm.jl b/lib/cl/memory/usm.jl index a12bc52f..a182cd35 100644 --- a/lib/cl/memory/usm.jl +++ b/lib/cl/memory/usm.jl @@ -176,6 +176,7 @@ end # fill a buffer with a pattern, returning an event function enqueue_usm_fill(ptr::Union{Ptr, CLPtr}, pattern::T, N::Integer; + queue::CmdQueue = queue(), wait_for::Vector{Event}=Event[]) where {T} nbytes = N * sizeof(T) pattern_size = sizeof(T) @@ -183,7 +184,7 @@ function enqueue_usm_fill(ptr::Union{Ptr, CLPtr}, pattern::T, N::Integer; evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin ret_evt = Ref{cl_event}() - clEnqueueMemFillINTEL(queue(), ptr, Ref(pattern), + clEnqueueMemFillINTEL(queue, ptr, Ref(pattern), pattern_size, nbytes, n_evts, evt_ids, ret_evt) @return_event ret_evt[] diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 69150e5e..9efd9ded 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -18,28 +18,28 @@ const simd_ns = [2, 3, 4, 8, 16] @testset "intrinsics" begin @testset "barrier" begin +@time "Barrier Local Mem fence" @on_device barrier(OpenCL.LOCAL_MEM_FENCE) +@time "Barrier global Mem fence" @on_device barrier(OpenCL.GLOBAL_MEM_FENCE) +@time "Barrier both Mem fence" @on_device barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE) -@on_device barrier(OpenCL.LOCAL_MEM_FENCE) -@on_device barrier(OpenCL.GLOBAL_MEM_FENCE) -@on_device barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE) +@time "WorkGroup Barrier Local Mem fence" @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE) +@time "WorkGroup Barrier global Mem fence" @on_device work_group_barrier(OpenCL.GLOBAL_MEM_FENCE) +@time "WorkGroup Barrier image Mem fence" @on_device work_group_barrier(OpenCL.IMAGE_MEM_FENCE) -@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE) -@on_device work_group_barrier(OpenCL.GLOBAL_MEM_FENCE) -@on_device work_group_barrier(OpenCL.IMAGE_MEM_FENCE) +@time "WorkGroup Barrier L/G Mem fence" @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE) +@time "WorkGroup Barrier L/I Mem fence" @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.IMAGE_MEM_FENCE) +@time "WorkGroup Barrier L/G/I Mem fence" @on_device work_group_barrier(OpenCL.GLOBAL_MEM_FENCE | OpenCL.LOCAL_MEM_FENCE | OpenCL.IMAGE_MEM_FENCE) -@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE) -@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.IMAGE_MEM_FENCE) -@on_device work_group_barrier(OpenCL.GLOBAL_MEM_FENCE | OpenCL.LOCAL_MEM_FENCE | OpenCL.IMAGE_MEM_FENCE) - -@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_work_item) -@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_work_group) -@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_device) -cl.memory_backend() isa cl.SVMBackend && @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_all_svm_devices) -@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_sub_group) +@time "WorkGroup Barrier Local Mem fence, work item scope" @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_work_item) +@time "WorkGroup Barrier Local Mem fence, workgroup scope" @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_work_group) +@time "WorkGroup Barrier Local Mem fence, device scope" @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_device) +@time "Skipped" cl.memory_backend() isa cl.SVMBackend && @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_all_svm_devices) +@time "WorkGroup Barrier Local Mem fence, subgroup scope" @on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_sub_group) end @testset "mem_fence" begin +@warn "mem_fence" @on_device mem_fence(OpenCL.LOCAL_MEM_FENCE) @on_device mem_fence(OpenCL.GLOBAL_MEM_FENCE) @@ -56,6 +56,7 @@ end end @testset "atomic_work_item_fence" begin +@warn "atomic_work_item_fence" @on_device atomic_work_item_fence(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_order_relaxed, OpenCL.memory_scope_work_item) @on_device atomic_work_item_fence(OpenCL.GLOBAL_MEM_FENCE, OpenCL.memory_order_acquire, OpenCL.memory_scope_work_group) @@ -67,6 +68,7 @@ cl.memory_backend() isa cl.SVMBackend && @on_device atomic_work_item_fence(OpenC end @testset "math" begin +@warn "math" @testset "unary - $T" for T in float_types @testset "$f" for f in [ @@ -104,6 +106,7 @@ end hypot, (^), ] + @warn "binary - $T, $f" x = rand(T) y = rand(T) broken = ispocl && T == Float16 && f == atan @@ -115,6 +118,7 @@ end @testset "$f" for f in [ fma, ] + @warn "ternary - $T, $f" x = rand(T) y = rand(T) z = rand(T) @@ -131,6 +135,8 @@ end OpenCL.rint, OpenCL.rsqrt, ] + @warn "OpenCL-specific unary - $T, $f" + x = rand(T) broken = ispocl && T == Float16 && !(f in [OpenCL.rint, OpenCL.rsqrt]) @test call_on_device(f, x) isa Real broken = broken # Just check it doesn't error @@ -149,6 +155,8 @@ end OpenCL.nextafter, OpenCL.powr, ] + @warn "OpenCL-specific binary - $T, $f" + x = rand(T) y = rand(T) broken = ispocl && T == Float16 && !(f in [OpenCL.maxmag, OpenCL.minmag]) @@ -162,6 +170,8 @@ end x = rand(T) y = rand(T) z = rand(T) + @warn "OpenCL-specific ternary - $T" + @test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z end diff --git a/test/runtests.jl b/test/runtests.jl index 6fd437f2..96e8764f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -46,7 +46,16 @@ do_verbose, _ = extract_flag!(ARGS, "--verbose") do_quickfail, _ = extract_flag!(ARGS, "--quickfail") include("setup.jl") # make sure everything is precompiled + @info "System information:\n" * sprint(io->OpenCL.versioninfo(io)) +Sys.isapple() && @info read(`launchctl limit maxfiles`, String) +Sys.isapple() && @info read(`ulimit -a`, String) +Sys.isapple() && @info read(`df -YIh`, String) +Sys.isapple() && @info read(`vm_stat`, String) + +if Sys.isapple() + run(`sysctl hw.optional`) +end @info "Running $jobs tests in parallel. If this is too many, specify the `--jobs` argument to the tests, or set the JULIA_CPU_THREADS environment variable." diff --git a/test/setup.jl b/test/setup.jl index 90337d36..1d5062e0 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -155,9 +155,13 @@ macro on_device(ex...) $code return end - - @opencl $(kwargs...) $kernel() - cl.finish(cl.queue()) + @info "Pre-queue" + queue = cl.queue() + @info "Post-queue $(queue)" + @opencl queue $(kwargs...) $kernel() + @info "Post-launch" + cl.finish(queue) + @info "Post-sync" end end) end