diff --git a/src/ROCKernels.jl b/src/ROCKernels.jl index 946e2cdff..7b3548644 100644 --- a/src/ROCKernels.jl +++ b/src/ROCKernels.jl @@ -4,6 +4,7 @@ export ROCBackend import AMDGPU import AMDGPU.Device: @device_override +using AMDGPU: GPUArrays, rocSPARSE import Adapt import KernelAbstractions as KA @@ -22,15 +23,20 @@ function KA.device!(kab::ROCBackend, id::Int) end Adapt.adapt_storage(::ROCBackend, a::Array) = Adapt.adapt(AMDGPU.ROCArray, a) -Adapt.adapt_storage(::ROCBackend, a::AMDGPU.ROCArray) = a -Adapt.adapt_storage(::KA.CPU, a::AMDGPU.ROCArray) = convert(Array, a) +Adapt.adapt_storage(::ROCBackend, a::Union{AMDGPU.ROCArray, GPUArrays.AbstractGPUSparseArray}) = a +Adapt.adapt_storage(::KA.CPU, a::Union{AMDGPU.ROCArray, GPUArrays.AbstractGPUSparseArray}) = + Adapt.adapt(Array, a) function Adapt.adapt_storage(::KA.ConstAdaptor, a::AMDGPU.ROCDeviceArray{T}) where T ptr = LLVM.Interop.addrspacecast(Core.LLVMPtr{T,AMDGPU.Device.AS.Constant}, a.ptr) AMDGPU.ROCDeviceArray(a.dims, ptr) end -KA.argconvert(::KA.Kernel{ROCBackend}, arg) = AMDGPU.rocconvert(arg) KA.get_backend(::AMDGPU.ROCArray) = ROCBackend() +KA.get_backend(::AMDGPU.rocSPARSE.ROCSparseVector) = ROCBackend() +KA.get_backend(::AMDGPU.rocSPARSE.ROCSparseMatrixCSC) = ROCBackend() +KA.get_backend(::AMDGPU.rocSPARSE.ROCSparseMatrixCSR) = ROCBackend() + +KA.argconvert(::KA.Kernel{ROCBackend}, arg) = AMDGPU.rocconvert(arg) KA.synchronize(::ROCBackend) = AMDGPU.synchronize() KA.unsafe_free!(x::AMDGPU.ROCArray) = AMDGPU.unsafe_free!(x) diff --git a/src/sparse/array.jl b/src/sparse/array.jl index 2aba15a87..cfe7874c1 100644 --- a/src/sparse/array.jl +++ b/src/sparse/array.jl @@ -1,14 +1,8 @@ -# custom extension of ROCArray in ROCM for sparse vectors/matrices -# using CSC format for interop with Julia's native sparse functionality - export ROCSparseMatrixCSC, ROCSparseMatrixCSR, ROCSparseMatrixBSR, ROCSparseMatrixCOO export ROCSparseMatrix, AbstractROCSparseMatrix, ROCSparseVector, ROCSparseVecOrMat -abstract type AbstractROCSparseArray{Tv, Ti, N} <: AbstractSparseArray{Tv, Ti, N} end -const AbstractROCSparseVector{Tv, Ti} = AbstractROCSparseArray{Tv, Ti, 1} -const AbstractROCSparseMatrix{Tv, Ti} = AbstractROCSparseArray{Tv, Ti, 2} - -Base.convert(T::Type{<:AbstractROCSparseArray}, m::AbstractArray) = m isa T ? m : T(m) +abstract type AbstractROCSparseVector{Tv, Ti} <: GPUArrays.AbstractGPUSparseArray{Tv, Ti, 1} end +abstract type AbstractROCSparseMatrix{Tv, Ti} <: GPUArrays.AbstractGPUSparseArray{Tv, Ti, 2} end mutable struct ROCSparseVector{Tv, Ti} <: AbstractROCSparseVector{Tv, Ti} iPtr::ROCVector{Ti} @@ -29,7 +23,7 @@ function AMDGPU.unsafe_free!(xs::ROCSparseVector) return end -mutable struct ROCSparseMatrixCSC{Tv, Ti} <: AbstractROCSparseMatrix{Tv, Ti} +mutable struct ROCSparseMatrixCSC{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCSC{Tv, Ti} colPtr::ROCVector{Ti} rowVal::ROCVector{Ti} nzVal::ROCVector{Tv} @@ -43,13 +37,17 @@ mutable struct ROCSparseMatrixCSC{Tv, Ti} <: AbstractROCSparseMatrix{Tv, Ti} new{Tv, Ti}(colPtr, rowVal, nzVal, dims, length(nzVal)) end end +ROCSparseMatrixCSC{Tv, Ti}(x::ROCSparseMatrixCSC{Tv, Ti}) where {Tv, Ti} = x +ROCSparseMatrixCSC(x::ROCSparseMatrixCSC) = x -ROCSparseMatrixCSC(A::ROCSparseMatrixCSC) = A +SparseArrays.rowvals(x::T) where T <: ROCSparseVector = nonzeroinds(x) +SparseArrays.rowvals(x::ROCSparseMatrixCSC) = x.rowVal +SparseArrays.getcolptr(x::ROCSparseMatrixCSC) = x.colPtr -function AMDGPU.unsafe_free!(xs::ROCSparseMatrixCSC) - unsafe_free!(xs.colPtr) - unsafe_free!(rowvals(xs)) - unsafe_free!(nonzeros(xs)) +function AMDGPU.unsafe_free!(x::ROCSparseMatrixCSC) + unsafe_free!(x.colPtr) + unsafe_free!(rowvals(x)) + unsafe_free!(nonzeros(x)) return end @@ -63,7 +61,7 @@ GPU. Most ROCSPARSE operations work with CSR formatted matrices, rather than CSC. """ -mutable struct ROCSparseMatrixCSR{Tv, Ti} <: AbstractROCSparseMatrix{Tv, Ti} +mutable struct ROCSparseMatrixCSR{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCSR{Tv, Ti} rowPtr::ROCVector{Ti} colVal::ROCVector{Ti} nzVal::ROCVector{Tv} @@ -78,7 +76,8 @@ mutable struct ROCSparseMatrixCSR{Tv, Ti} <: AbstractROCSparseMatrix{Tv, Ti} end end -ROCSparseMatrixCSR(A::ROCSparseMatrixCSR) = A +ROCSparseMatrixCSR{Tv, Ti}(x::ROCSparseMatrixCSR{Tv, Ti}) where {Tv, Ti} = x +ROCSparseMatrixCSR(x::ROCSparseMatrixCSR) = x function AMDGPU.unsafe_free!(xs::ROCSparseMatrixCSR) unsafe_free!(xs.rowPtr) @@ -87,6 +86,27 @@ function AMDGPU.unsafe_free!(xs::ROCSparseMatrixCSR) return end +GPUArrays.sparse_array_type(::Type{<:ROCSparseVector}) = ROCSparseVector +GPUArrays.sparse_array_type(::Type{<:ROCSparseMatrixCSC}) = ROCSparseMatrixCSC +GPUArrays.sparse_array_type(::Type{<:ROCSparseMatrixCSR}) = ROCSparseMatrixCSR +GPUArrays.sparse_array_type(::ROCSparseVector) = ROCSparseVector +GPUArrays.sparse_array_type(::ROCSparseMatrixCSC) = ROCSparseMatrixCSC +GPUArrays.sparse_array_type(::ROCSparseMatrixCSR) = ROCSparseMatrixCSR + +GPUArrays.dense_array_type(::Type{<:ROCSparseVector}) = ROCArray +GPUArrays.dense_array_type(::Type{<:ROCSparseMatrixCSC}) = ROCArray +GPUArrays.dense_array_type(::Type{<:ROCSparseMatrixCSR}) = ROCArray +GPUArrays.dense_array_type(::ROCSparseVector) = ROCArray +GPUArrays.dense_array_type(::ROCSparseMatrixCSC) = ROCArray +GPUArrays.dense_array_type(::ROCSparseMatrixCSR) = ROCArray + +GPUArrays.csc_type(::ROCSparseMatrixCSR) = ROCSparseMatrixCSC +GPUArrays.csr_type(::ROCSparseMatrixCSC) = ROCSparseMatrixCSR +GPUArrays.coo_type(::Union{ROCSparseMatrixCSR, Transpose{<:Any,<:ROCSparseMatrixCSR}, Adjoint{<:Any,<:ROCSparseMatrixCSR}}) = ROCSparseMatrixCOO +GPUArrays.coo_type(::Union{ROCSparseMatrixCSC, Transpose{<:Any,<:ROCSparseMatrixCSC}, Adjoint{<:Any,<:ROCSparseMatrixCSC}}) = ROCSparseMatrixCOO +GPUArrays.coo_type(::Type{T}) where {T<:Union{ROCSparseMatrixCSR, Transpose{<:Any,<:ROCSparseMatrixCSR}, Adjoint{<:Any,<:ROCSparseMatrixCSR}}} = ROCSparseMatrixCOO +GPUArrays.coo_type(::Type{T}) where {T<:Union{ROCSparseMatrixCSC, Transpose{<:Any,<:ROCSparseMatrixCSC}, Adjoint{<:Any,<:ROCSparseMatrixCSC}}} = ROCSparseMatrixCOO + """ Container to hold sparse matrices in block compressed sparse row (BSR) format on the GPU. BSR format is also used in Intel MKL, and is suited to matrices that are @@ -280,27 +300,6 @@ function SparseArrays.findnz(S::T) where {T <: AbstractROCSparseMatrix} return (I, J, V) end -SparseArrays.nnz(g::AbstractROCSparseArray) = g.nnz -SparseArrays.nonzeros(g::AbstractROCSparseArray) = g.nzVal - -SparseArrays.nonzeroinds(g::AbstractROCSparseVector) = g.iPtr -SparseArrays.rowvals(g::AbstractROCSparseVector) = nonzeroinds(g) - -SparseArrays.rowvals(g::ROCSparseMatrixCSC) = g.rowVal -SparseArrays.getcolptr(g::ROCSparseMatrixCSC) = g.colPtr - -LinearAlgebra.issymmetric(M::Union{ROCSparseMatrixCSC,ROCSparseMatrixCSR}) = false -LinearAlgebra.ishermitian(M::Union{ROCSparseMatrixCSC,ROCSparseMatrixCSR}) = false -LinearAlgebra.issymmetric(M::Symmetric{ROCSparseMatrixCSC}) = true -LinearAlgebra.ishermitian(M::Hermitian{ROCSparseMatrixCSC}) = true - -LinearAlgebra.istriu(M::UpperTriangular{T,S}) where {T<:BlasFloat, S<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} = true -LinearAlgebra.istril(M::UpperTriangular{T,S}) where {T<:BlasFloat, S<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} = false -LinearAlgebra.istriu(M::LowerTriangular{T,S}) where {T<:BlasFloat, S<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} = false -LinearAlgebra.istril(M::LowerTriangular{T,S}) where {T<:BlasFloat, S<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} = true - -Hermitian{T}(Mat::ROCSparseMatrix{T}) where T = Hermitian{T,typeof(Mat)}(Mat,'U') - SparseArrays.nnz(g::ROCSparseMatrixBSR) = g.nnzb * g.blockDim * g.blockDim ## indexing @@ -422,20 +421,15 @@ ROCSparseMatrixCSR(x::Adjoint{T}) where {T} = ROCSparseMatrixCSR{T}(x) ROCSparseMatrixCSC(x::Transpose{T}) where {T} = ROCSparseMatrixCSC{T}(x) ROCSparseMatrixCSC(x::Adjoint{T}) where {T} = ROCSparseMatrixCSC{T}(x) +# TODO adjoint / transpose: GPUArrays._sptranspose + # gpu to cpu -SparseVector(x::ROCSparseVector) = SparseVector(length(x), Array(nonzeroinds(x)), Array(nonzeros(x))) -SparseMatrixCSC(x::ROCSparseMatrixCSC) = SparseMatrixCSC(size(x)..., Array(x.colPtr), Array(rowvals(x)), Array(nonzeros(x))) +SparseVector(x::ROCSparseVector) = SparseVector(length(x), Array(SparseArrays.nonzeroinds(x)), Array(SparseArrays.nonzeros(x))) +SparseMatrixCSC(x::ROCSparseMatrixCSC) = SparseMatrixCSC(size(x)..., Array(x.colPtr), Array(SparseArrays.rowvals(x)), Array(SparseArrays.nonzeros(x))) SparseMatrixCSC(x::ROCSparseMatrixCSR) = SparseMatrixCSC(ROCSparseMatrixCSC(x)) # no direct conversion SparseMatrixCSC(x::ROCSparseMatrixBSR) = SparseMatrixCSC(ROCSparseMatrixCSR(x)) # no direct conversion SparseMatrixCSC(x::ROCSparseMatrixCOO) = SparseMatrixCSC(ROCSparseMatrixCSR(x)) # no direct conversion -# collect to Array -Base.collect(x::ROCSparseVector) = collect(SparseVector(x)) -Base.collect(x::ROCSparseMatrixCSC) = collect(SparseMatrixCSC(x)) -Base.collect(x::ROCSparseMatrixCSR) = collect(SparseMatrixCSC(x)) -Base.collect(x::ROCSparseMatrixBSR) = collect(ROCSparseMatrixCSR(x)) # no direct conversion -Base.collect(x::ROCSparseMatrixCOO) = collect(ROCSparseMatrixCSR(x)) # no direct conversion - Adapt.adapt_storage(::Type{ROCArray}, xs::SparseVector) = ROCSparseVector(xs) Adapt.adapt_storage(::Type{ROCArray}, xs::SparseMatrixCSC) = ROCSparseMatrixCSC(xs) Adapt.adapt_storage(::Type{ROCArray{T}}, xs::SparseVector) where {T} = ROCSparseVector{T}(xs) @@ -451,27 +445,6 @@ Adapt.adapt_storage(::Type{Array}, xs::ROCSparseMatrixCSC) = SparseMatrixCSC(xs) ## copying between sparse GPU arrays -function Base.copyto!(dst::ROCSparseVector, src::ROCSparseVector) - if length(dst) != length(src) - throw(ArgumentError("Inconsistent Sparse Vector size")) - end - copyto!(nonzeroinds(dst), nonzeroinds(src)) - copyto!(nonzeros(dst), nonzeros(src)) - dst.nnz = src.nnz - dst -end - -function Base.copyto!(dst::ROCSparseMatrixCSC, src::ROCSparseMatrixCSC) - if size(dst) != size(src) - throw(ArgumentError("Inconsistent Sparse Matrix size")) - end - copyto!(dst.colPtr, src.colPtr) - copyto!(rowvals(dst), rowvals(src)) - copyto!(nonzeros(dst), nonzeros(src)) - dst.nnz = src.nnz - dst -end - function Base.copyto!(dst::ROCSparseMatrixCSR, src::ROCSparseMatrixCSR) if size(dst) != size(src) throw(ArgumentError("Inconsistent Sparse Matrix size")) @@ -553,24 +526,57 @@ end # interop with device arrays Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseVector) = - ROCSparseDeviceVector(adapt(to, x.iPtr), adapt(to, x.nzVal), length(x), x.nnz) + GPUArrays.GPUSparseDeviceVector(adapt(to, x.iPtr), adapt(to, x.nzVal), length(x), x.nnz) Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseMatrixCSR) = - ROCSparseDeviceMatrixCSR( - adapt(to, x.rowPtr), adapt(to, x.colVal), adapt(to, x.nzVal), - size(x), x.nnz) + GPUArrays.GPUSparseDeviceMatrixCSR( + adapt(to, x.rowPtr), adapt(to, x.colVal), adapt(to, x.nzVal), size(x), x.nnz) Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseMatrixCSC) = - ROCSparseDeviceMatrixCSC( - adapt(to, x.colPtr), adapt(to, x.rowVal), adapt(to, x.nzVal), - size(x), x.nnz) + GPUArrays.GPUSparseDeviceMatrixCSC( + adapt(to, x.colPtr), adapt(to, x.rowVal), adapt(to, x.nzVal), size(x), x.nnz) Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseMatrixBSR) = - ROCSparseDeviceMatrixBSR( + GPUArrays.GPUSparseDeviceMatrixBSR( adapt(to, x.rowPtr), adapt(to, x.colVal), adapt(to, x.nzVal), size(x), x.blockDim, x.dir, x.nnzb) Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseMatrixCOO) = - ROCSparseDeviceMatrixCOO( - adapt(to, x.rowInd), adapt(to, x.colInd), adapt(to, x.nzVal), - size(x), x.nnz) + GPUArrays.GPUSparseDeviceMatrixCOO( + adapt(to, x.rowInd), adapt(to, x.colInd), adapt(to, x.nzVal), size(x), x.nnz) + +# device array ctors + +GPUArrays.GPUSparseDeviceVector( + iPtr::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A}, len::Int, nnz::Ti, +) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceVector{ + Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A, +}(iPtr, nzVal, len, nnz) + +GPUArrays.GPUSparseDeviceMatrixCSR( + rowPtr::ROCDeviceVector{Ti, A}, colVal::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A}, + dims::NTuple{2, Int}, nnz::Ti, +) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceMatrixCSR{ + Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A, +}(rowPtr, colVal, nzVal, dims, nnz) + +GPUArrays.GPUSparseDeviceMatrixCSC( + colPtr::ROCDeviceVector{Ti, A}, rowVal::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A}, + dims::NTuple{2, Int}, nnz::Ti, +) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceMatrixCSC{ + Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A, +}(colPtr, rowVal, nzVal, dims, nnz) + +GPUArrays.GPUSparseDeviceMatrixBSR( + rowPtr::ROCDeviceVector{Ti, A}, colVal::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A}, + dims::NTuple{2, Int}, blockDim::Ti, dir::Char, nnz::Ti, +) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceMatrixBSR{ + Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A, +}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz) + +GPUArrays.GPUSparseDeviceMatrixCOO( + rowInd::ROCDeviceVector{Ti, A}, colInd::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A}, + dims::NTuple{2, Int}, nnz::Ti, +) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceMatrixCOO{ + Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A, +}(rowInd, colInd, nzVal, dims, nnz) diff --git a/src/sparse/broadcast.jl b/src/sparse/broadcast.jl deleted file mode 100644 index c002ca471..000000000 --- a/src/sparse/broadcast.jl +++ /dev/null @@ -1,591 +0,0 @@ -# TODO: support more types (SparseVector, SparseMatrixCSC, COO, BSR) - -## sparse broadcast style - -# broadcast container type promotion for combinations of sparse arrays and other types -struct ROCSparseVecStyle <: Broadcast.AbstractArrayStyle{1} end -struct ROCSparseMatStyle <: Broadcast.AbstractArrayStyle{2} end -Broadcast.BroadcastStyle(::Type{<:ROCSparseVector}) = ROCSparseVecStyle() -Broadcast.BroadcastStyle(::Type{<:ROCSparseMatrix}) = ROCSparseMatStyle() -const SPVM = Union{ROCSparseVecStyle,ROCSparseMatStyle} - -# ROCSparseVecStyle handles 0-1 dimensions, ROCSparseMatStyle 0-2 dimensions. -# ROCSparseVecStyle promotes to ROCSparseMatStyle for 2 dimensions. -# Fall back to DefaultArrayStyle for higher dimensionality. -ROCSparseVecStyle(::Val{0}) = ROCSparseVecStyle() -ROCSparseVecStyle(::Val{1}) = ROCSparseVecStyle() -ROCSparseVecStyle(::Val{2}) = ROCSparseMatStyle() -ROCSparseVecStyle(::Val{N}) where N = Broadcast.DefaultArrayStyle{N}() -ROCSparseMatStyle(::Val{0}) = ROCSparseMatStyle() -ROCSparseMatStyle(::Val{1}) = ROCSparseMatStyle() -ROCSparseMatStyle(::Val{2}) = ROCSparseMatStyle() -ROCSparseMatStyle(::Val{N}) where N = Broadcast.DefaultArrayStyle{N}() - -Broadcast.BroadcastStyle(::ROCSparseVecStyle, ::ROCArrayStyle{1}) = ROCSparseVecStyle() -Broadcast.BroadcastStyle(::ROCSparseVecStyle, ::ROCArrayStyle{2}) = ROCSparseMatStyle() -Broadcast.BroadcastStyle(::ROCSparseMatStyle, ::ROCArrayStyle{2}) = ROCSparseMatStyle() - -# don't wrap sparse arrays with Extruded -Broadcast.extrude(x::ROCSparseVecOrMat) = x - -## detection of zero-preserving functions - -# modified from SparseArrays.jl - -# capturescalars takes a function (f) and a tuple of broadcast arguments, and returns a -# partially-evaluated function and a reduced argument tuple where all scalar operations have -# been applied already. -@inline function capturescalars(f, mixedargs) - let (passedsrcargstup, makeargs) = _capturescalars(mixedargs...) - parevalf = (passed...) -> f(makeargs(passed...)...) - return (parevalf, passedsrcargstup) - end -end -# Work around losing Type{T}s as DataTypes within the tuple that makeargs creates -@inline capturescalars(f, mixedargs::Tuple{Ref{Type{T}}, Vararg{Any}}) where {T} = - capturescalars((args...)->f(T, args...), Base.tail(mixedargs)) -@inline capturescalars(f, mixedargs::Tuple{Ref{Type{T}}, Ref{Type{S}}, Vararg{Any}}) where {T, S} = - # This definition is identical to the one above and necessary only for - # avoiding method ambiguity. - capturescalars((args...)->f(T, args...), Base.tail(mixedargs)) -@inline capturescalars(f, mixedargs::Tuple{ROCSparseVecOrMat, Ref{Type{T}}, Vararg{Any}}) where {T} = - capturescalars((a1, args...)->f(a1, T, args...), (mixedargs[1], Base.tail(Base.tail(mixedargs))...)) -@inline capturescalars(f, mixedargs::Tuple{Union{Ref,AbstractArray{<:Any,0}}, Ref{Type{T}}, Vararg{Any}}) where {T} = - capturescalars((args...)->f(mixedargs[1], T, args...), Base.tail(Base.tail(mixedargs))) - -scalararg(::Number) = true -scalararg(::Any) = false -scalarwrappedarg(::Union{AbstractArray{<:Any,0},Ref}) = true -scalarwrappedarg(::Any) = false - -@inline function _capturescalars() - return (), () -> () -end -@inline function _capturescalars(arg, mixedargs...) - let (rest, f) = _capturescalars(mixedargs...) - if scalararg(arg) - return rest, @inline function(tail...) - (arg, f(tail...)...) - end # add back scalararg after (in makeargs) - elseif scalarwrappedarg(arg) - return rest, @inline function(tail...) - (arg[], f(tail...)...) # TODO: This can put a Type{T} in a tuple - end # unwrap and add back scalararg after (in makeargs) - else - return (arg, rest...), @inline function(head, tail...) - (head, f(tail...)...) - end # pass-through to broadcast - end - end -end -@inline function _capturescalars(arg) # this definition is just an optimization (to bottom out the recursion slightly sooner) - if scalararg(arg) - return (), () -> (arg,) # add scalararg - elseif scalarwrappedarg(arg) - return (), () -> (arg[],) # unwrap - else - return (arg,), (head,) -> (head,) # pass-through - end -end - -@inline _iszero(x) = x == 0 -@inline _iszero(x::Number) = Base.iszero(x) -@inline _iszero(x::AbstractArray) = Base.iszero(x) -@inline _zeros_eltypes(A) = (zero(eltype(A)),) -@inline _zeros_eltypes(A, Bs...) = (zero(eltype(A)), _zeros_eltypes(Bs...)...) - -## iteration helpers - -""" - CSRIterator{Ti}(row, args...) - -A GPU-compatible iterator for accessing the elements of a single row `row` of several CSR -matrices `args` in one go. The row should be in-bounds for every sparse argument. Each -iteration returns a 2-element tuple: The current column, and each arguments' pointer index -(or 0 if that input didn't have an element at that column). The pointers can then be used to -access the elements themselves. - -For convenience, this iterator can be passed non-sparse arguments as well, which will be -ignored (with the returned `col`/`ptr` values set to 0). -""" -struct CSRIterator{Ti,N,ATs} - row::Ti - col_ends::NTuple{N, Ti} - args::ATs -end - -function CSRIterator{Ti}(row, args::Vararg{Any, N}) where {Ti,N} - # check that `row` is valid for all arguments - @boundscheck begin - ntuple(Val(N)) do i - arg = @inbounds args[i] - arg isa ROCSparseDeviceMatrixCSR && checkbounds(arg, row, 1) - end - end - - col_ends = ntuple(Val(N)) do i - arg = @inbounds args[i] - if arg isa ROCSparseDeviceMatrixCSR - @inbounds(arg.rowPtr[row+1]) - else - zero(Ti) - end - end - - CSRIterator{Ti, N, typeof(args)}(row, col_ends, args) -end - -@inline function Base.iterate(iter::CSRIterator{Ti,N}, state=nothing) where {Ti,N} - # helper function to get the column of a sparse array at a specific pointer - @inline function get_col(i, ptr) - arg = @inbounds iter.args[i] - if arg isa ROCSparseDeviceMatrixCSR - col_end = @inbounds iter.col_ends[i] - if ptr < col_end - return @inbounds arg.colVal[ptr] % Ti - end - end - typemax(Ti) - end - - # initialize the state - # - ptr: the current index into the colVal/nzVal arrays - # - col: the current column index (cached so that we don't have to re-read each time) - state = something(state, - ntuple(Val(N)) do i - arg = @inbounds iter.args[i] - if arg isa ROCSparseDeviceMatrixCSR - ptr = @inbounds iter.args[i].rowPtr[iter.row] % Ti - col = @inbounds get_col(i, ptr) - else - ptr = typemax(Ti) - col = typemax(Ti) - end - (; ptr, col) - end - ) - - # determine the column we're currently processing - cols = ntuple(i -> @inbounds(state[i].col), Val(N)) - cur_col = min(cols...) - cur_col == typemax(Ti) && return - - # fetch the pointers (we don't look up the values, as the caller might want to index - # the sparse array directly, e.g., to mutate it). we don't return `ptrs` from the state - # directly, but first convert the `typemax(Ti)` to a more convenient zero value. - # NOTE: these values may end up unused by the caller (e.g. in the count_nnzs kernels), - # but LLVM appears smart enough to filter them away. - ptrs = ntuple(Val(N)) do i - ptr, col = @inbounds state[i] - col == cur_col ? ptr : zero(Ti) - end - - # advance the state - new_state = ntuple(Val(N)) do i - ptr, col = @inbounds state[i] - if col == cur_col - ptr += one(Ti) - col = get_col(i, ptr) - end - (; ptr, col) - end - - return (cur_col, ptrs), new_state -end - -struct CSCIterator{Ti,N,ATs} - col::Ti - row_ends::NTuple{N, Ti} - args::ATs -end - -function CSCIterator{Ti}(col, args::Vararg{Any, N}) where {Ti,N} - # check that `col` is valid for all arguments - @boundscheck begin - ntuple(Val(N)) do i - arg = @inbounds args[i] - arg isa ROCSparseDeviceMatrixCSR && checkbounds(arg, 1, col) - end - end - - row_ends = ntuple(Val(N)) do i - arg = @inbounds args[i] - x = if arg isa ROCSparseDeviceMatrixCSC - @inbounds(arg.colPtr[col+1]) - else - zero(Ti) - end - x - end - - CSCIterator{Ti, N, typeof(args)}(col, row_ends, args) -end - -@inline function Base.iterate(iter::CSCIterator{Ti,N}, state=nothing) where {Ti,N} - # helper function to get the column of a sparse array at a specific pointer - @inline function get_col(i, ptr) - arg = @inbounds iter.args[i] - if arg isa ROCSparseDeviceMatrixCSC - col_end = @inbounds iter.row_ends[i] - if ptr < col_end - return @inbounds arg.rowVal[ptr] % Ti - end - end - typemax(Ti) - end - - # initialize the state - # - ptr: the current index into the rowVal/nzVal arrays - # - row: the current row index (cached so that we don't have to re-read each time) - state = something(state, - ntuple(Val(N)) do i - arg = @inbounds iter.args[i] - if arg isa ROCSparseDeviceMatrixCSC - ptr = @inbounds iter.args[i].colPtr[iter.col] % Ti - row = @inbounds get_col(i, ptr) - else - ptr = typemax(Ti) - row = typemax(Ti) - end - (; ptr, row) - end - ) - - # determine the row we're currently processing - rows = ntuple(i -> @inbounds(state[i].row), Val(N)) - cur_row = min(rows...) - cur_row == typemax(Ti) && return - - # fetch the pointers (we don't look up the values, as the caller might want to index - # the sparse array directly, e.g., to mutate it). we don't return `ptrs` from the state - # directly, but first convert the `typemax(Ti)` to a more convenient zero value. - # NOTE: these values may end up unused by the caller (e.g. in the count_nnzs kernels), - # but LLVM appears smart enough to filter them away. - ptrs = ntuple(Val(N)) do i - ptr, row = @inbounds state[i] - row == cur_row ? ptr : zero(Ti) - end - - # advance the state - new_state = ntuple(Val(N)) do i - ptr, row = @inbounds state[i] - if row == cur_row - ptr += one(Ti) - row = get_col(i, ptr) - end - (; ptr, row) - end - - return (cur_row, ptrs), new_state -end - -# helpers to index a sparse or dense array -function _getindex(arg::Union{ROCSparseDeviceMatrixCSR,ROCSparseDeviceMatrixCSC}, I, ptr) - if ptr == 0 - zero(eltype(arg)) - else - @inbounds arg.nzVal[ptr] - end -end -_getindex(arg, I, ptr) = Broadcast._broadcast_getindex(arg, I) - - -## sparse broadcast implementation - -# TODO: unify CSC/CSR kernels - -# kernel to count the number of non-zeros in a row, to determine the row offsets -function compute_offsets_kernel( - ::Type{<:ROCSparseMatrixCSR}, offsets::ROCDeviceVector{Ti}, args..., -) where Ti - # every thread processes an entire row - row::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x - row > length(offsets) - 1 && return - iter = @inbounds CSRIterator{Ti}(row, args...) - - # count the nonzero columns of all inputs - accum = zero(Ti) - for (col, vals) in iter - accum += one(Ti) - end - - # the way we write the nnz counts is a bit strange, but done so that the result - # after accumulation can be directly used as the rowPtr array of a CSR matrix. - @inbounds begin - if row == 1 - offsets[1] = 1 - end - offsets[row+1] = accum - end - return -end - -function compute_offsets_kernel( - ::Type{<:ROCSparseMatrixCSC}, offsets::ROCDeviceVector{Ti}, args..., -) where Ti - # every thread processes an entire columm - col::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x - col > length(offsets)-1 && return - iter = @inbounds CSCIterator{Ti}(col, args...) - - # count the nonzero columns of all inputs - accum = zero(Ti) - for (col, vals) in iter - accum += one(Ti) - end - - # the way we write the nnz counts is a bit strange, but done so that the result - # after accumulation can be directly used as the colPtr array of a CSC matrix. - @inbounds begin - if col == 1 - offsets[1] = 1 - end - offsets[col+1] = accum - end - return -end - -# broadcast kernels that iterate the elements of sparse arrays -function sparse_to_sparse_broadcast_kernel( - f, output::ROCSparseDeviceMatrixCSR{<:Any,Ti}, - offsets::Union{ROCDeviceVector,Nothing}, args..., -) where Ti - # every thread processes an entire row - row::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x - row > size(output, 1) && return - iter = @inbounds CSRIterator{Ti}(row, args...) - - # fetch the row offset, and write it to the output - @inbounds begin - output_ptr = output.rowPtr[row] = offsets[row] - if row == size(output, 1) - output.rowPtr[row+1] = offsets[row+1] - end - end - - # set the values for this row - for (col, ptrs) in iter - I = CartesianIndex(row, col) - vals = ntuple(Val(length(args))) do i - arg = @inbounds args[i] - ptr = @inbounds ptrs[i] - _getindex(arg, I, ptr) - end - - @inbounds output.colVal[output_ptr] = col - @inbounds output.nzVal[output_ptr] = f(vals...) - output_ptr += one(Ti) - end - return -end - -function sparse_to_sparse_broadcast_kernel( - f, output::ROCSparseDeviceMatrixCSC{<:Any,Ti}, - offsets::Union{ROCDeviceVector,Nothing}, args..., -) where Ti - # every thread processes an entire column - col::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x - col > size(output, 2) && return - iter = @inbounds CSCIterator{Ti}(col, args...) - - # fetch the column offset, and write it to the output - @inbounds begin - output_ptr = output.colPtr[col] = offsets[col] - if col == size(output, 2) - output.colPtr[col+1] = offsets[col+1] - end - end - - # set the values for this col - for (row, ptrs) in iter - I = CartesianIndex(col, row) - vals = ntuple(Val(length(args))) do i - arg = @inbounds args[i] - ptr = @inbounds ptrs[i] - _getindex(arg, I, ptr) - end - - @inbounds output.rowVal[output_ptr] = row - @inbounds output.nzVal[output_ptr] = f(vals...) - output_ptr += one(Ti) - end - return -end - -function sparse_to_dense_broadcast_kernel( - ::Type{<:ROCSparseMatrixCSR}, f, output::ROCDeviceArray, args..., -) - # every thread processes an entire row - row::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x - row > size(output, 1) && return - iter = @inbounds CSRIterator{UInt32}(row, args...) - - # set the values for this row - for (col, ptrs) in iter - I = CartesianIndex(row, col) - vals = ntuple(Val(length(args))) do i - arg = @inbounds args[i] - ptr = @inbounds ptrs[i] - _getindex(arg, I, ptr) - end - - @inbounds output[I] = f(vals...) - end - return -end - -function sparse_to_dense_broadcast_kernel( - ::Type{<:ROCSparseMatrixCSC}, f, output::ROCDeviceArray, args..., -) - # every thread processes an entire column - col::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x - col > size(output, 2) && return - iter = @inbounds CSCIterator{UInt32}(col, args...) - - # set the values for this col - for (row, ptrs) in iter - I = CartesianIndex(row, col) - vals = ntuple(Val(length(args))) do i - arg = @inbounds args[i] - ptr = @inbounds ptrs[i] - _getindex(arg, I, ptr) - end - - @inbounds output[I] = f(vals...) - end - return -end - -function Broadcast.copy(bc::Broadcasted{<:Union{ROCSparseVecStyle,ROCSparseMatStyle}}) - # find the sparse inputs - bc = Broadcast.flatten(bc) - sparse_args = findall(arg -> arg isa AbstractROCSparseArray, bc.args) - - sparse_types = unique(map(i -> nameof(typeof(bc.args[i])), sparse_args)) - length(sparse_types) > 1 && error( - "broadcast with multiple types of sparse arrays ($(join(sparse_types, ", "))) is not supported") - - sparse_typ = typeof(bc.args[first(sparse_args)]) - sparse_typ <: Union{ROCSparseMatrixCSR,ROCSparseMatrixCSC} || error( - "broadcast with sparse arrays is currently only implemented for CSR and CSC matrices") - - Ti = if sparse_typ <: ROCSparseMatrixCSR - reduce(promote_type, map(i -> eltype(bc.args[i].rowPtr), sparse_args)) - elseif sparse_typ <: ROCSparseMatrixCSC - reduce(promote_type, map(i -> eltype(bc.args[i].colPtr), sparse_args)) - end - - # determine the output type - Tv = Broadcast.combine_eltypes(bc.f, eltype.(bc.args)) - Base.isconcretetype(Tv) || error(""" - GPU sparse broadcast resulted in non-concrete element type $Tv. - This probably means that the function you are broadcasting contains an error or type instability. - """) - - # partially-evaluate the function, removing scalars. - parevalf, passedsrcargstup = capturescalars(bc.f, bc.args) - # check if the partially-evaluated function preserves zeros. if so, we'll only need to - # apply it to the sparse input arguments, preserving the sparse structure. - if all(arg -> isa(arg, AbstractSparseArray), passedsrcargstup) - fofzeros = parevalf(_zeros_eltypes(passedsrcargstup...)...) - fpreszeros = _iszero(fofzeros) - else - fpreszeros = false - end - - # the kernels below parallelize across rows or cols, not elements, so it's unlikely - # we'll launch many threads. to maximize utilization, parallelize across blocks first. - rows, cols = size(bc) - function compute_launch_config() - # config = launch_configuration(kernel.fun) - if sparse_typ <: ROCSparseMatrixCSR - threads = rows - blocks = cld(rows, threads) - threads = cld(rows, blocks) - elseif sparse_typ <: ROCSparseMatrixCSC - threads = cols - blocks = cld(cols, threads) - threads = cld(cols, blocks) - end - (; threads, blocks) - end - - # allocate the output container - if !fpreszeros - # either we have dense inputs, or the function isn't preserving zeros, - # so use a dense output to broadcast into. - output = ROCArray{Tv}(undef, size(bc)) - - # since we'll be iterating the sparse inputs, we need to pre-fill the dense output - # with appropriate values (while setting the sparse inputs to zero). we do this by - # re-using the dense broadcast implementation. - nonsparse_args = map(bc.args) do arg - # NOTE: this assumes the broadcst is flattened, but not yet preprocessed - if arg isa AbstractROCSparseArray - zero(eltype(arg)) - else - arg - end - end - broadcast!(bc.f, output, nonsparse_args...) - elseif length(sparse_args) == 1 - # we only have a single sparse input, so we can reuse its structure for the output. - # this avoids a kernel launch and costly synchronization. - sparse_arg = bc.args[first(sparse_args)] - if sparse_typ <: ROCSparseMatrixCSR - offsets = rowPtr = sparse_arg.rowPtr - colVal = similar(sparse_arg.colVal) - nzVal = similar(sparse_arg.nzVal, Tv) - output = ROCSparseMatrixCSR(rowPtr, colVal, nzVal, size(bc)) - elseif sparse_typ <: ROCSparseMatrixCSC - offsets = colPtr = sparse_arg.colPtr - rowVal = similar(sparse_arg.rowVal) - nzVal = similar(sparse_arg.nzVal, Tv) - output = ROCSparseMatrixCSC(colPtr, rowVal, nzVal, size(bc)) - end - # NOTE: we don't use CUSPARSE's similar, because that copies the structure arrays, - # while we do that in our kernel (for consistency with other code paths) - else - # determine the number of non-zero elements per row so that we can create an - # appropriately-structured output container - offsets = if sparse_typ <: ROCSparseMatrixCSR - ROCArray{Ti}(undef, rows+1) - elseif sparse_typ <: ROCSparseMatrixCSC - ROCArray{Ti}(undef, cols+1) - end - let - args = (sparse_typ, offsets, bc.args...) - groupsize, gridsize = compute_launch_config() - @roc gridsize=gridsize groupsize=groupsize compute_offsets_kernel(args...) - end - - # accumulate these values so that we can use them directly as row pointer offsets, - # as well as to get the total nnz count to allocate the sparse output array. - # cusparseXcsrgeam2Nnz computes this in one go, but it doesn't seem worth the effort - accumulate!(Base.add_sum, offsets, offsets) # TODO implement accumulate - total_nnz = @allowscalar last(offsets[end]) - 1 - - output = if sparse_typ <: ROCSparseMatrixCSR - colVal = ROCArray{Ti}(undef, total_nnz) - nzVal = ROCArray{Tv}(undef, total_nnz) - ROCSparseMatrixCSR(offsets, colVal, nzVal, size(bc)) - elseif sparse_typ <: ROCSparseMatrixCSC - rowVal = ROCArray{Ti}(undef, total_nnz) - nzVal = ROCArray{Tv}(undef, total_nnz) - ROCSparseMatrixCSC(offsets, rowVal, nzVal, size(bc)) - end - end - - # perform the actual broadcast - if output isa AbstractROCSparseArray - args = (bc.f, output, offsets, bc.args...) - groupsize, gridsize = compute_launch_config() - @roc gridsize=gridsize groupsize=groupsize sparse_to_sparse_broadcast_kernel(args...) - else - args = (sparse_typ, bc.f, output, bc.args...) - groupsize, gridsize = compute_launch_config() - @roc gridsize=gridsize groupsize=groupsize sparse_to_dense_broadcast_kernel(args...) - end - - return output -end diff --git a/src/sparse/device.jl b/src/sparse/device.jl deleted file mode 100644 index 70a739601..000000000 --- a/src/sparse/device.jl +++ /dev/null @@ -1,121 +0,0 @@ -# on-device sparse array functionality - -# NOTE: this functionality is currently very bare-bones, only defining the array types -# without any device-compatible sparse array functionality - -# core types - -export ROCSparseDeviceVector, ROCSparseDeviceMatrixCSC, ROCSparseDeviceMatrixCSR -export ROCSparseDeviceMatrixBSR, ROCSparseDeviceMatrixCOO - -struct ROCSparseDeviceVector{Tv,Ti, A} <: AbstractSparseVector{Tv,Ti} - iPtr::ROCDeviceVector{Ti, A} - nzVal::ROCDeviceVector{Tv, A} - len::Int - nnz::Ti -end - -Base.length(g::ROCSparseDeviceVector) = prod(g.dims) -Base.size(g::ROCSparseDeviceVector) = (g.len,) -Base.ndims(g::ROCSparseDeviceVector) = 1 -Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceVector = - foldr(hash, (x.iPtr, x.nzVal, x.len, x.nnz, T, h)) -SparseArrays.nnz(g::ROCSparseDeviceVector) = g.nnz - -struct ROCSparseDeviceMatrixCSC{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti} - colPtr::ROCDeviceVector{Ti, A} - rowVal::ROCDeviceVector{Ti, A} - nzVal::ROCDeviceVector{Tv, A} - dims::NTuple{2,Int} - nnz::Ti -end - -Base.length(g::ROCSparseDeviceMatrixCSC) = prod(g.dims) -Base.size(g::ROCSparseDeviceMatrixCSC) = g.dims -Base.ndims(g::ROCSparseDeviceMatrixCSC) = 2 -Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceMatrixCSC = - foldr(hash, (x.colPtr, x.rowVal, x.nzVal, x.dims, x.nnz, T, h)) -SparseArrays.nnz(g::ROCSparseDeviceMatrixCSC) = g.nnz - -struct ROCSparseDeviceMatrixCSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti} - rowPtr::ROCDeviceVector{Ti, A} - colVal::ROCDeviceVector{Ti, A} - nzVal::ROCDeviceVector{Tv, A} - dims::NTuple{2, Int} - nnz::Ti -end - -Base.length(g::ROCSparseDeviceMatrixCSR) = prod(g.dims) -Base.size(g::ROCSparseDeviceMatrixCSR) = g.dims -Base.ndims(g::ROCSparseDeviceMatrixCSR) = 2 -Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceMatrixCSR = - foldr(hash, (x.rowPtr, x.colVal, x.nzVal, x.dims, x.nnz, T, h)) -SparseArrays.nnz(g::ROCSparseDeviceMatrixCSR) = g.nnz - -struct ROCSparseDeviceMatrixBSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti} - rowPtr::ROCDeviceVector{Ti, A} - colVal::ROCDeviceVector{Ti, A} - nzVal::ROCDeviceVector{Tv, A} - dims::NTuple{2,Int} - blockDim::Ti - dir::Char - nnz::Ti -end - -Base.length(g::ROCSparseDeviceMatrixBSR) = prod(g.dims) -Base.size(g::ROCSparseDeviceMatrixBSR) = g.dims -Base.ndims(g::ROCSparseDeviceMatrixBSR) = 2 -Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceMatrixBSR = - foldr(hash, (x.rowPtr, x.colVal, x.nzVal, x.dims, x.blockDim, x.dir, x.nnz, T, h)) -SparseArrays.nnz(g::ROCSparseDeviceMatrixBSR) = g.nnz - -struct ROCSparseDeviceMatrixCOO{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti} - rowInd::ROCDeviceVector{Ti, A} - colInd::ROCDeviceVector{Ti, A} - nzVal::ROCDeviceVector{Tv, A} - dims::NTuple{2,Int} - nnz::Ti -end - -Base.length(g::ROCSparseDeviceMatrixCOO) = prod(g.dims) -Base.size(g::ROCSparseDeviceMatrixCOO) = g.dims -Base.ndims(g::ROCSparseDeviceMatrixCOO) = 2 -Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceMatrixCOO = - foldr(hash, (x.rowInd, x.colInd, x.nzVal, x.dims, x.nnz, T, h)) -SparseArrays.nnz(g::ROCSparseDeviceMatrixCOO) = g.nnz - -# input/output - -function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceVector) - println(io, "$(length(A))-element device sparse vector at:") - println(io, " iPtr: $(A.iPtr)") - print(io, " nzVal: $(A.nzVal)") -end - -function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceMatrixCSR) - println(io, "$(length(A))-element device sparse matrix CSR at:") - println(io, " rowPtr: $(A.rowPtr)") - println(io, " colVal: $(A.colVal)") - print(io, " nzVal: $(A.nzVal)") -end - -function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceMatrixCSC) - println(io, "$(length(A))-element device sparse matrix CSC at:") - println(io, " colPtr: $(A.colPtr)") - println(io, " rowVal: $(A.rowVal)") - print(io, " nzVal: $(A.nzVal)") -end - -function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceMatrixBSR) - println(io, "$(length(A))-element device sparse matrix BSR at:") - println(io, " rowPtr: $(A.rowPtr)") - println(io, " colVal: $(A.colVal)") - print(io, " nzVal: $(A.nzVal)") -end - -function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceMatrixCOO) - println(io, "$(length(A))-element device sparse matrix COO at:") - println(io, " rowPtr: $(A.rowPtr)") - println(io, " colInd: $(A.colInd)") - print(io, " nzVal: $(A.nzVal)") -end diff --git a/src/sparse/interfaces.jl b/src/sparse/interfaces.jl index 4299bd35e..fe77ab835 100644 --- a/src/sparse/interfaces.jl +++ b/src/sparse/interfaces.jl @@ -242,3 +242,5 @@ for SparseMatrixType in [:ROCSparseMatrixCSC, :ROCSparseMatrixCSR], op in [:(+), end end end + +# TODO _sptranspose / _spadjoint diff --git a/src/sparse/linalg.jl b/src/sparse/linalg.jl index 934d29c3b..83374ef3e 100644 --- a/src/sparse/linalg.jl +++ b/src/sparse/linalg.jl @@ -1,43 +1 @@ -function sum_dim1(A::ROCSparseMatrixCSR) - function kernel(Tnorm, out, dA) - idx::UInt32 = (blockIdx().x-1) * blockDim().x + threadIdx().x - idx < length(dA.rowPtr) || return - s = zero(Tnorm) - for k in dA.rowPtr[idx]:dA.rowPtr[idx+1]-1 - s += abs(dA.nzVal[k]) - end - out[idx] = s - return - end - - m, n = size(A) - Tnorm = typeof(float(real(zero(eltype(A))))) - Tsum = promote_type(Float64,Tnorm) - rowsum = AMDGPU.ROCArray{Tsum}(undef, m) - groupsize = n - gridsize = cld(n, groupsize) - @roc gridsize=gridsize groupsize=groupsize kernel(Tnorm, rowsum, A) - return rowsum -end - -function LinearAlgebra.opnorm(A::ROCSparseMatrixCSR, p::Real=2) - if p == Inf - return maximum(sum_dim1(A)) - else - error("p=$p is not supported") - end -end - -LinearAlgebra.opnorm(A::ROCSparseMatrixCSC, p::Real=2) = opnorm(ROCSparseMatrixCSR(A), p) - -# work around upstream breakage from JuliaLang/julia#55547 -@static if VERSION >= v"1.11.2" - const ROCSparseUpperOrUnitUpperTriangular = LinearAlgebra.UpperOrUnitUpperTriangular{ - <:Any,<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} - const ROCSparseLowerOrUnitLowerTriangular = LinearAlgebra.LowerOrUnitLowerTriangular{ - <:Any,<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} - LinearAlgebra.istriu(::ROCSparseUpperOrUnitUpperTriangular) = true - LinearAlgebra.istril(::ROCSparseUpperOrUnitUpperTriangular) = false - LinearAlgebra.istriu(::ROCSparseLowerOrUnitLowerTriangular) = false - LinearAlgebra.istril(::ROCSparseLowerOrUnitLowerTriangular) = true -end +# TODO triu, kron diff --git a/src/sparse/rocSPARSE.jl b/src/sparse/rocSPARSE.jl index 8ff305066..1ca043101 100644 --- a/src/sparse/rocSPARSE.jl +++ b/src/sparse/rocSPARSE.jl @@ -6,17 +6,17 @@ using CEnum: @cenum using LinearAlgebra using LinearAlgebra: HermOrSym, BlasComplex, BlasFloat, BlasReal, MulAddMul, AdjOrTrans using SparseArrays -using SparseArrays: nonzeroinds, dimlub +using SparseArrays: nonzeroinds, nonzeros, rowvals, getcolptr, dimlub +using GPUArrays using ..AMDGPU using ..AMDGPU: @allowscalar using ..AMDGPU: ROCArrayStyle, threadIdx, blockIdx, blockDim +import SparseArrays: SparseVector, SparseMatrixCSC import AMDGPU: librocsparse, HandleCache, HIP, library_state, ROCVector import AMDGPU.Device: ROCDeviceVector import .HIP: HIPContext, HIPStream, hipStream_t -import SparseArrays: SparseVector, SparseMatrixCSC - const SparseChar = Char # core library @@ -54,10 +54,6 @@ include("array.jl") include("util.jl") include("types.jl") -# native functionality -include("device.jl") -include("broadcast.jl") - # low-level wrappers include("helpers.jl") include("level1.jl") diff --git a/test/rocsparse/device.jl b/test/rocsparse/device.jl index 9c08b313b..368b19f55 100644 --- a/test/rocsparse/device.jl +++ b/test/rocsparse/device.jl @@ -1,24 +1,54 @@ @testset "rocconvert" begin - @test isbitstype(ROCSparseDeviceVector{Float32, Cint, AMDGPU.Device.AS.Global}) - @test isbitstype(ROCSparseDeviceMatrixCSC{Float32, Cint, AMDGPU.Device.AS.Global}) - @test isbitstype(ROCSparseDeviceMatrixCSR{Float32, Cint, AMDGPU.Device.AS.Global}) - @test isbitstype(ROCSparseDeviceMatrixBSR{Float32, Cint, AMDGPU.Device.AS.Global}) - @test isbitstype(ROCSparseDeviceMatrixCOO{Float32, Cint, AMDGPU.Device.AS.Global}) + @test isbitstype(GPUSparseDeviceVector{ + Float32, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float32, AS.Global}, AS.Global}) + @test isbitstype(GPUSparseDeviceMatrixCSC{ + Float32, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float32, AS.Global}, AS.Global}) + @test isbitstype(GPUSparseDeviceMatrixCSR{ + Float32, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float32, AS.Global}, AS.Global}) + @test isbitstype(GPUSparseDeviceMatrixBSR{ + Float32, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float32, AS.Global}, AS.Global}) + @test isbitstype(GPUSparseDeviceMatrixCOO{ + Float32, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float32, AS.Global}, AS.Global}) V = sprand(10, 0.5) rocV = ROCSparseVector(V) - @test rocconvert(rocV) isa ROCSparseDeviceVector{Float64, Cint, 1} + @test rocconvert(rocV) isa GPUSparseDeviceVector{ + Float64, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float64, AS.Global}, AS.Global} A = sprand(10, 10, 0.5) rocA = ROCSparseMatrixCSC(A) - @test rocconvert(rocA) isa ROCSparseDeviceMatrixCSC{Float64, Cint, 1} + @test rocconvert(rocA) isa GPUSparseDeviceMatrixCSC{ + Float64, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float64, AS.Global}, AS.Global} rocA = ROCSparseMatrixCSR(A) - @test rocconvert(rocA) isa ROCSparseDeviceMatrixCSR{Float64, Cint, 1} + @test rocconvert(rocA) isa GPUSparseDeviceMatrixCSR{ + Float64, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float64, AS.Global}, AS.Global} rocA = ROCSparseMatrixCOO(A) - @test rocconvert(rocA) isa ROCSparseDeviceMatrixCOO{Float64, Cint, 1} + @test rocconvert(rocA) isa GPUSparseDeviceMatrixCOO{ + Float64, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float64, AS.Global}, AS.Global} rocA = ROCSparseMatrixBSR(A, 2) - @test rocconvert(rocA) isa ROCSparseDeviceMatrixBSR + @test rocconvert(rocA) isa GPUSparseDeviceMatrixBSR{ + Float64, Cint, + ROCDeviceVector{Cint, AS.Global}, + ROCDeviceVector{Float64, AS.Global}, AS.Global} end diff --git a/test/rocsparse/rocsparse.jl b/test/rocsparse/rocsparse.jl index 52c8e7c32..cdc2bc6d6 100644 --- a/test/rocsparse/rocsparse.jl +++ b/test/rocsparse/rocsparse.jl @@ -4,9 +4,11 @@ using SparseArrays using AMDGPU.rocSPARSE - using .rocSPARSE: ROCSparseDeviceVector, ROCSparseDeviceMatrixCSC - using .rocSPARSE: ROCSparseDeviceMatrixCSR, ROCSparseDeviceMatrixBSR - using .rocSPARSE: ROCSparseDeviceMatrixCOO + using AMDGPU.Device.AS + using AMDGPU.Device: ROCDeviceVector + using AMDGPU.GPUArrays: + GPUSparseDeviceVector, GPUSparseDeviceMatrixCSC, GPUSparseDeviceMatrixCSR, + GPUSparseDeviceMatrixBSR, GPUSparseDeviceMatrixCOO include("broadcast.jl") include("conversions.jl")