diff --git a/src/ROCKernels.jl b/src/ROCKernels.jl
index 946e2cdff..7b3548644 100644
--- a/src/ROCKernels.jl
+++ b/src/ROCKernels.jl
@@ -4,6 +4,7 @@ export ROCBackend
 
 import AMDGPU
 import AMDGPU.Device: @device_override
+using AMDGPU: GPUArrays, rocSPARSE
 
 import Adapt
 import KernelAbstractions as KA
@@ -22,15 +23,20 @@ function KA.device!(kab::ROCBackend, id::Int)
 end
 
 Adapt.adapt_storage(::ROCBackend, a::Array) = Adapt.adapt(AMDGPU.ROCArray, a)
-Adapt.adapt_storage(::ROCBackend, a::AMDGPU.ROCArray) = a
-Adapt.adapt_storage(::KA.CPU, a::AMDGPU.ROCArray) = convert(Array, a)
+Adapt.adapt_storage(::ROCBackend, a::Union{AMDGPU.ROCArray, GPUArrays.AbstractGPUSparseArray}) = a
+Adapt.adapt_storage(::KA.CPU, a::Union{AMDGPU.ROCArray, GPUArrays.AbstractGPUSparseArray}) =
+    Adapt.adapt(Array, a)
 function Adapt.adapt_storage(::KA.ConstAdaptor, a::AMDGPU.ROCDeviceArray{T}) where T
     ptr = LLVM.Interop.addrspacecast(Core.LLVMPtr{T,AMDGPU.Device.AS.Constant}, a.ptr)
     AMDGPU.ROCDeviceArray(a.dims, ptr)
 end
 
-KA.argconvert(::KA.Kernel{ROCBackend}, arg) = AMDGPU.rocconvert(arg)
 KA.get_backend(::AMDGPU.ROCArray) = ROCBackend()
+KA.get_backend(::AMDGPU.rocSPARSE.ROCSparseVector) = ROCBackend()
+KA.get_backend(::AMDGPU.rocSPARSE.ROCSparseMatrixCSC) = ROCBackend()
+KA.get_backend(::AMDGPU.rocSPARSE.ROCSparseMatrixCSR) = ROCBackend()
+
+KA.argconvert(::KA.Kernel{ROCBackend}, arg) = AMDGPU.rocconvert(arg)
 KA.synchronize(::ROCBackend) = AMDGPU.synchronize()
 
 KA.unsafe_free!(x::AMDGPU.ROCArray) = AMDGPU.unsafe_free!(x)
diff --git a/src/sparse/array.jl b/src/sparse/array.jl
index 2aba15a87..cfe7874c1 100644
--- a/src/sparse/array.jl
+++ b/src/sparse/array.jl
@@ -1,14 +1,8 @@
-# custom extension of ROCArray in ROCM for sparse vectors/matrices
-# using CSC format for interop with Julia's native sparse functionality
-
 export ROCSparseMatrixCSC, ROCSparseMatrixCSR, ROCSparseMatrixBSR, ROCSparseMatrixCOO
 export ROCSparseMatrix, AbstractROCSparseMatrix, ROCSparseVector, ROCSparseVecOrMat
 
-abstract type AbstractROCSparseArray{Tv, Ti, N} <: AbstractSparseArray{Tv, Ti, N} end
-const AbstractROCSparseVector{Tv, Ti} = AbstractROCSparseArray{Tv, Ti, 1}
-const AbstractROCSparseMatrix{Tv, Ti} = AbstractROCSparseArray{Tv, Ti, 2}
-
-Base.convert(T::Type{<:AbstractROCSparseArray}, m::AbstractArray) = m isa T ? m : T(m)
+abstract type AbstractROCSparseVector{Tv, Ti} <: GPUArrays.AbstractGPUSparseArray{Tv, Ti, 1} end
+abstract type AbstractROCSparseMatrix{Tv, Ti} <: GPUArrays.AbstractGPUSparseArray{Tv, Ti, 2} end
 
 mutable struct ROCSparseVector{Tv, Ti} <: AbstractROCSparseVector{Tv, Ti}
     iPtr::ROCVector{Ti}
@@ -29,7 +23,7 @@ function AMDGPU.unsafe_free!(xs::ROCSparseVector)
     return
 end
 
-mutable struct ROCSparseMatrixCSC{Tv, Ti} <: AbstractROCSparseMatrix{Tv, Ti}
+mutable struct ROCSparseMatrixCSC{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCSC{Tv, Ti}
     colPtr::ROCVector{Ti}
     rowVal::ROCVector{Ti}
     nzVal::ROCVector{Tv}
@@ -43,13 +37,17 @@ mutable struct ROCSparseMatrixCSC{Tv, Ti} <: AbstractROCSparseMatrix{Tv, Ti}
         new{Tv, Ti}(colPtr, rowVal, nzVal, dims, length(nzVal))
     end
 end
+ROCSparseMatrixCSC{Tv, Ti}(x::ROCSparseMatrixCSC{Tv, Ti}) where {Tv, Ti} = x
+ROCSparseMatrixCSC(x::ROCSparseMatrixCSC) = x
 
-ROCSparseMatrixCSC(A::ROCSparseMatrixCSC) = A
+SparseArrays.rowvals(x::T) where T <: ROCSparseVector = nonzeroinds(x)
+SparseArrays.rowvals(x::ROCSparseMatrixCSC) = x.rowVal
+SparseArrays.getcolptr(x::ROCSparseMatrixCSC) = x.colPtr
 
-function AMDGPU.unsafe_free!(xs::ROCSparseMatrixCSC)
-    unsafe_free!(xs.colPtr)
-    unsafe_free!(rowvals(xs))
-    unsafe_free!(nonzeros(xs))
+function AMDGPU.unsafe_free!(x::ROCSparseMatrixCSC)
+    unsafe_free!(x.colPtr)
+    unsafe_free!(rowvals(x))
+    unsafe_free!(nonzeros(x))
     return
 end
 
@@ -63,7 +61,7 @@ GPU.
     Most ROCSPARSE operations work with CSR formatted matrices, rather
     than CSC.
 """
-mutable struct ROCSparseMatrixCSR{Tv, Ti} <: AbstractROCSparseMatrix{Tv, Ti}
+mutable struct ROCSparseMatrixCSR{Tv, Ti} <: GPUArrays.AbstractGPUSparseMatrixCSR{Tv, Ti}
     rowPtr::ROCVector{Ti}
     colVal::ROCVector{Ti}
     nzVal::ROCVector{Tv}
@@ -78,7 +76,8 @@ mutable struct ROCSparseMatrixCSR{Tv, Ti} <: AbstractROCSparseMatrix{Tv, Ti}
     end
 end
 
-ROCSparseMatrixCSR(A::ROCSparseMatrixCSR) = A
+ROCSparseMatrixCSR{Tv, Ti}(x::ROCSparseMatrixCSR{Tv, Ti}) where {Tv, Ti} = x
+ROCSparseMatrixCSR(x::ROCSparseMatrixCSR) = x
 
 function AMDGPU.unsafe_free!(xs::ROCSparseMatrixCSR)
     unsafe_free!(xs.rowPtr)
@@ -87,6 +86,27 @@ function AMDGPU.unsafe_free!(xs::ROCSparseMatrixCSR)
     return
 end
 
+GPUArrays.sparse_array_type(::Type{<:ROCSparseVector}) = ROCSparseVector
+GPUArrays.sparse_array_type(::Type{<:ROCSparseMatrixCSC}) = ROCSparseMatrixCSC
+GPUArrays.sparse_array_type(::Type{<:ROCSparseMatrixCSR}) = ROCSparseMatrixCSR
+GPUArrays.sparse_array_type(::ROCSparseVector) = ROCSparseVector
+GPUArrays.sparse_array_type(::ROCSparseMatrixCSC) = ROCSparseMatrixCSC
+GPUArrays.sparse_array_type(::ROCSparseMatrixCSR) = ROCSparseMatrixCSR
+
+GPUArrays.dense_array_type(::Type{<:ROCSparseVector}) = ROCArray
+GPUArrays.dense_array_type(::Type{<:ROCSparseMatrixCSC}) = ROCArray
+GPUArrays.dense_array_type(::Type{<:ROCSparseMatrixCSR}) = ROCArray
+GPUArrays.dense_array_type(::ROCSparseVector) = ROCArray
+GPUArrays.dense_array_type(::ROCSparseMatrixCSC) = ROCArray
+GPUArrays.dense_array_type(::ROCSparseMatrixCSR) = ROCArray
+
+GPUArrays.csc_type(::ROCSparseMatrixCSR) = ROCSparseMatrixCSC
+GPUArrays.csr_type(::ROCSparseMatrixCSC) = ROCSparseMatrixCSR
+GPUArrays.coo_type(::Union{ROCSparseMatrixCSR, Transpose{<:Any,<:ROCSparseMatrixCSR}, Adjoint{<:Any,<:ROCSparseMatrixCSR}}) = ROCSparseMatrixCOO
+GPUArrays.coo_type(::Union{ROCSparseMatrixCSC, Transpose{<:Any,<:ROCSparseMatrixCSC}, Adjoint{<:Any,<:ROCSparseMatrixCSC}}) = ROCSparseMatrixCOO
+GPUArrays.coo_type(::Type{T}) where {T<:Union{ROCSparseMatrixCSR, Transpose{<:Any,<:ROCSparseMatrixCSR}, Adjoint{<:Any,<:ROCSparseMatrixCSR}}} = ROCSparseMatrixCOO
+GPUArrays.coo_type(::Type{T}) where {T<:Union{ROCSparseMatrixCSC, Transpose{<:Any,<:ROCSparseMatrixCSC}, Adjoint{<:Any,<:ROCSparseMatrixCSC}}} = ROCSparseMatrixCOO
+
 """
 Container to hold sparse matrices in block compressed sparse row (BSR) format on
 the GPU. BSR format is also used in Intel MKL, and is suited to matrices that are
@@ -280,27 +300,6 @@ function SparseArrays.findnz(S::T) where {T <: AbstractROCSparseMatrix}
     return (I, J, V)
 end
 
-SparseArrays.nnz(g::AbstractROCSparseArray) = g.nnz
-SparseArrays.nonzeros(g::AbstractROCSparseArray) = g.nzVal
-
-SparseArrays.nonzeroinds(g::AbstractROCSparseVector) = g.iPtr
-SparseArrays.rowvals(g::AbstractROCSparseVector) = nonzeroinds(g)
-
-SparseArrays.rowvals(g::ROCSparseMatrixCSC) = g.rowVal
-SparseArrays.getcolptr(g::ROCSparseMatrixCSC) = g.colPtr
-
-LinearAlgebra.issymmetric(M::Union{ROCSparseMatrixCSC,ROCSparseMatrixCSR}) = false
-LinearAlgebra.ishermitian(M::Union{ROCSparseMatrixCSC,ROCSparseMatrixCSR}) = false
-LinearAlgebra.issymmetric(M::Symmetric{ROCSparseMatrixCSC}) = true
-LinearAlgebra.ishermitian(M::Hermitian{ROCSparseMatrixCSC}) = true
-
-LinearAlgebra.istriu(M::UpperTriangular{T,S}) where {T<:BlasFloat, S<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} = true
-LinearAlgebra.istril(M::UpperTriangular{T,S}) where {T<:BlasFloat, S<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} = false
-LinearAlgebra.istriu(M::LowerTriangular{T,S}) where {T<:BlasFloat, S<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} = false
-LinearAlgebra.istril(M::LowerTriangular{T,S}) where {T<:BlasFloat, S<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}} = true
-
-Hermitian{T}(Mat::ROCSparseMatrix{T}) where T = Hermitian{T,typeof(Mat)}(Mat,'U')
-
 SparseArrays.nnz(g::ROCSparseMatrixBSR) = g.nnzb * g.blockDim * g.blockDim
 
 ## indexing
@@ -422,20 +421,15 @@ ROCSparseMatrixCSR(x::Adjoint{T}) where {T} = ROCSparseMatrixCSR{T}(x)
 ROCSparseMatrixCSC(x::Transpose{T}) where {T} = ROCSparseMatrixCSC{T}(x)
 ROCSparseMatrixCSC(x::Adjoint{T}) where {T} = ROCSparseMatrixCSC{T}(x)
 
+# TODO adjoint / transpose: GPUArrays._sptranspose
+
 # gpu to cpu
-SparseVector(x::ROCSparseVector) = SparseVector(length(x), Array(nonzeroinds(x)), Array(nonzeros(x)))
-SparseMatrixCSC(x::ROCSparseMatrixCSC) = SparseMatrixCSC(size(x)..., Array(x.colPtr), Array(rowvals(x)), Array(nonzeros(x)))
+SparseVector(x::ROCSparseVector) = SparseVector(length(x), Array(SparseArrays.nonzeroinds(x)), Array(SparseArrays.nonzeros(x)))
+SparseMatrixCSC(x::ROCSparseMatrixCSC) = SparseMatrixCSC(size(x)..., Array(x.colPtr), Array(SparseArrays.rowvals(x)), Array(SparseArrays.nonzeros(x)))
 SparseMatrixCSC(x::ROCSparseMatrixCSR) = SparseMatrixCSC(ROCSparseMatrixCSC(x))  # no direct conversion
 SparseMatrixCSC(x::ROCSparseMatrixBSR) = SparseMatrixCSC(ROCSparseMatrixCSR(x))  # no direct conversion
 SparseMatrixCSC(x::ROCSparseMatrixCOO) = SparseMatrixCSC(ROCSparseMatrixCSR(x))  # no direct conversion
 
-# collect to Array
-Base.collect(x::ROCSparseVector) = collect(SparseVector(x))
-Base.collect(x::ROCSparseMatrixCSC) = collect(SparseMatrixCSC(x))
-Base.collect(x::ROCSparseMatrixCSR) = collect(SparseMatrixCSC(x))
-Base.collect(x::ROCSparseMatrixBSR) = collect(ROCSparseMatrixCSR(x))  # no direct conversion
-Base.collect(x::ROCSparseMatrixCOO) = collect(ROCSparseMatrixCSR(x))  # no direct conversion
-
 Adapt.adapt_storage(::Type{ROCArray}, xs::SparseVector) = ROCSparseVector(xs)
 Adapt.adapt_storage(::Type{ROCArray}, xs::SparseMatrixCSC) = ROCSparseMatrixCSC(xs)
 Adapt.adapt_storage(::Type{ROCArray{T}}, xs::SparseVector) where {T} = ROCSparseVector{T}(xs)
@@ -451,27 +445,6 @@ Adapt.adapt_storage(::Type{Array}, xs::ROCSparseMatrixCSC) = SparseMatrixCSC(xs)
 
 ## copying between sparse GPU arrays
 
-function Base.copyto!(dst::ROCSparseVector, src::ROCSparseVector)
-    if length(dst) != length(src)
-        throw(ArgumentError("Inconsistent Sparse Vector size"))
-    end
-    copyto!(nonzeroinds(dst), nonzeroinds(src))
-    copyto!(nonzeros(dst), nonzeros(src))
-    dst.nnz = src.nnz
-    dst
-end
-
-function Base.copyto!(dst::ROCSparseMatrixCSC, src::ROCSparseMatrixCSC)
-    if size(dst) != size(src)
-        throw(ArgumentError("Inconsistent Sparse Matrix size"))
-    end
-    copyto!(dst.colPtr, src.colPtr)
-    copyto!(rowvals(dst), rowvals(src))
-    copyto!(nonzeros(dst), nonzeros(src))
-    dst.nnz = src.nnz
-    dst
-end
-
 function Base.copyto!(dst::ROCSparseMatrixCSR, src::ROCSparseMatrixCSR)
     if size(dst) != size(src)
         throw(ArgumentError("Inconsistent Sparse Matrix size"))
@@ -553,24 +526,57 @@ end
 # interop with device arrays
 
 Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseVector) =
-    ROCSparseDeviceVector(adapt(to, x.iPtr), adapt(to, x.nzVal), length(x), x.nnz)
+    GPUArrays.GPUSparseDeviceVector(adapt(to, x.iPtr), adapt(to, x.nzVal), length(x), x.nnz)
 
 Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseMatrixCSR) =
-    ROCSparseDeviceMatrixCSR(
-        adapt(to, x.rowPtr), adapt(to, x.colVal), adapt(to, x.nzVal),
-        size(x), x.nnz)
+    GPUArrays.GPUSparseDeviceMatrixCSR(
+        adapt(to, x.rowPtr), adapt(to, x.colVal), adapt(to, x.nzVal), size(x), x.nnz)
 
 Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseMatrixCSC) =
-    ROCSparseDeviceMatrixCSC(
-        adapt(to, x.colPtr), adapt(to, x.rowVal), adapt(to, x.nzVal),
-        size(x), x.nnz)
+    GPUArrays.GPUSparseDeviceMatrixCSC(
+        adapt(to, x.colPtr), adapt(to, x.rowVal), adapt(to, x.nzVal), size(x), x.nnz)
 
 Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseMatrixBSR) =
-    ROCSparseDeviceMatrixBSR(
+    GPUArrays.GPUSparseDeviceMatrixBSR(
         adapt(to, x.rowPtr), adapt(to, x.colVal), adapt(to, x.nzVal),
         size(x), x.blockDim, x.dir, x.nnzb)
 
 Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, x::ROCSparseMatrixCOO) =
-    ROCSparseDeviceMatrixCOO(
-        adapt(to, x.rowInd), adapt(to, x.colInd), adapt(to, x.nzVal),
-        size(x), x.nnz)
+    GPUArrays.GPUSparseDeviceMatrixCOO(
+        adapt(to, x.rowInd), adapt(to, x.colInd), adapt(to, x.nzVal), size(x), x.nnz)
+
+# device array ctors
+
+GPUArrays.GPUSparseDeviceVector(
+    iPtr::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A}, len::Int, nnz::Ti,
+) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceVector{
+    Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A,
+}(iPtr, nzVal, len, nnz)
+
+GPUArrays.GPUSparseDeviceMatrixCSR(
+    rowPtr::ROCDeviceVector{Ti, A}, colVal::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A},
+    dims::NTuple{2, Int}, nnz::Ti,
+) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceMatrixCSR{
+    Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A,
+}(rowPtr, colVal, nzVal, dims, nnz)
+
+GPUArrays.GPUSparseDeviceMatrixCSC(
+    colPtr::ROCDeviceVector{Ti, A}, rowVal::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A},
+    dims::NTuple{2, Int}, nnz::Ti,
+) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceMatrixCSC{
+    Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A,
+}(colPtr, rowVal, nzVal, dims, nnz)
+
+GPUArrays.GPUSparseDeviceMatrixBSR(
+    rowPtr::ROCDeviceVector{Ti, A}, colVal::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A},
+    dims::NTuple{2, Int}, blockDim::Ti, dir::Char, nnz::Ti,
+) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceMatrixBSR{
+    Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A,
+}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz)
+
+GPUArrays.GPUSparseDeviceMatrixCOO(
+    rowInd::ROCDeviceVector{Ti, A}, colInd::ROCDeviceVector{Ti, A}, nzVal::ROCDeviceVector{Tv, A},
+    dims::NTuple{2, Int}, nnz::Ti,
+) where {Ti, Tv, A} = GPUArrays.GPUSparseDeviceMatrixCOO{
+    Tv, Ti, ROCDeviceVector{Ti, A}, ROCDeviceVector{Tv, A}, A,
+}(rowInd, colInd, nzVal, dims, nnz)
diff --git a/src/sparse/broadcast.jl b/src/sparse/broadcast.jl
deleted file mode 100644
index c002ca471..000000000
--- a/src/sparse/broadcast.jl
+++ /dev/null
@@ -1,591 +0,0 @@
-# TODO: support more types (SparseVector, SparseMatrixCSC, COO, BSR)
-
-## sparse broadcast style
-
-# broadcast container type promotion for combinations of sparse arrays and other types
-struct ROCSparseVecStyle <: Broadcast.AbstractArrayStyle{1} end
-struct ROCSparseMatStyle <: Broadcast.AbstractArrayStyle{2} end
-Broadcast.BroadcastStyle(::Type{<:ROCSparseVector}) = ROCSparseVecStyle()
-Broadcast.BroadcastStyle(::Type{<:ROCSparseMatrix}) = ROCSparseMatStyle()
-const SPVM = Union{ROCSparseVecStyle,ROCSparseMatStyle}
-
-# ROCSparseVecStyle handles 0-1 dimensions, ROCSparseMatStyle 0-2 dimensions.
-# ROCSparseVecStyle promotes to ROCSparseMatStyle for 2 dimensions.
-# Fall back to DefaultArrayStyle for higher dimensionality.
-ROCSparseVecStyle(::Val{0}) = ROCSparseVecStyle()
-ROCSparseVecStyle(::Val{1}) = ROCSparseVecStyle()
-ROCSparseVecStyle(::Val{2}) = ROCSparseMatStyle()
-ROCSparseVecStyle(::Val{N}) where N = Broadcast.DefaultArrayStyle{N}()
-ROCSparseMatStyle(::Val{0}) = ROCSparseMatStyle()
-ROCSparseMatStyle(::Val{1}) = ROCSparseMatStyle()
-ROCSparseMatStyle(::Val{2}) = ROCSparseMatStyle()
-ROCSparseMatStyle(::Val{N}) where N = Broadcast.DefaultArrayStyle{N}()
-
-Broadcast.BroadcastStyle(::ROCSparseVecStyle, ::ROCArrayStyle{1}) = ROCSparseVecStyle()
-Broadcast.BroadcastStyle(::ROCSparseVecStyle, ::ROCArrayStyle{2}) = ROCSparseMatStyle()
-Broadcast.BroadcastStyle(::ROCSparseMatStyle, ::ROCArrayStyle{2}) = ROCSparseMatStyle()
-
-# don't wrap sparse arrays with Extruded
-Broadcast.extrude(x::ROCSparseVecOrMat) = x
-
-## detection of zero-preserving functions
-
-# modified from SparseArrays.jl
-
-# capturescalars takes a function (f) and a tuple of broadcast arguments, and returns a
-# partially-evaluated function and a reduced argument tuple where all scalar operations have
-# been applied already.
-@inline function capturescalars(f, mixedargs)
-    let (passedsrcargstup, makeargs) = _capturescalars(mixedargs...)
-        parevalf = (passed...) -> f(makeargs(passed...)...)
-        return (parevalf, passedsrcargstup)
-    end
-end
-# Work around losing Type{T}s as DataTypes within the tuple that makeargs creates
-@inline capturescalars(f, mixedargs::Tuple{Ref{Type{T}}, Vararg{Any}}) where {T} =
-    capturescalars((args...)->f(T, args...), Base.tail(mixedargs))
-@inline capturescalars(f, mixedargs::Tuple{Ref{Type{T}}, Ref{Type{S}}, Vararg{Any}}) where {T, S} =
-    # This definition is identical to the one above and necessary only for
-    # avoiding method ambiguity.
-    capturescalars((args...)->f(T, args...), Base.tail(mixedargs))
-@inline capturescalars(f, mixedargs::Tuple{ROCSparseVecOrMat, Ref{Type{T}}, Vararg{Any}}) where {T} =
-    capturescalars((a1, args...)->f(a1, T, args...), (mixedargs[1], Base.tail(Base.tail(mixedargs))...))
-@inline capturescalars(f, mixedargs::Tuple{Union{Ref,AbstractArray{<:Any,0}}, Ref{Type{T}}, Vararg{Any}}) where {T} =
-    capturescalars((args...)->f(mixedargs[1], T, args...), Base.tail(Base.tail(mixedargs)))
-
-scalararg(::Number) = true
-scalararg(::Any) = false
-scalarwrappedarg(::Union{AbstractArray{<:Any,0},Ref}) = true
-scalarwrappedarg(::Any) = false
-
-@inline function _capturescalars()
-    return (), () -> ()
-end
-@inline function _capturescalars(arg, mixedargs...)
-    let (rest, f) = _capturescalars(mixedargs...)
-        if scalararg(arg)
-            return rest, @inline function(tail...)
-                (arg, f(tail...)...)
-            end # add back scalararg after (in makeargs)
-        elseif scalarwrappedarg(arg)
-            return rest, @inline function(tail...)
-                (arg[], f(tail...)...) # TODO: This can put a Type{T} in a tuple
-            end # unwrap and add back scalararg after (in makeargs)
-        else
-            return (arg, rest...), @inline function(head, tail...)
-                (head, f(tail...)...)
-            end # pass-through to broadcast
-        end
-    end
-end
-@inline function _capturescalars(arg) # this definition is just an optimization (to bottom out the recursion slightly sooner)
-    if scalararg(arg)
-        return (), () -> (arg,) # add scalararg
-    elseif scalarwrappedarg(arg)
-        return (), () -> (arg[],) # unwrap
-    else
-        return (arg,), (head,) -> (head,) # pass-through
-    end
-end
-
-@inline _iszero(x) = x == 0
-@inline _iszero(x::Number) = Base.iszero(x)
-@inline _iszero(x::AbstractArray) = Base.iszero(x)
-@inline _zeros_eltypes(A) = (zero(eltype(A)),)
-@inline _zeros_eltypes(A, Bs...) = (zero(eltype(A)), _zeros_eltypes(Bs...)...)
-
-## iteration helpers
-
-"""
-    CSRIterator{Ti}(row, args...)
-
-A GPU-compatible iterator for accessing the elements of a single row `row` of several CSR
-matrices `args` in one go. The row should be in-bounds for every sparse argument. Each
-iteration returns a 2-element tuple: The current column, and each arguments' pointer index
-(or 0 if that input didn't have an element at that column). The pointers can then be used to
-access the elements themselves.
-
-For convenience, this iterator can be passed non-sparse arguments as well, which will be
-ignored (with the returned `col`/`ptr` values set to 0).
-"""
-struct CSRIterator{Ti,N,ATs}
-    row::Ti
-    col_ends::NTuple{N, Ti}
-    args::ATs
-end
-
-function CSRIterator{Ti}(row, args::Vararg{Any, N}) where {Ti,N}
-    # check that `row` is valid for all arguments
-    @boundscheck begin
-        ntuple(Val(N)) do i
-            arg = @inbounds args[i]
-            arg isa ROCSparseDeviceMatrixCSR && checkbounds(arg, row, 1)
-        end
-    end
-
-    col_ends = ntuple(Val(N)) do i
-        arg = @inbounds args[i]
-        if arg isa ROCSparseDeviceMatrixCSR
-            @inbounds(arg.rowPtr[row+1])
-        else
-            zero(Ti)
-        end
-    end
-
-    CSRIterator{Ti, N, typeof(args)}(row, col_ends, args)
-end
-
-@inline function Base.iterate(iter::CSRIterator{Ti,N}, state=nothing) where {Ti,N}
-    # helper function to get the column of a sparse array at a specific pointer
-    @inline function get_col(i, ptr)
-        arg = @inbounds iter.args[i]
-        if arg isa ROCSparseDeviceMatrixCSR
-            col_end = @inbounds iter.col_ends[i]
-            if ptr < col_end
-                return @inbounds arg.colVal[ptr] % Ti
-            end
-        end
-        typemax(Ti)
-    end
-
-    # initialize the state
-    # - ptr: the current index into the colVal/nzVal arrays
-    # - col: the current column index (cached so that we don't have to re-read each time)
-    state = something(state,
-        ntuple(Val(N)) do i
-            arg = @inbounds iter.args[i]
-            if arg isa ROCSparseDeviceMatrixCSR
-                ptr = @inbounds iter.args[i].rowPtr[iter.row] % Ti
-                col = @inbounds get_col(i, ptr)
-            else
-                ptr = typemax(Ti)
-                col = typemax(Ti)
-            end
-            (; ptr, col)
-        end
-    )
-
-    # determine the column we're currently processing
-    cols = ntuple(i -> @inbounds(state[i].col), Val(N))
-    cur_col = min(cols...)
-    cur_col == typemax(Ti) && return
-
-    # fetch the pointers (we don't look up the values, as the caller might want to index
-    # the sparse array directly, e.g., to mutate it). we don't return `ptrs` from the state
-    # directly, but first convert the `typemax(Ti)` to a more convenient zero value.
-    # NOTE: these values may end up unused by the caller (e.g. in the count_nnzs kernels),
-    #       but LLVM appears smart enough to filter them away.
-    ptrs = ntuple(Val(N)) do i
-        ptr, col = @inbounds state[i]
-        col == cur_col ? ptr : zero(Ti)
-    end
-
-    # advance the state
-    new_state = ntuple(Val(N)) do i
-        ptr, col = @inbounds state[i]
-        if col == cur_col
-            ptr += one(Ti)
-            col = get_col(i, ptr)
-        end
-        (; ptr, col)
-    end
-
-    return (cur_col, ptrs), new_state
-end
-
-struct CSCIterator{Ti,N,ATs}
-    col::Ti
-    row_ends::NTuple{N, Ti}
-    args::ATs
-end
-
-function CSCIterator{Ti}(col, args::Vararg{Any, N}) where {Ti,N}
-    # check that `col` is valid for all arguments
-    @boundscheck begin
-        ntuple(Val(N)) do i
-            arg = @inbounds args[i]
-            arg isa ROCSparseDeviceMatrixCSR && checkbounds(arg, 1, col)
-        end
-    end
-
-    row_ends = ntuple(Val(N)) do i
-        arg = @inbounds args[i]
-        x = if arg isa ROCSparseDeviceMatrixCSC
-            @inbounds(arg.colPtr[col+1])
-        else
-            zero(Ti)
-        end
-        x
-    end
-
-    CSCIterator{Ti, N, typeof(args)}(col, row_ends, args)
-end
-
-@inline function Base.iterate(iter::CSCIterator{Ti,N}, state=nothing) where {Ti,N}
-    # helper function to get the column of a sparse array at a specific pointer
-    @inline function get_col(i, ptr)
-        arg = @inbounds iter.args[i]
-        if arg isa ROCSparseDeviceMatrixCSC
-            col_end = @inbounds iter.row_ends[i]
-            if ptr < col_end
-                return @inbounds arg.rowVal[ptr] % Ti
-            end
-        end
-        typemax(Ti)
-    end
-
-    # initialize the state
-    # - ptr: the current index into the rowVal/nzVal arrays
-    # - row: the current row index (cached so that we don't have to re-read each time)
-    state = something(state,
-        ntuple(Val(N)) do i
-            arg = @inbounds iter.args[i]
-            if arg isa ROCSparseDeviceMatrixCSC
-                ptr = @inbounds iter.args[i].colPtr[iter.col] % Ti
-                row = @inbounds get_col(i, ptr)
-            else
-                ptr = typemax(Ti)
-                row = typemax(Ti)
-            end
-            (; ptr, row)
-        end
-    )
-
-    # determine the row we're currently processing
-    rows = ntuple(i -> @inbounds(state[i].row), Val(N))
-    cur_row = min(rows...)
-    cur_row == typemax(Ti) && return
-
-    # fetch the pointers (we don't look up the values, as the caller might want to index
-    # the sparse array directly, e.g., to mutate it). we don't return `ptrs` from the state
-    # directly, but first convert the `typemax(Ti)` to a more convenient zero value.
-    # NOTE: these values may end up unused by the caller (e.g. in the count_nnzs kernels),
-    #       but LLVM appears smart enough to filter them away.
-    ptrs = ntuple(Val(N)) do i
-        ptr, row = @inbounds state[i]
-        row == cur_row ? ptr : zero(Ti)
-    end
-
-    # advance the state
-    new_state = ntuple(Val(N)) do i
-        ptr, row = @inbounds state[i]
-        if row == cur_row
-            ptr += one(Ti)
-            row = get_col(i, ptr)
-        end
-        (; ptr, row)
-    end
-
-    return (cur_row, ptrs), new_state
-end
-
-# helpers to index a sparse or dense array
-function _getindex(arg::Union{ROCSparseDeviceMatrixCSR,ROCSparseDeviceMatrixCSC}, I, ptr)
-    if ptr == 0
-        zero(eltype(arg))
-    else
-        @inbounds arg.nzVal[ptr]
-    end
-end
-_getindex(arg, I, ptr) = Broadcast._broadcast_getindex(arg, I)
-
-
-## sparse broadcast implementation
-
-# TODO: unify CSC/CSR kernels
-
-# kernel to count the number of non-zeros in a row, to determine the row offsets
-function compute_offsets_kernel(
-    ::Type{<:ROCSparseMatrixCSR}, offsets::ROCDeviceVector{Ti}, args...,
-) where Ti
-    # every thread processes an entire row
-    row::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    row > length(offsets) - 1 && return
-    iter = @inbounds CSRIterator{Ti}(row, args...)
-
-    # count the nonzero columns of all inputs
-    accum = zero(Ti)
-    for (col, vals) in iter
-        accum += one(Ti)
-    end
-
-    # the way we write the nnz counts is a bit strange, but done so that the result
-    # after accumulation can be directly used as the rowPtr array of a CSR matrix.
-    @inbounds begin
-        if row == 1
-            offsets[1] = 1
-        end
-        offsets[row+1] = accum
-    end
-    return
-end
-
-function compute_offsets_kernel(
-    ::Type{<:ROCSparseMatrixCSC}, offsets::ROCDeviceVector{Ti}, args...,
-) where Ti
-    # every thread processes an entire columm
-    col::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    col > length(offsets)-1 && return
-    iter = @inbounds CSCIterator{Ti}(col, args...)
-
-    # count the nonzero columns of all inputs
-    accum = zero(Ti)
-    for (col, vals) in iter
-        accum += one(Ti)
-    end
-
-    # the way we write the nnz counts is a bit strange, but done so that the result
-    # after accumulation can be directly used as the colPtr array of a CSC matrix.
-    @inbounds begin
-        if col == 1
-            offsets[1] = 1
-        end
-        offsets[col+1] = accum
-    end
-    return
-end
-
-# broadcast kernels that iterate the elements of sparse arrays
-function sparse_to_sparse_broadcast_kernel(
-    f, output::ROCSparseDeviceMatrixCSR{<:Any,Ti},
-    offsets::Union{ROCDeviceVector,Nothing}, args...,
-) where Ti
-    # every thread processes an entire row
-    row::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    row > size(output, 1) && return
-    iter = @inbounds CSRIterator{Ti}(row, args...)
-
-    # fetch the row offset, and write it to the output
-    @inbounds begin
-        output_ptr = output.rowPtr[row] = offsets[row]
-        if row == size(output, 1)
-            output.rowPtr[row+1] = offsets[row+1]
-        end
-    end
-
-    # set the values for this row
-    for (col, ptrs) in iter
-        I = CartesianIndex(row, col)
-        vals = ntuple(Val(length(args))) do i
-            arg = @inbounds args[i]
-            ptr = @inbounds ptrs[i]
-            _getindex(arg, I, ptr)
-        end
-
-        @inbounds output.colVal[output_ptr] = col
-        @inbounds output.nzVal[output_ptr] = f(vals...)
-        output_ptr += one(Ti)
-    end
-    return
-end
-
-function sparse_to_sparse_broadcast_kernel(
-    f, output::ROCSparseDeviceMatrixCSC{<:Any,Ti},
-    offsets::Union{ROCDeviceVector,Nothing}, args...,
-) where Ti
-    # every thread processes an entire column
-    col::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    col > size(output, 2) && return
-    iter = @inbounds CSCIterator{Ti}(col, args...)
-
-    # fetch the column offset, and write it to the output
-    @inbounds begin
-        output_ptr = output.colPtr[col] = offsets[col]
-        if col == size(output, 2)
-            output.colPtr[col+1] = offsets[col+1]
-        end
-    end
-
-    # set the values for this col
-    for (row, ptrs) in iter
-        I = CartesianIndex(col, row)
-        vals = ntuple(Val(length(args))) do i
-            arg = @inbounds args[i]
-            ptr = @inbounds ptrs[i]
-            _getindex(arg, I, ptr)
-        end
-
-        @inbounds output.rowVal[output_ptr] = row
-        @inbounds output.nzVal[output_ptr] = f(vals...)
-        output_ptr += one(Ti)
-    end
-    return
-end
-
-function sparse_to_dense_broadcast_kernel(
-    ::Type{<:ROCSparseMatrixCSR}, f, output::ROCDeviceArray, args...,
-)
-    # every thread processes an entire row
-    row::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    row > size(output, 1) && return
-    iter = @inbounds CSRIterator{UInt32}(row, args...)
-
-    # set the values for this row
-    for (col, ptrs) in iter
-        I = CartesianIndex(row, col)
-        vals = ntuple(Val(length(args))) do i
-            arg = @inbounds args[i]
-            ptr = @inbounds ptrs[i]
-            _getindex(arg, I, ptr)
-        end
-
-        @inbounds output[I] = f(vals...)
-    end
-    return
-end
-
-function sparse_to_dense_broadcast_kernel(
-    ::Type{<:ROCSparseMatrixCSC}, f, output::ROCDeviceArray, args...,
-)
-    # every thread processes an entire column
-    col::UInt32 = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    col > size(output, 2) && return
-    iter = @inbounds CSCIterator{UInt32}(col, args...)
-
-    # set the values for this col
-    for (row, ptrs) in iter
-        I = CartesianIndex(row, col)
-        vals = ntuple(Val(length(args))) do i
-            arg = @inbounds args[i]
-            ptr = @inbounds ptrs[i]
-            _getindex(arg, I, ptr)
-        end
-
-        @inbounds output[I] = f(vals...)
-    end
-    return
-end
-
-function Broadcast.copy(bc::Broadcasted{<:Union{ROCSparseVecStyle,ROCSparseMatStyle}})
-    # find the sparse inputs
-    bc = Broadcast.flatten(bc)
-    sparse_args = findall(arg -> arg isa AbstractROCSparseArray, bc.args)
-
-    sparse_types = unique(map(i -> nameof(typeof(bc.args[i])), sparse_args))
-    length(sparse_types) > 1 && error(
-        "broadcast with multiple types of sparse arrays ($(join(sparse_types, ", "))) is not supported")
-
-    sparse_typ = typeof(bc.args[first(sparse_args)])
-    sparse_typ <: Union{ROCSparseMatrixCSR,ROCSparseMatrixCSC} || error(
-        "broadcast with sparse arrays is currently only implemented for CSR and CSC matrices")
-
-    Ti = if sparse_typ <: ROCSparseMatrixCSR
-        reduce(promote_type, map(i -> eltype(bc.args[i].rowPtr), sparse_args))
-    elseif sparse_typ <: ROCSparseMatrixCSC
-        reduce(promote_type, map(i -> eltype(bc.args[i].colPtr), sparse_args))
-    end
-
-    # determine the output type
-    Tv = Broadcast.combine_eltypes(bc.f, eltype.(bc.args))
-    Base.isconcretetype(Tv) || error("""
-        GPU sparse broadcast resulted in non-concrete element type $Tv.
-        This probably means that the function you are broadcasting contains an error or type instability.
-    """)
-
-    # partially-evaluate the function, removing scalars.
-    parevalf, passedsrcargstup = capturescalars(bc.f, bc.args)
-    # check if the partially-evaluated function preserves zeros. if so, we'll only need to
-    # apply it to the sparse input arguments, preserving the sparse structure.
-    if all(arg -> isa(arg, AbstractSparseArray), passedsrcargstup)
-        fofzeros = parevalf(_zeros_eltypes(passedsrcargstup...)...)
-        fpreszeros = _iszero(fofzeros)
-    else
-        fpreszeros = false
-    end
-
-    # the kernels below parallelize across rows or cols, not elements, so it's unlikely
-    # we'll launch many threads. to maximize utilization, parallelize across blocks first.
-    rows, cols = size(bc)
-    function compute_launch_config()
-        # config = launch_configuration(kernel.fun)
-        if sparse_typ <: ROCSparseMatrixCSR
-            threads = rows
-            blocks = cld(rows, threads)
-            threads = cld(rows, blocks)
-        elseif sparse_typ <: ROCSparseMatrixCSC
-            threads = cols
-            blocks = cld(cols, threads)
-            threads = cld(cols, blocks)
-        end
-        (; threads, blocks)
-    end
-
-    # allocate the output container
-    if !fpreszeros
-        # either we have dense inputs, or the function isn't preserving zeros,
-        # so use a dense output to broadcast into.
-        output = ROCArray{Tv}(undef, size(bc))
-
-        # since we'll be iterating the sparse inputs, we need to pre-fill the dense output
-        # with appropriate values (while setting the sparse inputs to zero). we do this by
-        # re-using the dense broadcast implementation.
-        nonsparse_args = map(bc.args) do arg
-            # NOTE: this assumes the broadcst is flattened, but not yet preprocessed
-            if arg isa AbstractROCSparseArray
-                zero(eltype(arg))
-            else
-                arg
-            end
-        end
-        broadcast!(bc.f, output, nonsparse_args...)
-    elseif length(sparse_args) == 1
-        # we only have a single sparse input, so we can reuse its structure for the output.
-        # this avoids a kernel launch and costly synchronization.
-        sparse_arg = bc.args[first(sparse_args)]
-        if sparse_typ <: ROCSparseMatrixCSR
-            offsets = rowPtr = sparse_arg.rowPtr
-            colVal = similar(sparse_arg.colVal)
-            nzVal = similar(sparse_arg.nzVal, Tv)
-            output = ROCSparseMatrixCSR(rowPtr, colVal, nzVal, size(bc))
-        elseif sparse_typ <: ROCSparseMatrixCSC
-            offsets = colPtr = sparse_arg.colPtr
-            rowVal = similar(sparse_arg.rowVal)
-            nzVal = similar(sparse_arg.nzVal, Tv)
-            output = ROCSparseMatrixCSC(colPtr, rowVal, nzVal, size(bc))
-        end
-        # NOTE: we don't use CUSPARSE's similar, because that copies the structure arrays,
-        #       while we do that in our kernel (for consistency with other code paths)
-    else
-        # determine the number of non-zero elements per row so that we can create an
-        # appropriately-structured output container
-        offsets = if sparse_typ <: ROCSparseMatrixCSR
-            ROCArray{Ti}(undef, rows+1)
-        elseif sparse_typ <: ROCSparseMatrixCSC
-            ROCArray{Ti}(undef, cols+1)
-        end
-        let
-            args = (sparse_typ, offsets, bc.args...)
-            groupsize, gridsize = compute_launch_config()
-            @roc gridsize=gridsize groupsize=groupsize compute_offsets_kernel(args...)
-        end
-
-        # accumulate these values so that we can use them directly as row pointer offsets,
-        # as well as to get the total nnz count to allocate the sparse output array.
-        # cusparseXcsrgeam2Nnz computes this in one go, but it doesn't seem worth the effort
-        accumulate!(Base.add_sum, offsets, offsets) # TODO implement accumulate
-        total_nnz = @allowscalar last(offsets[end]) - 1
-
-        output = if sparse_typ <: ROCSparseMatrixCSR
-            colVal = ROCArray{Ti}(undef, total_nnz)
-            nzVal = ROCArray{Tv}(undef, total_nnz)
-            ROCSparseMatrixCSR(offsets, colVal, nzVal, size(bc))
-        elseif sparse_typ <: ROCSparseMatrixCSC
-            rowVal = ROCArray{Ti}(undef, total_nnz)
-            nzVal = ROCArray{Tv}(undef, total_nnz)
-            ROCSparseMatrixCSC(offsets, rowVal, nzVal, size(bc))
-        end
-    end
-
-    # perform the actual broadcast
-    if output isa AbstractROCSparseArray
-        args = (bc.f, output, offsets, bc.args...)
-        groupsize, gridsize = compute_launch_config()
-        @roc gridsize=gridsize groupsize=groupsize sparse_to_sparse_broadcast_kernel(args...)
-    else
-        args = (sparse_typ, bc.f, output, bc.args...)
-        groupsize, gridsize = compute_launch_config()
-        @roc gridsize=gridsize groupsize=groupsize sparse_to_dense_broadcast_kernel(args...)
-    end
-
-    return output
-end
diff --git a/src/sparse/device.jl b/src/sparse/device.jl
deleted file mode 100644
index 70a739601..000000000
--- a/src/sparse/device.jl
+++ /dev/null
@@ -1,121 +0,0 @@
-# on-device sparse array functionality
-
-# NOTE: this functionality is currently very bare-bones, only defining the array types
-#       without any device-compatible sparse array functionality
-
-# core types
-
-export ROCSparseDeviceVector, ROCSparseDeviceMatrixCSC, ROCSparseDeviceMatrixCSR
-export ROCSparseDeviceMatrixBSR, ROCSparseDeviceMatrixCOO
-
-struct ROCSparseDeviceVector{Tv,Ti, A} <: AbstractSparseVector{Tv,Ti}
-    iPtr::ROCDeviceVector{Ti, A}
-    nzVal::ROCDeviceVector{Tv, A}
-    len::Int
-    nnz::Ti
-end
-
-Base.length(g::ROCSparseDeviceVector) = prod(g.dims)
-Base.size(g::ROCSparseDeviceVector) = (g.len,)
-Base.ndims(g::ROCSparseDeviceVector) = 1
-Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceVector =
-    foldr(hash, (x.iPtr, x.nzVal, x.len, x.nnz, T, h))
-SparseArrays.nnz(g::ROCSparseDeviceVector) = g.nnz
-
-struct ROCSparseDeviceMatrixCSC{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
-    colPtr::ROCDeviceVector{Ti, A}
-    rowVal::ROCDeviceVector{Ti, A}
-    nzVal::ROCDeviceVector{Tv, A}
-    dims::NTuple{2,Int}
-    nnz::Ti
-end
-
-Base.length(g::ROCSparseDeviceMatrixCSC) = prod(g.dims)
-Base.size(g::ROCSparseDeviceMatrixCSC) = g.dims
-Base.ndims(g::ROCSparseDeviceMatrixCSC) = 2
-Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceMatrixCSC =
-    foldr(hash, (x.colPtr, x.rowVal, x.nzVal, x.dims, x.nnz, T, h))
-SparseArrays.nnz(g::ROCSparseDeviceMatrixCSC) = g.nnz
-
-struct ROCSparseDeviceMatrixCSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
-    rowPtr::ROCDeviceVector{Ti, A}
-    colVal::ROCDeviceVector{Ti, A}
-    nzVal::ROCDeviceVector{Tv, A}
-    dims::NTuple{2, Int}
-    nnz::Ti
-end
-
-Base.length(g::ROCSparseDeviceMatrixCSR) = prod(g.dims)
-Base.size(g::ROCSparseDeviceMatrixCSR) = g.dims
-Base.ndims(g::ROCSparseDeviceMatrixCSR) = 2
-Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceMatrixCSR =
-    foldr(hash, (x.rowPtr, x.colVal, x.nzVal, x.dims, x.nnz, T, h))
-SparseArrays.nnz(g::ROCSparseDeviceMatrixCSR) = g.nnz
-
-struct ROCSparseDeviceMatrixBSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
-    rowPtr::ROCDeviceVector{Ti, A}
-    colVal::ROCDeviceVector{Ti, A}
-    nzVal::ROCDeviceVector{Tv, A}
-    dims::NTuple{2,Int}
-    blockDim::Ti
-    dir::Char
-    nnz::Ti
-end
-
-Base.length(g::ROCSparseDeviceMatrixBSR) = prod(g.dims)
-Base.size(g::ROCSparseDeviceMatrixBSR) = g.dims
-Base.ndims(g::ROCSparseDeviceMatrixBSR) = 2
-Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceMatrixBSR =
-    foldr(hash, (x.rowPtr, x.colVal, x.nzVal, x.dims, x.blockDim, x.dir, x.nnz, T, h))
-SparseArrays.nnz(g::ROCSparseDeviceMatrixBSR) = g.nnz
-
-struct ROCSparseDeviceMatrixCOO{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
-    rowInd::ROCDeviceVector{Ti, A}
-    colInd::ROCDeviceVector{Ti, A}
-    nzVal::ROCDeviceVector{Tv, A}
-    dims::NTuple{2,Int}
-    nnz::Ti
-end
-
-Base.length(g::ROCSparseDeviceMatrixCOO) = prod(g.dims)
-Base.size(g::ROCSparseDeviceMatrixCOO) = g.dims
-Base.ndims(g::ROCSparseDeviceMatrixCOO) = 2
-Base.hash(x::T, h::UInt) where T <: ROCSparseDeviceMatrixCOO =
-    foldr(hash, (x.rowInd, x.colInd, x.nzVal, x.dims, x.nnz, T, h))
-SparseArrays.nnz(g::ROCSparseDeviceMatrixCOO) = g.nnz
-
-# input/output
-
-function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceVector)
-    println(io, "$(length(A))-element device sparse vector at:")
-    println(io, "  iPtr: $(A.iPtr)")
-    print(io,   "  nzVal: $(A.nzVal)")
-end
-
-function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceMatrixCSR)
-    println(io, "$(length(A))-element device sparse matrix CSR at:")
-    println(io, "  rowPtr: $(A.rowPtr)")
-    println(io, "  colVal: $(A.colVal)")
-    print(io,   "  nzVal:  $(A.nzVal)")
-end
-
-function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceMatrixCSC)
-    println(io, "$(length(A))-element device sparse matrix CSC at:")
-    println(io, "  colPtr: $(A.colPtr)")
-    println(io, "  rowVal: $(A.rowVal)")
-    print(io,   "  nzVal:  $(A.nzVal)")
-end
-
-function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceMatrixBSR)
-    println(io, "$(length(A))-element device sparse matrix BSR at:")
-    println(io, "  rowPtr: $(A.rowPtr)")
-    println(io, "  colVal: $(A.colVal)")
-    print(io,   "  nzVal:  $(A.nzVal)")
-end
-
-function Base.show(io::IO, ::MIME"text/plain", A::ROCSparseDeviceMatrixCOO)
-    println(io, "$(length(A))-element device sparse matrix COO at:")
-    println(io, "  rowPtr: $(A.rowPtr)")
-    println(io, "  colInd: $(A.colInd)")
-    print(io,   "  nzVal:  $(A.nzVal)")
-end
diff --git a/src/sparse/interfaces.jl b/src/sparse/interfaces.jl
index 4299bd35e..fe77ab835 100644
--- a/src/sparse/interfaces.jl
+++ b/src/sparse/interfaces.jl
@@ -242,3 +242,5 @@ for SparseMatrixType in [:ROCSparseMatrixCSC, :ROCSparseMatrixCSR], op in [:(+),
         end
     end
 end
+
+# TODO _sptranspose / _spadjoint
diff --git a/src/sparse/linalg.jl b/src/sparse/linalg.jl
index 934d29c3b..83374ef3e 100644
--- a/src/sparse/linalg.jl
+++ b/src/sparse/linalg.jl
@@ -1,43 +1 @@
-function sum_dim1(A::ROCSparseMatrixCSR)
-    function kernel(Tnorm, out, dA)
-        idx::UInt32 = (blockIdx().x-1) * blockDim().x + threadIdx().x
-        idx < length(dA.rowPtr) || return
-        s = zero(Tnorm)
-        for k in dA.rowPtr[idx]:dA.rowPtr[idx+1]-1
-            s += abs(dA.nzVal[k])
-        end
-        out[idx] = s
-        return
-    end
-
-    m, n = size(A)
-    Tnorm = typeof(float(real(zero(eltype(A)))))
-    Tsum = promote_type(Float64,Tnorm)
-    rowsum = AMDGPU.ROCArray{Tsum}(undef, m)
-    groupsize = n
-    gridsize = cld(n, groupsize)
-    @roc gridsize=gridsize groupsize=groupsize kernel(Tnorm, rowsum, A)
-    return rowsum
-end
-
-function LinearAlgebra.opnorm(A::ROCSparseMatrixCSR, p::Real=2)
-    if p == Inf
-        return maximum(sum_dim1(A))
-    else
-        error("p=$p is not supported")
-    end
-end
-
-LinearAlgebra.opnorm(A::ROCSparseMatrixCSC, p::Real=2) = opnorm(ROCSparseMatrixCSR(A), p)
-
-# work around upstream breakage from JuliaLang/julia#55547
-@static if VERSION >= v"1.11.2"
-    const ROCSparseUpperOrUnitUpperTriangular = LinearAlgebra.UpperOrUnitUpperTriangular{
-        <:Any,<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}}
-    const ROCSparseLowerOrUnitLowerTriangular = LinearAlgebra.LowerOrUnitLowerTriangular{
-        <:Any,<:Union{<:AbstractROCSparseMatrix, Adjoint{<:Any, <:AbstractROCSparseMatrix}, Transpose{<:Any, <:AbstractROCSparseMatrix}}}
-    LinearAlgebra.istriu(::ROCSparseUpperOrUnitUpperTriangular) = true
-    LinearAlgebra.istril(::ROCSparseUpperOrUnitUpperTriangular) = false
-    LinearAlgebra.istriu(::ROCSparseLowerOrUnitLowerTriangular) = false
-    LinearAlgebra.istril(::ROCSparseLowerOrUnitLowerTriangular) = true
-end
+# TODO triu, kron
diff --git a/src/sparse/rocSPARSE.jl b/src/sparse/rocSPARSE.jl
index 8ff305066..1ca043101 100644
--- a/src/sparse/rocSPARSE.jl
+++ b/src/sparse/rocSPARSE.jl
@@ -6,17 +6,17 @@ using CEnum: @cenum
 using LinearAlgebra
 using LinearAlgebra: HermOrSym, BlasComplex, BlasFloat, BlasReal, MulAddMul, AdjOrTrans
 using SparseArrays
-using SparseArrays: nonzeroinds, dimlub
+using SparseArrays: nonzeroinds, nonzeros, rowvals, getcolptr, dimlub
+using GPUArrays
 using ..AMDGPU
 using ..AMDGPU: @allowscalar
 using ..AMDGPU: ROCArrayStyle, threadIdx, blockIdx, blockDim
 
+import SparseArrays: SparseVector, SparseMatrixCSC
 import AMDGPU: librocsparse, HandleCache, HIP, library_state, ROCVector
 import AMDGPU.Device: ROCDeviceVector
 import .HIP: HIPContext, HIPStream, hipStream_t
 
-import SparseArrays: SparseVector, SparseMatrixCSC
-
 const SparseChar = Char
 
 # core library
@@ -54,10 +54,6 @@ include("array.jl")
 include("util.jl")
 include("types.jl")
 
-# native functionality
-include("device.jl")
-include("broadcast.jl")
-
 # low-level wrappers
 include("helpers.jl")
 include("level1.jl")
diff --git a/test/rocsparse/device.jl b/test/rocsparse/device.jl
index 9c08b313b..368b19f55 100644
--- a/test/rocsparse/device.jl
+++ b/test/rocsparse/device.jl
@@ -1,24 +1,54 @@
 @testset "rocconvert" begin
-    @test isbitstype(ROCSparseDeviceVector{Float32, Cint, AMDGPU.Device.AS.Global})
-    @test isbitstype(ROCSparseDeviceMatrixCSC{Float32, Cint, AMDGPU.Device.AS.Global})
-    @test isbitstype(ROCSparseDeviceMatrixCSR{Float32, Cint, AMDGPU.Device.AS.Global})
-    @test isbitstype(ROCSparseDeviceMatrixBSR{Float32, Cint, AMDGPU.Device.AS.Global})
-    @test isbitstype(ROCSparseDeviceMatrixCOO{Float32, Cint, AMDGPU.Device.AS.Global})
+    @test isbitstype(GPUSparseDeviceVector{
+        Float32, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float32, AS.Global}, AS.Global})
+    @test isbitstype(GPUSparseDeviceMatrixCSC{
+        Float32, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float32, AS.Global}, AS.Global})
+    @test isbitstype(GPUSparseDeviceMatrixCSR{
+        Float32, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float32, AS.Global}, AS.Global})
+    @test isbitstype(GPUSparseDeviceMatrixBSR{
+        Float32, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float32, AS.Global}, AS.Global})
+    @test isbitstype(GPUSparseDeviceMatrixCOO{
+        Float32, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float32, AS.Global}, AS.Global})
 
     V = sprand(10, 0.5)
     rocV = ROCSparseVector(V)
-    @test rocconvert(rocV) isa ROCSparseDeviceVector{Float64, Cint, 1}
+    @test rocconvert(rocV) isa GPUSparseDeviceVector{
+        Float64, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float64, AS.Global}, AS.Global}
 
     A = sprand(10, 10, 0.5)
     rocA = ROCSparseMatrixCSC(A)
-    @test rocconvert(rocA) isa ROCSparseDeviceMatrixCSC{Float64, Cint, 1}
+    @test rocconvert(rocA) isa GPUSparseDeviceMatrixCSC{
+        Float64, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float64, AS.Global}, AS.Global}
 
     rocA = ROCSparseMatrixCSR(A)
-    @test rocconvert(rocA) isa ROCSparseDeviceMatrixCSR{Float64, Cint, 1}
+    @test rocconvert(rocA) isa GPUSparseDeviceMatrixCSR{
+        Float64, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float64, AS.Global}, AS.Global}
 
     rocA = ROCSparseMatrixCOO(A)
-    @test rocconvert(rocA) isa ROCSparseDeviceMatrixCOO{Float64, Cint, 1}
+    @test rocconvert(rocA) isa GPUSparseDeviceMatrixCOO{
+        Float64, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float64, AS.Global}, AS.Global}
 
     rocA = ROCSparseMatrixBSR(A, 2)
-    @test rocconvert(rocA) isa ROCSparseDeviceMatrixBSR
+    @test rocconvert(rocA) isa GPUSparseDeviceMatrixBSR{
+        Float64, Cint,
+        ROCDeviceVector{Cint, AS.Global},
+        ROCDeviceVector{Float64, AS.Global}, AS.Global}
 end
diff --git a/test/rocsparse/rocsparse.jl b/test/rocsparse/rocsparse.jl
index 52c8e7c32..cdc2bc6d6 100644
--- a/test/rocsparse/rocsparse.jl
+++ b/test/rocsparse/rocsparse.jl
@@ -4,9 +4,11 @@
     using SparseArrays
 
     using AMDGPU.rocSPARSE
-    using .rocSPARSE: ROCSparseDeviceVector, ROCSparseDeviceMatrixCSC
-    using .rocSPARSE: ROCSparseDeviceMatrixCSR, ROCSparseDeviceMatrixBSR
-    using .rocSPARSE: ROCSparseDeviceMatrixCOO
+    using AMDGPU.Device.AS
+    using AMDGPU.Device: ROCDeviceVector
+    using AMDGPU.GPUArrays:
+        GPUSparseDeviceVector, GPUSparseDeviceMatrixCSC, GPUSparseDeviceMatrixCSR,
+        GPUSparseDeviceMatrixBSR, GPUSparseDeviceMatrixCOO
 
     include("broadcast.jl")
     include("conversions.jl")