diff --git a/Project.toml b/Project.toml index 586a097..9054165 100644 --- a/Project.toml +++ b/Project.toml @@ -1,11 +1,12 @@ name = "Octavian" uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" authors = ["Chris Elrod", "Dilum Aluthge", "Mason Protter", "contributors"] -version = "0.3.22" +version = "0.3.23" [deps] CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97" IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" ManualMemory = "d125e4d3-2237-4719-b19c-fa641b8a4667" @@ -16,9 +17,16 @@ StaticArrayInterface = "0d7ed370-da01-4f52-bd93-41d350b8b718" ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5" VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" +[weakdeps] +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" + +[extensions] +ForwardDiffExt = "ForwardDiff" + [compat] CPUSummary = "0.1.26, 0.2.1" ForwardDiff = "0.10" +HyperDualNumbers = "4" IfElse = "0.1" LoopVectorization = "0.12.86" ManualMemory = "0.1.1" @@ -30,13 +38,11 @@ ThreadingUtilities = "0.5" VectorizationBase = "0.21.15" julia = "1.6" -[extensions] -ForwardDiffExt = "ForwardDiff" - [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" @@ -45,7 +51,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [targets] -test = ["Aqua", "BenchmarkTools", "ForwardDiff", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"] - -[weakdeps] -ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +test = ["Aqua", "BenchmarkTools", "ForwardDiff", "HyperDualNumbers", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"] diff --git a/ext/HyperDualNumbersExt.jl b/ext/HyperDualNumbersExt.jl new file mode 100644 index 0000000..0af5650 --- /dev/null +++ b/ext/HyperDualNumbersExt.jl @@ -0,0 +1,249 @@ +module HyperDualNumbersExt + +using HyperDualNumbers: Hyper +using Octavian: ArrayInterface, + @turbo, @tturbo, + One, Zero, + indices, static +import Octavian: real_rep, _matmul!, _matmul_serial! + +real_rep(a::AbstractArray{DualT}) where {T,DualT<:Hyper{T}} = + reinterpret(reshape, T, a) +_view1(B::AbstractMatrix) = @view(B[1, :]) +_view1(B::AbstractArray{<:Any,3}) = @view(B[1, :, :]) + +for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix) + # multiplication of dual vector/matrix by standard matrix from the left + @eval function _matmul!( + _C::$(AbstractVectorOrMatrix){DualT}, + A::AbstractMatrix, + _B::$(AbstractVectorOrMatrix){DualT}, + α, + β = Zero(), + nthread::Nothing = nothing, + MKN = nothing, + contig_axis = nothing + ) where {T, DualT<:Hyper{T}} + B = real_rep(_B) + C = real_rep(_C) + + @tturbo for n ∈ indices((C, B), 3), + m ∈ indices((C, A), (2, 1)), + l in indices((C, B), 1) + + Cₗₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), 2) + Cₗₘₙ += A[m, k] * B[l, k, n] + end + C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n] + end + + _C + end + + # multiplication of dual matrix by standard vector/matrix from the right + @eval @inline function _matmul!( + _C::$(AbstractVectorOrMatrix){DualT}, + _A::AbstractMatrix{DualT}, + B::$(AbstractVectorOrMatrix), + α = One(), + β = Zero(), + nthread::Nothing = nothing, + MKN = nothing + ) where {T,DualT<:Hyper{T}} + if Bool(ArrayInterface.is_dense(_C)) && + Bool(ArrayInterface.is_column_major(_C)) && + Bool(ArrayInterface.is_dense(_A)) && + Bool(ArrayInterface.is_column_major(_A)) + # we can avoid the reshape and call the standard method + A = reinterpret(T, _A) + C = reinterpret(T, _C) + _matmul!(C, A, B, α, β, nthread, nothing) + else + # we cannot use the standard method directly + A = real_rep(_A) + C = real_rep(_C) + + @tturbo for n ∈ indices((C, B), (3, 2)), + m ∈ indices((C, A), 2), + l in indices((C, A), 1) + + Cₗₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), (3, 1)) + Cₗₘₙ += A[l, m, k] * B[k, n] + end + C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n] + end + end + + _C + end + + @eval @inline function _matmul!( + _C::$(AbstractVectorOrMatrix){DualT}, + _A::AbstractMatrix{DualT}, + _B::$(AbstractVectorOrMatrix){DualT}, + α = One(), + β = Zero(), + nthread::Nothing = nothing, + MKN = nothing, + contig = nothing + ) where {T,DualT<:Hyper{T}} + A = real_rep(_A) + C = real_rep(_C) + B = real_rep(_B) + if Bool(ArrayInterface.is_dense(_C)) && + Bool(ArrayInterface.is_column_major(_C)) && + Bool(ArrayInterface.is_dense(_A)) && + Bool(ArrayInterface.is_column_major(_A)) + # we can avoid the reshape and call the standard method + Ar = reinterpret(T, _A) + Cr = reinterpret(T, _C) + _matmul!(Cr, Ar, _view1(B), α, β, nthread, nothing) + else + # we cannot use the standard method directly + @tturbo for n ∈ indices((C, B), 3), + m ∈ indices((C, A), 2), + l in indices((C, A), 1) + + Cₗₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), (3, 2)) + Cₗₘₙ += A[l, m, k] * B[1, k, n] + end + C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n] + end + end + @tturbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2), p ∈ 1:3 + Cₚₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), (3, 2)) + Cₚₘₙ += A[1, m, k] * B[p+1, k, n] + end + C[p+1, m, n] = C[p+1, m, n] + α * Cₚₘₙ + end + + @tturbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2) + Cₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), (3, 2)) + Cₘₙ += A[2, m, k] * B[3, k, n] + A[3, m, k] * B[2, k, n] + end + C[4, m, n] = C[4, m, n] + α * Cₘₙ + end + _C + end + + # multiplication of dual vector/matrix by standard matrix from the left + @eval function _matmul_serial!( + _C::$(AbstractVectorOrMatrix){DualT}, + A::AbstractMatrix, + _B::$(AbstractVectorOrMatrix){DualT}, + α, + β, + MKN + ) where {T, DualT<:Hyper{T}} + B = real_rep(_B) + C = real_rep(_C) + + @turbo for n ∈ indices((C, B), 3), + m ∈ indices((C, A), (2, 1)), + l in indices((C, B), 1) + + Cₗₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), 2) + Cₗₘₙ += A[m, k] * B[l, k, n] + end + C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n] + end + + _C + end + +# multiplication of dual matrix by standard vector/matrix from the right + @eval @inline function _matmul_serial!( + _C::$(AbstractVectorOrMatrix){DualT}, + _A::AbstractMatrix{DualT}, + B::$(AbstractVectorOrMatrix), + α, + β, + MKN + ) where {T,DualT<:Hyper{T}} + if Bool(ArrayInterface.is_dense(_C)) && + Bool(ArrayInterface.is_column_major(_C)) && + Bool(ArrayInterface.is_dense(_A)) && + Bool(ArrayInterface.is_column_major(_A)) + # we can avoid the reshape and call the standard method + A = reinterpret(T, _A) + C = reinterpret(T, _C) + _matmul_serial!(C, A, B, α, β, nothing) + else + # we cannot use the standard method directly + A = real_rep(_A) + C = real_rep(_C) + + @turbo for n ∈ indices((C, B), (3, 2)), + m ∈ indices((C, A), 2), + l in indices((C, A), 1) + + Cₗₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), (3, 1)) + Cₗₘₙ += A[l, m, k] * B[k, n] + end + C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n] + end + end + + _C + end + + @eval @inline function _matmul_serial!( + _C::$(AbstractVectorOrMatrix){DualT}, + _A::AbstractMatrix{DualT}, + _B::$(AbstractVectorOrMatrix){DualT}, + α, + β, + MKN + ) where {T, DualT<:Hyper{T}} + A = real_rep(_A) + C = real_rep(_C) + B = real_rep(_B) + if Bool(ArrayInterface.is_dense(_C)) && + Bool(ArrayInterface.is_column_major(_C)) && + Bool(ArrayInterface.is_dense(_A)) && + Bool(ArrayInterface.is_column_major(_A)) + # we can avoid the reshape and call the standard method + Ar = reinterpret(T, _A) + Cr = reinterpret(T, _C) + _matmul_serial!(Cr, Ar, _view1(B), α, β, nothing) + else + # we cannot use the standard method directly + @turbo for n ∈ indices((C, B), 3), + m ∈ indices((C, A), 2), + l in indices((C, A), 1) + + Cₗₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), (3, 2)) + Cₗₘₙ += A[l, m, k] * B[1, k, n] + end + C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n] + end + end + + @turbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2), p ∈ 1:3 + Cₚₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), (3, 2)) + Cₚₘₙ += A[1, m, k] * B[p+1, k, n] + end + C[p+1, m, n] = C[p+1, m, n] + α * Cₚₘₙ + end + + @tturbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2) + Cₘₙ = zero(eltype(C)) + for k ∈ indices((A, B), (3, 2)) + Cₘₙ += A[2, m, k] * B[3, k, n] + A[3, m, k] * B[2, k, n] + end + C[4, m, n] = C[4, m, n] + α * Cₘₙ + end + _C + end +end # for + +end # module \ No newline at end of file diff --git a/src/Octavian.jl b/src/Octavian.jl index 105633e..6539178 100644 --- a/src/Octavian.jl +++ b/src/Octavian.jl @@ -73,6 +73,9 @@ if !isdefined(Base, :get_extension) include("../ext/ForwardDiffExt.jl") end +# TODO: confirm when we need this extension +include("../ext/HyperDualNumbersExt.jl") + @static if VERSION >= v"1.8.0-beta1" @setup_workload begin # Putting some things in `setup` can reduce the size of the diff --git a/test/hyperduals.jl b/test/hyperduals.jl new file mode 100644 index 0000000..0891b4a --- /dev/null +++ b/test/hyperduals.jl @@ -0,0 +1,92 @@ +function randdual(x) + _x = zeros(HyperDualNumbers.Hyper{Float64}, size(x)...) + for i in eachindex(x) + _x[i] = HyperDualNumbers.Hyper(x[i], randn(), randn(), randn()) + end + return _x +end + +function reinterpretHD(T, A) + tmp = reinterpret(T, A) + return tmp[1:4:end, :] +end + +@time @testset "HyperDualNumbers.jl" begin + m = 53 + n = 63 + k = 73 + + A1 = rand(Float64, m, k) + B1 = rand(Float64, k, n) + C1 = rand(Float64, m, n) + + α = Float64(2.0) + β = Float64(2.0) + + @testset "real array from the right" begin + A1dual = randdual(A1) + C1dual = randdual(C1) + + A2dual = deepcopy(A1dual) + B2 = deepcopy(B1) + C2dual = deepcopy(C1dual) + + Octavian.matmul!(C1dual, A1dual, B1, α, β) + LinearAlgebra.mul!(C2dual, A2dual, B2, α, β) + @test reinterpret(Float64, C1dual) ≈ reinterpret(Float64, C2dual) + end + + @testset "real array from the left" begin + B1dual = randdual(B1) + C1dual = randdual(C1) + + A2 = deepcopy(A1) + B2dual = deepcopy(B1dual) + C2dual = deepcopy(C1dual) + + Octavian.matmul!(C1dual, A1, B1dual, α, β) + LinearAlgebra.mul!(C2dual, A2, B2dual, α, β) + @test reinterpret(Float64, C1dual) ≈ reinterpret(Float64, C2dual) + end + + + @testset "transposed arrays" begin + A1dual = randdual(A1') + C1dual = randdual(C1) + + A2dual = deepcopy(A1dual) + B2 = deepcopy(B1) + C2dual = deepcopy(C1dual) + + C3dual = similar(C1dual) + C4dual = similar(C2dual) + Octavian.matmul!(C1dual, A1dual', B1) + Octavian.matmul!(C2dual, A2dual', B2) + Octavian.matmul_serial!(C3dual, A1dual', B1) + Octavian.matmul_serial!(C4dual, A2dual', B2) + + Cref = zeros(Float64, size(C1)...) + LinearAlgebra.mul!(Cref, A1, B1) + @test (reinterpretHD(Float64, C1dual) ≈ reinterpretHD(Float64, C2dual) ≈ + reinterpretHD(Float64, C3dual) ≈ reinterpretHD(Float64, C4dual) ≈ Cref) && + (reinterpret(Float64, C1dual) ≈ reinterpret(Float64, C2dual) ≈ + reinterpret(Float64, C3dual) ≈ reinterpret(Float64, C4dual) ) + end + + @testset "two dual arrays" begin + A1d = randdual(A1) + B1d = randdual(B1) + @test reinterpret(Float64, Octavian.matmul(A1d, B1d, 1.3)) ≈ + reinterpret(Float64, Octavian.matmul_serial(A1d, B1d, 1.3)) ≈ + reinterpret(Float64, (A1d * B1d) .* 1.3) + @test reinterpret( + Float64, + Octavian.matmul(@view(A1d[begin:end-1, :]), B1d) + ) ≈ + reinterpret( + Float64, + Octavian.matmul_serial(@view(A1d[begin:end-1, :]), B1d) + ) ≈ + reinterpret(Float64, @view(A1d[begin:end-1, :]) * B1d) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 05baff2..5bb3e1e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,6 +9,7 @@ import Octavian import Aqua import BenchmarkTools import ForwardDiff +import HyperDualNumbers import InteractiveUtils import LinearAlgebra import LoopVectorization @@ -37,5 +38,5 @@ include("utils.jl") if sizeof(Int) >= 8 || !Sys.iswindows() include("forward_diff.jl") end - +include("hyperduals.jl") include("aqua.jl") # run the Aqua.jl tests last