diff --git a/Project.toml b/Project.toml
index 586a097..9054165 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,11 +1,12 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Chris Elrod", "Dilum Aluthge", "Mason Protter", "contributors"]
-version = "0.3.22"
+version = "0.3.23"
 
 [deps]
 CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97"
 IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 ManualMemory = "d125e4d3-2237-4719-b19c-fa641b8a4667"
@@ -16,9 +17,16 @@ StaticArrayInterface = "0d7ed370-da01-4f52-bd93-41d350b8b718"
 ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
+[weakdeps]
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+
+[extensions]
+ForwardDiffExt = "ForwardDiff"
+
 [compat]
 CPUSummary = "0.1.26, 0.2.1"
 ForwardDiff = "0.10"
+HyperDualNumbers = "4"
 IfElse = "0.1"
 LoopVectorization = "0.12.86"
 ManualMemory = "0.1.1"
@@ -30,13 +38,11 @@ ThreadingUtilities = "0.5"
 VectorizationBase = "0.21.15"
 julia = "1.6"
 
-[extensions]
-ForwardDiffExt = "ForwardDiff"
-
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+HyperDualNumbers = "50ceba7f-c3ee-5a84-a6e8-3ad40456ec97"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
@@ -45,7 +51,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [targets]
-test = ["Aqua", "BenchmarkTools", "ForwardDiff", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
-
-[weakdeps]
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+test = ["Aqua", "BenchmarkTools", "ForwardDiff", "HyperDualNumbers", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
diff --git a/ext/HyperDualNumbersExt.jl b/ext/HyperDualNumbersExt.jl
new file mode 100644
index 0000000..0af5650
--- /dev/null
+++ b/ext/HyperDualNumbersExt.jl
@@ -0,0 +1,249 @@
+module HyperDualNumbersExt
+
+using HyperDualNumbers: Hyper
+using Octavian: ArrayInterface,
+                 @turbo, @tturbo,
+                 One, Zero,
+                 indices, static
+import Octavian: real_rep, _matmul!, _matmul_serial!
+
+real_rep(a::AbstractArray{DualT}) where {T,DualT<:Hyper{T}} =
+  reinterpret(reshape, T, a)
+_view1(B::AbstractMatrix) = @view(B[1, :])
+_view1(B::AbstractArray{<:Any,3}) = @view(B[1, :, :])
+
+for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
+  # multiplication of dual vector/matrix by standard matrix from the left
+  @eval function _matmul!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    A::AbstractMatrix,
+    _B::$(AbstractVectorOrMatrix){DualT},
+    α,
+    β = Zero(),
+    nthread::Nothing = nothing,
+    MKN = nothing,
+    contig_axis = nothing
+  ) where {T, DualT<:Hyper{T}}
+    B = real_rep(_B)
+    C = real_rep(_C)
+
+    @tturbo for n ∈ indices((C, B), 3),
+      m ∈ indices((C, A), (2, 1)),
+      l in indices((C, B), 1)
+
+      Cₗₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), 2)
+        Cₗₘₙ += A[m, k] * B[l, k, n]
+      end
+      C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+    end
+
+    _C
+  end
+
+  # multiplication of dual matrix by standard vector/matrix from the right
+  @eval @inline function _matmul!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    _A::AbstractMatrix{DualT},
+    B::$(AbstractVectorOrMatrix),
+    α = One(),
+    β = Zero(),
+    nthread::Nothing = nothing,
+    MKN = nothing
+  ) where {T,DualT<:Hyper{T}}
+    if Bool(ArrayInterface.is_dense(_C)) &&
+      Bool(ArrayInterface.is_column_major(_C)) &&
+      Bool(ArrayInterface.is_dense(_A)) &&
+      Bool(ArrayInterface.is_column_major(_A))
+      # we can avoid the reshape and call the standard method
+      A = reinterpret(T, _A)
+      C = reinterpret(T, _C)
+      _matmul!(C, A, B, α, β, nthread, nothing)
+    else
+      # we cannot use the standard method directly
+      A = real_rep(_A)
+      C = real_rep(_C)
+
+      @tturbo for n ∈ indices((C, B), (3, 2)),
+        m ∈ indices((C, A), 2),
+        l in indices((C, A), 1)
+
+        Cₗₘₙ = zero(eltype(C))
+        for k ∈ indices((A, B), (3, 1))
+          Cₗₘₙ += A[l, m, k] * B[k, n]
+        end
+        C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+      end
+    end
+
+    _C
+  end
+
+  @eval @inline function _matmul!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    _A::AbstractMatrix{DualT},
+    _B::$(AbstractVectorOrMatrix){DualT},
+    α = One(),
+    β = Zero(),
+    nthread::Nothing = nothing,
+    MKN = nothing,
+    contig = nothing
+  ) where {T,DualT<:Hyper{T}}
+    A = real_rep(_A)
+    C = real_rep(_C)
+    B = real_rep(_B)
+    if Bool(ArrayInterface.is_dense(_C)) &&
+      Bool(ArrayInterface.is_column_major(_C)) &&
+      Bool(ArrayInterface.is_dense(_A)) &&
+      Bool(ArrayInterface.is_column_major(_A))
+      # we can avoid the reshape and call the standard method
+      Ar = reinterpret(T, _A)
+      Cr = reinterpret(T, _C)
+      _matmul!(Cr, Ar, _view1(B), α, β, nthread, nothing)
+    else
+      # we cannot use the standard method directly
+      @tturbo for n ∈ indices((C, B), 3),
+        m ∈ indices((C, A), 2),
+        l in indices((C, A), 1)
+
+        Cₗₘₙ = zero(eltype(C))
+        for k ∈ indices((A, B), (3, 2))
+          Cₗₘₙ += A[l, m, k] * B[1, k, n]
+        end
+        C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+      end
+    end
+    @tturbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2), p ∈ 1:3
+      Cₚₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), (3, 2))
+        Cₚₘₙ += A[1, m, k] * B[p+1, k, n]
+      end
+      C[p+1, m, n] = C[p+1, m, n] + α * Cₚₘₙ
+    end
+
+    @tturbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2)
+      Cₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), (3, 2))
+        Cₘₙ += A[2, m, k] * B[3, k, n] + A[3, m, k] * B[2, k, n]
+      end
+      C[4, m, n] = C[4, m, n] + α * Cₘₙ
+    end
+    _C
+  end
+
+  # multiplication of dual vector/matrix by standard matrix from the left
+  @eval function _matmul_serial!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    A::AbstractMatrix,
+    _B::$(AbstractVectorOrMatrix){DualT},
+    α,
+    β,
+    MKN
+  ) where {T, DualT<:Hyper{T}}
+    B = real_rep(_B)
+    C = real_rep(_C)
+
+    @turbo for n ∈ indices((C, B), 3),
+      m ∈ indices((C, A), (2, 1)),
+      l in indices((C, B), 1)
+
+      Cₗₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), 2)
+        Cₗₘₙ += A[m, k] * B[l, k, n]
+      end
+      C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+    end
+
+    _C
+  end
+
+# multiplication of dual matrix by standard vector/matrix from the right
+  @eval @inline function _matmul_serial!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    _A::AbstractMatrix{DualT},
+    B::$(AbstractVectorOrMatrix),
+    α,
+    β,
+    MKN
+  ) where {T,DualT<:Hyper{T}}
+    if Bool(ArrayInterface.is_dense(_C)) &&
+      Bool(ArrayInterface.is_column_major(_C)) &&
+      Bool(ArrayInterface.is_dense(_A)) &&
+      Bool(ArrayInterface.is_column_major(_A))
+      # we can avoid the reshape and call the standard method
+      A = reinterpret(T, _A)
+      C = reinterpret(T, _C)
+      _matmul_serial!(C, A, B, α, β, nothing)
+    else
+      # we cannot use the standard method directly
+      A = real_rep(_A)
+      C = real_rep(_C)
+
+      @turbo for n ∈ indices((C, B), (3, 2)),
+        m ∈ indices((C, A), 2),
+        l in indices((C, A), 1)
+
+        Cₗₘₙ = zero(eltype(C))
+        for k ∈ indices((A, B), (3, 1))
+          Cₗₘₙ += A[l, m, k] * B[k, n]
+        end
+        C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+      end
+    end
+
+    _C
+  end
+
+  @eval @inline function _matmul_serial!(
+    _C::$(AbstractVectorOrMatrix){DualT},
+    _A::AbstractMatrix{DualT},
+    _B::$(AbstractVectorOrMatrix){DualT},
+    α,
+    β,
+    MKN
+  ) where {T, DualT<:Hyper{T}}
+    A = real_rep(_A)
+    C = real_rep(_C)
+    B = real_rep(_B)
+    if Bool(ArrayInterface.is_dense(_C)) &&
+      Bool(ArrayInterface.is_column_major(_C)) &&
+      Bool(ArrayInterface.is_dense(_A)) &&
+      Bool(ArrayInterface.is_column_major(_A))
+      # we can avoid the reshape and call the standard method
+      Ar = reinterpret(T, _A)
+      Cr = reinterpret(T, _C)
+      _matmul_serial!(Cr, Ar, _view1(B), α, β, nothing)
+    else
+      # we cannot use the standard method directly
+      @turbo for n ∈ indices((C, B), 3),
+        m ∈ indices((C, A), 2),
+        l in indices((C, A), 1)
+
+        Cₗₘₙ = zero(eltype(C))
+        for k ∈ indices((A, B), (3, 2))
+          Cₗₘₙ += A[l, m, k] * B[1, k, n]
+        end
+        C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
+      end
+    end
+
+    @turbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2), p ∈ 1:3
+      Cₚₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), (3, 2))
+        Cₚₘₙ += A[1, m, k] * B[p+1, k, n]
+      end
+      C[p+1, m, n] = C[p+1, m, n] + α * Cₚₘₙ
+    end
+
+    @tturbo for n ∈ indices((B, C), 3), m ∈ indices((A, C), 2)
+      Cₘₙ = zero(eltype(C))
+      for k ∈ indices((A, B), (3, 2))
+        Cₘₙ += A[2, m, k] * B[3, k, n] + A[3, m, k] * B[2, k, n]
+      end
+      C[4, m, n] = C[4, m, n] + α * Cₘₙ
+    end
+    _C
+  end
+end # for
+
+end # module
\ No newline at end of file
diff --git a/src/Octavian.jl b/src/Octavian.jl
index 105633e..6539178 100644
--- a/src/Octavian.jl
+++ b/src/Octavian.jl
@@ -73,6 +73,9 @@ if !isdefined(Base, :get_extension)
   include("../ext/ForwardDiffExt.jl")
 end
 
+# TODO: confirm when we need this extension
+include("../ext/HyperDualNumbersExt.jl")
+
 @static if VERSION >= v"1.8.0-beta1"
   @setup_workload begin
     # Putting some things in `setup` can reduce the size of the
diff --git a/test/hyperduals.jl b/test/hyperduals.jl
new file mode 100644
index 0000000..0891b4a
--- /dev/null
+++ b/test/hyperduals.jl
@@ -0,0 +1,92 @@
+function randdual(x)
+  _x = zeros(HyperDualNumbers.Hyper{Float64}, size(x)...)
+  for i in eachindex(x)
+    _x[i] = HyperDualNumbers.Hyper(x[i], randn(), randn(), randn())
+  end
+  return _x
+end
+
+function reinterpretHD(T, A)
+  tmp = reinterpret(T, A)
+  return tmp[1:4:end, :]
+end
+
+@time @testset "HyperDualNumbers.jl" begin
+  m = 53
+  n = 63
+  k = 73
+
+  A1 = rand(Float64, m, k)
+  B1 = rand(Float64, k, n)
+  C1 = rand(Float64, m, n)
+
+  α = Float64(2.0)
+  β = Float64(2.0)
+
+  @testset "real array from the right" begin
+    A1dual = randdual(A1)
+    C1dual = randdual(C1)
+    
+    A2dual = deepcopy(A1dual)
+    B2 = deepcopy(B1)
+    C2dual = deepcopy(C1dual)
+
+    Octavian.matmul!(C1dual, A1dual, B1, α, β)
+    LinearAlgebra.mul!(C2dual, A2dual, B2, α, β)
+    @test reinterpret(Float64, C1dual) ≈ reinterpret(Float64, C2dual)
+  end
+
+  @testset "real array from the left" begin
+    B1dual = randdual(B1)
+    C1dual = randdual(C1)
+
+    A2 = deepcopy(A1)
+    B2dual = deepcopy(B1dual)
+    C2dual = deepcopy(C1dual)
+
+    Octavian.matmul!(C1dual, A1, B1dual, α, β)
+    LinearAlgebra.mul!(C2dual, A2, B2dual, α, β)
+    @test reinterpret(Float64, C1dual) ≈ reinterpret(Float64, C2dual)
+  end
+
+  
+  @testset "transposed arrays" begin
+    A1dual = randdual(A1')
+    C1dual = randdual(C1)
+
+    A2dual = deepcopy(A1dual)
+    B2 = deepcopy(B1)
+    C2dual = deepcopy(C1dual)
+
+    C3dual = similar(C1dual)
+    C4dual = similar(C2dual)
+    Octavian.matmul!(C1dual, A1dual', B1)
+    Octavian.matmul!(C2dual, A2dual', B2)
+    Octavian.matmul_serial!(C3dual, A1dual', B1)
+    Octavian.matmul_serial!(C4dual, A2dual', B2)
+
+    Cref = zeros(Float64, size(C1)...)
+    LinearAlgebra.mul!(Cref, A1, B1)
+    @test (reinterpretHD(Float64, C1dual) ≈ reinterpretHD(Float64, C2dual) ≈ 
+          reinterpretHD(Float64, C3dual) ≈ reinterpretHD(Float64, C4dual) ≈ Cref) && 
+          (reinterpret(Float64, C1dual) ≈ reinterpret(Float64, C2dual) ≈ 
+          reinterpret(Float64, C3dual) ≈ reinterpret(Float64, C4dual) )
+  end
+
+  @testset "two dual arrays" begin 
+    A1d = randdual(A1)
+    B1d = randdual(B1)
+    @test reinterpret(Float64, Octavian.matmul(A1d, B1d, 1.3)) ≈
+          reinterpret(Float64, Octavian.matmul_serial(A1d, B1d, 1.3)) ≈
+          reinterpret(Float64, (A1d * B1d) .* 1.3)
+    @test reinterpret(
+            Float64,
+            Octavian.matmul(@view(A1d[begin:end-1, :]), B1d)
+          ) ≈
+          reinterpret(
+            Float64,
+            Octavian.matmul_serial(@view(A1d[begin:end-1, :]), B1d)
+          ) ≈
+          reinterpret(Float64, @view(A1d[begin:end-1, :]) * B1d)
+  end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 05baff2..5bb3e1e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,6 +9,7 @@ import Octavian
 import Aqua
 import BenchmarkTools
 import ForwardDiff
+import HyperDualNumbers
 import InteractiveUtils
 import LinearAlgebra
 import LoopVectorization
@@ -37,5 +38,5 @@ include("utils.jl")
 if sizeof(Int) >= 8 || !Sys.iswindows()
   include("forward_diff.jl")
 end
-
+include("hyperduals.jl")
 include("aqua.jl") # run the Aqua.jl tests last