diff --git a/.gitignore b/.gitignore index 20a8981..9930b45 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,7 @@ /docs/build/ Manifest.toml *~ +*# +*#* +*#*# diff --git a/Project.toml b/Project.toml index 9cd614c..14573f8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,20 +1,23 @@ name = "Octavian" uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"] -version = "0.1.0" +version = "0.2.0-DEV" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" +ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5" VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [compat] ArrayInterface = "2.14" LoopVectorization = "0.9.18" +ThreadingUtilities = "0.2" VectorizationBase = "0.15.2" julia = "1.5" [extras] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -24,4 +27,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [targets] -test = ["BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"] +test = ["Aqua", "BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"] diff --git a/src/Octavian.jl b/src/Octavian.jl index ee420dc..0196b36 100644 --- a/src/Octavian.jl +++ b/src/Octavian.jl @@ -1,23 +1,33 @@ module Octavian -import ArrayInterface -import LoopVectorization -import VectorizationBase +using VectorizationBase, ArrayInterface, LoopVectorization -using VectorizationBase: StaticInt +using VectorizationBase: align, gep, AbstractStridedPointer, AbstractSIMDVector, vnoaliasstore!, staticm1, + static_sizeof, lazymul, vmul_fast, StridedPointer, gesp, zero_offsets, pause, + CACHE_COUNT, NUM_CORES, CACHE_INCLUSIVITY, zstridedpointer +using LoopVectorization: maybestaticsize, mᵣ, nᵣ, preserve_buffer, CloseOpen +using ArrayInterface: StaticInt, Zero, One, OptionallyStaticUnitRange, size, strides, offsets, indices, + static_length, static_first, static_last, axes, + dense_dims, DenseDims, stride_rank, StrideRank -export matmul -export matmul! +using ThreadingUtilities: + _atomic_add!, _atomic_umax!, _atomic_umin!, + _atomic_load, _atomic_store!, _atomic_cas_cmp!, + SPIN, WAIT, TASK, LOCK, STUP, taskpointer, + wake_thread!, __wait, load, store! + +export matmul!, matmul_serial!, matmul, matmul_serial, StaticInt include("global_constants.jl") include("types.jl") - +include("staticfloats.jl") +include("integerdivision.jl") +include("memory_buffer.jl") include("block_sizes.jl") +include("funcptrs.jl") include("macrokernels.jl") -include("matmul.jl") -include("memory_buffer.jl") -include("pointer_matrix.jl") include("utils.jl") +include("matmul.jl") include("init.jl") # `Octavian.__init__()` is defined in this file diff --git a/src/block_sizes.jl b/src/block_sizes.jl index 80dbd96..7f7e502 100644 --- a/src/block_sizes.jl +++ b/src/block_sizes.jl @@ -1,70 +1,233 @@ + +first_effective_cache(::Type{T}) where {T} = StaticInt{FIRST__CACHE_SIZE}() ÷ static_sizeof(T) +second_effective_cache(::Type{T}) where {T} = StaticInt{SECOND_CACHE_SIZE}() ÷ static_sizeof(T) + +function block_sizes(::Type{T}, _α, _β, R₁, R₂) where {T} + W = VectorizationBase.pick_vector_width_val(T) + α = _α * W + β = _β * W + L₁ₑ = first_effective_cache(T) * R₁ + L₂ₑ = second_effective_cache(T) * R₂ + block_sizes(W, α, β, L₁ₑ, L₂ₑ) +end +function block_sizes(W, α, β, L₁ₑ, L₂ₑ) + MᵣW = StaticInt{mᵣ}() * W + + Mc = floortostaticint(√(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ) / MᵣW) * MᵣW + Kc = roundtostaticint(√(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α)) + Nc = floortostaticint(√(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) / StaticInt{nᵣ}()) * StaticInt{nᵣ}() + + Mc, Kc, Nc +end +function block_sizes(::Type{T}) where {T} + block_sizes(T, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}()) +end + """ - block_sizes(::Type{T}) -> (Mc, Kc, Nc) - -Returns the dimensions of our macrokernel, which iterates over the microkernel. That is, in -calculating `C = A * B`, our macrokernel will be called on `Mc × Kc` blocks of `A`, multiplying - them with `Kc × Nc` blocks of `B`, to update `Mc × Nc` blocks of `C`. - -We want these blocks to fit nicely in the cache. There is a lot of room for improvement here, but this -initial implementation should work reasonably well. - - -The constants `LoopVectorization.mᵣ` and `LoopVectorization.nᵣ` are the factors by which LoopVectorization -wants to unroll the rows and columns of the microkernel, respectively. `LoopVectorization` defines these -constants by running it's analysis on a gemm kernel; they're there for convenience/to make it easier to -implement matrix-multiply. -It also wants to vectorize the rows by `W = VectorizationBase.pick_vector_width_val(T)`. -Thus, the microkernel's dimensions are `(W * mᵣ) × nᵣ`; that is, the microkernel updates a `(W * mᵣ) × nᵣ` -block of `C`. - -Because the macrokernel iterates over tiles and repeatedly applies the microkernel, we would prefer the -macrokernel's dimensions to be an integer multiple of the microkernel's. -That is, we want `Mc` to be an integer multiple of `W * mᵣ` and `Nc` to be an integer multiple of `nᵣ`. - -Additionally, we want our blocks of `A` to fit in the core-local L2 cache. -Empirically, I found that when using `Float64` arrays, 72 rows works well on Haswell (where `W * mᵣ = 8`) -and 96 works well for Cascadelake (where `W * mᵣ = 24`). -So I kind of heuristically multiply `W * mᵣ` by `4` given 32 vector register (as in Cascadelake), which -would yield `96`, and multiply by `9` otherwise, which would give `72` on Haswell. -Ideally, we'd have a better means of picking. -I suspect relatively small numbers work well because I'm currently using a column-major memory layout for -the internal packing arrays. A column-major memory layout means that if our macro-kernel had a lot of rows, -moving across columns would involve reading memory far apart, moving across memory pages more rapidly, -hitting the TLB harder. This is why libraries like OpenBLAS and BLIS don't use a column-major layout, but -reshape into a 3-d array, e.g. `A` will be reshaped into a `Mᵣ × Kc × (Mc ÷ Mᵣ)` array (also sometimes -referred to as a tile-major matrix), so that all memory reads happen in consecutive memory locations. - - -Now that we have `Mc`, we use it and the `L2` cache size to calculate `Kc`, but shave off a percent to -leave room in the cache for some other things. - -We want out blocks of `B` to fir in the `L3` cache, so we can use the `L3` cache-size and `Kc` to -similarly calculate `Nc`, with the additional note that we also divide and multiply by `nᵣ` to ensure -that `Nc` is an integer multiple of `nᵣ`. + split_m(M, Miters_base, W) + +Splits `M` into at most `Miters_base` iterations. +For example, if we wish to divide `517` iterations into roughly 7 blocks using multiples of `8`: + +```julia +julia> split_m(517, 7, 8) +(72, 2, 69, 7) +``` + +This suggests we have base block sizes of size `72`, with two iterations requiring an extra remainder of `8 ( = W)`, +and a final block of `69` to handle the remainder. It also tells us that there are `7` total iterations, as requested. +```julia +julia> 80*2 + 72*(7-2-1) + 69 +517 +``` +This is meant to specify roughly the requested amount of blocks, and return relatively even sizes. + +This method is used fairly generally. """ -function block_sizes(::Type{T}) where {T} - _L1, _L2, __L3, _L4 = VectorizationBase.CACHE_SIZE - L1c, L2c, L3c, L4c = VectorizationBase.CACHE_COUNT - # TODO: something better than treating it as 4 MiB if there is no L3 cache - # one possibility is to focus on the L1 and L2 caches instead of the L2 and L3. - _L3 = something(__L3, 4194304) - @assert L1c == L2c == VectorizationBase.NUM_CORES +@inline function split_m(M, _Mblocks, W) + Miters = cld_fast(M, W) + Mblocks = min(_Mblocks, Miters) + Miter_per_block, Mrem = divrem_fast(Miters, Mblocks) + Mbsize = Miter_per_block * W + Mremfinal = M - Mbsize*(Mblocks-1) - Mrem * W + Mbsize, Mrem, Mremfinal, Mblocks +end + +""" + solve_block_sizes(::Type{T}, M, K, N, α, β, R₂, R₃) + +This function returns iteration/blocking descriptions `Mc`, `Kc`, and `Nc` for use when packing both `A` and `B`. + +It tries to roughly minimize the cost +```julia +MKN/(Kc*W) + α * MKN/Mc + β * MKN/Nc +``` +subject to constraints +```julia +Mc - M ≤ 0 +Kc - K ≤ 0 +Nc - N ≤ 0 +Mc*Kc - L₁ₑ ≤ 0 +Kc*Nc - L₂ₑ ≤ 0 +``` +That is, our constraints say that our block sizes shouldn't be bigger than the actual dimensions, and also that +our packed `A` (`Mc × Kc`) should fit into the first packing cache (generally, actually the `L₂`, and our packed +`B` (`Kc × Nc`) should fit into the second packing cache (generally the `L₃`). - st = VectorizationBase.static_sizeof(T) +Our cost model consists of three components: +1. Cost of moving data in and out of registers. This is done `(M/Mᵣ * K/Kc * N/Nᵣ)` times and the cost per is `(Mᵣ/W * Nᵣ)`. +2. Cost of moving strips from `B` pack from the low cache levels to the highest cache levels when multiplying `Aₚ * Bₚ`. + This is done `(M / Mc * K / Kc * N / Nc)` times, and the cost per is proportional to `(Kc * Nᵣ)`. + `α` is the proportionality-constant parameter. +3. Cost of packing `A`. This is done `(M / Mc * K / Kc * N / Nc)` times, and the cost per is proportional to + `(Mc * Kc)`. `β` is the proportionality-constant parameter. - L2 = (StaticInt{_L2}() - StaticInt{_L1}()) ÷ st - L3 = _calculate_L3(StaticInt{_L2}(), StaticInt{_L3}(), st, Val{VectorizationBase.CACHE_INCLUSIVITY[3]}()) +As `W` is a constant, we multiply the cost by `W` and absorb it into `α` and `β`. We drop it from the description +from here on out. +In the full problem, we would have Lagrangian, with μ < 0: +f((Mc,Kc,Nc),(μ₁,μ₂,μ₃,μ₄,μ₅)) +MKN/Kc + α * MKN/Mc + β * MKN/Nc - μ₁(Mc - M) - μ₂(Kc - K) - μ₃(Nc - N) - μ₄(Mc*Kc - L2) - μ₅(Kc*Nc - L3) +```julia +0 = ∂L/∂Mc = - α * MKN / Mc² - μ₁ - μ₄*Kc +0 = ∂L/∂Kc = - MKN / Kc² - μ₂ - μ₄*Mc - μ₅*Nc +0 = ∂L/∂Nc = - β * MKN / Nc² - μ₃ - μ₅*Kc +0 = ∂L/∂μ₁ = M - Mc +0 = ∂L/∂μ₂ = K - Kc +0 = ∂L/∂μ₃ = N - Nc +0 = ∂L/∂μ₄ = L₁ₑ - Mc*Kc +0 = ∂L/∂μ₅ = L₂ₑ - Kc*Nc +``` +The first 3 constraints complicate things, because they're trivially solved by setting `M = Mc`, `K = Kc`, and `N = Nc`. +But this will violate the last two constraints in general; normally we will be on the interior of the inequalities, +meaning we'd be dropping those constraints. Doing so, this leaves us with: + +First, lets just solve the cost w/o constraints 1-3 +```julia +0 = ∂L/∂Mc = - α * MKN / Mc² - μ₄*Kc +0 = ∂L/∂Kc = - MKN / Kc² - μ₄*Mc - μ₅*Nc +0 = ∂L/∂Nc = - β * MKN / Nc² - μ₅*Kc +0 = ∂L/∂μ₄ = L₁ₑ - Mc*Kc +0 = ∂L/∂μ₅ = L₂ₑ - Kc*Nc +``` +Solving: +```julia +Mc = √(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ) +Kc = √(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α) +Nc = √(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) +μ₄ = -K*√(L₂ₑ)*M*N*α/(L₁ₑ^(3/2)*√(L₁ₑ*β + L₂ₑ*α)) +μ₅ = -K*√(L₁ₑ)*M*N*β/(L₂ₑ^(3/2)*√(L₁ₑ*β + L₂ₑ*α)) +``` +These solutions are indepedent of matrix size. +The approach we'll take here is solving for `Nc`, `Kc`, and then finally `Mc` one after the other, incorporating sizes. + +Starting with `N`, we check how many iterations would be implied by `Nc`, and then choose the smallest value that would +yield that number of iterations. This also ensures that `Nc ≤ N`. +```julia +Niter = cld(N, √(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ)) +Nblock, Nrem = divrem(N, Niter) +Nblock_Nrem = Nblock + (Nrem > 0) +``` +We have `Nrem` iterations of size `Nblock_Nrem`, and `Niter - Nrem` iterations of size `Nblock`. + +We can now make `Nc = Nblock_Nrem` a constant, and solve the remaining three equations again: +```julia +0 = ∂L/∂Mc = - α * MKN / Mc² - μ₄*Kc +0 = ∂L/∂Kc = - MKN / Kc² - μ₄*Mc - μ₅*Ncm +0 = ∂L/∂μ₄ = L₂ₑ - Mc*Kc +``` +yielding +```julia +Mc = √(L₁ₑ)*√(α) +Kc = √(L₁ₑ)/√(α) +μ₄ = -K*M*N*√(α)/L₁ₑ^(3/2) +``` +We proceed in the same fashion as for `Nc`, being sure to reapply the `Kc * Nc ≤ L₂ₑ` constraint: +```julia +Kiter = cld(K, min(√(L₁ₑ)/√(α), L₂ₑ/Nc)) +Kblock, Krem = divrem(K, Ki) +Kblock_Krem = Kblock + (Krem > 0) +``` +This leaves `Mc` partitioning, for which, for which we use the constraint `Mc * Kc ≤ L₁ₑ` to set +the initial number of proposed iterations as `cld(M, L₁ₑ / Kcm)` for calling `split_m`. +```julia +Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, cld(M, L₁ₑ / Kcm), StaticInt{W}()) +``` + +Note that for synchronization on `B`, all threads must have the same values for `Kc` and `Nc`. +`K` and `N` will be equal between threads, but `M` may differ. By calculating `Kc` and `Nc` +independently of `M`, this algorithm guarantees all threads are on the same page. +""" +@inline function solve_block_sizes(::Type{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T} W = VectorizationBase.pick_vector_width_val(T) - Mr = StaticInt{LoopVectorization.mᵣ}() - Nr = StaticInt{LoopVectorization.nᵣ}() + α = _α * W + β = _β * W + L₁ₑ = first_effective_cache(T) * R₂ + L₂ₑ = second_effective_cache(T) * R₃ - Mc = (VectorizationBase.REGISTER_COUNT == 32 ? StaticInt{4}() : StaticInt{9}()) * Mr * W - Kc = (StaticInt{5}() * L2) ÷ (StaticInt{7}() * Mc) - Nc = ((StaticInt{5}() * L3) ÷ (StaticInt{7}() * Kc * Nr)) * Nr + # Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ)) + Nc_init⁻¹ = √(L₁ₑ) / (√(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)) + + Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil` + Nblock, Nrem = divrem_fast(N, Niter) + Nblock_Nrem = Nblock + One()#(Nrem > 0) - Mc, Kc, Nc + ((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = solve_McKc(T, M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor) + + (Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), promote(Nblock, Nblock_Nrem, Nrem, Niter) +end +# Takes Nc, calcs Mc and Kc +@inline function solve_McKc(::Type{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T} + W = VectorizationBase.pick_vector_width_val(T) + α = _α * W + β = _β * W + L₁ₑ = first_effective_cache(T) * R₂ + L₂ₑ = second_effective_cache(T) * R₃ + + Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ)) + Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil` + Kblock, Krem = divrem_fast(K, Kiter) + Kblock_Krem = Kblock + One() + + Miter_init = cldapproxi(M * inv(L₁ₑ), Kblock_Krem) # Miter = M * Kc / L₁ₑ + Mbsize, Mrem, Mremfinal, Miter = split_m(M, Miter_init, W * Wfactor) + Mblock_Mrem = Mbsize + W * Wfactor + + promote(Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter), promote(Kblock, Kblock_Krem, Krem, Kiter) +end + +@inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil` + +""" + find_first_acceptable(M, W) + +Finds first combination of `Miter` and `Niter` that doesn't make `M` too small while producing `Miter * Niter = NUM_CORES`. +This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point. +""" +@inline function find_first_acceptable(M, W) + Mᵣ = StaticInt{mᵣ}() * W + for (miter,niter) ∈ CORE_FACTORS + if miter * ((MᵣW_mul_factor - One()) * Mᵣ) ≤ M + (W + W) + return miter, niter + end + end + last(CORE_FACTORS) +end +""" + divide_blocks(M, Ntotal, _nspawn, W) + +Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices. +""" +@inline function divide_blocks(M, Ntotal, _nspawn, W) + _nspawn == NUM_CORES && return find_first_acceptable(M, W) + + Miter = clamp(div_fast(M, W*StaticInt{mᵣ}() * MᵣW_mul_factor), 1, _nspawn) + nspawn = div_fast(_nspawn, Miter) + Niter = if (nspawn ≤ 1) & (Miter < _nspawn) + # rebalance Miter + Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter)) + nspawn = div_fast(_nspawn, Miter) + end + Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn))) end -_calculate_L3(_L2, _L3, st, ::Val{true}) = (_L3 - _L2) ÷ st -_calculate_L3(_L2, _L3, st, ::Val{false}) = _L3 ÷ st diff --git a/src/funcptrs.jl b/src/funcptrs.jl new file mode 100644 index 0000000..a021484 --- /dev/null +++ b/src/funcptrs.jl @@ -0,0 +1,129 @@ + + +struct LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd} <: Function end +function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd} + offset, C = load(p, TC, 1) + offset, A = load(p, TA, offset) + offset, B = load(p, TB, offset) + offset, α = load(p, Α, offset) + offset, β = load(p, Β, offset) + offset, M = load(p, Md, offset) + offset, K = load(p, Kd, offset) + offset, N = load(p, Nd, offset) + _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}()) + nothing +end +@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N) +@inline function _call_loopmul!(C::StridedPointer{T}, A, B, α, β, M, K, N, ::Val{true}) where {T} + if M*K < first_effective_cache(T) * R₂Default + packaloopmul!(C, A, B, α, β, M, K, N) + return + else + matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}()) + return + end +end +call_loopmul!(C, A, B, α, β, M, K, N, ::Val{P}) where {P} = _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}()) + +struct SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂} <: Function end +function (::SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂})(p::Ptr{UInt}) where {TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂} + offset, C = load(p, TC, 1) + offset, A = load(p, TA, offset) + offset, B = load(p, TB, offset) + offset, α = load(p, Α, offset) + offset, β = load(p, Β, offset) + offset, M = load(p, Md, offset) + offset, K = load(p, Kd, offset) + offset, N = load(p, Nd, offset) + offset, atomicp = load(p, AP, offset) + offset, bcachep = load(p, BCP, offset) + offset, id = load(p, ID, offset) + offset, total_ids = load(p, TT, offset) + sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat{W₁}(), StaticFloat{W₂}(), StaticFloat{R₁}(), StaticFloat{R₂}()) + nothing +end + +@generated function cfuncpointer(::T) where {T} + precompile(T(), (Ptr{UInt},)) + quote + $(Expr(:meta,:inline)) + @cfunction($(T()), Cvoid, (Ptr{UInt},)) + end +end + +@inline function setup_matmul!(p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd, ::Val{P}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd} + offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), 0) + offset = store!(p, C, offset) + offset = store!(p, A, offset) + offset = store!(p, B, offset) + offset = store!(p, α, offset) + offset = store!(p, β, offset) + offset = store!(p, M, offset) + offset = store!(p, K, offset) + offset = store!(p, N, offset) + nothing +end + +@inline function setup_syncmul!( + p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd, + ap::AP,bcp::BCP,id::ID,tt::TT,::StaticFloat{W₁},::StaticFloat{W₂},::StaticFloat{R₁},::StaticFloat{R₂} +) where {TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂} + offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂}()), 0) + offset = store!(p, C, offset) + offset = store!(p, A, offset) + offset = store!(p, B, offset) + offset = store!(p, α, offset) + offset = store!(p, β, offset) + offset = store!(p, M, offset) + offset = store!(p, K, offset) + offset = store!(p, N, offset) + offset = store!(p, ap, offset) + offset = store!(p, bcp, offset) + offset = store!(p, id, offset) + offset = store!(p, tt, offset) + nothing +end + +function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::Int, ::Val{P}) where {P} + p = taskpointer(tid) + while true + if _atomic_cas_cmp!(p, SPIN, STUP) + setup_matmul!(p, C, A, B, α, β, M, K, N, Val{P}()) + @assert _atomic_cas_cmp!(p, STUP, TASK) + return + elseif _atomic_cas_cmp!(p, WAIT, STUP) + setup_matmul!(p, C, A, B, α, β, M, K, N, Val{P}()) + @assert _atomic_cas_cmp!(p, STUP, LOCK) + wake_thread!(tid % UInt) + return + end + pause() + end +end +function launch_thread_mul!( + C, A, B, α, β, M, K, N, ap, bcp, tid, tt,::StaticFloat{W₁},::StaticFloat{W₂},::StaticFloat{R₁},::StaticFloat{R₂} +) where {W₁,W₂,R₁,R₂} + p = taskpointer(tid+one(UInt)) + while true + if _atomic_cas_cmp!(p, SPIN, STUP) + setup_syncmul!( + p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt, + StaticFloat{W₁}(),StaticFloat{W₂}(),StaticFloat{R₁}(),StaticFloat{R₂}() + ) + @assert _atomic_cas_cmp!(p, STUP, TASK) + return + elseif _atomic_cas_cmp!(p, WAIT, STUP) + # we immediately write the `atomicp, bc, tid, total_tids` part, so we can dispatch to the same code as the other methods + setup_syncmul!( + p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt, + StaticFloat{W₁}(),StaticFloat{W₂}(),StaticFloat{R₁}(),StaticFloat{R₂}() + ) + @assert _atomic_cas_cmp!(p, STUP, LOCK) + wake_thread!(tid+one(UInt)) + return + end + pause() + end +end + + diff --git a/src/global_constants.jl b/src/global_constants.jl index 5cf2485..b13155e 100644 --- a/src/global_constants.jl +++ b/src/global_constants.jl @@ -1,3 +1,47 @@ const BCACHE = UInt8[] const OCTAVIAN_NUM_TASKS = Ref(1) +_nthreads() = OCTAVIAN_NUM_TASKS[] + +@generated function calc_factors(::Val{nc} = Val{NUM_CORES}()) where {nc} + t = Expr(:tuple) + for i ∈ nc:-1:1 + d, r = divrem(nc, i) + iszero(r) && push!(t.args, (i, d)) + end + t +end +const CORE_FACTORS = calc_factors() + +const MᵣW_mul_factor = VectorizationBase.REGISTER_SIZE === 64 ? StaticInt{4}() : StaticInt{9}() + +if VectorizationBase.AVX512F + const W₁Default = 0.006089395198610773 + const W₂Default = 0.7979822724696168 + const R₁Default = 0.5900561503730485 + const R₂Default = 0.762152930709678 +else + const W₁Default = 0.1 # TODO: relax bounds; this was the upper bound set for the optimizer. + const W₂Default = 0.15989396641218157 + const R₁Default = 0.4203583148344484 + const R₂Default = 0.6344856142604789 +end + +const FIRST__CACHE = 1 + (VectorizationBase.CACHE_SIZE[3] !== nothing) +const SECOND_CACHE = 2 + (VectorizationBase.CACHE_SIZE[3] !== nothing) +const FIRST__CACHE_SIZE = VectorizationBase.CACHE_SIZE[FIRST__CACHE] === nothing ? 262144 : + (((FIRST__CACHE == 2) & CACHE_INCLUSIVITY[2]) ? (VectorizationBase.CACHE_SIZE[2] - VectorizationBase.CACHE_SIZE[1]) : + VectorizationBase.CACHE_SIZE[FIRST__CACHE]) +const SECOND_CACHE_SIZE = (VectorizationBase.CACHE_SIZE[SECOND_CACHE] === nothing ? 3145728 : + (CACHE_INCLUSIVITY[SECOND_CACHE] ? (VectorizationBase.CACHE_SIZE[SECOND_CACHE] - VectorizationBase.CACHE_SIZE[FIRST__CACHE]) : + VectorizationBase.CACHE_SIZE[SECOND_CACHE])) * something(VectorizationBase.CACHE_COUNT[SECOND_CACHE], 1) + +const CACHELINESIZE = something(VectorizationBase.L₁CACHE.linesize, 64) % UInt +const BCACHE_COUNT = something(VectorizationBase.CACHE_COUNT[3], 1); +const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt)) + +if Sys.WORD_SIZE ≤ 32 + const ACACHE = UInt8[] + const ACACHEPTR = Ref{Ptr{UInt8}}(C_NULL) +end + diff --git a/src/init.jl b/src/init.jl index 93a2fa3..f7172c0 100644 --- a/src/init.jl +++ b/src/init.jl @@ -1,11 +1,28 @@ function __init__() + init_acache() init_bcache() - init_num_tasks() - return nothing + nt = init_num_tasks() + if nt < NUM_CORES && ("SUPPRESS_OCTAVIAN_WARNING" ∉ keys(ENV)) + msg = string( + "Your system has $NUM_CORES physical cores, but `Octavian.jl` only has ", + "$(nt > 1 ? "$(nt) threads" : "1 thread") available.", + "For the best performance, you should start Julia with at least $(NUM_CORES) threads.", + "", + ) + @warn msg + end + reseet_bcache_lock!() end function init_bcache() - resize!(BCACHE, VectorizationBase.CACHE_SIZE[3] * VectorizationBase.CACHE_COUNT[3]) + resize!(BCACHE, SECOND_CACHE_SIZE * BCACHE_COUNT); nothing +end + +function init_acache() + Sys.WORD_SIZE ≤ 32 || return + resize!(ACACHE, FIRST__CACHE_SIZE * init_num_tasks() + 4095) + ACACHEPTR[] = align(pointer(ACACHE), 4096) + nothing end function init_num_tasks() @@ -15,9 +32,10 @@ end function _read_environment_num_tasks() environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String + nt = min(Threads.nthreads(), VectorizationBase.NUM_CORES)::Int if isempty(environment_variable) - return min(Threads.nthreads(), VectorizationBase.NUM_CORES)::Int + return nt else - return parse(Int, environment_variable)::Int + return min(parse(Int, environment_variable)::Int, nt) end end diff --git a/src/integerdivision.jl b/src/integerdivision.jl new file mode 100644 index 0000000..c2b3925 --- /dev/null +++ b/src/integerdivision.jl @@ -0,0 +1,26 @@ +@inline cld_fast(x, y) = cld(x, y) +@inline function cld_fast(x::I, y) where {I <: Integer} + d = div_fast(x, y) + (d + (d * unsigned(y) != unsigned(x))) % I +end +cld_fast(::StaticInt{N}, ::StaticInt{M}) where {N,M}= (StaticInt{N}() + StaticInt{M}() + One()) ÷ StaticInt{M}() +@inline function divrem_fast(x::I, y) where {I <: Integer} + ux = unsigned(x); uy = unsigned(y) + d = Base.udiv_int(ux, uy) + r = ux - d * uy + d % I, r % I +end +@inline divrem_fast(::StaticInt{x}, y::I) where {x, I <: Integer} = divrem_fast(x % I, y) +@inline div_fast(x::I, y::Integer) where {I <: Integer} = Base.udiv_int(unsigned(x), unsigned(y)) % I +@inline div_fast(::StaticInt{x}, y::I) where {x, I <: Integer} = Base.udiv_int(unsigned(x), unsigned(y)) % I +divrem_fast(::StaticInt{N}, ::StaticInt{M}) where {N,M}= divrem(StaticInt{N}(), StaticInt{M}()) +div_fast(::StaticInt{N}, ::StaticInt{M}) where {N,M}= StaticInt{N}() ÷ StaticInt{M}() +@generated function div_fast(x::I, ::StaticInt{M}) where {I<:Integer,M} + if VectorizationBase.ispow2(M) + lm = VectorizationBase.intlog2(M) + Expr(:block, Expr(:meta,:inline), :(x >>> $lm)) + else + Expr(:block, Expr(:meta,:inline), :(div_fast(x, $(I(M))))) + end +end + diff --git a/src/macrokernels.jl b/src/macrokernels.jl index 886dd5e..5b4f8a6 100644 --- a/src/macrokernels.jl +++ b/src/macrokernels.jl @@ -1,24 +1,347 @@ -""" -The macrokernel. It iterates over our tiles, and applies the microkernel. -""" -function macrokernel!(C, A, B, α, β) - iszero(β) && return macrokernel!(C, A, B, α, StaticInt{0}()) - LoopVectorization.@avx for n ∈ axes(C,2), m ∈ axes(C,1) - Cmn = zero(eltype(C)) - for k ∈ axes(B,1) - Cmn += A[m,k] * B[k,n] +@inline function loopmul!(C, A, B, ::StaticInt{1}, ::StaticInt{0}, M, K, N) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] end - # C[m,n] may be `NaN`, but if `β == 0` we still want to zero it - C[m,n] = (α * Cmn) + β * C[m,n] + C[m,n] = Cₘₙ end + nothing end -function macrokernel!(C, A, B, α, ::StaticInt{0}) - LoopVectorization.@avx for n ∈ axes(C,2), m ∈ axes(C,1) - Cmn = zero(eltype(C)) - for k ∈ axes(B,1) - Cmn += A[m,k] * B[k,n] +@inline function loopmul!(C, A, B, ::StaticInt{1}, ::StaticInt{1}, M, K, N) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] end - # C[m,n] may be `NaN`, but if `β == 0` we still want to zero it - C[m,n] = (α * Cmn) + C[m,n] += Cₘₙ end + nothing end +@inline function loopmul!(C, A, B, ::StaticInt{1}, β, M, K, N) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] = Cₘₙ + β * C[m,n] + end + nothing +end +@inline function loopmul!(C, A, B, ::StaticInt{-1}, ::StaticInt{0}, M, K, N) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ -= A[m,k] * B[k,n] + end + C[m,n] = Cₘₙ + end + nothing +end +@inline function loopmul!(C, A, B, ::StaticInt{-1}, ::StaticInt{1}, M, K, N) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ -= A[m,k] * B[k,n] + end + C[m,n] += Cₘₙ + end + nothing +end +@inline function loopmul!(C, A, B, ::StaticInt{-1}, β, M, K, N) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ -= A[m,k] * B[k,n] + end + C[m,n] = Cₘₙ + β * C[m,n] + end + nothing +end +@inline function loopmul!(C, A, B, α, ::StaticInt{0}, M, K, N) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] = α * Cₘₙ + end + nothing +end + +@inline function loopmul!(C, A, B, α, ::StaticInt{1}, M, K, N) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] += α * Cₘₙ + end + nothing +end +@inline function loopmul!(C, A, B, α, β, M, K, N) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] = α * Cₘₙ + β * C[m,n] + end + nothing +end + + +@inline function packamul!( + C, Ãₚ, A, B, + ::StaticInt{1}, ::StaticInt{0}, + M, K, N +) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Aₘₖ = A[m,k] + Cₘₙ += Aₘₖ * B[k,n] + Ãₚ[m,k] = Aₘₖ + end + C[m,n] = Cₘₙ + end +end +@inline function packamul!( + C, Ãₚ, A, B, + ::StaticInt{1}, ::StaticInt{1}, + M, K, N +) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Aₘₖ = A[m,k] + Cₘₙ += Aₘₖ * B[k,n] + Ãₚ[m,k] = Aₘₖ + end + C[m,n] += Cₘₙ + end +end +@inline function packamul!( + C, Ãₚ, A, B, + ::StaticInt{1}, β, + M, K, N +) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Aₘₖ = A[m,k] + Cₘₙ += Aₘₖ * B[k,n] + Ãₚ[m,k] = Aₘₖ + end + C[m,n] = Cₘₙ + β * C[m,n] + end +end +@inline function packamul!( + C, Ãₚ, A, B, + ::StaticInt{-1}, ::StaticInt{0}, + M, K, N +) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Aₘₖ = A[m,k] + Cₘₙ -= Aₘₖ * B[k,n] + Ãₚ[m,k] = Aₘₖ + end + C[m,n] = Cₘₙ + end +end +@inline function packamul!( + C, Ãₚ, A, B, + ::StaticInt{-1}, ::StaticInt{1}, + M, K, N +) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Aₘₖ = A[m,k] + Cₘₙ -= Aₘₖ * B[k,n] + Ãₚ[m,k] = Aₘₖ + end + C[m,n] += Cₘₙ + end +end +@inline function packamul!( + C, Ãₚ, A, B, + ::StaticInt{-1}, β, + M, K, N +) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Aₘₖ = A[m,k] + Cₘₙ -= Aₘₖ * B[k,n] + Ãₚ[m,k] = Aₘₖ + end + C[m,n] = Cₘₙ + β * C[m,n] + end +end +@inline function packamul!( + C, Ãₚ, A, B, + α, ::StaticInt{0}, + M, K, N +) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Aₘₖ = A[m,k] + Cₘₙ += Aₘₖ * B[k,n] + Ãₚ[m,k] = Aₘₖ + end + C[m,n] = α * Cₘₙ + end +end +@inline function packamul!( + C, Ãₚ, A, B, + α, ::StaticInt{1}, + M, K, N +) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Aₘₖ = A[m,k] + Cₘₙ += Aₘₖ * B[k,n] + Ãₚ[m,k] = Aₘₖ + end + C[m,n] += α * Cₘₙ + end +end +@inline function packamul!( + C, Ãₚ, A, B, + α, β, M, K, N +) + @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Aₘₖ = A[m,k] + Cₘₙ += Aₘₖ * B[k,n] + Ãₚ[m,k] = Aₘₖ + end + C[m,n] = α * Cₘₙ + β * C[m,n] + end +end +@inline function alloc_a_pack(A, M, ::Type{T}) where {T} + # buffer = Vector{T}(undef, FIRST__CACHE_SIZE ÷ sizeof(T)) + buffer = first_cache_buffer(T) + bufferptr = if Sys.WORD_SIZE == 32 + reinterpret(Ptr{T}, buffer) # Base.unsafe_convert(Ptr{T}, buffer) + else + align(pointer(buffer)) + end + Apack = default_zerobased_stridedpointer(bufferptr, (One(), align(M, T))) + Apack, buffer +end +@inline function packaloopmul!( + C::AbstractStridedPointer{T}, + A::AbstractStridedPointer, + B::AbstractStridedPointer, + α, β, M, K, N +) where {T} + Ãₚ, buffer = alloc_a_pack(A, M, T) + GC.@preserve buffer begin + Nᵣ = StaticInt{nᵣ}() + packamul!(C, Ãₚ, A, B, α, β, M, K, Nᵣ) + loopmul!(gesp(C, (Zero(), Nᵣ)), Ãₚ, gesp(B, (Zero(), Nᵣ)), α, β, M, K, N - Nᵣ) + end + nothing +end + + + +@inline function inlineloopmul!(C, A, B, ::StaticInt{1}, ::StaticInt{0}, M, K, N) + @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] = Cₘₙ + end + C +end +@inline function inlineloopmul!(C, A, B, ::StaticInt{1}, ::StaticInt{1}, M, K, N) + @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] += Cₘₙ + end + C +end +@inline function inlineloopmul!(C, A, B, ::StaticInt{1}, β, M, K, N) + @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] = Cₘₙ + β * C[m,n] + end + C +end +@inline function inlineloopmul!(C, A, B, ::StaticInt{-1}, ::StaticInt{0}, M, K, N) + @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ -= A[m,k] * B[k,n] + end + C[m,n] = Cₘₙ + end + C +end +@inline function inlineloopmul!(C, A, B, ::StaticInt{-1}, ::StaticInt{1}, M, K, N) + @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ -= A[m,k] * B[k,n] + end + C[m,n] += Cₘₙ + end + C +end +@inline function inlineloopmul!(C, A, B, ::StaticInt{-1}, β, M, K, N) + @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ -= A[m,k] * B[k,n] + end + C[m,n] = Cₘₙ + β * C[m,n] + end + C +end +@inline function inlineloopmul!(C, A, B, α, ::StaticInt{0}, M, K, N) + @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] = α * Cₘₙ + end + C +end +@inline function inlineloopmul!(C, A, B, α, ::StaticInt{1}, M, K, N) + @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] += α * Cₘₙ + end + C +end +@inline function inlineloopmul!(C, A, B, α, β, M, K, N) + @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N) + Cₘₙ = zero(eltype(C)) + for k ∈ CloseOpen(K) + Cₘₙ += A[m,k] * B[k,n] + end + C[m,n] = α * Cₘₙ + β * C[m,n] + end + C +end + + diff --git a/src/matmul.jl b/src/matmul.jl index 658024d..470810b 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -1,90 +1,476 @@ -evenly_divide(x, y) = cld(x, cld(x, y)) -evenly_divide(x, y, z) = cld(evenly_divide(x, y), z) * z +""" +Only packs `A`. Primitively does column-major packing: it packs blocks of `A` into a column-major temporary. +""" +function matmul_st_only_pack_A!( + C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, + α, β, M, K, N, ::StaticFloat{W₁}, ::StaticFloat{W₂}, ::StaticFloat{R₁}, ::StaticFloat{R₂} +) where {T, W₁, W₂, R₁, R₂} + + ((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = + solve_McKc(T, M, K, N, StaticFloat{W₁}(), StaticFloat{W₂}(), StaticFloat{R₁}(), StaticFloat{R₂}(), StaticInt{mᵣ}()) + for ko ∈ CloseOpen(Kiter) + ksize = ifelse(ko < Krem, Kblock_Krem, Kblock) + let A = A, C = C + for mo in CloseOpen(Miter) + msize = ifelse((mo+1) == Miter, Mremfinal, ifelse(mo < Mrem, Mblock_Mrem, Mblock)) + # if ko == 0 + # loopmul!(C, A, B, α, β, msize, ksize, N) + # else + # loopmul!(C, A, B, α, One(), msize, ksize, N) + # end + if ko == 0 + packaloopmul!(C, A, B, α, β, msize, ksize, N) + else + packaloopmul!(C, A, B, α, One(), msize, ksize, N) + end + A = gesp(A, (msize, Zero())) + C = gesp(C, (msize, Zero())) + end + end + A = gesp(A, (Zero(), ksize)) + B = gesp(B, (ksize, Zero())) + end + nothing +end + +function matmul_st_pack_A_and_B!( + C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, W₁, W₂, R₁, R₂, tid +) where {T} + # TODO: if this is nested in other threaded code, use only a piece of BCACHE and make R₂ (and thus L₂ₑ) smaller + (Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), (Nblock, Nblock_Nrem, Nrem, Niter) = + solve_block_sizes(T, M, K, N, W₁, W₂, R₁, R₂, mᵣ) + + bcache = _use_bcache(tid) + L3ptr = Base.unsafe_convert(Ptr{T}, bcache) + GC.@preserve BCACHE begin + for n ∈ CloseOpen(Niter) + nsize = ifelse(n < Nrem, Nblock_Nrem, Nblock) + let A = A, B = B + for k ∈ CloseOpen(Kiter) + ksize = ifelse(k < Krem, Kblock_Krem, Kblock) + _B = default_zerobased_stridedpointer(L3ptr, (One(),ksize)) + unsafe_copyto_avx!(_B, B, ksize, nsize) + let A = A, C = C, B = _B + for m in CloseOpen(Miter) + msize = ifelse((m+1) == Miter, Mremfinal, ifelse(m < Mrem, Mblock_Mrem, Mblock)) + if k == 0 + packaloopmul!(C, A, B, α, β, msize, ksize, nsize) + else + packaloopmul!(C, A, B, α, One(), msize, ksize, nsize) + end + A = gesp(A, (msize, Zero())) + C = gesp(C, (msize, Zero())) + end + end + A = gesp(A, (Zero(), ksize)) + B = gesp(B, (ksize, Zero())) + end + end + B = gesp(B, (Zero(), nsize)) + C = gesp(C, (Zero(), nsize)) + end + end # GC.@preserve + _free_bcache!(bcache) + nothing +end + +@inline contiguousstride1(A) = ArrayInterface.contiguous_axis(A) === ArrayInterface.Contiguous{1}() +@inline contiguousstride1(A::AbstractStridedPointer{T,N,1}) where {T,N} = true +# @inline bytestride(A::AbstractArray, i) = VectorizationBase.bytestrides(A)[i] +@inline bytestride(A::AbstractStridedPointer, i) = strides(A)[i] +@inline firstbytestride(A::AbstractStridedPointer) = bytestride(A, One()) + +@inline function vectormultiple(bytex, ::Type{Tc}, ::Type{Ta}) where {Tc,Ta} + Wc = VectorizationBase.pick_vector_width_val(Tc) * static_sizeof(Ta) - One() + iszero(bytex & (VectorizationBase.REGISTER_SIZE - 1)) +end +@inline function dontpack(pA::AbstractStridedPointer{Ta}, M, K, ::StaticInt{mc}, ::StaticInt{kc}, ::Type{Tc}) where {mc, kc, Tc, Ta} + (contiguousstride1(pA) && + ((((VectorizationBase.AVX512F ? 9 : 13) * VectorizationBase.pick_vector_width(Tc)) ≥ M) || + (vectormultiple(bytestride(pA, StaticInt{2}()), Tc, Ta) && ((M * K) ≤ (mc * kc)) && iszero(reinterpret(Int, pointer(pA)) & (VectorizationBase.REGISTER_SIZE - 1))))) +end -_dim1contig(::ArrayInterface.Contiguous) = false -_dim1contig(::ArrayInterface.Contiguous{1}) = true -dim1contig(::Type{A}) where {A <: StridedArray} = _dim1contig(ArrayInterface.contiguous_axis(A)) -dim1contig(::Type) = false -dim1contig(A) = dim1contig(typeof(A)) + +@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractArray{TB}) where {TA,TB} + # TODO: if `M` and `N` are statically sized, shouldn't return a `Matrix`. + M, KA = size(A) + KB, N = size(B) + @assert KA == KB "Size mismatch." + Matrix{promote_type(TA,TB)}(undef, M, N), (M, KA, N) +end +@inline function matmul_serial(A::AbstractMatrix, B::AbstractMatrix) + C, (M,K,N) = alloc_matmul_product(A, B) + _matmul_serial!(C, A, B, One(), Zero(), (M,K,N)) + return C +end + + +# These methods must be compile time constant +maybeinline(::Any, ::Any, ::Any, ::Any) = false +function maybeinline(::StaticInt{M}, ::StaticInt{N}, ::Type{T}, ::Val{true}) where {M,N,T} + static_sizeof(T) * StaticInt{M}() * StaticInt{N}() < StaticInt{176}() * StaticInt(mᵣ) * StaticInt{nᵣ}() +end +function maybeinline(::StaticInt{M}, ::StaticInt{N}, ::Type{T}, ::Val{false}) where {M,N,T} + StaticInt{M}() * static_sizeof(T) ≤ StaticInt{2}() * StaticInt{VectorizationBase.REGISTER_SIZE}() +end + + +@inline function matmul_serial!(C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix) where {T} + matmul_serial!(C, A, B, One(), Zero(), nothing, ArrayInterface.contiguous_axis(C)) +end +@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α) + matmul_serial!(C, A, B, α, Zero(), nothing, ArrayInterface.contiguous_axis(C)) +end +@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β) + matmul_serial!(C, A, B, α, β, nothing, ArrayInterface.contiguous_axis(C)) +end +@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN, ::ArrayInterface.Contiguous{2}) + _matmul_serial!(C', B', A', α, β, nothing) + return C +end +@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN, ::ArrayInterface.Contiguous) + _matmul_serial!(C, A, B, α, β, nothing) + return C +end """ - matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, _α = 1, _β = 0) + matmul_serial!(C, A, B[, α = 1, β = 0]) + +Calculates `C = α * (A * B) + β * C` in place. + +A single threaded matrix-matrix-multiply implementation. +Supports dynamically and statically sized arrays. + +Organizationally, `matmul_serial!` checks the arrays properties to try and dispatch to an appropriate implementation. +If the arrays are small and statically sized, it will dispatch to an inlined multiply. + +Otherwise, based on the array's size, whether they are transposed, and whether the columns are already aligned, it decides to not pack at all, to pack only `A`, or to pack both arrays `A` and `B`. """ -function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T}, _α = one(T), _β = zero(T)) where {T} - _Mc, _Kc, _Nc = block_sizes(T) +@inline function _matmul_serial!( + C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN +) where {T} + M, K, N = MKN === nothing ? matmul_sizes(C, A, B) : MKN + pA = zstridedpointer(A); pB = zstridedpointer(B); pC = zstridedpointer(C); + Cb = preserve_buffer(C); Ab = preserve_buffer(A); Bb = preserve_buffer(B); + Mc, Kc, Nc = block_sizes(T) + GC.@preserve Cb Ab Bb begin + if maybeinline(M, N, T, ArrayInterface.is_column_major(A)) # check MUST be compile-time resolvable + inlineloopmul!(pC, pA, pB, One(), Zero(), M, K, N) + return + elseif VectorizationBase.CACHE_SIZE[2] === nothing || (nᵣ ≥ N) || dontpack(pA, M, K, Mc, Kc, T) + loopmul!(pC, pA, pB, α, β, M, K, N) + return + else + matmul_st_pack_dispatcher!(pC, pA, pB, α, β, M, K, N) + return + end + end +end # function + +function matmul_st_pack_dispatcher!(pC::AbstractStridedPointer{T}, pA, pB, α, β, M, K, N, notnested::Union{Nothing,Bool} = nothing) where {T} + Mc, Kc, Nc = block_sizes(T) + if VectorizationBase.CACHE_SIZE[3] === nothing || (contiguousstride1(pB) ? (Kc * Nc ≥ K * N) : (firstbytestride(pB) ≤ 1600)) + matmul_st_only_pack_A!(pC, pA, pB, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}()) + elseif notnested === nothing ? iszero(ccall(:jl_in_threaded_region, Cint, ())) : notnested + matmul_st_pack_A_and_B!(pC, pA, pB, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}(), nothing) + else + matmul_st_pack_A_and_B!(pC, pA, pB, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), R₂Default/Threads.nthreads(), Threads.threadid()) + end + nothing +end + - M, K, N = matmul_sizes(C, A, B) +""" + matmul(A, B) - # Check if maybe it's better not to pack at all. - if M * K ≤ _Mc * _Kc && dim1contig(A) && LoopVectorization.check_args(C, A, B) && - (stride(A,2) ≤ 72 || (iszero(stride(A,2) & (VectorizationBase.pick_vector_width(eltype(A))-1)) && iszero(reinterpret(Int,pointer(A)) & 63))) - macrokernel!(C, A, B, _α, _β) - return C +Multiply matrices `A` and `B`. +""" +@inline function matmul(A::AbstractMatrix, B::AbstractMatrix) + C, (M,K,N) = alloc_matmul_product(A, B) + _matmul!(C, A, B, One(), Zero(), nothing, (M,K,N)) + return C +end + +""" + matmul!(C, A, B[, α, β, max_threads]) + +Calculates `C = α * A * B + β * C` in place, overwriting the contents of `A`. +It may use up to `max_threads` threads. It will not use threads when nested in other threaded code. +""" +@inline function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix) where {T} + matmul!(C, A, B, One(), Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C)) +end +@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α) + matmul!(C, A, B, α, Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C)) +end +@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β) + matmul!(C, A, B, α, β, nothing, nothing, ArrayInterface.contiguous_axis(C)) +end +@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread) + matmul!(C, A, B, α, β, nthread, nothing, ArrayInterface.contiguous_axis(C)) +end +@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread, MKN, ::ArrayInterface.Contiguous{2}) + _matmul!(C', B', A', α, β, nthread, MKN) + return C +end +@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread, MKN, ::ArrayInterface.Contiguous) + _matmul!(C, A, B, α, β, nthread, MKN) + return C +end + + +@inline function dontpack(pA::AbstractStridedPointer{Ta}, M, K, ::StaticInt{mc}, ::StaticInt{kc}, ::Type{Tc}, nspawn) where {mc, kc, Tc, Ta} + # MᵣW = VectorizationBase.pick_vector_width_val(Tc) * StaticInt{mᵣ}() + # TODO: perhaps consider K vs kc by themselves? + (contiguousstride1(pA) && ((M * K) ≤ (mc * kc) * nspawn >>> 1)) +end + +# passing MKN directly would let osmeone skip the size check. +@inline function _matmul!(C::AbstractMatrix{T}, A, B, α, β, nthread, MKN) where {T}#::Union{Nothing,Tuple{Vararg{Integer,3}}}) where {T} + M, K, N = MKN === nothing ? matmul_sizes(C, A, B) : MKN + W = VectorizationBase.pick_vector_width_val(T) + pA = zstridedpointer(A); pB = zstridedpointer(B); pC = zstridedpointer(C); + Cb = preserve_buffer(C); Ab = preserve_buffer(A); Bb = preserve_buffer(B); + GC.@preserve Cb Ab Bb begin + if maybeinline(M, N, T, ArrayInterface.is_column_major(A)) # check MUST be compile-time resolvable + inlineloopmul!(pC, pA, pB, One(), Zero(), M, K, N) + return + elseif (nᵣ ≥ N) || (M*K*N < (StaticInt{13824}() * W)) + loopmul!(pC, pA, pB, α, β, M, K, N) + return + else + __matmul!(pC, pA, pB, α, β, M, K, N, nthread) + return + end end +end - # check if we want to skip packing B - do_not_pack_B = (B isa DenseArray && (K * N ≤ _Kc * _Nc)) || N ≤ LoopVectorization.nᵣ - - Bptr = Base.unsafe_convert(Ptr{T}, BCACHE); - - Mc = evenly_divide(M, _Mc, VectorizationBase.pick_vector_width_val(T) * StaticInt{LoopVectorization.mᵣ}()) - Kc = evenly_divide(M, _Kc) - Nc = evenly_divide(M, _Nc, StaticInt{LoopVectorization.nᵣ}()) - - α = T(_α); - for n ∈ StaticInt{1}():Nc:N # loop 5 - nsize = min(Int(n + Nc), Int(N + 1)) - n - β = T(_β) - for k ∈ StaticInt{1}():Kc:K # loop 4 - ksize = min(Int(k + Kc), Int(K + 1)) - k - Bview = view(B, k:k+ksize-1, n:n+nsize-1) - # seperate out loop 3, because of _Bblock type instability - if do_not_pack_B - # _Bblock is likely to have the same type as _Bblock; it'd be nice to reduce the amount of compilation - # by homogenizing types across branches, but for now I'm prefering the simplicity of using `Bview` - # _Bblock = PointerMatrix(gesp1(stridedpointer(B), (k,n)), ksize, nsize) - # matmul_loop3!(C, T, Ablock, A, _Bblock, α, β, msize, ksize, nsize, M, k, n, Mc) - matmul_loop3!(C, T, A, Bview, α, β, ksize, nsize, M, k, n, Mc) - else - Bblock = PointerMatrix(Bptr, (ksize,nsize)) - unsafe_copyto_avx!(Bblock, Bview) - matmul_loop3!(C, T, A, Bblock, α, β, ksize, nsize, M, k, n, Mc) +# This funciton is sort of a `pun`. It splits aggressively (it does a lot of "splitin'"), which often means it will split-N. +function matmulsplitn!(C::AbstractStridedPointer{T}, A, B, α, β, ::StaticInt{Mc}, M, K, N, nspawn, ::Val{PACK}) where {T, Mc, PACK} + Mᵣ = StaticInt{mᵣ}(); Nᵣ = StaticInt{nᵣ}(); + W = VectorizationBase.pick_vector_width_val(T) + MᵣW = Mᵣ*W + _Mblocks, Nblocks = divide_blocks(M, cld_fast(N, Nᵣ), nspawn, W) + Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, _Mblocks, W) + # Nblocks = min(N, _Nblocks) + Nbsize, Nrem = divrem_fast(N, Nblocks) + + _nspawn = Mblocks * Nblocks + Mbsize_Mrem, Mbsize_ = promote(Mbsize + W, Mbsize) + Nbsize_Nrem, Nbsize_ = promote(Nbsize + One(), Nbsize) + + let _A = A, _B = B, _C = C, n = 0, tnum = 0, Nrc = Nblocks - Nrem, Mrc = Mblocks - Mrem, __Mblocks = Mblocks - One() + while true + nsize = ifelse(Nblocks > Nrc, Nbsize_Nrem, Nbsize_); Nblocks -= 1 + let _A = _A, _C = _C, __Mblocks = __Mblocks + while __Mblocks != 0 + msize = ifelse(__Mblocks ≥ Mrc, Mbsize_Mrem, Mbsize_); __Mblocks -= 1 + launch_thread_mul!(_C, _A, _B, α, β, msize, K, nsize, (tnum += 1), Val{PACK}()) + _A = gesp(_A, (msize, Zero())) + _C = gesp(_C, (msize, Zero())) + end + if Nblocks != 0 + launch_thread_mul!(_C, _A, _B, α, β, Mremfinal, K, nsize, (tnum += 1), Val{PACK}()) + else + call_loopmul!(_C, _A, _B, α, β, Mremfinal, K, nsize, Val{PACK}()) + waitonmultasks(CloseOpen(One(), _nspawn)) + return + end end - β = one(T) # re-writing to the same blocks of `C`, so replace original factor with `1` - end # loop 4 - end # loop 5 - C -end - -function matmul_loop3!(C, ::Type{T}, A, Bblock, α, β, ksize, nsize, M, k, n, Mc) where {T} - full_range = StaticInt{1}():Mc:M - partitions = Iterators.partition(full_range, OCTAVIAN_NUM_TASKS[]) - @sync for partition ∈ partitions - Threads.@spawn begin - # Create L2-buffer for `A`; it should be stack-allocated - Amem = L2Buffer(T) - Aptr = Base.unsafe_convert(Ptr{T}, Amem); - GC.@preserve Amem begin - for m ∈ partition # loop 3 - msize = min(Int(m + Mc), Int(M + 1)) - m - Ablock = PointerMatrix(Aptr, (msize, ksize), true) - unsafe_copyto_avx!(Ablock, view(A, m:m+msize-1, k:k+ksize-1)) - - Cblock = view(C, m:m+msize-1, n:n+nsize-1) - macrokernel!(Cblock, Ablock, Bblock, α, β) - end # loop 3 - end # GC.@preserve + _B = gesp(_B, (Zero(), nsize)) + _C = gesp(_C, (Zero(), nsize)) end end end -""" - matmul(A::AbstractMatrix, B::AbstractMatrix) +function __matmul!( + C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, nthread +) where {T} + Mᵣ = StaticInt{mᵣ}(); Nᵣ = StaticInt{nᵣ}(); + W = VectorizationBase.pick_vector_width_val(T) + Mc, Kc, Nc = block_sizes(T) + MᵣW = Mᵣ*W -Return the matrix product A*B. -""" -function matmul(A::AbstractMatrix{Ta}, B::AbstractMatrix{Tb}) where {Ta, Tb} - # TODO: use `similar` / make it more generic; ideally should work with `StaticArrays.MArray` - C = Matrix{promote_type(Ta, Tb)}(undef, size(A,1), size(B,2)) - matmul!(C, A, B) + # Not taking the fast path + # But maybe we don't want to thread anyway + # Maybe this is nested, or we have ≤ 1 threads + nt = _nthreads() + _nthread = nthread === nothing ? nt : min(nt, nthread) + not_in_threaded = iszero(ccall(:jl_in_threaded_region, Cint, ())) + if (!not_in_threaded) | (_nthread ≤ 1) + matmul_st_pack_dispatcher!(C, A, B, α, β, M, K, N, not_in_threaded) + return + end + # We are threading, but how many threads? + L = StaticInt{128}() * W + # L = StaticInt{64}() * W + nspawn = clamp(div_fast(M * N, L), 1, _nthread) + + # nkern = cld_fast(M * N, MᵣW * Nᵣ) + # Approach: + # Check if we don't want to pack A, + # if not, aggressively subdivide + # if so, check if we don't want to pack B + # if not, check if we want to thread `N` loop anyway + # if so, divide `M` first, then use ratio of desired divisions / divisions along `M` to calc divisions along `N` + # if not, only thread along `M`. These don't need syncing, as we're not packing `B` + # if so, `matmul_pack_A_and_B!` + # + # MᵣW * (MᵣW_mul_factor - One()) # gives a smaller Mc, then + # if 2M/nspawn is less than it, we don't don't `A` + # First check is: do we just want to split aggressively? + if VectorizationBase.CACHE_SIZE[2] === nothing || # do not pack A + dontpack(A, M, K, Mc, Kc, T, nspawn) || (W ≥ M) || (nᵣ*((NUM_CORES ≥ 8) ? max(nspawn,8) : 8) ≥ N) + # `nᵣ*nspawn ≥ N` is needed at the moment to avoid accidentally splitting `N` to be `< nᵣ` while packing + # Should probably handle that with a smarter splitting function... + matmulsplitn!(C, A, B, α, β, Mc, M, K, N, nspawn, Val{false}()) + elseif ((nspawn*(W+W) > M) || (contiguousstride1(B) ? (roundtostaticint(Kc * Nc * StaticFloat{R₂Default}()) ≥ K * N) : (firstbytestride(B) ≤ 1600))) + matmulsplitn!(C, A, B, α, β, Mc, M, K, N, nspawn, Val{true}()) + else # TODO: Allow splitting along `N` for `matmul_pack_A_and_B!` + matmul_pack_A_and_B!(C, A, B, α, β, M, K, N, nspawn, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}()) + end + nothing +end + + +# If tasks is [0,1,2,3] (e.g., `CloseOpen(0,4)`), it will wait on `MULTASKS[i]` for `i = [1,2,3]`. +function waitonmultasks(tasks) + for tid ∈ tasks + __wait(tid) + end +end + +@inline allocref(::StaticInt{N}) where {N} = Ref{NTuple{N,UInt8}}() +function matmul_pack_A_and_B!( + C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, + tospawn::Int, ::StaticFloat{W₁}, ::StaticFloat{W₂}, ::StaticFloat{R₁}, ::StaticFloat{R₂}#, ::Val{1} +) where {T,W₁,W₂,R₁,R₂} + W = VectorizationBase.pick_vector_width_val(T) + mᵣW = StaticInt{mᵣ}() * W + # atomicsync = Ref{NTuple{16,UInt}}() + Mbsize, Mrem, Mremfinal, _to_spawn = split_m(M, tospawn, W) # M is guaranteed to be > W because of `W ≥ M` condition for `jmultsplitn!`... + atomicsync = allocref(StaticInt{NUM_CORES}()*StaticInt{2}()*StaticInt(CACHELINESIZE)) + p = reinterpret(Ptr{UInt}, Base.unsafe_convert(Ptr{UInt8}, atomicsync)) + GC.@preserve atomicsync begin + for i ∈ CloseOpen(2_to_spawn) + _atomic_store!(p + i*CACHELINESIZE, zero(UInt)) + end + # _atomic_umin!(p, zero(UInt)); _atomic_umin!(p + 8sizeof(UInt), zero(UInt)) + # Mbsize, Mrem, Mremfinal, _to_spawn = split_m(M, tospawn, mᵣW) # M is guaranteed to be > W because of `W ≥ M` condition for `jmultsplitn!`... + Mblock_Mrem, Mblock_ = promote(Mbsize + W, Mbsize) + u_to_spawn = _to_spawn % UInt + tid = 0 + bc = _use_bcache() + bc_ptr = Base.unsafe_convert(typeof(pointer(C)), pointer(bc)) + last_id = _to_spawn - One() + for m ∈ CloseOpen(last_id) # ...thus the fact that `CloseOpen()` iterates at least once is okay. + Mblock = ifelse(m < Mrem, Mblock_Mrem, Mblock_) + launch_thread_mul!(C, A, B, α, β, Mblock, K, N, p, bc_ptr, m % UInt, u_to_spawn, StaticFloat{W₁}(),StaticFloat{W₂}(),StaticFloat{R₁}(),StaticFloat{R₂}()) + A = gesp(A, (Mblock, Zero())) + C = gesp(C, (Mblock, Zero())) + end + sync_mul!(C, A, B, α, β, Mremfinal, K, N, p, bc_ptr, last_id % UInt, u_to_spawn, StaticFloat{W₁}(), StaticFloat{W₂}(), StaticFloat{R₁}(), StaticFloat{R₂}()) + waitonmultasks(CloseOpen(One(), _to_spawn)) + end + _free_bcache!(bc) + return +end + +function sync_mul!( + C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, atomicp::Ptr{UInt}, bc::Ptr, id::UInt, total_ids::UInt, + ::StaticFloat{W₁}, ::StaticFloat{W₂}, ::StaticFloat{R₁}, ::StaticFloat{R₂} +) where {T, W₁, W₂, R₁, R₂} + + (Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), (Nblock, Nblock_Nrem, Nrem, Niter) = + solve_block_sizes(T, M, K, N, StaticFloat{W₁}(), StaticFloat{W₂}(), StaticFloat{R₁}(), StaticFloat{R₂}(), One()) + + # atomics = atomicp + 8sizeof(UInt) + sync_iters = zero(UInt) + myp = atomicp + id *CACHELINESIZE + atomicp -= CACHELINESIZE + atomics = atomicp + total_ids*CACHELINESIZE + mys = myp + total_ids*(CACHELINESIZE % UInt) + Npackb_r_div, Npackb_r_rem = divrem_fast(Nblock_Nrem, total_ids) + Npackb_r_block_rem, Npackb_r_block_ = promote(Npackb_r_div + One(), Npackb_r_div) + + Npackb___div, Npackb___rem = divrem_fast(Nblock, total_ids) + Npackb___block_rem, Npackb___block_ = promote(Npackb___div + One(), Npackb___div) + + pack_r_offset = Npackb_r_div * id + min(id, Npackb_r_rem) + pack___offset = Npackb___div * id + min(id, Npackb___rem) + + pack_r_len = ifelse(id < Npackb_r_rem, Npackb_r_block_rem, Npackb_r_block_) + pack___len = ifelse(id < Npackb___rem, Npackb___block_rem, Npackb___block_) + + GC.@preserve BCACHE begin + for n in CloseOpen(Niter) + # Krem + # pack kc x nc block of B + nfull = n < Nrem + nsize = ifelse(nfull, Nblock_Nrem, Nblock) + pack_offset = ifelse(nfull, pack_r_offset, pack___offset) + pack_len = ifelse(nfull, pack_r_len, pack___len) + let A = A, B = B + for k ∈ CloseOpen(Kiter) + ksize = ifelse(k < Krem, Kblock_Krem, Kblock) + _B = default_zerobased_stridedpointer(bc, (One(), ksize)) + unsafe_copyto_avx!(gesp(_B, (Zero(), pack_offset)), gesp(B, (Zero(), pack_offset)), ksize, pack_len) + # synchronize before starting the multiplication, to ensure `B` is packed + # sync_iters += total_ids + # _mv = _atomic_add!(atomicp, one(UInt)) + # while _mv < sync_iters + # pause() + # _mv = _atomic_umax!(atomicp, zero(UInt)) + # end + _mv = _atomic_add!(myp, one(UInt)) + sync_iters += one(UInt) + let atomp = atomicp + for _ ∈ CloseOpen(total_ids) + atomp += CACHELINESIZE + atomp == myp && continue + # while !_atomic_cas_cmp!(atomp, sync_iters, sync_iters) + while _atomic_load(atomp) != sync_iters + pause() + end + end + end + # multiply + let A = A, B = _B, C = C + for m in CloseOpen(Miter) + msize = ifelse((m+1) == Miter, Mremfinal, ifelse(m < Mrem, Mblock_Mrem, Mblock)) + if k == 0 + packaloopmul!(C, A, B, α, β, msize, ksize, nsize) + else + packaloopmul!(C, A, B, α, One(), msize, ksize, nsize) + end + A = gesp(A, (msize, Zero())) + C = gesp(C, (msize, Zero())) + end + end + A = gesp(A, (Zero(), ksize)) + B = gesp(B, (ksize, Zero())) + # synchronize on completion so we wait until every thread is done with `Bpacked` before beginning to overwrite it + # _mv = _atomic_add!(atomics, one(UInt)) + # while _mv < sync_iters + # pause() + # _mv = _atomic_umax!(atomics, zero(UInt)) + # end + _mv = _atomic_add!(mys, one(UInt)) + let atoms = atomics + for _ ∈ CloseOpen(total_ids) + atoms += CACHELINESIZE + atoms == mys && continue + # while !_atomic_cas_cmp!(atoms, sync_iters, sync_iters) + while _atomic_load(atoms) != sync_iters + pause() + end + end + end + end + end + B = gesp(B, (Zero(), nsize)) + C = gesp(C, (Zero(), nsize)) + end + end # GC.@preserve + nothing end diff --git a/src/memory_buffer.jl b/src/memory_buffer.jl index b804c82..ac38827 100644 --- a/src/memory_buffer.jl +++ b/src/memory_buffer.jl @@ -1,5 +1,57 @@ -Base.unsafe_convert(::Type{Ptr{T}}, m::MemoryBuffer) where {T} = Base.unsafe_convert(Ptr{T}, Base.pointer_from_objref(m)) -@inline MemoryBuffer(::StaticInt{L}, ::Type{T}) where {L,T} = MemoryBuffer{L,T}(undef) -@inline function L2Buffer(::Type{T}) where {T} - MemoryBuffer(StaticInt{VectorizationBase.CACHE_SIZE[2]}() ÷ VectorizationBase.static_sizeof(T), T) +@inline Base.unsafe_convert(::Type{Ptr{T}}, d::MemoryBuffer) where {T} = Base.unsafe_convert(Ptr{T}, Base.pointer_from_objref(d)) +@inline MemoryBuffer{T}(::UndefInitializer, ::StaticInt{L}) where {L,T} = MemoryBuffer{L,T}(undef) +Base.size(::MemoryBuffer{L}) where L = (L,) +@inline Base.similar(::MemoryBuffer{L,T}) where {L,T} = MemoryBuffer{L,T}(undef) +# Base.IndexStyle(::Type{<:MemoryBuffer}) = Base.IndexLinear() +@inline function Base.getindex(m::MemoryBuffer{L,T}, i::Int) where {L,T} + @boundscheck checkbounds(m, i) + GC.@preserve m x = vload(pointer(m), VectorizationBase.lazymul(VectorizationBase.static_sizeof(T), i - one(i))) + x end +@inline function Base.setindex!(m::MemoryBuffer{L,T}, x, i::Int) where {L,T} + @boundscheck checkbounds(m, i) + GC.@preserve m vstore!(pointer(m), convert(T, x), lazymul(static_sizeof(T), i - one(i))) +end + +if Sys.WORD_SIZE == 32 + @inline function first_cache_buffer(::Type{T}) where {T} + ACACHEPTR[] + Threads.threadid() * FIRST__CACHE_SIZE - FIRST__CACHE_SIZE + end +else + @inline function first_cache_buffer(::Type{T}) where {T} + MemoryBuffer{T}(undef, StaticInt{FIRST__CACHE_SIZE}() ÷ static_sizeof(T)) + end +end + +BCache(i::Integer) = BCache(pointer(BCACHE)+cld_fast(SECOND_CACHE_SIZE*i,Threads.nthreads()), i % UInt) +BCache(::Nothing) = BCache(pointer(BCACHE), nothing) + +@inline Base.pointer(b::BCache) = b.p +@inline Base.unsafe_convert(::Type{Ptr{T}}, b::BCache) where {T} = Base.unsafe_convert(Ptr{T}, b.p) + +function _use_bcache() + while Threads.atomic_cas!(BCACHE_LOCK, zero(UInt), typemax(UInt)) != zero(UInt) + pause() + end + return BCache(nothing) +end +@inline _free_bcache!(b::BCache{Nothing}) = reseet_bcache_lock!() + +_use_bcache(::Nothing) = _use_bcache() +function _use_bcache(i) + f = one(UInt) << i + while (Threads.atomic_or!(BCACHE_LOCK, f) & f) != zero(UInt) + pause() + end + BCache(i) +end +_free_bcache!(b::BCache{UInt}) = (Threads.atomic_xor!(BCACHE_LOCK, one(UInt) << b.i); nothing) + +""" + reset_bcache_lock!() + +Currently not using try/finally in matmul routine, despite locking. +So if it errors for some reason, you may need to manually call `reset_bcache_lock!()`. +""" +@inline reseet_bcache_lock!() = (BCACHE_LOCK[] = zero(UInt); nothing) + diff --git a/src/pointer_matrix.jl b/src/pointer_matrix.jl deleted file mode 100644 index fcc7c33..0000000 --- a/src/pointer_matrix.jl +++ /dev/null @@ -1,33 +0,0 @@ -PointerMatrix(p::P, s::S) where {T,P<:VectorizationBase.AbstractStridedPointer{T},S} = PointerMatrix{T,P,S}(p, s) -Base.size(A::PointerMatrix) = map(Int, A.s) -VectorizationBase.stridedpointer(A::PointerMatrix) = A.p -Base.unsafe_convert(::Type{Ptr{T}}, A::PointerMatrix{T}) where {T} = pointer(A.p) -@inline function Base.getindex(A::PointerMatrix, i::Integer, j::Integer) - @boundscheck checkbounds(A, i, j) - VectorizationBase.vload(VectorizationBase.stridedpointer(A), (i,j)) -end -@inline function Base.getindex(A::PointerMatrix, i::Integer) - @boundscheck checkbounds(A, i) - VectorizationBase.vload(VectorizationBase.stridedpointer(A), (i-1,)) -end -@inline function Base.setindex!(A::PointerMatrix{T}, v, i::Integer, j::Integer) where {T} - @boundscheck checkbounds(A, i, j) - VectorizationBase.vstore!(VectorizationBase.stridedpointer(A), convert(T, v), (i,j)) - v -end -@inline function Base.setindex!(A::PointerMatrix{T}, v, i::Integer) where {T} - @boundscheck checkbounds(A, i) - VectorizationBase.vstore!(VectorizationBase.stridedpointer(A), convert(T, v), (i-1,)) - v -end - -function PointerMatrix(Bptr::Ptr{T}, (M,N), padcols::Bool = false) where {T} - st = VectorizationBase.static_sizeof(T) - _M = padcols ? VectorizationBase.align(M, T) : M - # Should maybe add a more convenient column major constructor - Bsptr = VectorizationBase.stridedpointer( - Bptr, VectorizationBase.ArrayInterface.Contiguous{1}(), VectorizationBase.ArrayInterface.ContiguousBatch{0}(), - VectorizationBase.ArrayInterface.StrideRank{(1,2)}(), (st, _M*st), (StaticInt{1}(),StaticInt{1}()) - ) - PointerMatrix(Bsptr, (M,N)) -end diff --git a/src/staticfloats.jl b/src/staticfloats.jl new file mode 100644 index 0000000..2e7000d --- /dev/null +++ b/src/staticfloats.jl @@ -0,0 +1,41 @@ +Base.convert(::Type{T}, ::StaticFloat{N}) where {N,T<:AbstractFloat} = T(N) +Base.promote_rule(::Type{StaticFloat{N}}, ::Type{T}) where {N,T} = promote_type(T, Float64) + +@generated Base.:+(::StaticFloat{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M+N)) +@generated Base.:+(::StaticFloat{N}, ::StaticInt{M}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M+N)) +@generated Base.:+(::StaticInt{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M+N)) + +@generated Base.:+(::StaticFloat{N}, ::StaticInt{0}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N)) +@generated Base.:+(::StaticInt{0}, ::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N)) + +@generated Base.:-(::StaticFloat{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M - N)) +@generated Base.:-(::StaticFloat{N}, ::StaticInt{M}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, N - M)) +@generated Base.:-(::StaticInt{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M - N)) + +@generated Base.:-(::StaticFloat{N}, ::StaticInt{0}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N)) +@generated Base.:-(::StaticInt{0}, ::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, -N)) + + +@generated Base.:*(::StaticFloat{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M*N)) +@generated Base.:*(::StaticFloat{N}, ::StaticInt{M}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M*N)) +@generated Base.:*(::StaticInt{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M*N)) + +@generated Base.:*(::StaticFloat{N}, ::StaticInt{0}) where {N} = Expr(:call, Expr(:curly, :StaticInt, 0)) +@generated Base.:*(::StaticInt{0}, ::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticInt, 0)) +@generated Base.:*(::StaticFloat{N}, ::StaticInt{1}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N)) +@generated Base.:*(::StaticInt{1}, ::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N)) + +@generated Base.:/(::StaticFloat{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M/N)) +@generated Base.:/(::StaticFloat{N}, ::StaticInt{M}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, N/M)) +@generated Base.:/(::StaticInt{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M/N)) + +@generated Base.sqrt(::StaticInt{M}) where {M} = Expr(:call, Expr(:curly, :StaticFloat, sqrt(M))) +@generated Base.sqrt(::StaticFloat{M}) where {M} = Expr(:call, Expr(:curly, :StaticFloat, sqrt(M))) + +@generated Base.round(::StaticFloat{M}) where {M} = Expr(:call, Expr(:curly, :StaticFloat, round(M))) +@generated roundtostaticint(::StaticFloat{M}) where {M} = Expr(:call, Expr(:curly, :StaticInt, round(Int, M))) +roundtostaticint(x::AbstractFloat) = round(Int, x) +@generated floortostaticint(::StaticFloat{M}) where {M} = Expr(:call, Expr(:curly, :StaticInt, floor(Int, M))) +floortostaticint(x::AbstractFloat) = Base.fptosi(Int, x) +@generated Base.inv(::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, inv(N))) + diff --git a/src/types.jl b/src/types.jl index 72c8ba4..4a97feb 100644 --- a/src/types.jl +++ b/src/types.jl @@ -1,9 +1,18 @@ -struct PointerMatrix{T,P<:VectorizationBase.AbstractStridedPointer,S<:Tuple{Vararg{Integer,2}}} <: DenseMatrix{T} - p::P - s::S +mutable struct MemoryBuffer{L,T} <: DenseVector{T} + data::NTuple{L,T} + @inline function MemoryBuffer{L,T}(::UndefInitializer) where {L,T} + @assert isbitstype(T) "Memory buffers must point to bits types, but `isbitstype($T) == false`." + new{L,T}() + end end -mutable struct MemoryBuffer{L,T} - data::NTuple{L,T} - MemoryBuffer{L,T}(::UndefInitializer) where {L,T} = new{L,T}() +struct StaticFloat{N} <: AbstractFloat + StaticFloat{N}() where {N} = new{N::Float64}() +end + +struct BCache{T<:Union{UInt,Nothing}} + p::Ptr{UInt8} + i::T end + + diff --git a/src/utils.jl b/src/utils.jl index a2842cc..f457f47 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,29 +1,50 @@ -check_sizes(::StaticInt{M}, ::StaticInt{M}) where {M} = StaticInt{M}() -check_sizes(::StaticInt{M}, ::StaticInt{N}) where {M,N} = throw(ErrorException("$M ≠ $N")) -check_sizes(::StaticInt{M}, m) where {M} = (@assert M == m; StaticInt{M}()) -check_sizes(m, ::StaticInt{M}) where {M} = (@assert M == m; StaticInt{M}()) -check_sizes(m, n) = (@assert m == n; m) +@inline _select(::StaticInt{M}, ::StaticInt{M}) where {M} = StaticInt{M}() +@noinline _select(::StaticInt{M}, ::StaticInt{N}) where {M,N} = throw("$M ≠ $N") +@inline _select(::StaticInt{M}, _) where {M} = StaticInt{M}() +@inline _select(_, ::StaticInt{M}) where {M} = StaticInt{M}() +@inline _select(x, _) = x """ Checks sizes for compatibility, and preserves the static size information if given a mix of static and dynamic sizes. """ -function matmul_sizes(C, A, B) - MC, NC = VectorizationBase.ArrayInterface.size(C) - MA, KA = VectorizationBase.ArrayInterface.size(A) - KB, NB = VectorizationBase.ArrayInterface.size(B) - M = check_sizes(MC, MA) - K = check_sizes(KA, KB) - N = check_sizes(NC, NB) - M, K, N +@inline function matmul_sizes(C,A,B) + MC, NC = size(C) + MA, KA = size(A) + KB, NB = size(B) + @assert ((MC == MA) & (KA == KB) & (NC == NB)) "Size mismatch." + (_select(MA, MC), _select(KA, KB), _select(NB, NC)) end -function unsafe_copyto_avx!(B, A) - LoopVectorization.@avx for i ∈ eachindex(B, A) - B[i] = A[i] +function unsafe_copyto_avx!(pB, pA, M, N) + LoopVectorization.@avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M) + pB[m,n] = pA[m,n] end end -# @inline function gesp1(sp::P, inds) where {P <: VectorizationBase.AbstractStridedPointer} -# P(VectorizationBase.gep(sp, inds), sp.strd, sp.offsets) -# end +function default_stridedpointer_quote(::Type{T}, N, Ot) where {T} + C = 1 + B = 0 + R = Expr(:tuple) + o = Expr(:tuple) + xt = Expr(:tuple) + st = Expr(:call, Expr(:curly, :StaticInt, sizeof(T))) + for n ∈ 1:N + push!(xt.args, Expr(:call, :*, st, Expr(:ref, :x, n))) + push!(R.args, n) + push!(o.args, Expr(:call, Ot)) + end + quote + $(Expr(:meta,:inline)) + StridedPointer{$T,$N,$C,$B,$R}(ptr, $xt, $o) + end +end + +@generated function default_stridedpointer(ptr::Ptr{T}, x::X) where {T, N, X <: Tuple{Vararg{Integer,N}}} + default_stridedpointer_quote(T, N, :One) +end +@generated function default_zerobased_stridedpointer(ptr::Ptr{T}, x::X) where {T, N, X <: Tuple{Vararg{Integer,N}}} + default_stridedpointer_quote(T, N, :Zero) +end + + diff --git a/test/block_sizes.jl b/test/block_sizes.jl index 3e51e08..9f26279 100644 --- a/test/block_sizes.jl +++ b/test/block_sizes.jl @@ -1,4 +1,4 @@ @time @testset "block_sizes" begin - @test Octavian._calculate_L3(1, 1, 1, Val(true)) == 0 - @test Octavian._calculate_L3(1, 1, 1, Val(false)) == 1 + + end diff --git a/test/init.jl b/test/init.jl index eaf4e97..f11d282 100644 --- a/test/init.jl +++ b/test/init.jl @@ -2,7 +2,10 @@ withenv("OCTAVIAN_NUM_TASKS" => "") do @test Octavian._read_environment_num_tasks() == min(Threads.nthreads(), VectorizationBase.NUM_CORES) end + withenv("OCTAVIAN_NUM_TASKS" => "1") do + @test Octavian._read_environment_num_tasks() == 1 + end withenv("OCTAVIAN_NUM_TASKS" => "99") do - @test Octavian._read_environment_num_tasks() == 99 + @test Octavian._read_environment_num_tasks() == min(Threads.nthreads(), VectorizationBase.NUM_CORES) end end diff --git a/test/macrokernels.jl b/test/macrokernels.jl index 9bf9ee6..88c5803 100644 --- a/test/macrokernels.jl +++ b/test/macrokernels.jl @@ -10,7 +10,7 @@ C2 = deepcopy(C1) α = Float64(2.0) β = Float64(2.0) - Octavian.macrokernel!(C1, A1, B1, α, β) + Octavian.loopmul!(VectorizationBase.zstridedpointer(C1), VectorizationBase.zstridedpointer(A1), VectorizationBase.zstridedpointer(B1), α, β, m, k, n) C2 = α*A2*B2 + β*C2 @test C1 ≈ C2 end diff --git a/test/matmul.jl b/test/matmul.jl index 4b2b498..90ed726 100644 --- a/test/matmul.jl +++ b/test/matmul.jl @@ -11,10 +11,15 @@ A = rand(Float32, m, k) A′ = permutedims(A)' @show m, k, n - @test @time(Octavian.matmul(A, B)) ≈ A * B - @test @time(Octavian.matmul(A′, B)) ≈ A′ * B - @test @time(Octavian.matmul(A, B′)) ≈ A * B′ - @test @time(Octavian.matmul(A′, B′)) ≈ A′ * B′ + AB = A * B; A′B = A′ * B; AB′ = A * B′; A′B′ = A′ * B′; + @test @time(Octavian.matmul(A, B)) ≈ AB + @test @time(Octavian.matmul(A′, B)) ≈ A′B + @test @time(Octavian.matmul(A, B′)) ≈ AB′ + @test @time(Octavian.matmul(A′, B′)) ≈ A′B′ + @test @time(Octavian.matmul_serial(A, B)) ≈ AB + @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B + @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′ + @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′ end end end @@ -33,10 +38,15 @@ end A = rand(Float64, m, k) A′ = permutedims(A)' @show m, k, n - @test @time(Octavian.matmul(A, B)) ≈ A * B - @test @time(Octavian.matmul(A′, B)) ≈ A′ * B - @test @time(Octavian.matmul(A, B′)) ≈ A * B′ - @test @time(Octavian.matmul(A′, B′)) ≈ A′ * B′ + AB = A * B; A′B = A′ * B; AB′ = A * B′; A′B′ = A′ * B′; + @test @time(Octavian.matmul(A, B)) ≈ AB + @test @time(Octavian.matmul(A′, B)) ≈ A′B + @test @time(Octavian.matmul(A, B′)) ≈ AB′ + @test @time(Octavian.matmul(A′, B′)) ≈ A′B′ + @test @time(Octavian.matmul_serial(A, B)) ≈ AB + @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B + @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′ + @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′ end end end @@ -55,10 +65,15 @@ end A = rand(Int32, m, k) A′ = permutedims(A)' @show m, k, n - @test @time(Octavian.matmul(A, B)) == A * B - @test @time(Octavian.matmul(A′, B)) == A′ * B - @test @time(Octavian.matmul(A, B′)) == A * B′ - @test @time(Octavian.matmul(A′, B′)) == A′ * B′ + AB = A * B; A′B = A′ * B; AB′ = A * B′; A′B′ = A′ * B′; + @test @time(Octavian.matmul(A, B)) ≈ AB + @test @time(Octavian.matmul(A′, B)) ≈ A′B + @test @time(Octavian.matmul(A, B′)) ≈ AB′ + @test @time(Octavian.matmul(A′, B′)) ≈ A′B′ + @test @time(Octavian.matmul_serial(A, B)) ≈ AB + @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B + @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′ + @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′ end end end @@ -77,11 +92,17 @@ end A = rand(Int64, m, k) A′ = permutedims(A)' @show m, k, n - @test @time(Octavian.matmul(A, B)) == A * B - @test @time(Octavian.matmul(A′, B)) == A′ * B - @test @time(Octavian.matmul(A, B′)) == A * B′ - @test @time(Octavian.matmul(A′, B′)) == A′ * B′ + AB = A * B; A′B = A′ * B; AB′ = A * B′; A′B′ = A′ * B′; + @test @time(Octavian.matmul(A, B)) ≈ AB + @test @time(Octavian.matmul(A′, B)) ≈ A′B + @test @time(Octavian.matmul(A, B′)) ≈ AB′ + @test @time(Octavian.matmul(A′, B′)) ≈ A′B′ + @test @time(Octavian.matmul_serial(A, B)) ≈ AB + @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B + @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′ + @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′ end end end end + diff --git a/test/pointer_matrix.jl b/test/pointer_matrix.jl deleted file mode 100644 index ff00a1b..0000000 --- a/test/pointer_matrix.jl +++ /dev/null @@ -1,12 +0,0 @@ -@time Test.@testset "PointerMatrix" begin - mem = Octavian.L2Buffer(Float64); - ptr = Base.unsafe_convert(Ptr{Float64}, mem) - block = Octavian.PointerMatrix(ptr, (10, 20)) - Test.@test Base.unsafe_convert(Ptr{Float64}, block) == pointer(block.p) - GC.@preserve mem begin - block[1] = 2.3 - Test.@test block[1] == 2.3 - block[4, 5] = 67.89 - Test.@test block[4, 5] == 67.89 - end -end diff --git a/test/runtests.jl b/test/runtests.jl index f30b6db..384e49b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -19,12 +19,14 @@ include("test_suite_preamble.jl") Random.seed!(123) +using Aqua: test_all +test_all(Octavian) + +include("utils.jl") include("block_sizes.jl") include("init.jl") include("macrokernels.jl") include("matmul_coverage.jl") -include("pointer_matrix.jl") -include("utils.jl") if !coverage include("matmul.jl") diff --git a/test/utils.jl b/test/utils.jl index 6062834..9b6303a 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -2,11 +2,12 @@ @testset "check_sizes" begin a = Octavian.StaticInt{1}() b = Octavian.StaticInt{2}() - @test Octavian.check_sizes(a, a) == a - @test_throws ErrorException Octavian.check_sizes(a, b) - @test Octavian.check_sizes(a, 1) == a - @test_throws AssertionError Octavian.check_sizes(a, 100) - @test Octavian.check_sizes(2, b) == b - @test_throws AssertionError Octavian.check_sizes(200, b) + @test Octavian._select(a, a) === a + @test Octavian._select(a, 1) === a + @test Octavian._select(1, a) === a + @test Octavian._select(1, 1) === 1 + @test_throws AssertionError Octavian.matmul_sizes(rand(3,2), rand(3,4), rand(5,2)) + @test_throws AssertionError Octavian.matmul_sizes(rand(3,2), rand(3,4), rand(4,5)) + @test_throws AssertionError Octavian.matmul_sizes(rand(3,2), rand(5,4), rand(4,2)) end end