JuliaLinearAlgebra · chriselrod · Jan 20, 2021 · Jan 18, 2021 · Jan 18, 2021 · Jan 18, 2021
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,7 @@
 /docs/build/
 Manifest.toml
 *~
+*#
+*#*
+*#*#
 
diff --git a/Project.toml b/Project.toml
@@ -1,20 +1,23 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
-version = "0.1.0"
+version = "0.2.0-DEV"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 ArrayInterface = "2.14"
 LoopVectorization = "0.9.18"
+ThreadingUtilities = "0.2"
 VectorizationBase = "0.15.2"
 julia = "1.5"
 
 [extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -24,4 +27,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [targets]
-test = ["BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
+test = ["Aqua", "BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
diff --git a/src/Octavian.jl b/src/Octavian.jl
@@ -1,23 +1,33 @@
 module Octavian
 
-import ArrayInterface
-import LoopVectorization
-import VectorizationBase
+using VectorizationBase, ArrayInterface, LoopVectorization
 
-using VectorizationBase: StaticInt
+using VectorizationBase: align, gep, AbstractStridedPointer, AbstractSIMDVector, vnoaliasstore!, staticm1,
+    static_sizeof, lazymul, vmul_fast, StridedPointer, gesp, zero_offsets, pause,
+    CACHE_COUNT, NUM_CORES, CACHE_INCLUSIVITY, zstridedpointer
+using LoopVectorization: maybestaticsize, mᵣ, nᵣ, preserve_buffer, CloseOpen
+using ArrayInterface: StaticInt, Zero, One, OptionallyStaticUnitRange, size, strides, offsets, indices,
+    static_length, static_first, static_last, axes,
+    dense_dims, DenseDims, stride_rank, StrideRank
 
-export matmul
-export matmul!
+using ThreadingUtilities:
+    _atomic_add!, _atomic_umax!, _atomic_umin!,
+    _atomic_load, _atomic_store!, _atomic_cas_cmp!,
+    SPIN, WAIT, TASK, LOCK, STUP, taskpointer,
+    wake_thread!, __wait, load, store!
+
+export matmul!, matmul_serial!, matmul, matmul_serial, StaticInt
 
 include("global_constants.jl")
 include("types.jl")
-
+include("staticfloats.jl")
+include("integerdivision.jl")
+include("memory_buffer.jl")
 include("block_sizes.jl")
+include("funcptrs.jl")
 include("macrokernels.jl")
-include("matmul.jl")
-include("memory_buffer.jl")
-include("pointer_matrix.jl")
 include("utils.jl")
+include("matmul.jl")
 
 include("init.jl") # `Octavian.__init__()` is defined in this file
 

diff --git a/src/block_sizes.jl b/src/block_sizes.jl
@@ -1,70 +1,233 @@
+
+first_effective_cache(::Type{T}) where {T} = StaticInt{FIRST__CACHE_SIZE}() ÷ static_sizeof(T)
+second_effective_cache(::Type{T}) where {T} = StaticInt{SECOND_CACHE_SIZE}() ÷ static_sizeof(T)
+
+function block_sizes(::Type{T}, _α, _β, R₁, R₂) where {T}
+    W = VectorizationBase.pick_vector_width_val(T)
+    α = _α * W
+    β = _β * W
+    L₁ₑ = first_effective_cache(T) * R₁
+    L₂ₑ = second_effective_cache(T) * R₂
+    block_sizes(W, α, β, L₁ₑ, L₂ₑ)
+end
+function block_sizes(W, α, β, L₁ₑ, L₂ₑ)
+    MᵣW = StaticInt{mᵣ}() * W
+
+    Mc = floortostaticint(√(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ) / MᵣW) * MᵣW
+    Kc = roundtostaticint(√(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α))
+    Nc = floortostaticint(√(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) / StaticInt{nᵣ}()) * StaticInt{nᵣ}()
+
+    Mc, Kc, Nc
+end
+function block_sizes(::Type{T}) where {T}
+    block_sizes(T, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
+end
+
 """
-  block_sizes(::Type{T}) -> (Mc, Kc, Nc)
-
-Returns the dimensions of our macrokernel, which iterates over the microkernel. That is, in
-calculating `C = A * B`, our macrokernel will be called on `Mc × Kc` blocks of `A`, multiplying
- them with `Kc × Nc` blocks of `B`, to update `Mc × Nc` blocks of `C`.
-
-We want these blocks to fit nicely in the cache. There is a lot of room for improvement here, but this
-initial implementation should work reasonably well.
-
-
-The constants `LoopVectorization.mᵣ` and `LoopVectorization.nᵣ` are the factors by which LoopVectorization
-wants to unroll the rows and columns of the microkernel, respectively. `LoopVectorization` defines these
-constants by running it's analysis on a gemm kernel; they're there for convenience/to make it easier to
-implement matrix-multiply.
-It also wants to vectorize the rows by `W = VectorizationBase.pick_vector_width_val(T)`.
-Thus, the microkernel's dimensions are `(W * mᵣ) × nᵣ`; that is, the microkernel updates a `(W * mᵣ) × nᵣ`
-block of `C`.
-
-Because the macrokernel iterates over tiles and repeatedly applies the microkernel, we would prefer the
-macrokernel's dimensions to be an integer multiple of the microkernel's.
-That is, we want `Mc` to be an integer multiple of `W * mᵣ` and `Nc` to be an integer multiple of `nᵣ`.
-
-Additionally, we want our blocks of `A` to fit in the core-local L2 cache.
-Empirically, I found that when using `Float64` arrays, 72 rows works well on Haswell (where `W * mᵣ = 8`)
-and 96 works well for Cascadelake (where `W * mᵣ = 24`).
-So I kind of heuristically multiply `W * mᵣ` by `4` given 32 vector register (as in Cascadelake), which
-would yield `96`, and multiply by `9` otherwise, which would give `72` on Haswell.
-Ideally, we'd have a better means of picking.
-I suspect relatively small numbers work well because I'm currently using a column-major memory layout for
-the internal packing arrays. A column-major memory layout means that if our macro-kernel had a lot of rows,
-moving across columns would involve reading memory far apart, moving across memory pages more rapidly,
-hitting the TLB harder. This is why libraries like OpenBLAS and BLIS don't use a column-major layout, but
-reshape into a 3-d array, e.g. `A` will be reshaped into a `Mᵣ × Kc × (Mc ÷ Mᵣ)` array (also sometimes
-referred to as a tile-major matrix), so that all memory reads happen in consecutive memory locations.
-
-
-Now that we have `Mc`, we use it and the `L2` cache size to calculate `Kc`, but shave off a percent to
-leave room in the cache for some other things.
-
-We want out blocks of `B` to fir in the `L3` cache, so we can use the `L3` cache-size and `Kc` to
-similarly calculate `Nc`, with the additional note that we also divide and multiply by `nᵣ` to ensure
-that `Nc` is an integer multiple of `nᵣ`.
+    split_m(M, Miters_base, W)
+
+Splits `M` into at most `Miters_base` iterations.
+For example, if we wish to divide `517` iterations into roughly 7 blocks using multiples of `8`:
+
+```julia
+julia> split_m(517, 7, 8)
+(72, 2, 69, 7)
+```
+
+This suggests we have base block sizes of size `72`, with two iterations requiring an extra remainder of `8 ( = W)`,
+and a final block of `69` to handle the remainder. It also tells us that there are `7` total iterations, as requested.
+```julia
+julia> 80*2 + 72*(7-2-1) + 69
+517
+```
+This is meant to specify roughly the requested amount of blocks, and return relatively even sizes.
+
+This method is used fairly generally.
 """
-function block_sizes(::Type{T}) where {T}
-    _L1, _L2, __L3, _L4 = VectorizationBase.CACHE_SIZE
-    L1c, L2c, L3c, L4c = VectorizationBase.CACHE_COUNT
-    # TODO: something better than treating it as 4 MiB if there is no L3 cache
-    #       one possibility is to focus on the L1 and L2 caches instead of the L2 and L3.
-    _L3 = something(__L3, 4194304)
-    @assert L1c == L2c == VectorizationBase.NUM_CORES
+@inline function split_m(M, _Mblocks, W)
+    Miters = cld_fast(M, W)
+    Mblocks = min(_Mblocks, Miters)
+    Miter_per_block, Mrem = divrem_fast(Miters, Mblocks)
+    Mbsize = Miter_per_block * W
+    Mremfinal = M - Mbsize*(Mblocks-1) - Mrem * W
+    Mbsize, Mrem, Mremfinal, Mblocks
+end
+
+"""
+  solve_block_sizes(::Type{T}, M, K, N, α, β, R₂, R₃)
+
+This function returns iteration/blocking descriptions `Mc`, `Kc`, and `Nc` for use when packing both `A` and `B`.
+
+It tries to roughly minimize the cost
+```julia
+MKN/(Kc*W) + α * MKN/Mc + β * MKN/Nc
+```
+subject to constraints
+```julia
+Mc - M ≤ 0
+Kc - K ≤ 0
+Nc - N ≤ 0
+Mc*Kc - L₁ₑ ≤ 0
+Kc*Nc - L₂ₑ ≤ 0
+```
+That is, our constraints say that our block sizes shouldn't be bigger than the actual dimensions, and also that
+our packed `A` (`Mc × Kc`) should fit into the first packing cache (generally, actually the `L₂`, and our packed
+`B` (`Kc × Nc`) should fit into the second packing cache (generally the `L₃`).
 
-    st = VectorizationBase.static_sizeof(T)
+Our cost model consists of three components:
+1. Cost of moving data in and out of registers. This is done `(M/Mᵣ * K/Kc * N/Nᵣ)` times and the cost per is `(Mᵣ/W * Nᵣ)`.
+2. Cost of moving strips from `B` pack from the low cache levels to the highest cache levels when multiplying `Aₚ * Bₚ`.
+   This is done `(M / Mc * K / Kc * N / Nc)` times, and the cost per is proportional to `(Kc * Nᵣ)`.
+   `α` is the proportionality-constant parameter.
+3. Cost of packing `A`. This is done `(M / Mc * K / Kc * N / Nc)` times, and the cost per is proportional to
+   `(Mc * Kc)`. `β` is the proportionality-constant parameter.
 
-    L2 = (StaticInt{_L2}() - StaticInt{_L1}()) ÷ st
-    L3 = _calculate_L3(StaticInt{_L2}(), StaticInt{_L3}(), st, Val{VectorizationBase.CACHE_INCLUSIVITY[3]}())
+As `W` is a constant, we multiply the cost by `W` and absorb it into `α` and `β`. We drop it from the description
+from  here on out.
 
+In the full problem, we would have Lagrangian, with μ < 0:
+f((Mc,Kc,Nc),(μ₁,μ₂,μ₃,μ₄,μ₅))
+MKN/Kc + α * MKN/Mc + β * MKN/Nc - μ₁(Mc - M) - μ₂(Kc - K) - μ₃(Nc - N) - μ₄(Mc*Kc - L2) - μ₅(Kc*Nc - L3)
+```julia
+0 = ∂L/∂Mc = - α * MKN / Mc² - μ₁ - μ₄*Kc
+0 = ∂L/∂Kc = - MKN / Kc² - μ₂ - μ₄*Mc - μ₅*Nc
+0 = ∂L/∂Nc = - β * MKN / Nc² - μ₃ - μ₅*Kc
+0 = ∂L/∂μ₁ = M - Mc
+0 = ∂L/∂μ₂ = K - Kc
+0 = ∂L/∂μ₃ = N - Nc
+0 = ∂L/∂μ₄ = L₁ₑ - Mc*Kc
+0 = ∂L/∂μ₅ = L₂ₑ - Kc*Nc
+```
+The first 3 constraints complicate things, because they're trivially solved by setting `M = Mc`, `K = Kc`, and `N = Nc`.
+But this will violate the last two constraints in general; normally we will be on the interior of the inequalities,
+meaning we'd be dropping those constraints. Doing so, this leaves us with:
+
+First, lets just solve the cost w/o constraints 1-3
+```julia
+0 = ∂L/∂Mc = - α * MKN / Mc² - μ₄*Kc
+0 = ∂L/∂Kc = - MKN / Kc² - μ₄*Mc - μ₅*Nc
+0 = ∂L/∂Nc = - β * MKN / Nc² - μ₅*Kc
+0 = ∂L/∂μ₄ = L₁ₑ - Mc*Kc
+0 = ∂L/∂μ₅ = L₂ₑ - Kc*Nc
+```
+Solving:
+```julia
+Mc = √(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ)
+Kc = √(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α)
+Nc = √(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ)
+μ₄ = -K*√(L₂ₑ)*M*N*α/(L₁ₑ^(3/2)*√(L₁ₑ*β + L₂ₑ*α))
+μ₅ = -K*√(L₁ₑ)*M*N*β/(L₂ₑ^(3/2)*√(L₁ₑ*β + L₂ₑ*α))
+```
+These solutions are indepedent of matrix size.
+The approach we'll take here is solving for `Nc`, `Kc`, and then finally `Mc` one after the other, incorporating sizes.
+
+Starting with `N`, we check how many iterations would be implied by `Nc`, and then choose the smallest value that would
+yield that number of iterations. This also ensures that `Nc ≤ N`.
+```julia
+Niter = cld(N, √(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ))
+Nblock, Nrem = divrem(N, Niter)
+Nblock_Nrem = Nblock + (Nrem > 0)
+```
+We have `Nrem` iterations of size `Nblock_Nrem`, and `Niter - Nrem` iterations of size `Nblock`.
+
+We can now make `Nc = Nblock_Nrem` a constant, and solve the remaining three equations again:
+```julia
+0 = ∂L/∂Mc = - α * MKN / Mc² - μ₄*Kc
+0 = ∂L/∂Kc = - MKN / Kc² - μ₄*Mc - μ₅*Ncm
+0 = ∂L/∂μ₄ = L₂ₑ - Mc*Kc
+```
+yielding
+```julia
+Mc = √(L₁ₑ)*√(α)
+Kc = √(L₁ₑ)/√(α)
+μ₄ = -K*M*N*√(α)/L₁ₑ^(3/2)
+```
+We proceed in the same fashion as for `Nc`, being sure to reapply the `Kc * Nc ≤ L₂ₑ` constraint:
+```julia
+Kiter = cld(K, min(√(L₁ₑ)/√(α), L₂ₑ/Nc))
+Kblock, Krem = divrem(K, Ki)
+Kblock_Krem = Kblock + (Krem > 0)
+```
+This leaves `Mc` partitioning, for which, for which we use the constraint `Mc * Kc ≤ L₁ₑ` to set
+the initial number of proposed iterations as `cld(M, L₁ₑ / Kcm)` for calling `split_m`.
+```julia
+Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, cld(M, L₁ₑ / Kcm), StaticInt{W}())
+```
+
+Note that for synchronization on `B`, all threads must have the same values for `Kc` and `Nc`.
+`K` and `N` will be equal between threads, but `M` may differ. By calculating `Kc` and `Nc`
+independently of `M`, this algorithm guarantees all threads are on the same page.
+"""
+@inline function solve_block_sizes(::Type{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
     W = VectorizationBase.pick_vector_width_val(T)
-    Mr = StaticInt{LoopVectorization.mᵣ}()
-    Nr = StaticInt{LoopVectorization.nᵣ}()
+    α = _α * W
+    β = _β * W
+    L₁ₑ =  first_effective_cache(T) * R₂
+    L₂ₑ = second_effective_cache(T) * R₃
 
-    Mc = (VectorizationBase.REGISTER_COUNT == 32 ? StaticInt{4}() : StaticInt{9}()) * Mr * W
-    Kc = (StaticInt{5}() * L2) ÷ (StaticInt{7}() * Mc)
-    Nc = ((StaticInt{5}() * L3) ÷ (StaticInt{7}() * Kc * Nr)) * Nr
+    # Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
+    Nc_init⁻¹ = √(L₁ₑ) / (√(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ))
+
+    Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil`
+    Nblock, Nrem = divrem_fast(N, Niter)
+    Nblock_Nrem = Nblock + One()#(Nrem > 0)
 
-    Mc, Kc, Nc
+    ((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = solve_McKc(T, M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)
+
+    (Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), promote(Nblock, Nblock_Nrem, Nrem, Niter)
+end
+# Takes Nc, calcs Mc and Kc
+@inline function solve_McKc(::Type{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
+    W = VectorizationBase.pick_vector_width_val(T)
+    α = _α * W
+    β = _β * W
+    L₁ₑ =  first_effective_cache(T) * R₂
+    L₂ₑ = second_effective_cache(T) * R₃
+
+    Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
+    Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
+    Kblock, Krem = divrem_fast(K, Kiter)
+    Kblock_Krem = Kblock + One()
+
+    Miter_init = cldapproxi(M * inv(L₁ₑ), Kblock_Krem) # Miter = M * Kc / L₁ₑ
+    Mbsize, Mrem, Mremfinal, Miter = split_m(M, Miter_init, W * Wfactor)
+    Mblock_Mrem = Mbsize + W * Wfactor
+
+    promote(Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter), promote(Kblock, Kblock_Krem, Krem, Kiter)
+end
+
+@inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil`
+
+"""
+  find_first_acceptable(M, W)
+
+Finds first combination of `Miter` and `Niter` that doesn't make `M` too small while producing `Miter * Niter = NUM_CORES`.
+This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point.
+"""
+@inline function find_first_acceptable(M, W)
+    Mᵣ = StaticInt{mᵣ}() * W
+    for (miter,niter) ∈ CORE_FACTORS
+        if miter * ((MᵣW_mul_factor - One()) * Mᵣ) ≤ M + (W + W)
+            return miter, niter
+        end
+    end
+    last(CORE_FACTORS)
+end
+"""
+  divide_blocks(M, Ntotal, _nspawn, W)
+
+Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices.
+"""
+@inline function divide_blocks(M, Ntotal, _nspawn, W)
+    _nspawn == NUM_CORES && return find_first_acceptable(M, W)
+
+    Miter = clamp(div_fast(M, W*StaticInt{mᵣ}() * MᵣW_mul_factor), 1, _nspawn)
+    nspawn = div_fast(_nspawn, Miter)
+    Niter = if (nspawn ≤ 1) & (Miter < _nspawn)
+        # rebalance Miter
+        Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter))
+        nspawn = div_fast(_nspawn, Miter)
+    end
+    Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn)))
 end
 
-_calculate_L3(_L2, _L3, st, ::Val{true}) = (_L3 - _L2) ÷ st
-_calculate_L3(_L2, _L3, st, ::Val{false}) = _L3 ÷ st
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,4 +4,7 @@ @@
     /docs/build/
     Manifest.toml
     *~
+    *#
+    *#*
+    *#*#