diff --git a/.gitignore b/.gitignore
index 20a8981..9930b45 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,7 @@
 /docs/build/
 Manifest.toml
 *~
+*#
+*#*
+*#*#
 
diff --git a/Project.toml b/Project.toml
index 9cd614c..14573f8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,20 +1,23 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
-version = "0.1.0"
+version = "0.2.0-DEV"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 ArrayInterface = "2.14"
 LoopVectorization = "0.9.18"
+ThreadingUtilities = "0.2"
 VectorizationBase = "0.15.2"
 julia = "1.5"
 
 [extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -24,4 +27,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [targets]
-test = ["BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
+test = ["Aqua", "BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
diff --git a/src/Octavian.jl b/src/Octavian.jl
index ee420dc..0196b36 100644
--- a/src/Octavian.jl
+++ b/src/Octavian.jl
@@ -1,23 +1,33 @@
 module Octavian
 
-import ArrayInterface
-import LoopVectorization
-import VectorizationBase
+using VectorizationBase, ArrayInterface, LoopVectorization
 
-using VectorizationBase: StaticInt
+using VectorizationBase: align, gep, AbstractStridedPointer, AbstractSIMDVector, vnoaliasstore!, staticm1,
+    static_sizeof, lazymul, vmul_fast, StridedPointer, gesp, zero_offsets, pause,
+    CACHE_COUNT, NUM_CORES, CACHE_INCLUSIVITY, zstridedpointer
+using LoopVectorization: maybestaticsize, mᵣ, nᵣ, preserve_buffer, CloseOpen
+using ArrayInterface: StaticInt, Zero, One, OptionallyStaticUnitRange, size, strides, offsets, indices,
+    static_length, static_first, static_last, axes,
+    dense_dims, DenseDims, stride_rank, StrideRank
 
-export matmul
-export matmul!
+using ThreadingUtilities:
+    _atomic_add!, _atomic_umax!, _atomic_umin!,
+    _atomic_load, _atomic_store!, _atomic_cas_cmp!,
+    SPIN, WAIT, TASK, LOCK, STUP, taskpointer,
+    wake_thread!, __wait, load, store!
+
+export matmul!, matmul_serial!, matmul, matmul_serial, StaticInt
 
 include("global_constants.jl")
 include("types.jl")
-
+include("staticfloats.jl")
+include("integerdivision.jl")
+include("memory_buffer.jl")
 include("block_sizes.jl")
+include("funcptrs.jl")
 include("macrokernels.jl")
-include("matmul.jl")
-include("memory_buffer.jl")
-include("pointer_matrix.jl")
 include("utils.jl")
+include("matmul.jl")
 
 include("init.jl") # `Octavian.__init__()` is defined in this file
 
diff --git a/src/block_sizes.jl b/src/block_sizes.jl
index 80dbd96..7f7e502 100644
--- a/src/block_sizes.jl
+++ b/src/block_sizes.jl
@@ -1,70 +1,233 @@
+
+first_effective_cache(::Type{T}) where {T} = StaticInt{FIRST__CACHE_SIZE}() ÷ static_sizeof(T)
+second_effective_cache(::Type{T}) where {T} = StaticInt{SECOND_CACHE_SIZE}() ÷ static_sizeof(T)
+
+function block_sizes(::Type{T}, _α, _β, R₁, R₂) where {T}
+    W = VectorizationBase.pick_vector_width_val(T)
+    α = _α * W
+    β = _β * W
+    L₁ₑ = first_effective_cache(T) * R₁
+    L₂ₑ = second_effective_cache(T) * R₂
+    block_sizes(W, α, β, L₁ₑ, L₂ₑ)
+end
+function block_sizes(W, α, β, L₁ₑ, L₂ₑ)
+    MᵣW = StaticInt{mᵣ}() * W
+    
+    Mc = floortostaticint(√(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ) / MᵣW) * MᵣW
+    Kc = roundtostaticint(√(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α))
+    Nc = floortostaticint(√(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) / StaticInt{nᵣ}()) * StaticInt{nᵣ}()
+
+    Mc, Kc, Nc
+end
+function block_sizes(::Type{T}) where {T}
+    block_sizes(T, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
+end
+
 """
-  block_sizes(::Type{T}) -> (Mc, Kc, Nc)
-
-Returns the dimensions of our macrokernel, which iterates over the microkernel. That is, in
-calculating `C = A * B`, our macrokernel will be called on `Mc × Kc` blocks of `A`, multiplying
- them with `Kc × Nc` blocks of `B`, to update `Mc × Nc` blocks of `C`.
-
-We want these blocks to fit nicely in the cache. There is a lot of room for improvement here, but this
-initial implementation should work reasonably well.
-
-
-The constants `LoopVectorization.mᵣ` and `LoopVectorization.nᵣ` are the factors by which LoopVectorization
-wants to unroll the rows and columns of the microkernel, respectively. `LoopVectorization` defines these
-constants by running it's analysis on a gemm kernel; they're there for convenience/to make it easier to
-implement matrix-multiply.
-It also wants to vectorize the rows by `W = VectorizationBase.pick_vector_width_val(T)`.
-Thus, the microkernel's dimensions are `(W * mᵣ) × nᵣ`; that is, the microkernel updates a `(W * mᵣ) × nᵣ`
-block of `C`.
-
-Because the macrokernel iterates over tiles and repeatedly applies the microkernel, we would prefer the
-macrokernel's dimensions to be an integer multiple of the microkernel's.
-That is, we want `Mc` to be an integer multiple of `W * mᵣ` and `Nc` to be an integer multiple of `nᵣ`.
-
-Additionally, we want our blocks of `A` to fit in the core-local L2 cache.
-Empirically, I found that when using `Float64` arrays, 72 rows works well on Haswell (where `W * mᵣ = 8`)
-and 96 works well for Cascadelake (where `W * mᵣ = 24`).
-So I kind of heuristically multiply `W * mᵣ` by `4` given 32 vector register (as in Cascadelake), which
-would yield `96`, and multiply by `9` otherwise, which would give `72` on Haswell.
-Ideally, we'd have a better means of picking.
-I suspect relatively small numbers work well because I'm currently using a column-major memory layout for
-the internal packing arrays. A column-major memory layout means that if our macro-kernel had a lot of rows,
-moving across columns would involve reading memory far apart, moving across memory pages more rapidly,
-hitting the TLB harder. This is why libraries like OpenBLAS and BLIS don't use a column-major layout, but
-reshape into a 3-d array, e.g. `A` will be reshaped into a `Mᵣ × Kc × (Mc ÷ Mᵣ)` array (also sometimes
-referred to as a tile-major matrix), so that all memory reads happen in consecutive memory locations.
-
-
-Now that we have `Mc`, we use it and the `L2` cache size to calculate `Kc`, but shave off a percent to
-leave room in the cache for some other things.
-
-We want out blocks of `B` to fir in the `L3` cache, so we can use the `L3` cache-size and `Kc` to
-similarly calculate `Nc`, with the additional note that we also divide and multiply by `nᵣ` to ensure
-that `Nc` is an integer multiple of `nᵣ`.
+    split_m(M, Miters_base, W)
+
+Splits `M` into at most `Miters_base` iterations.
+For example, if we wish to divide `517` iterations into roughly 7 blocks using multiples of `8`:
+
+```julia
+julia> split_m(517, 7, 8)
+(72, 2, 69, 7)
+```
+
+This suggests we have base block sizes of size `72`, with two iterations requiring an extra remainder of `8 ( = W)`,
+and a final block of `69` to handle the remainder. It also tells us that there are `7` total iterations, as requested.
+```julia
+julia> 80*2 + 72*(7-2-1) + 69
+517
+```
+This is meant to specify roughly the requested amount of blocks, and return relatively even sizes.
+
+This method is used fairly generally.
 """
-function block_sizes(::Type{T}) where {T}
-    _L1, _L2, __L3, _L4 = VectorizationBase.CACHE_SIZE
-    L1c, L2c, L3c, L4c = VectorizationBase.CACHE_COUNT
-    # TODO: something better than treating it as 4 MiB if there is no L3 cache
-    #       one possibility is to focus on the L1 and L2 caches instead of the L2 and L3.
-    _L3 = something(__L3, 4194304)
-    @assert L1c == L2c == VectorizationBase.NUM_CORES
+@inline function split_m(M, _Mblocks, W)
+    Miters = cld_fast(M, W)
+    Mblocks = min(_Mblocks, Miters)
+    Miter_per_block, Mrem = divrem_fast(Miters, Mblocks)
+    Mbsize = Miter_per_block * W
+    Mremfinal = M - Mbsize*(Mblocks-1) - Mrem * W
+    Mbsize, Mrem, Mremfinal, Mblocks
+end
+
+"""
+  solve_block_sizes(::Type{T}, M, K, N, α, β, R₂, R₃)
+
+This function returns iteration/blocking descriptions `Mc`, `Kc`, and `Nc` for use when packing both `A` and `B`.
+
+It tries to roughly minimize the cost
+```julia
+MKN/(Kc*W) + α * MKN/Mc + β * MKN/Nc
+```
+subject to constraints
+```julia
+Mc - M ≤ 0
+Kc - K ≤ 0
+Nc - N ≤ 0
+Mc*Kc - L₁ₑ ≤ 0
+Kc*Nc - L₂ₑ ≤ 0
+```
+That is, our constraints say that our block sizes shouldn't be bigger than the actual dimensions, and also that
+our packed `A` (`Mc × Kc`) should fit into the first packing cache (generally, actually the `L₂`, and our packed
+`B` (`Kc × Nc`) should fit into the second packing cache (generally the `L₃`).
 
-    st = VectorizationBase.static_sizeof(T)
+Our cost model consists of three components:
+1. Cost of moving data in and out of registers. This is done `(M/Mᵣ * K/Kc * N/Nᵣ)` times and the cost per is `(Mᵣ/W * Nᵣ)`.
+2. Cost of moving strips from `B` pack from the low cache levels to the highest cache levels when multiplying `Aₚ * Bₚ`.
+   This is done `(M / Mc * K / Kc * N / Nc)` times, and the cost per is proportional to `(Kc * Nᵣ)`.
+   `α` is the proportionality-constant parameter.
+3. Cost of packing `A`. This is done `(M / Mc * K / Kc * N / Nc)` times, and the cost per is proportional to
+   `(Mc * Kc)`. `β` is the proportionality-constant parameter.
 
-    L2 = (StaticInt{_L2}() - StaticInt{_L1}()) ÷ st
-    L3 = _calculate_L3(StaticInt{_L2}(), StaticInt{_L3}(), st, Val{VectorizationBase.CACHE_INCLUSIVITY[3]}())
+As `W` is a constant, we multiply the cost by `W` and absorb it into `α` and `β`. We drop it from the description
+from  here on out.
 
+In the full problem, we would have Lagrangian, with μ < 0:
+f((Mc,Kc,Nc),(μ₁,μ₂,μ₃,μ₄,μ₅))
+MKN/Kc + α * MKN/Mc + β * MKN/Nc - μ₁(Mc - M) - μ₂(Kc - K) - μ₃(Nc - N) - μ₄(Mc*Kc - L2) - μ₅(Kc*Nc - L3)
+```julia
+0 = ∂L/∂Mc = - α * MKN / Mc² - μ₁ - μ₄*Kc
+0 = ∂L/∂Kc = - MKN / Kc² - μ₂ - μ₄*Mc - μ₅*Nc
+0 = ∂L/∂Nc = - β * MKN / Nc² - μ₃ - μ₅*Kc
+0 = ∂L/∂μ₁ = M - Mc
+0 = ∂L/∂μ₂ = K - Kc
+0 = ∂L/∂μ₃ = N - Nc
+0 = ∂L/∂μ₄ = L₁ₑ - Mc*Kc
+0 = ∂L/∂μ₅ = L₂ₑ - Kc*Nc
+```
+The first 3 constraints complicate things, because they're trivially solved by setting `M = Mc`, `K = Kc`, and `N = Nc`.
+But this will violate the last two constraints in general; normally we will be on the interior of the inequalities,
+meaning we'd be dropping those constraints. Doing so, this leaves us with:
+
+First, lets just solve the cost w/o constraints 1-3
+```julia
+0 = ∂L/∂Mc = - α * MKN / Mc² - μ₄*Kc
+0 = ∂L/∂Kc = - MKN / Kc² - μ₄*Mc - μ₅*Nc
+0 = ∂L/∂Nc = - β * MKN / Nc² - μ₅*Kc
+0 = ∂L/∂μ₄ = L₁ₑ - Mc*Kc
+0 = ∂L/∂μ₅ = L₂ₑ - Kc*Nc
+```
+Solving:
+```julia
+Mc = √(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ)
+Kc = √(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α)
+Nc = √(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ)
+μ₄ = -K*√(L₂ₑ)*M*N*α/(L₁ₑ^(3/2)*√(L₁ₑ*β + L₂ₑ*α))
+μ₅ = -K*√(L₁ₑ)*M*N*β/(L₂ₑ^(3/2)*√(L₁ₑ*β + L₂ₑ*α))
+```
+These solutions are indepedent of matrix size.
+The approach we'll take here is solving for `Nc`, `Kc`, and then finally `Mc` one after the other, incorporating sizes.
+
+Starting with `N`, we check how many iterations would be implied by `Nc`, and then choose the smallest value that would
+yield that number of iterations. This also ensures that `Nc ≤ N`.
+```julia
+Niter = cld(N, √(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ))
+Nblock, Nrem = divrem(N, Niter)
+Nblock_Nrem = Nblock + (Nrem > 0)
+```
+We have `Nrem` iterations of size `Nblock_Nrem`, and `Niter - Nrem` iterations of size `Nblock`.
+
+We can now make `Nc = Nblock_Nrem` a constant, and solve the remaining three equations again:
+```julia
+0 = ∂L/∂Mc = - α * MKN / Mc² - μ₄*Kc
+0 = ∂L/∂Kc = - MKN / Kc² - μ₄*Mc - μ₅*Ncm
+0 = ∂L/∂μ₄ = L₂ₑ - Mc*Kc
+```
+yielding
+```julia
+Mc = √(L₁ₑ)*√(α)
+Kc = √(L₁ₑ)/√(α)
+μ₄ = -K*M*N*√(α)/L₁ₑ^(3/2)
+```
+We proceed in the same fashion as for `Nc`, being sure to reapply the `Kc * Nc ≤ L₂ₑ` constraint:
+```julia
+Kiter = cld(K, min(√(L₁ₑ)/√(α), L₂ₑ/Nc))
+Kblock, Krem = divrem(K, Ki)
+Kblock_Krem = Kblock + (Krem > 0)
+```
+This leaves `Mc` partitioning, for which, for which we use the constraint `Mc * Kc ≤ L₁ₑ` to set
+the initial number of proposed iterations as `cld(M, L₁ₑ / Kcm)` for calling `split_m`.
+```julia
+Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, cld(M, L₁ₑ / Kcm), StaticInt{W}())
+```
+
+Note that for synchronization on `B`, all threads must have the same values for `Kc` and `Nc`.
+`K` and `N` will be equal between threads, but `M` may differ. By calculating `Kc` and `Nc`
+independently of `M`, this algorithm guarantees all threads are on the same page.
+"""
+@inline function solve_block_sizes(::Type{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
     W = VectorizationBase.pick_vector_width_val(T)
-    Mr = StaticInt{LoopVectorization.mᵣ}()
-    Nr = StaticInt{LoopVectorization.nᵣ}()
+    α = _α * W
+    β = _β * W
+    L₁ₑ =  first_effective_cache(T) * R₂
+    L₂ₑ = second_effective_cache(T) * R₃
 
-    Mc = (VectorizationBase.REGISTER_COUNT == 32 ? StaticInt{4}() : StaticInt{9}()) * Mr * W
-    Kc = (StaticInt{5}() * L2) ÷ (StaticInt{7}() * Mc)
-    Nc = ((StaticInt{5}() * L3) ÷ (StaticInt{7}() * Kc * Nr)) * Nr
+    # Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
+    Nc_init⁻¹ = √(L₁ₑ) / (√(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ))
+    
+    Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil`
+    Nblock, Nrem = divrem_fast(N, Niter)
+    Nblock_Nrem = Nblock + One()#(Nrem > 0)
 
-    Mc, Kc, Nc
+    ((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = solve_McKc(T, M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)
+    
+    (Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), promote(Nblock, Nblock_Nrem, Nrem, Niter)
+end
+# Takes Nc, calcs Mc and Kc
+@inline function solve_McKc(::Type{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
+    W = VectorizationBase.pick_vector_width_val(T)
+    α = _α * W
+    β = _β * W
+    L₁ₑ =  first_effective_cache(T) * R₂
+    L₂ₑ = second_effective_cache(T) * R₃
+
+    Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
+    Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
+    Kblock, Krem = divrem_fast(K, Kiter)
+    Kblock_Krem = Kblock + One()
+
+    Miter_init = cldapproxi(M * inv(L₁ₑ), Kblock_Krem) # Miter = M * Kc / L₁ₑ
+    Mbsize, Mrem, Mremfinal, Miter = split_m(M, Miter_init, W * Wfactor)
+    Mblock_Mrem = Mbsize + W * Wfactor
+    
+    promote(Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter), promote(Kblock, Kblock_Krem, Krem, Kiter)
+end
+
+@inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil`
+
+"""
+  find_first_acceptable(M, W)
+
+Finds first combination of `Miter` and `Niter` that doesn't make `M` too small while producing `Miter * Niter = NUM_CORES`.
+This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point.
+"""
+@inline function find_first_acceptable(M, W)
+    Mᵣ = StaticInt{mᵣ}() * W
+    for (miter,niter) ∈ CORE_FACTORS
+        if miter * ((MᵣW_mul_factor - One()) * Mᵣ) ≤ M + (W + W)
+            return miter, niter
+        end
+    end
+    last(CORE_FACTORS)
+end
+"""
+  divide_blocks(M, Ntotal, _nspawn, W)
+
+Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices.
+"""
+@inline function divide_blocks(M, Ntotal, _nspawn, W)
+    _nspawn == NUM_CORES && return find_first_acceptable(M, W)
+    
+    Miter = clamp(div_fast(M, W*StaticInt{mᵣ}() * MᵣW_mul_factor), 1, _nspawn)
+    nspawn = div_fast(_nspawn, Miter)
+    Niter = if (nspawn ≤ 1) & (Miter < _nspawn)
+        # rebalance Miter
+        Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter))
+        nspawn = div_fast(_nspawn, Miter)
+    end
+    Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn)))
 end
 
-_calculate_L3(_L2, _L3, st, ::Val{true}) = (_L3 - _L2) ÷ st
-_calculate_L3(_L2, _L3, st, ::Val{false}) = _L3 ÷ st
diff --git a/src/funcptrs.jl b/src/funcptrs.jl
new file mode 100644
index 0000000..a021484
--- /dev/null
+++ b/src/funcptrs.jl
@@ -0,0 +1,129 @@
+
+
+struct LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd} <: Function end
+function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
+    offset, C = load(p, TC, 1)
+    offset, A = load(p, TA, offset)
+    offset, B = load(p, TB, offset)
+    offset, α = load(p, Α, offset)
+    offset, β = load(p, Β, offset)
+    offset, M = load(p, Md, offset)
+    offset, K = load(p, Kd, offset)
+    offset, N = load(p, Nd, offset)
+    _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
+    nothing
+end
+@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N)
+@inline function _call_loopmul!(C::StridedPointer{T}, A, B, α, β, M, K, N, ::Val{true}) where {T}
+    if M*K < first_effective_cache(T) * R₂Default
+        packaloopmul!(C, A, B, α, β, M, K, N)
+        return
+    else
+        matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
+        return
+    end
+end
+call_loopmul!(C, A, B, α, β, M, K, N, ::Val{P}) where {P} = _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
+
+struct SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂} <: Function end
+function (::SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂})(p::Ptr{UInt}) where {TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂}
+    offset, C = load(p, TC, 1)
+    offset, A = load(p, TA, offset)
+    offset, B = load(p, TB, offset)
+    offset, α = load(p, Α, offset)
+    offset, β = load(p, Β, offset)
+    offset, M = load(p, Md, offset)
+    offset, K = load(p, Kd, offset)
+    offset, N = load(p, Nd, offset)
+    offset, atomicp = load(p, AP, offset)
+    offset, bcachep = load(p, BCP, offset)
+    offset, id = load(p, ID, offset)
+    offset, total_ids = load(p, TT, offset)
+    sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat{W₁}(), StaticFloat{W₂}(), StaticFloat{R₁}(), StaticFloat{R₂}())
+    nothing
+end
+
+@generated function cfuncpointer(::T) where {T}
+    precompile(T(), (Ptr{UInt},))
+    quote
+        $(Expr(:meta,:inline))
+        @cfunction($(T()), Cvoid, (Ptr{UInt},))
+    end
+end
+
+@inline function setup_matmul!(p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd, ::Val{P}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
+    offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), 0)
+    offset = store!(p, C, offset)
+    offset = store!(p, A, offset)
+    offset = store!(p, B, offset)
+    offset = store!(p, α, offset)
+    offset = store!(p, β, offset)
+    offset = store!(p, M, offset)
+    offset = store!(p, K, offset)
+    offset = store!(p, N, offset)
+    nothing
+end
+
+@inline function setup_syncmul!(
+    p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd,
+    ap::AP,bcp::BCP,id::ID,tt::TT,::StaticFloat{W₁},::StaticFloat{W₂},::StaticFloat{R₁},::StaticFloat{R₂}
+) where {TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂}
+    offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,AP,BCP,ID,TT,W₁,W₂,R₁,R₂}()), 0)
+    offset = store!(p, C, offset)
+    offset = store!(p, A, offset)
+    offset = store!(p, B, offset)
+    offset = store!(p, α, offset)
+    offset = store!(p, β, offset)
+    offset = store!(p, M, offset)
+    offset = store!(p, K, offset)
+    offset = store!(p, N, offset)
+    offset = store!(p, ap,  offset)
+    offset = store!(p, bcp, offset)
+    offset = store!(p, id,  offset)
+    offset = store!(p, tt,  offset)
+    nothing
+end
+
+function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::Int, ::Val{P}) where {P}
+    p = taskpointer(tid)
+    while true
+        if _atomic_cas_cmp!(p, SPIN, STUP)
+            setup_matmul!(p, C, A, B, α, β, M, K, N, Val{P}())
+            @assert _atomic_cas_cmp!(p, STUP, TASK)
+            return
+        elseif _atomic_cas_cmp!(p, WAIT, STUP)
+            setup_matmul!(p, C, A, B, α, β, M, K, N, Val{P}())
+            @assert _atomic_cas_cmp!(p, STUP, LOCK)
+            wake_thread!(tid % UInt)
+            return
+        end
+        pause()
+    end
+end
+function launch_thread_mul!(
+    C, A, B, α, β, M, K, N, ap, bcp, tid, tt,::StaticFloat{W₁},::StaticFloat{W₂},::StaticFloat{R₁},::StaticFloat{R₂}
+) where {W₁,W₂,R₁,R₂}
+    p = taskpointer(tid+one(UInt))
+    while true
+        if _atomic_cas_cmp!(p, SPIN, STUP)
+            setup_syncmul!(
+                p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt,
+                StaticFloat{W₁}(),StaticFloat{W₂}(),StaticFloat{R₁}(),StaticFloat{R₂}()
+            )
+            @assert _atomic_cas_cmp!(p, STUP, TASK)
+            return
+        elseif _atomic_cas_cmp!(p, WAIT, STUP)
+            # we immediately write the `atomicp, bc, tid, total_tids` part, so we can dispatch to the same code as the other methods
+            setup_syncmul!(
+                p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt,
+                StaticFloat{W₁}(),StaticFloat{W₂}(),StaticFloat{R₁}(),StaticFloat{R₂}()
+            )
+            @assert _atomic_cas_cmp!(p, STUP, LOCK)
+            wake_thread!(tid+one(UInt))
+            return
+        end
+        pause()
+    end
+end
+
+
diff --git a/src/global_constants.jl b/src/global_constants.jl
index 5cf2485..b13155e 100644
--- a/src/global_constants.jl
+++ b/src/global_constants.jl
@@ -1,3 +1,47 @@
 const BCACHE = UInt8[]
 
 const OCTAVIAN_NUM_TASKS = Ref(1)
+_nthreads() = OCTAVIAN_NUM_TASKS[]
+
+@generated function calc_factors(::Val{nc} = Val{NUM_CORES}()) where {nc}
+    t = Expr(:tuple)
+    for i ∈ nc:-1:1
+        d, r = divrem(nc, i)
+        iszero(r) && push!(t.args, (i, d))
+    end
+    t
+end
+const CORE_FACTORS = calc_factors()
+
+const MᵣW_mul_factor = VectorizationBase.REGISTER_SIZE === 64 ? StaticInt{4}() : StaticInt{9}()
+
+if VectorizationBase.AVX512F
+    const W₁Default = 0.006089395198610773
+    const W₂Default = 0.7979822724696168
+    const R₁Default = 0.5900561503730485
+    const R₂Default = 0.762152930709678
+else
+    const W₁Default = 0.1 # TODO: relax bounds; this was the upper bound set for the optimizer.
+    const W₂Default = 0.15989396641218157
+    const R₁Default = 0.4203583148344484
+    const R₂Default = 0.6344856142604789
+end
+
+const FIRST__CACHE = 1 + (VectorizationBase.CACHE_SIZE[3] !== nothing)
+const SECOND_CACHE = 2 + (VectorizationBase.CACHE_SIZE[3] !== nothing)
+const FIRST__CACHE_SIZE = VectorizationBase.CACHE_SIZE[FIRST__CACHE] === nothing ? 262144 :
+    (((FIRST__CACHE == 2) & CACHE_INCLUSIVITY[2]) ? (VectorizationBase.CACHE_SIZE[2] - VectorizationBase.CACHE_SIZE[1]) :
+    VectorizationBase.CACHE_SIZE[FIRST__CACHE])
+const SECOND_CACHE_SIZE = (VectorizationBase.CACHE_SIZE[SECOND_CACHE] === nothing ? 3145728 :
+    (CACHE_INCLUSIVITY[SECOND_CACHE] ? (VectorizationBase.CACHE_SIZE[SECOND_CACHE] - VectorizationBase.CACHE_SIZE[FIRST__CACHE]) :
+    VectorizationBase.CACHE_SIZE[SECOND_CACHE])) * something(VectorizationBase.CACHE_COUNT[SECOND_CACHE], 1)
+
+const CACHELINESIZE = something(VectorizationBase.L₁CACHE.linesize, 64) % UInt
+const BCACHE_COUNT = something(VectorizationBase.CACHE_COUNT[3], 1);
+const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt))
+
+if Sys.WORD_SIZE ≤ 32
+    const ACACHE = UInt8[]
+    const ACACHEPTR = Ref{Ptr{UInt8}}(C_NULL)
+end
+
diff --git a/src/init.jl b/src/init.jl
index 93a2fa3..f7172c0 100644
--- a/src/init.jl
+++ b/src/init.jl
@@ -1,11 +1,28 @@
 function __init__()
+    init_acache()
     init_bcache()
-    init_num_tasks()
-    return nothing
+    nt = init_num_tasks()
+    if nt < NUM_CORES && ("SUPPRESS_OCTAVIAN_WARNING" ∉ keys(ENV))
+        msg = string(
+            "Your system has $NUM_CORES physical cores, but `Octavian.jl` only has ",
+            "$(nt > 1 ? "$(nt) threads" : "1 thread") available.",
+            "For the best performance, you should start Julia with at least $(NUM_CORES) threads.",
+            "",
+        )
+        @warn msg
+    end
+    reseet_bcache_lock!()
 end
 
 function init_bcache()
-    resize!(BCACHE, VectorizationBase.CACHE_SIZE[3] * VectorizationBase.CACHE_COUNT[3])
+    resize!(BCACHE, SECOND_CACHE_SIZE * BCACHE_COUNT); nothing
+end
+
+function init_acache()
+    Sys.WORD_SIZE ≤ 32 || return
+    resize!(ACACHE, FIRST__CACHE_SIZE * init_num_tasks() + 4095)
+    ACACHEPTR[] = align(pointer(ACACHE), 4096)
+    nothing
 end
 
 function init_num_tasks()
@@ -15,9 +32,10 @@ end
 
 function _read_environment_num_tasks()
     environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String
+    nt = min(Threads.nthreads(), VectorizationBase.NUM_CORES)::Int
     if isempty(environment_variable)
-        return min(Threads.nthreads(), VectorizationBase.NUM_CORES)::Int
+        return nt
     else
-        return parse(Int, environment_variable)::Int
+        return min(parse(Int, environment_variable)::Int, nt)
     end
 end
diff --git a/src/integerdivision.jl b/src/integerdivision.jl
new file mode 100644
index 0000000..c2b3925
--- /dev/null
+++ b/src/integerdivision.jl
@@ -0,0 +1,26 @@
+@inline cld_fast(x, y) = cld(x, y)
+@inline function cld_fast(x::I, y) where {I <: Integer}
+    d = div_fast(x, y)
+    (d + (d * unsigned(y) != unsigned(x))) % I
+end
+cld_fast(::StaticInt{N}, ::StaticInt{M}) where {N,M}= (StaticInt{N}() + StaticInt{M}() + One()) ÷ StaticInt{M}()
+@inline function divrem_fast(x::I, y) where {I <: Integer}
+    ux = unsigned(x); uy = unsigned(y)
+    d = Base.udiv_int(ux, uy)
+    r = ux - d * uy
+    d % I, r % I
+end
+@inline divrem_fast(::StaticInt{x}, y::I) where {x, I <: Integer} = divrem_fast(x % I, y)
+@inline div_fast(x::I, y::Integer) where {I <: Integer} = Base.udiv_int(unsigned(x), unsigned(y)) % I
+@inline div_fast(::StaticInt{x}, y::I) where {x, I <: Integer} = Base.udiv_int(unsigned(x), unsigned(y)) % I
+divrem_fast(::StaticInt{N}, ::StaticInt{M}) where {N,M}= divrem(StaticInt{N}(), StaticInt{M}())
+div_fast(::StaticInt{N}, ::StaticInt{M}) where {N,M}= StaticInt{N}() ÷ StaticInt{M}()
+@generated function div_fast(x::I, ::StaticInt{M}) where {I<:Integer,M}
+    if VectorizationBase.ispow2(M)
+        lm = VectorizationBase.intlog2(M)
+        Expr(:block, Expr(:meta,:inline), :(x >>> $lm))
+    else
+        Expr(:block, Expr(:meta,:inline), :(div_fast(x, $(I(M)))))
+    end
+end
+
diff --git a/src/macrokernels.jl b/src/macrokernels.jl
index 886dd5e..5b4f8a6 100644
--- a/src/macrokernels.jl
+++ b/src/macrokernels.jl
@@ -1,24 +1,347 @@
-"""
-The macrokernel. It iterates over our tiles, and applies the microkernel.
-"""
-function macrokernel!(C, A, B, α, β)
-    iszero(β) && return macrokernel!(C, A, B, α, StaticInt{0}())
-    LoopVectorization.@avx for n ∈ axes(C,2), m ∈ axes(C,1)
-        Cmn = zero(eltype(C))
-        for k ∈ axes(B,1)
-            Cmn += A[m,k] * B[k,n]
+@inline function loopmul!(C, A, B, ::StaticInt{1}, ::StaticInt{0}, M, K, N)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
         end
-        # C[m,n] may be `NaN`, but if `β == 0` we still want to zero it
-        C[m,n] = (α * Cmn) + β * C[m,n]
+        C[m,n] = Cₘₙ
     end
+    nothing
 end
-function macrokernel!(C, A, B, α, ::StaticInt{0})
-    LoopVectorization.@avx for n ∈ axes(C,2), m ∈ axes(C,1)
-        Cmn = zero(eltype(C))
-        for k ∈ axes(B,1)
-            Cmn += A[m,k] * B[k,n]
+@inline function loopmul!(C, A, B, ::StaticInt{1}, ::StaticInt{1}, M, K, N)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
         end
-        # C[m,n] may be `NaN`, but if `β == 0` we still want to zero it
-        C[m,n] = (α * Cmn)
+        C[m,n] += Cₘₙ
     end
+    nothing
 end
+@inline function loopmul!(C, A, B, ::StaticInt{1}, β, M, K, N)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n] = Cₘₙ + β * C[m,n]
+    end
+    nothing
+end
+@inline function loopmul!(C, A, B, ::StaticInt{-1}, ::StaticInt{0}, M, K, N)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ -= A[m,k] * B[k,n]
+        end
+        C[m,n] = Cₘₙ
+    end
+    nothing
+end
+@inline function loopmul!(C, A, B, ::StaticInt{-1}, ::StaticInt{1}, M, K, N)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ -= A[m,k] * B[k,n]
+        end
+        C[m,n] += Cₘₙ
+    end
+    nothing
+end
+@inline function loopmul!(C, A, B, ::StaticInt{-1}, β, M, K, N)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ -= A[m,k] * B[k,n]
+        end
+        C[m,n] = Cₘₙ + β * C[m,n]
+    end
+    nothing
+end
+@inline function loopmul!(C, A, B, α, ::StaticInt{0}, M, K, N)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n] = α * Cₘₙ
+    end
+    nothing
+end
+
+@inline function loopmul!(C, A, B, α, ::StaticInt{1}, M, K, N)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n] += α * Cₘₙ
+    end
+    nothing
+end
+@inline function loopmul!(C, A, B, α, β, M, K, N)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n] = α * Cₘₙ + β * C[m,n]
+    end
+    nothing
+end
+
+
+@inline function packamul!(
+    C, Ãₚ, A, B,
+    ::StaticInt{1}, ::StaticInt{0},
+    M, K, N
+)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Aₘₖ = A[m,k]
+            Cₘₙ += Aₘₖ * B[k,n]
+            Ãₚ[m,k] = Aₘₖ 
+        end
+        C[m,n] = Cₘₙ
+    end
+end
+@inline function packamul!(
+    C, Ãₚ, A, B,
+    ::StaticInt{1}, ::StaticInt{1},
+    M, K, N
+)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Aₘₖ = A[m,k]
+            Cₘₙ += Aₘₖ * B[k,n]
+            Ãₚ[m,k] = Aₘₖ
+        end
+        C[m,n] += Cₘₙ
+    end
+end
+@inline function packamul!(
+    C, Ãₚ, A, B,
+    ::StaticInt{1}, β,
+    M, K, N
+)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Aₘₖ = A[m,k]
+            Cₘₙ += Aₘₖ * B[k,n]
+            Ãₚ[m,k] = Aₘₖ 
+        end
+        C[m,n] = Cₘₙ + β * C[m,n]
+    end
+end
+@inline function packamul!(
+    C, Ãₚ, A, B,
+    ::StaticInt{-1}, ::StaticInt{0},
+    M, K, N
+)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Aₘₖ = A[m,k]
+            Cₘₙ -= Aₘₖ * B[k,n]
+            Ãₚ[m,k] = Aₘₖ 
+        end
+        C[m,n] = Cₘₙ
+    end
+end
+@inline function packamul!(
+    C, Ãₚ, A, B,
+    ::StaticInt{-1}, ::StaticInt{1},
+    M, K, N
+)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Aₘₖ = A[m,k]
+            Cₘₙ -= Aₘₖ * B[k,n]
+            Ãₚ[m,k] = Aₘₖ
+        end
+        C[m,n] += Cₘₙ
+    end
+end
+@inline function packamul!(
+    C, Ãₚ, A, B,
+    ::StaticInt{-1}, β,
+    M, K, N
+)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Aₘₖ = A[m,k]
+            Cₘₙ -= Aₘₖ * B[k,n]
+            Ãₚ[m,k] = Aₘₖ 
+        end
+        C[m,n] = Cₘₙ + β * C[m,n]
+    end
+end
+@inline function packamul!(
+    C, Ãₚ, A, B,
+    α, ::StaticInt{0},
+    M, K, N
+)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Aₘₖ = A[m,k]
+            Cₘₙ += Aₘₖ * B[k,n]
+            Ãₚ[m,k] = Aₘₖ 
+        end
+        C[m,n] = α * Cₘₙ
+    end
+end
+@inline function packamul!(
+    C, Ãₚ, A, B,
+    α, ::StaticInt{1},
+    M, K, N
+)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Aₘₖ = A[m,k]
+            Cₘₙ += Aₘₖ * B[k,n]
+            Ãₚ[m,k] = Aₘₖ 
+        end
+        C[m,n] += α * Cₘₙ
+    end
+end
+@inline function packamul!(
+    C, Ãₚ, A, B,
+    α, β, M, K, N
+)
+    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Aₘₖ = A[m,k]
+            Cₘₙ += Aₘₖ * B[k,n]
+            Ãₚ[m,k] = Aₘₖ 
+        end
+        C[m,n] = α * Cₘₙ + β * C[m,n] 
+   end
+end
+@inline function alloc_a_pack(A, M, ::Type{T}) where {T}
+    # buffer = Vector{T}(undef, FIRST__CACHE_SIZE ÷ sizeof(T))
+    buffer = first_cache_buffer(T)
+    bufferptr = if Sys.WORD_SIZE == 32
+        reinterpret(Ptr{T}, buffer) # Base.unsafe_convert(Ptr{T}, buffer)
+    else
+        align(pointer(buffer))
+    end
+    Apack = default_zerobased_stridedpointer(bufferptr, (One(), align(M, T)))
+    Apack, buffer
+end
+@inline function packaloopmul!(
+    C::AbstractStridedPointer{T},
+    A::AbstractStridedPointer,
+    B::AbstractStridedPointer,
+    α, β, M, K, N
+) where {T}
+    Ãₚ, buffer = alloc_a_pack(A, M, T)
+    GC.@preserve buffer begin
+        Nᵣ = StaticInt{nᵣ}()
+        packamul!(C, Ãₚ, A, B, α, β, M, K, Nᵣ)
+        loopmul!(gesp(C, (Zero(), Nᵣ)), Ãₚ, gesp(B, (Zero(), Nᵣ)), α, β, M, K, N - Nᵣ)
+    end
+    nothing
+end
+
+
+
+@inline function inlineloopmul!(C, A, B, ::StaticInt{1}, ::StaticInt{0}, M, K, N)
+    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n] = Cₘₙ
+    end
+    C
+end
+@inline function inlineloopmul!(C, A, B, ::StaticInt{1}, ::StaticInt{1}, M, K, N)
+    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n] += Cₘₙ
+    end
+    C
+end
+@inline function inlineloopmul!(C, A, B, ::StaticInt{1}, β, M, K, N)
+    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n] = Cₘₙ + β * C[m,n]
+    end
+    C
+end
+@inline function inlineloopmul!(C, A, B, ::StaticInt{-1}, ::StaticInt{0}, M, K, N)
+    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ -= A[m,k] * B[k,n]
+        end
+        C[m,n] = Cₘₙ
+    end
+    C
+end
+@inline function inlineloopmul!(C, A, B, ::StaticInt{-1}, ::StaticInt{1}, M, K, N)
+    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ -= A[m,k] * B[k,n]
+        end
+        C[m,n] += Cₘₙ
+    end
+    C
+end
+@inline function inlineloopmul!(C, A, B, ::StaticInt{-1}, β, M, K, N)
+    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ -= A[m,k] * B[k,n]
+        end
+        C[m,n] = Cₘₙ + β * C[m,n]
+    end
+    C
+end
+@inline function inlineloopmul!(C, A, B, α, ::StaticInt{0}, M, K, N)
+    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n] = α * Cₘₙ
+    end
+    C
+end
+@inline function inlineloopmul!(C, A, B, α, ::StaticInt{1}, M, K, N)
+    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n] += α * Cₘₙ
+    end
+    C
+end
+@inline function inlineloopmul!(C, A, B, α, β, M, K, N)
+    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+        Cₘₙ = zero(eltype(C))
+        for k ∈ CloseOpen(K)
+            Cₘₙ += A[m,k] * B[k,n]
+        end
+        C[m,n]  = α * Cₘₙ + β * C[m,n]
+    end
+    C
+end
+
+
diff --git a/src/matmul.jl b/src/matmul.jl
index 658024d..470810b 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -1,90 +1,476 @@
-evenly_divide(x, y) = cld(x, cld(x, y))
-evenly_divide(x, y, z) = cld(evenly_divide(x, y), z) * z
+"""
+Only packs `A`. Primitively does column-major packing: it packs blocks of `A` into a column-major temporary.
+"""
+function matmul_st_only_pack_A!(
+    C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer,
+    α, β, M, K, N, ::StaticFloat{W₁}, ::StaticFloat{W₂}, ::StaticFloat{R₁}, ::StaticFloat{R₂}
+) where {T, W₁, W₂, R₁, R₂}
+
+    ((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) =
+        solve_McKc(T, M, K, N, StaticFloat{W₁}(), StaticFloat{W₂}(), StaticFloat{R₁}(), StaticFloat{R₂}(), StaticInt{mᵣ}())
+    for ko ∈ CloseOpen(Kiter)
+        ksize = ifelse(ko < Krem, Kblock_Krem, Kblock)
+        let A = A, C = C
+            for mo in CloseOpen(Miter)
+                msize = ifelse((mo+1) == Miter, Mremfinal, ifelse(mo < Mrem, Mblock_Mrem, Mblock))
+                # if ko == 0
+                #     loopmul!(C, A, B, α, β, msize, ksize, N)
+                # else
+                #     loopmul!(C, A, B, α, One(), msize, ksize, N)
+                # end
+                if ko == 0
+                    packaloopmul!(C, A, B, α, β, msize, ksize, N)
+                else
+                    packaloopmul!(C, A, B, α, One(), msize, ksize, N)
+                end
+                A = gesp(A, (msize, Zero()))
+                C = gesp(C, (msize, Zero()))
+            end
+        end
+        A = gesp(A, (Zero(), ksize))
+        B = gesp(B, (ksize, Zero()))
+    end
+    nothing
+end
+
+function matmul_st_pack_A_and_B!(
+    C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, W₁, W₂, R₁, R₂, tid
+) where {T}
+    # TODO: if this is nested in other threaded code, use only a piece of BCACHE and make R₂ (and thus L₂ₑ) smaller
+    (Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), (Nblock, Nblock_Nrem, Nrem, Niter) =
+        solve_block_sizes(T, M, K, N, W₁, W₂, R₁, R₂, mᵣ)
+
+    bcache = _use_bcache(tid)
+    L3ptr = Base.unsafe_convert(Ptr{T}, bcache)
+    GC.@preserve BCACHE begin
+        for n ∈ CloseOpen(Niter)
+            nsize = ifelse(n < Nrem, Nblock_Nrem, Nblock)
+            let A = A, B = B
+                for k ∈ CloseOpen(Kiter)
+                    ksize = ifelse(k < Krem, Kblock_Krem, Kblock)
+                    _B = default_zerobased_stridedpointer(L3ptr, (One(),ksize))
+                    unsafe_copyto_avx!(_B, B, ksize, nsize)
+                    let A = A, C = C, B = _B
+                        for m in CloseOpen(Miter)
+                            msize = ifelse((m+1) == Miter, Mremfinal, ifelse(m < Mrem, Mblock_Mrem, Mblock))
+                            if k == 0
+                                packaloopmul!(C, A, B, α,     β, msize, ksize, nsize)
+                            else
+                                packaloopmul!(C, A, B, α, One(), msize, ksize, nsize)
+                            end
+                            A = gesp(A, (msize, Zero()))
+                            C = gesp(C, (msize, Zero()))
+                        end
+                    end
+                    A = gesp(A, (Zero(), ksize))
+                    B = gesp(B, (ksize, Zero()))
+                end
+            end
+            B = gesp(B, (Zero(), nsize))
+            C = gesp(C, (Zero(), nsize))
+        end
+    end # GC.@preserve
+    _free_bcache!(bcache)
+    nothing
+end
+
+@inline contiguousstride1(A) = ArrayInterface.contiguous_axis(A) === ArrayInterface.Contiguous{1}()
+@inline contiguousstride1(A::AbstractStridedPointer{T,N,1}) where {T,N} = true
+# @inline bytestride(A::AbstractArray, i) = VectorizationBase.bytestrides(A)[i]
+@inline bytestride(A::AbstractStridedPointer, i) = strides(A)[i]
+@inline firstbytestride(A::AbstractStridedPointer) = bytestride(A, One())
+
+@inline function vectormultiple(bytex, ::Type{Tc}, ::Type{Ta}) where {Tc,Ta}
+    Wc = VectorizationBase.pick_vector_width_val(Tc) * static_sizeof(Ta) - One()
+    iszero(bytex & (VectorizationBase.REGISTER_SIZE - 1))
+end
+@inline function dontpack(pA::AbstractStridedPointer{Ta}, M, K, ::StaticInt{mc}, ::StaticInt{kc}, ::Type{Tc}) where {mc, kc, Tc, Ta}
+    (contiguousstride1(pA) &&
+         ((((VectorizationBase.AVX512F ? 9 : 13) * VectorizationBase.pick_vector_width(Tc)) ≥ M) ||
+          (vectormultiple(bytestride(pA, StaticInt{2}()), Tc, Ta) && ((M * K) ≤ (mc * kc)) && iszero(reinterpret(Int, pointer(pA)) & (VectorizationBase.REGISTER_SIZE - 1)))))
+end
 
-_dim1contig(::ArrayInterface.Contiguous) = false
-_dim1contig(::ArrayInterface.Contiguous{1}) = true
-dim1contig(::Type{A}) where {A <: StridedArray} = _dim1contig(ArrayInterface.contiguous_axis(A))
-dim1contig(::Type) = false
-dim1contig(A) = dim1contig(typeof(A))
+
+@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractArray{TB}) where {TA,TB}
+    # TODO: if `M` and `N` are statically sized, shouldn't return a `Matrix`.
+    M, KA = size(A)
+    KB, N = size(B)
+    @assert KA == KB "Size mismatch."
+    Matrix{promote_type(TA,TB)}(undef, M, N), (M, KA, N)
+end
+@inline function matmul_serial(A::AbstractMatrix, B::AbstractMatrix)
+    C, (M,K,N) = alloc_matmul_product(A, B)
+    _matmul_serial!(C, A, B, One(), Zero(), (M,K,N))
+    return C
+end
+
+
+# These methods must be compile time constant
+maybeinline(::Any, ::Any, ::Any, ::Any) = false
+function maybeinline(::StaticInt{M}, ::StaticInt{N}, ::Type{T}, ::Val{true}) where {M,N,T}
+    static_sizeof(T) * StaticInt{M}() * StaticInt{N}() < StaticInt{176}() * StaticInt(mᵣ) * StaticInt{nᵣ}()
+end
+function maybeinline(::StaticInt{M}, ::StaticInt{N}, ::Type{T}, ::Val{false}) where {M,N,T}
+    StaticInt{M}() * static_sizeof(T) ≤ StaticInt{2}() * StaticInt{VectorizationBase.REGISTER_SIZE}()
+end
+
+
+@inline function matmul_serial!(C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix) where {T}
+    matmul_serial!(C, A, B, One(), Zero(), nothing, ArrayInterface.contiguous_axis(C))
+end
+@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α)
+    matmul_serial!(C, A, B, α, Zero(), nothing, ArrayInterface.contiguous_axis(C))
+end
+@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β)
+    matmul_serial!(C, A, B, α, β, nothing, ArrayInterface.contiguous_axis(C))
+end
+@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN, ::ArrayInterface.Contiguous{2})
+    _matmul_serial!(C', B', A', α, β, nothing)
+    return C
+end
+@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN, ::ArrayInterface.Contiguous)
+    _matmul_serial!(C, A, B, α, β, nothing)
+    return C
+end
 
 """
-    matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, _α = 1, _β = 0)
+  matmul_serial!(C, A, B[, α = 1, β = 0])
+
+Calculates `C = α * (A * B) + β * C` in place.
+
+A single threaded matrix-matrix-multiply implementation.
+Supports dynamically and statically sized arrays.
+
+Organizationally, `matmul_serial!` checks the arrays properties to try and dispatch to an appropriate implementation.
+If the arrays are small and statically sized, it will dispatch to an inlined multiply.
+
+Otherwise, based on the array's size, whether they are transposed, and whether the columns are already aligned, it decides to not pack at all, to pack only `A`, or to pack both arrays `A` and `B`.
 """
-function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T}, _α = one(T), _β = zero(T)) where {T}
-    _Mc, _Kc, _Nc = block_sizes(T)
+@inline function _matmul_serial!(
+    C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN
+) where {T}
+    M, K, N = MKN === nothing ? matmul_sizes(C, A, B) : MKN
+    pA = zstridedpointer(A); pB = zstridedpointer(B); pC = zstridedpointer(C);
+    Cb = preserve_buffer(C); Ab = preserve_buffer(A); Bb = preserve_buffer(B);
+    Mc, Kc, Nc = block_sizes(T)
+    GC.@preserve Cb Ab Bb begin
+        if maybeinline(M, N, T, ArrayInterface.is_column_major(A)) # check MUST be compile-time resolvable
+            inlineloopmul!(pC, pA, pB, One(), Zero(), M, K, N)
+            return
+        elseif VectorizationBase.CACHE_SIZE[2] === nothing ||  (nᵣ ≥ N) || dontpack(pA, M, K, Mc, Kc, T)
+            loopmul!(pC, pA, pB, α, β, M, K, N)
+            return
+        else
+            matmul_st_pack_dispatcher!(pC, pA, pB, α, β, M, K, N)
+            return
+        end
+    end
+end # function
+
+function matmul_st_pack_dispatcher!(pC::AbstractStridedPointer{T}, pA, pB, α, β, M, K, N, notnested::Union{Nothing,Bool} = nothing) where {T}
+    Mc, Kc, Nc = block_sizes(T)
+    if VectorizationBase.CACHE_SIZE[3] === nothing || (contiguousstride1(pB) ? (Kc * Nc ≥ K * N) : (firstbytestride(pB) ≤ 1600))
+        matmul_st_only_pack_A!(pC, pA, pB, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
+    elseif notnested === nothing ? iszero(ccall(:jl_in_threaded_region, Cint, ())) : notnested
+        matmul_st_pack_A_and_B!(pC, pA, pB, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}(), nothing)
+    else
+        matmul_st_pack_A_and_B!(pC, pA, pB, α, β, M, K, N, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), R₂Default/Threads.nthreads(), Threads.threadid())
+    end
+    nothing
+end
+
 
-    M, K, N = matmul_sizes(C, A, B)
+"""
+    matmul(A, B)
 
-    # Check if maybe it's better not to pack at all.
-    if M * K ≤ _Mc * _Kc && dim1contig(A) && LoopVectorization.check_args(C, A, B) &&
-        (stride(A,2) ≤ 72 || (iszero(stride(A,2) & (VectorizationBase.pick_vector_width(eltype(A))-1)) && iszero(reinterpret(Int,pointer(A)) & 63)))
-        macrokernel!(C, A, B, _α, _β)
-        return C
+Multiply matrices `A` and `B`.
+"""
+@inline function matmul(A::AbstractMatrix, B::AbstractMatrix)
+    C, (M,K,N) = alloc_matmul_product(A, B)
+    _matmul!(C, A, B, One(), Zero(), nothing, (M,K,N))
+    return C
+end
+
+"""
+    matmul!(C, A, B[, α, β, max_threads])
+
+Calculates `C = α * A * B + β * C` in place, overwriting the contents of `A`.
+It may use up to `max_threads` threads. It will not use threads when nested in other threaded code.
+"""
+@inline function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix) where {T}
+    matmul!(C, A, B, One(), Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C))
+end
+@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α)
+    matmul!(C, A, B, α, Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C))
+end
+@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β)
+    matmul!(C, A, B, α, β, nothing, nothing, ArrayInterface.contiguous_axis(C))
+end
+@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread)
+    matmul!(C, A, B, α, β, nthread, nothing, ArrayInterface.contiguous_axis(C))
+end
+@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread, MKN, ::ArrayInterface.Contiguous{2})
+    _matmul!(C', B', A', α, β, nthread, MKN)
+    return C
+end
+@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread, MKN, ::ArrayInterface.Contiguous)
+    _matmul!(C, A, B, α, β, nthread, MKN)
+    return C
+end
+
+
+@inline function dontpack(pA::AbstractStridedPointer{Ta}, M, K, ::StaticInt{mc}, ::StaticInt{kc}, ::Type{Tc}, nspawn) where {mc, kc, Tc, Ta}
+    # MᵣW = VectorizationBase.pick_vector_width_val(Tc) * StaticInt{mᵣ}()
+    # TODO: perhaps consider K vs kc by themselves?
+    (contiguousstride1(pA) && ((M * K) ≤ (mc * kc) * nspawn >>> 1))
+end
+
+# passing MKN directly would let osmeone skip the size check.
+@inline function _matmul!(C::AbstractMatrix{T}, A, B, α, β, nthread, MKN) where {T}#::Union{Nothing,Tuple{Vararg{Integer,3}}}) where {T}
+    M, K, N = MKN === nothing ? matmul_sizes(C, A, B) : MKN
+    W = VectorizationBase.pick_vector_width_val(T)
+    pA = zstridedpointer(A); pB = zstridedpointer(B); pC = zstridedpointer(C);
+    Cb = preserve_buffer(C); Ab = preserve_buffer(A); Bb = preserve_buffer(B);
+    GC.@preserve Cb Ab Bb begin
+        if maybeinline(M, N, T, ArrayInterface.is_column_major(A)) # check MUST be compile-time resolvable
+            inlineloopmul!(pC, pA, pB, One(), Zero(), M, K, N)
+            return
+        elseif (nᵣ ≥ N) || (M*K*N < (StaticInt{13824}() * W))
+            loopmul!(pC, pA, pB, α, β, M, K, N)
+            return
+        else
+            __matmul!(pC, pA, pB, α, β, M, K, N, nthread)
+            return
+        end
     end
+end
 
-    # check if we want to skip packing B
-    do_not_pack_B = (B isa DenseArray && (K * N ≤ _Kc * _Nc)) || N ≤ LoopVectorization.nᵣ
-
-    Bptr = Base.unsafe_convert(Ptr{T}, BCACHE);
-
-    Mc = evenly_divide(M, _Mc, VectorizationBase.pick_vector_width_val(T) * StaticInt{LoopVectorization.mᵣ}())
-    Kc = evenly_divide(M, _Kc)
-    Nc = evenly_divide(M, _Nc, StaticInt{LoopVectorization.nᵣ}())
-
-    α = T(_α);
-    for n ∈ StaticInt{1}():Nc:N # loop 5
-        nsize = min(Int(n + Nc), Int(N + 1)) - n
-        β = T(_β)
-        for k ∈ StaticInt{1}():Kc:K # loop 4
-            ksize = min(Int(k + Kc), Int(K + 1)) - k
-            Bview = view(B, k:k+ksize-1, n:n+nsize-1)
-            # seperate out loop 3, because of _Bblock type instability
-            if do_not_pack_B
-                # _Bblock is likely to have the same type as _Bblock; it'd be nice to reduce the amount of compilation
-                # by homogenizing types across branches, but for now I'm prefering the simplicity of using `Bview`
-                # _Bblock = PointerMatrix(gesp1(stridedpointer(B), (k,n)), ksize, nsize)
-                # matmul_loop3!(C, T, Ablock, A, _Bblock, α, β, msize, ksize, nsize, M, k, n, Mc)
-                matmul_loop3!(C, T, A, Bview, α, β, ksize, nsize, M, k, n, Mc)
-            else
-                Bblock = PointerMatrix(Bptr, (ksize,nsize))
-                unsafe_copyto_avx!(Bblock, Bview)
-                matmul_loop3!(C, T, A, Bblock, α, β, ksize, nsize, M, k, n, Mc)
+# This funciton is sort of a `pun`. It splits aggressively (it does a lot of "splitin'"), which often means it will split-N.
+function matmulsplitn!(C::AbstractStridedPointer{T}, A, B, α, β, ::StaticInt{Mc}, M, K, N, nspawn, ::Val{PACK}) where {T, Mc, PACK}
+    Mᵣ = StaticInt{mᵣ}(); Nᵣ = StaticInt{nᵣ}();
+    W = VectorizationBase.pick_vector_width_val(T)
+    MᵣW = Mᵣ*W
+    _Mblocks, Nblocks = divide_blocks(M, cld_fast(N, Nᵣ), nspawn, W)
+    Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, _Mblocks, W)
+    # Nblocks = min(N, _Nblocks)
+    Nbsize, Nrem = divrem_fast(N, Nblocks)
+
+    _nspawn = Mblocks * Nblocks
+    Mbsize_Mrem, Mbsize_ = promote(Mbsize +     W, Mbsize)
+    Nbsize_Nrem, Nbsize_ = promote(Nbsize + One(), Nbsize)
+
+    let _A = A, _B = B, _C = C, n = 0, tnum = 0, Nrc = Nblocks - Nrem, Mrc = Mblocks - Mrem, __Mblocks = Mblocks - One()
+        while true
+            nsize = ifelse(Nblocks > Nrc, Nbsize_Nrem, Nbsize_); Nblocks -= 1
+            let _A = _A, _C = _C, __Mblocks = __Mblocks
+                while __Mblocks != 0
+                    msize = ifelse(__Mblocks ≥ Mrc, Mbsize_Mrem, Mbsize_); __Mblocks -= 1
+                    launch_thread_mul!(_C, _A, _B, α, β, msize, K, nsize, (tnum += 1), Val{PACK}())
+                    _A = gesp(_A, (msize, Zero()))
+                    _C = gesp(_C, (msize, Zero()))
+                end
+                if Nblocks != 0
+                    launch_thread_mul!(_C, _A, _B, α, β, Mremfinal, K, nsize, (tnum += 1), Val{PACK}())
+                else
+                    call_loopmul!(_C, _A, _B, α, β, Mremfinal, K, nsize, Val{PACK}())
+                    waitonmultasks(CloseOpen(One(), _nspawn))
+                    return
+                end
             end
-            β = one(T) # re-writing to the same blocks of `C`, so replace original factor with `1`
-        end # loop 4
-    end # loop 5
-    C
-end
-
-function matmul_loop3!(C, ::Type{T}, A, Bblock, α, β, ksize, nsize, M, k, n, Mc) where {T}
-    full_range = StaticInt{1}():Mc:M
-    partitions = Iterators.partition(full_range, OCTAVIAN_NUM_TASKS[])
-    @sync for partition ∈ partitions
-        Threads.@spawn begin
-            # Create L2-buffer for `A`; it should be stack-allocated
-            Amem = L2Buffer(T)
-            Aptr = Base.unsafe_convert(Ptr{T}, Amem);
-            GC.@preserve Amem begin
-                for m ∈ partition # loop 3
-                    msize = min(Int(m + Mc), Int(M + 1)) - m
-                    Ablock = PointerMatrix(Aptr, (msize, ksize), true)
-                    unsafe_copyto_avx!(Ablock, view(A, m:m+msize-1, k:k+ksize-1))
-
-                    Cblock = view(C, m:m+msize-1, n:n+nsize-1)
-                    macrokernel!(Cblock, Ablock, Bblock, α, β)
-                end # loop 3
-            end # GC.@preserve
+            _B = gesp(_B, (Zero(), nsize))
+            _C = gesp(_C, (Zero(), nsize))
         end
     end
 end
 
-"""
-    matmul(A::AbstractMatrix, B::AbstractMatrix)
+function __matmul!(
+    C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, nthread
+) where {T}
+    Mᵣ = StaticInt{mᵣ}(); Nᵣ = StaticInt{nᵣ}();
+    W = VectorizationBase.pick_vector_width_val(T)
+    Mc, Kc, Nc = block_sizes(T)
+    MᵣW = Mᵣ*W
 
-Return the matrix product A*B.
-"""
-function matmul(A::AbstractMatrix{Ta}, B::AbstractMatrix{Tb}) where {Ta, Tb}
-    # TODO: use `similar` / make it more generic; ideally should work with `StaticArrays.MArray`
-    C = Matrix{promote_type(Ta, Tb)}(undef, size(A,1), size(B,2))
-    matmul!(C, A, B)
+    # Not taking the fast path
+    # But maybe we don't want to thread anyway
+    # Maybe this is nested, or we have ≤ 1 threads
+    nt = _nthreads()
+    _nthread = nthread === nothing ? nt : min(nt, nthread)
+    not_in_threaded = iszero(ccall(:jl_in_threaded_region, Cint, ()))
+    if (!not_in_threaded) | (_nthread ≤ 1)
+        matmul_st_pack_dispatcher!(C, A, B, α, β, M, K, N, not_in_threaded)
+        return
+    end
+    # We are threading, but how many threads?
+    L = StaticInt{128}() * W
+    # L = StaticInt{64}() * W
+    nspawn = clamp(div_fast(M * N, L), 1, _nthread)
+
+    # nkern = cld_fast(M * N,  MᵣW * Nᵣ)
+    # Approach:
+    # Check if we don't want to pack A,
+    #    if not, aggressively subdivide
+    # if so, check if we don't want to pack B
+    #    if not, check if we want to thread `N` loop anyway
+    #       if so, divide `M` first, then use ratio of desired divisions / divisions along `M` to calc divisions along `N`
+    #       if not, only thread along `M`. These don't need syncing, as we're not packing `B`
+    #    if so, `matmul_pack_A_and_B!`
+    #
+    # MᵣW * (MᵣW_mul_factor - One()) # gives a smaller Mc, then
+    # if 2M/nspawn is less than it, we don't don't `A`
+    # First check is: do we just want to split aggressively?
+    if VectorizationBase.CACHE_SIZE[2] === nothing ||  # do not pack A
+        dontpack(A, M, K, Mc, Kc, T, nspawn) || (W ≥ M) || (nᵣ*((NUM_CORES ≥ 8) ? max(nspawn,8) : 8) ≥ N)
+        # `nᵣ*nspawn ≥ N` is needed at the moment to avoid accidentally splitting `N` to be `< nᵣ` while packing
+        # Should probably handle that with a smarter splitting function...
+        matmulsplitn!(C, A, B, α, β, Mc, M, K, N, nspawn, Val{false}())
+    elseif ((nspawn*(W+W) > M) || (contiguousstride1(B) ? (roundtostaticint(Kc * Nc * StaticFloat{R₂Default}()) ≥ K * N) : (firstbytestride(B) ≤ 1600)))
+        matmulsplitn!(C, A, B, α, β, Mc, M, K, N, nspawn, Val{true}())
+    else # TODO: Allow splitting along `N` for `matmul_pack_A_and_B!`
+        matmul_pack_A_and_B!(C, A, B, α, β, M, K, N, nspawn, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
+    end
+    nothing
+end
+
+
+# If tasks is [0,1,2,3] (e.g., `CloseOpen(0,4)`), it will wait on `MULTASKS[i]` for `i = [1,2,3]`.
+function waitonmultasks(tasks)
+    for tid ∈ tasks
+        __wait(tid)
+    end
+end
+
+@inline allocref(::StaticInt{N}) where {N} = Ref{NTuple{N,UInt8}}()
+function matmul_pack_A_and_B!(
+    C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N,
+    tospawn::Int, ::StaticFloat{W₁}, ::StaticFloat{W₂}, ::StaticFloat{R₁}, ::StaticFloat{R₂}#, ::Val{1}
+) where {T,W₁,W₂,R₁,R₂}
+    W = VectorizationBase.pick_vector_width_val(T)
+    mᵣW = StaticInt{mᵣ}() * W
+    # atomicsync = Ref{NTuple{16,UInt}}()
+    Mbsize, Mrem, Mremfinal, _to_spawn = split_m(M, tospawn, W) # M is guaranteed to be > W because of `W ≥ M` condition for `jmultsplitn!`...
+    atomicsync = allocref(StaticInt{NUM_CORES}()*StaticInt{2}()*StaticInt(CACHELINESIZE))
+    p = reinterpret(Ptr{UInt}, Base.unsafe_convert(Ptr{UInt8}, atomicsync))
+    GC.@preserve atomicsync begin
+        for i ∈ CloseOpen(2_to_spawn)
+            _atomic_store!(p + i*CACHELINESIZE, zero(UInt))
+        end
+        # _atomic_umin!(p, zero(UInt)); _atomic_umin!(p + 8sizeof(UInt), zero(UInt))
+        #    Mbsize, Mrem, Mremfinal, _to_spawn = split_m(M, tospawn, mᵣW) # M is guaranteed to be > W because of `W ≥ M` condition for `jmultsplitn!`...
+        Mblock_Mrem, Mblock_ = promote(Mbsize + W, Mbsize)
+        u_to_spawn = _to_spawn % UInt
+        tid = 0
+        bc = _use_bcache()
+        bc_ptr = Base.unsafe_convert(typeof(pointer(C)), pointer(bc))
+        last_id = _to_spawn - One()
+        for m ∈ CloseOpen(last_id) # ...thus the fact that `CloseOpen()` iterates at least once is okay.
+            Mblock = ifelse(m < Mrem, Mblock_Mrem, Mblock_)
+            launch_thread_mul!(C, A, B, α, β, Mblock, K, N, p, bc_ptr, m % UInt, u_to_spawn, StaticFloat{W₁}(),StaticFloat{W₂}(),StaticFloat{R₁}(),StaticFloat{R₂}())
+            A = gesp(A, (Mblock, Zero()))
+            C = gesp(C, (Mblock, Zero()))
+        end
+        sync_mul!(C, A, B, α, β, Mremfinal, K, N, p, bc_ptr, last_id % UInt, u_to_spawn, StaticFloat{W₁}(), StaticFloat{W₂}(), StaticFloat{R₁}(), StaticFloat{R₂}())
+        waitonmultasks(CloseOpen(One(), _to_spawn))
+    end
+    _free_bcache!(bc)
+    return
+end
+
+function sync_mul!(
+    C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, atomicp::Ptr{UInt}, bc::Ptr, id::UInt, total_ids::UInt,
+    ::StaticFloat{W₁}, ::StaticFloat{W₂}, ::StaticFloat{R₁}, ::StaticFloat{R₂}
+) where {T, W₁, W₂, R₁, R₂}
+
+    (Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), (Nblock, Nblock_Nrem, Nrem, Niter) =
+        solve_block_sizes(T, M, K, N, StaticFloat{W₁}(), StaticFloat{W₂}(), StaticFloat{R₁}(), StaticFloat{R₂}(), One())
+
+    # atomics = atomicp + 8sizeof(UInt)
+    sync_iters = zero(UInt)
+    myp = atomicp +   id *CACHELINESIZE
+    atomicp -= CACHELINESIZE
+    atomics = atomicp + total_ids*CACHELINESIZE
+    mys = myp + total_ids*(CACHELINESIZE % UInt)
+    Npackb_r_div, Npackb_r_rem = divrem_fast(Nblock_Nrem, total_ids)
+    Npackb_r_block_rem, Npackb_r_block_ = promote(Npackb_r_div + One(), Npackb_r_div)
+
+    Npackb___div, Npackb___rem = divrem_fast(Nblock, total_ids)
+    Npackb___block_rem, Npackb___block_ = promote(Npackb___div + One(), Npackb___div)
+
+    pack_r_offset = Npackb_r_div * id + min(id, Npackb_r_rem)
+    pack___offset = Npackb___div * id + min(id, Npackb___rem)
+
+    pack_r_len = ifelse(id < Npackb_r_rem, Npackb_r_block_rem, Npackb_r_block_)
+    pack___len = ifelse(id < Npackb___rem, Npackb___block_rem, Npackb___block_)
+
+    GC.@preserve BCACHE begin
+        for n in CloseOpen(Niter)
+            # Krem
+            # pack kc x nc block of B
+            nfull = n < Nrem
+            nsize = ifelse(nfull, Nblock_Nrem, Nblock)
+            pack_offset = ifelse(nfull, pack_r_offset, pack___offset)
+            pack_len = ifelse(nfull, pack_r_len, pack___len)
+            let A = A, B = B
+                for k ∈ CloseOpen(Kiter)
+                    ksize = ifelse(k < Krem, Kblock_Krem, Kblock)
+                    _B = default_zerobased_stridedpointer(bc, (One(), ksize))
+                    unsafe_copyto_avx!(gesp(_B, (Zero(), pack_offset)), gesp(B, (Zero(), pack_offset)), ksize, pack_len)
+                    # synchronize before starting the multiplication, to ensure `B` is packed
+                    # sync_iters += total_ids
+                    # _mv = _atomic_add!(atomicp, one(UInt))
+                    # while _mv < sync_iters
+                    #     pause()
+                    #     _mv = _atomic_umax!(atomicp, zero(UInt))
+                    # end
+                    _mv = _atomic_add!(myp, one(UInt))
+                    sync_iters += one(UInt)
+                    let atomp = atomicp
+                        for _ ∈ CloseOpen(total_ids)
+                            atomp += CACHELINESIZE
+                            atomp == myp && continue
+                            # while !_atomic_cas_cmp!(atomp, sync_iters, sync_iters)
+                            while _atomic_load(atomp) != sync_iters
+                                pause()
+                            end
+                        end
+                    end
+                    # multiply
+                    let A = A, B = _B, C = C
+                        for m in CloseOpen(Miter)
+                            msize = ifelse((m+1) == Miter, Mremfinal, ifelse(m < Mrem, Mblock_Mrem, Mblock))
+                            if k == 0
+                                packaloopmul!(C, A, B, α,     β, msize, ksize, nsize)
+                            else
+                                packaloopmul!(C, A, B, α, One(), msize, ksize, nsize)
+                            end
+                            A = gesp(A, (msize, Zero()))
+                            C = gesp(C, (msize, Zero()))
+                        end
+                    end
+                    A = gesp(A, (Zero(), ksize))
+                    B = gesp(B, (ksize, Zero()))
+                    # synchronize on completion so we wait until every thread is done with `Bpacked` before beginning to overwrite it
+                    # _mv = _atomic_add!(atomics, one(UInt))
+                    # while _mv < sync_iters
+                    #     pause()
+                    #     _mv = _atomic_umax!(atomics, zero(UInt))
+                    # end
+                    _mv = _atomic_add!(mys, one(UInt))
+                    let atoms = atomics
+                        for _ ∈ CloseOpen(total_ids)
+                            atoms += CACHELINESIZE
+                            atoms == mys && continue
+                            # while !_atomic_cas_cmp!(atoms, sync_iters, sync_iters)
+                            while _atomic_load(atoms) != sync_iters
+                                pause()
+                            end
+                        end
+                    end
+                end
+            end
+            B = gesp(B, (Zero(), nsize))
+            C = gesp(C, (Zero(), nsize))
+        end
+    end # GC.@preserve
+    nothing
 end
diff --git a/src/memory_buffer.jl b/src/memory_buffer.jl
index b804c82..ac38827 100644
--- a/src/memory_buffer.jl
+++ b/src/memory_buffer.jl
@@ -1,5 +1,57 @@
-Base.unsafe_convert(::Type{Ptr{T}}, m::MemoryBuffer) where {T} = Base.unsafe_convert(Ptr{T}, Base.pointer_from_objref(m))
-@inline MemoryBuffer(::StaticInt{L}, ::Type{T}) where {L,T} = MemoryBuffer{L,T}(undef)
-@inline function L2Buffer(::Type{T}) where {T}
-    MemoryBuffer(StaticInt{VectorizationBase.CACHE_SIZE[2]}() ÷ VectorizationBase.static_sizeof(T), T)
+@inline Base.unsafe_convert(::Type{Ptr{T}}, d::MemoryBuffer) where {T} = Base.unsafe_convert(Ptr{T}, Base.pointer_from_objref(d))
+@inline MemoryBuffer{T}(::UndefInitializer, ::StaticInt{L}) where {L,T} = MemoryBuffer{L,T}(undef)
+Base.size(::MemoryBuffer{L}) where L = (L,)
+@inline Base.similar(::MemoryBuffer{L,T}) where {L,T} = MemoryBuffer{L,T}(undef)
+# Base.IndexStyle(::Type{<:MemoryBuffer}) = Base.IndexLinear()
+@inline function Base.getindex(m::MemoryBuffer{L,T}, i::Int) where {L,T}
+    @boundscheck checkbounds(m, i)
+    GC.@preserve m x = vload(pointer(m), VectorizationBase.lazymul(VectorizationBase.static_sizeof(T), i - one(i)))
+    x
 end
+@inline function Base.setindex!(m::MemoryBuffer{L,T}, x, i::Int) where {L,T}
+    @boundscheck checkbounds(m, i)
+    GC.@preserve m vstore!(pointer(m), convert(T, x), lazymul(static_sizeof(T), i - one(i)))
+end
+
+if Sys.WORD_SIZE == 32
+    @inline function first_cache_buffer(::Type{T}) where {T}
+        ACACHEPTR[] + Threads.threadid() * FIRST__CACHE_SIZE - FIRST__CACHE_SIZE
+    end
+else
+    @inline function first_cache_buffer(::Type{T}) where {T}
+        MemoryBuffer{T}(undef, StaticInt{FIRST__CACHE_SIZE}() ÷ static_sizeof(T))
+    end
+end
+
+BCache(i::Integer) = BCache(pointer(BCACHE)+cld_fast(SECOND_CACHE_SIZE*i,Threads.nthreads()), i % UInt)
+BCache(::Nothing) = BCache(pointer(BCACHE), nothing)
+
+@inline Base.pointer(b::BCache) = b.p
+@inline Base.unsafe_convert(::Type{Ptr{T}}, b::BCache) where {T} = Base.unsafe_convert(Ptr{T}, b.p)
+
+function _use_bcache()
+    while Threads.atomic_cas!(BCACHE_LOCK, zero(UInt), typemax(UInt)) != zero(UInt)
+        pause()
+    end
+    return BCache(nothing)
+end
+@inline _free_bcache!(b::BCache{Nothing}) = reseet_bcache_lock!()
+
+_use_bcache(::Nothing) = _use_bcache()
+function _use_bcache(i)
+    f = one(UInt) << i
+    while (Threads.atomic_or!(BCACHE_LOCK, f) & f) != zero(UInt)
+        pause()
+    end
+    BCache(i)
+end
+_free_bcache!(b::BCache{UInt}) = (Threads.atomic_xor!(BCACHE_LOCK, one(UInt) << b.i); nothing)
+
+"""
+  reset_bcache_lock!()
+
+Currently not using try/finally in matmul routine, despite locking.
+So if it errors for some reason, you may need to manually call `reset_bcache_lock!()`.
+"""
+@inline reseet_bcache_lock!() = (BCACHE_LOCK[] = zero(UInt); nothing)
+
diff --git a/src/pointer_matrix.jl b/src/pointer_matrix.jl
deleted file mode 100644
index fcc7c33..0000000
--- a/src/pointer_matrix.jl
+++ /dev/null
@@ -1,33 +0,0 @@
-PointerMatrix(p::P, s::S) where {T,P<:VectorizationBase.AbstractStridedPointer{T},S} = PointerMatrix{T,P,S}(p, s)
-Base.size(A::PointerMatrix) = map(Int, A.s)
-VectorizationBase.stridedpointer(A::PointerMatrix) = A.p
-Base.unsafe_convert(::Type{Ptr{T}}, A::PointerMatrix{T}) where {T} = pointer(A.p)
-@inline function Base.getindex(A::PointerMatrix, i::Integer, j::Integer)
-    @boundscheck checkbounds(A, i, j)
-    VectorizationBase.vload(VectorizationBase.stridedpointer(A), (i,j))
-end
-@inline function Base.getindex(A::PointerMatrix, i::Integer)
-    @boundscheck checkbounds(A, i)
-    VectorizationBase.vload(VectorizationBase.stridedpointer(A), (i-1,))
-end
-@inline function Base.setindex!(A::PointerMatrix{T}, v, i::Integer, j::Integer) where {T}
-    @boundscheck checkbounds(A, i, j)
-    VectorizationBase.vstore!(VectorizationBase.stridedpointer(A), convert(T, v), (i,j))
-    v
-end
-@inline function Base.setindex!(A::PointerMatrix{T}, v, i::Integer) where {T}
-    @boundscheck checkbounds(A, i)
-    VectorizationBase.vstore!(VectorizationBase.stridedpointer(A), convert(T, v), (i-1,))
-    v
-end
-
-function PointerMatrix(Bptr::Ptr{T}, (M,N), padcols::Bool = false) where {T}
-    st = VectorizationBase.static_sizeof(T)
-    _M = padcols ? VectorizationBase.align(M, T) : M
-    # Should maybe add a more convenient column major constructor
-    Bsptr = VectorizationBase.stridedpointer(
-        Bptr, VectorizationBase.ArrayInterface.Contiguous{1}(), VectorizationBase.ArrayInterface.ContiguousBatch{0}(),
-        VectorizationBase.ArrayInterface.StrideRank{(1,2)}(), (st, _M*st), (StaticInt{1}(),StaticInt{1}())
-    )
-    PointerMatrix(Bsptr, (M,N))
-end
diff --git a/src/staticfloats.jl b/src/staticfloats.jl
new file mode 100644
index 0000000..2e7000d
--- /dev/null
+++ b/src/staticfloats.jl
@@ -0,0 +1,41 @@
+Base.convert(::Type{T}, ::StaticFloat{N}) where {N,T<:AbstractFloat} = T(N)
+Base.promote_rule(::Type{StaticFloat{N}}, ::Type{T}) where {N,T} = promote_type(T, Float64)
+
+@generated Base.:+(::StaticFloat{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M+N))
+@generated Base.:+(::StaticFloat{N}, ::StaticInt{M}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M+N))
+@generated Base.:+(::StaticInt{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M+N))
+
+@generated Base.:+(::StaticFloat{N}, ::StaticInt{0}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N))
+@generated Base.:+(::StaticInt{0}, ::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N))
+
+@generated Base.:-(::StaticFloat{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M - N))
+@generated Base.:-(::StaticFloat{N}, ::StaticInt{M}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, N - M))
+@generated Base.:-(::StaticInt{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M - N))
+
+@generated Base.:-(::StaticFloat{N}, ::StaticInt{0}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N))
+@generated Base.:-(::StaticInt{0}, ::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, -N))
+
+
+@generated Base.:*(::StaticFloat{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M*N))
+@generated Base.:*(::StaticFloat{N}, ::StaticInt{M}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M*N))
+@generated Base.:*(::StaticInt{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M*N))
+
+@generated Base.:*(::StaticFloat{N}, ::StaticInt{0}) where {N} = Expr(:call, Expr(:curly, :StaticInt, 0))
+@generated Base.:*(::StaticInt{0}, ::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticInt, 0))
+@generated Base.:*(::StaticFloat{N}, ::StaticInt{1}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N))
+@generated Base.:*(::StaticInt{1}, ::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, N))
+
+@generated Base.:/(::StaticFloat{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M/N))
+@generated Base.:/(::StaticFloat{N}, ::StaticInt{M}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, N/M))
+@generated Base.:/(::StaticInt{M}, ::StaticFloat{N}) where {M,N} = Expr(:call, Expr(:curly, :StaticFloat, M/N))
+
+@generated Base.sqrt(::StaticInt{M}) where {M} = Expr(:call, Expr(:curly, :StaticFloat, sqrt(M)))
+@generated Base.sqrt(::StaticFloat{M}) where {M} = Expr(:call, Expr(:curly, :StaticFloat, sqrt(M)))
+
+@generated Base.round(::StaticFloat{M}) where {M} = Expr(:call, Expr(:curly, :StaticFloat, round(M)))
+@generated roundtostaticint(::StaticFloat{M}) where {M} = Expr(:call, Expr(:curly, :StaticInt, round(Int, M)))
+roundtostaticint(x::AbstractFloat) = round(Int, x)
+@generated floortostaticint(::StaticFloat{M}) where {M} = Expr(:call, Expr(:curly, :StaticInt, floor(Int, M)))
+floortostaticint(x::AbstractFloat) = Base.fptosi(Int, x)
+@generated Base.inv(::StaticFloat{N}) where {N} = Expr(:call, Expr(:curly, :StaticFloat, inv(N)))
+
diff --git a/src/types.jl b/src/types.jl
index 72c8ba4..4a97feb 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -1,9 +1,18 @@
-struct PointerMatrix{T,P<:VectorizationBase.AbstractStridedPointer,S<:Tuple{Vararg{Integer,2}}} <: DenseMatrix{T}
-    p::P
-    s::S
+mutable struct MemoryBuffer{L,T} <: DenseVector{T}
+    data::NTuple{L,T}
+    @inline function MemoryBuffer{L,T}(::UndefInitializer) where {L,T}
+        @assert isbitstype(T) "Memory buffers must point to bits types, but `isbitstype($T) == false`."
+        new{L,T}()
+    end
 end
 
-mutable struct MemoryBuffer{L,T}
-    data::NTuple{L,T}
-    MemoryBuffer{L,T}(::UndefInitializer) where {L,T} = new{L,T}()
+struct StaticFloat{N} <: AbstractFloat
+    StaticFloat{N}() where {N} = new{N::Float64}()
+end
+
+struct BCache{T<:Union{UInt,Nothing}}
+    p::Ptr{UInt8}
+    i::T
 end
+
+
diff --git a/src/utils.jl b/src/utils.jl
index a2842cc..f457f47 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,29 +1,50 @@
-check_sizes(::StaticInt{M}, ::StaticInt{M}) where {M} = StaticInt{M}()
-check_sizes(::StaticInt{M}, ::StaticInt{N}) where {M,N} = throw(ErrorException("$M ≠ $N"))
-check_sizes(::StaticInt{M}, m) where {M} = (@assert M == m; StaticInt{M}())
-check_sizes(m, ::StaticInt{M}) where {M} = (@assert M == m; StaticInt{M}())
-check_sizes(m, n) = (@assert m == n; m)
+@inline _select(::StaticInt{M}, ::StaticInt{M}) where {M} = StaticInt{M}()
+@noinline _select(::StaticInt{M}, ::StaticInt{N}) where {M,N} = throw("$M ≠ $N")
+@inline _select(::StaticInt{M}, _) where {M} = StaticInt{M}()
+@inline _select(_, ::StaticInt{M}) where {M} = StaticInt{M}()
+@inline _select(x, _) = x
 
 """
 Checks sizes for compatibility, and preserves the static size information if
 given a mix of static and dynamic sizes.
 """
-function matmul_sizes(C, A, B)
-    MC, NC = VectorizationBase.ArrayInterface.size(C)
-    MA, KA = VectorizationBase.ArrayInterface.size(A)
-    KB, NB = VectorizationBase.ArrayInterface.size(B)
-    M = check_sizes(MC, MA)
-    K = check_sizes(KA, KB)
-    N = check_sizes(NC, NB)
-    M, K, N
+@inline function matmul_sizes(C,A,B)
+    MC, NC = size(C)
+    MA, KA = size(A)
+    KB, NB = size(B)
+    @assert ((MC == MA) & (KA == KB) & (NC == NB)) "Size mismatch."
+    (_select(MA, MC), _select(KA, KB), _select(NB, NC))
 end
 
-function unsafe_copyto_avx!(B, A)
-    LoopVectorization.@avx for i ∈ eachindex(B, A)
-        B[i] = A[i]
+function unsafe_copyto_avx!(pB, pA, M, N)
+    LoopVectorization.@avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+        pB[m,n] = pA[m,n]
     end
 end
 
-# @inline function gesp1(sp::P, inds) where {P <: VectorizationBase.AbstractStridedPointer}
-#     P(VectorizationBase.gep(sp, inds), sp.strd, sp.offsets)
-# end
+function default_stridedpointer_quote(::Type{T}, N, Ot) where {T}
+    C = 1
+    B = 0
+    R = Expr(:tuple)
+    o = Expr(:tuple)
+    xt = Expr(:tuple)
+    st = Expr(:call, Expr(:curly, :StaticInt, sizeof(T)))
+    for n ∈ 1:N
+        push!(xt.args, Expr(:call, :*, st, Expr(:ref, :x, n)))
+        push!(R.args, n)
+        push!(o.args, Expr(:call, Ot))
+    end
+    quote
+        $(Expr(:meta,:inline))
+        StridedPointer{$T,$N,$C,$B,$R}(ptr, $xt, $o)
+    end
+end
+
+@generated function default_stridedpointer(ptr::Ptr{T}, x::X) where {T, N, X <: Tuple{Vararg{Integer,N}}}
+    default_stridedpointer_quote(T, N, :One)
+end
+@generated function default_zerobased_stridedpointer(ptr::Ptr{T}, x::X) where {T, N, X <: Tuple{Vararg{Integer,N}}}
+    default_stridedpointer_quote(T, N, :Zero)
+end
+
+
diff --git a/test/block_sizes.jl b/test/block_sizes.jl
index 3e51e08..9f26279 100644
--- a/test/block_sizes.jl
+++ b/test/block_sizes.jl
@@ -1,4 +1,4 @@
 @time @testset "block_sizes" begin
-    @test Octavian._calculate_L3(1, 1, 1, Val(true)) == 0
-    @test Octavian._calculate_L3(1, 1, 1, Val(false)) == 1
+    
+    
 end
diff --git a/test/init.jl b/test/init.jl
index eaf4e97..f11d282 100644
--- a/test/init.jl
+++ b/test/init.jl
@@ -2,7 +2,10 @@
     withenv("OCTAVIAN_NUM_TASKS" => "") do
         @test Octavian._read_environment_num_tasks() == min(Threads.nthreads(), VectorizationBase.NUM_CORES)
     end
+    withenv("OCTAVIAN_NUM_TASKS" => "1") do
+        @test Octavian._read_environment_num_tasks() == 1
+    end
     withenv("OCTAVIAN_NUM_TASKS" => "99") do
-        @test Octavian._read_environment_num_tasks() == 99
+        @test Octavian._read_environment_num_tasks() == min(Threads.nthreads(), VectorizationBase.NUM_CORES)
     end
 end
diff --git a/test/macrokernels.jl b/test/macrokernels.jl
index 9bf9ee6..88c5803 100644
--- a/test/macrokernels.jl
+++ b/test/macrokernels.jl
@@ -10,7 +10,7 @@
     C2 = deepcopy(C1)
     α = Float64(2.0)
     β = Float64(2.0)
-    Octavian.macrokernel!(C1, A1, B1, α, β)
+    Octavian.loopmul!(VectorizationBase.zstridedpointer(C1), VectorizationBase.zstridedpointer(A1), VectorizationBase.zstridedpointer(B1), α, β, m, k, n)
     C2 = α*A2*B2 + β*C2
     @test C1 ≈ C2
 end
diff --git a/test/matmul.jl b/test/matmul.jl
index 4b2b498..90ed726 100644
--- a/test/matmul.jl
+++ b/test/matmul.jl
@@ -11,10 +11,15 @@
                 A = rand(Float32, m, k)
                 A′ = permutedims(A)'
                 @show m, k, n
-                @test @time(Octavian.matmul(A, B)) ≈ A * B
-                @test @time(Octavian.matmul(A′, B)) ≈ A′ * B
-                @test @time(Octavian.matmul(A, B′)) ≈ A * B′
-                @test @time(Octavian.matmul(A′, B′)) ≈ A′ * B′
+                AB = A * B; A′B = A′ * B; AB′ = A * B′; A′B′ = A′ * B′;
+                @test @time(Octavian.matmul(A, B)) ≈ AB
+                @test @time(Octavian.matmul(A′, B)) ≈ A′B
+                @test @time(Octavian.matmul(A, B′)) ≈ AB′
+                @test @time(Octavian.matmul(A′, B′)) ≈ A′B′
+                @test @time(Octavian.matmul_serial(A, B)) ≈ AB
+                @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B
+                @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′
+                @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′
             end
         end
     end
@@ -33,10 +38,15 @@ end
                 A = rand(Float64, m, k)
                 A′ = permutedims(A)'
                 @show m, k, n
-                @test @time(Octavian.matmul(A, B)) ≈ A * B
-                @test @time(Octavian.matmul(A′, B)) ≈ A′ * B
-                @test @time(Octavian.matmul(A, B′)) ≈ A * B′
-                @test @time(Octavian.matmul(A′, B′)) ≈ A′ * B′
+                AB = A * B; A′B = A′ * B; AB′ = A * B′; A′B′ = A′ * B′;
+                @test @time(Octavian.matmul(A, B)) ≈ AB
+                @test @time(Octavian.matmul(A′, B)) ≈ A′B
+                @test @time(Octavian.matmul(A, B′)) ≈ AB′
+                @test @time(Octavian.matmul(A′, B′)) ≈ A′B′
+                @test @time(Octavian.matmul_serial(A, B)) ≈ AB
+                @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B
+                @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′
+                @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′
             end
         end
     end
@@ -55,10 +65,15 @@ end
                 A = rand(Int32, m, k)
                 A′ = permutedims(A)'
                 @show m, k, n
-                @test @time(Octavian.matmul(A, B)) == A * B
-                @test @time(Octavian.matmul(A′, B)) == A′ * B
-                @test @time(Octavian.matmul(A, B′)) == A * B′
-                @test @time(Octavian.matmul(A′, B′)) == A′ * B′
+                AB = A * B; A′B = A′ * B; AB′ = A * B′; A′B′ = A′ * B′;
+                @test @time(Octavian.matmul(A, B)) ≈ AB
+                @test @time(Octavian.matmul(A′, B)) ≈ A′B
+                @test @time(Octavian.matmul(A, B′)) ≈ AB′
+                @test @time(Octavian.matmul(A′, B′)) ≈ A′B′
+                @test @time(Octavian.matmul_serial(A, B)) ≈ AB
+                @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B
+                @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′
+                @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′
             end
         end
     end
@@ -77,11 +92,17 @@ end
                 A = rand(Int64, m, k)
                 A′ = permutedims(A)'
                 @show m, k, n
-                @test @time(Octavian.matmul(A, B)) == A * B
-                @test @time(Octavian.matmul(A′, B)) == A′ * B
-                @test @time(Octavian.matmul(A, B′)) == A * B′
-                @test @time(Octavian.matmul(A′, B′)) == A′ * B′
+                AB = A * B; A′B = A′ * B; AB′ = A * B′; A′B′ = A′ * B′;
+                @test @time(Octavian.matmul(A, B)) ≈ AB
+                @test @time(Octavian.matmul(A′, B)) ≈ A′B
+                @test @time(Octavian.matmul(A, B′)) ≈ AB′
+                @test @time(Octavian.matmul(A′, B′)) ≈ A′B′
+                @test @time(Octavian.matmul_serial(A, B)) ≈ AB
+                @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B
+                @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′
+                @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′
             end
         end
     end
 end
+
diff --git a/test/pointer_matrix.jl b/test/pointer_matrix.jl
deleted file mode 100644
index ff00a1b..0000000
--- a/test/pointer_matrix.jl
+++ /dev/null
@@ -1,12 +0,0 @@
-@time Test.@testset "PointerMatrix" begin
-    mem = Octavian.L2Buffer(Float64);
-    ptr = Base.unsafe_convert(Ptr{Float64}, mem)
-    block = Octavian.PointerMatrix(ptr, (10, 20))
-    Test.@test Base.unsafe_convert(Ptr{Float64}, block) == pointer(block.p)
-    GC.@preserve mem begin
-        block[1] = 2.3
-        Test.@test block[1] == 2.3
-        block[4, 5] = 67.89
-        Test.@test block[4, 5] == 67.89
-    end
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index f30b6db..384e49b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -19,12 +19,14 @@ include("test_suite_preamble.jl")
 
 Random.seed!(123)
 
+using Aqua: test_all
+test_all(Octavian)
+
+include("utils.jl")
 include("block_sizes.jl")
 include("init.jl")
 include("macrokernels.jl")
 include("matmul_coverage.jl")
-include("pointer_matrix.jl")
-include("utils.jl")
 
 if !coverage
     include("matmul.jl")
diff --git a/test/utils.jl b/test/utils.jl
index 6062834..9b6303a 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -2,11 +2,12 @@
     @testset "check_sizes" begin
         a = Octavian.StaticInt{1}()
         b = Octavian.StaticInt{2}()
-        @test Octavian.check_sizes(a, a) == a
-        @test_throws ErrorException Octavian.check_sizes(a, b)
-        @test Octavian.check_sizes(a, 1) == a
-        @test_throws AssertionError Octavian.check_sizes(a, 100)
-        @test Octavian.check_sizes(2, b) == b
-        @test_throws AssertionError Octavian.check_sizes(200, b)
+        @test Octavian._select(a, a) === a
+        @test Octavian._select(a, 1) === a
+        @test Octavian._select(1, a) === a
+        @test Octavian._select(1, 1) === 1
+        @test_throws AssertionError Octavian.matmul_sizes(rand(3,2), rand(3,4), rand(5,2))
+        @test_throws AssertionError Octavian.matmul_sizes(rand(3,2), rand(3,4), rand(4,5))
+        @test_throws AssertionError Octavian.matmul_sizes(rand(3,2), rand(5,4), rand(4,2))
     end
 end