Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@
/docs/build/
Manifest.toml
*~
*#
*#*
*#*#

7 changes: 5 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
name = "Octavian"
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
version = "0.1.0"
version = "0.2.0-DEV"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[compat]
ArrayInterface = "2.14"
LoopVectorization = "0.9.18"
ThreadingUtilities = "0.2"
VectorizationBase = "0.15.2"
julia = "1.5"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand All @@ -24,4 +27,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[targets]
test = ["BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
test = ["Aqua", "BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"]
30 changes: 20 additions & 10 deletions src/Octavian.jl
Original file line number Diff line number Diff line change
@@ -1,23 +1,33 @@
module Octavian

import ArrayInterface
import LoopVectorization
import VectorizationBase
using VectorizationBase, ArrayInterface, LoopVectorization

using VectorizationBase: StaticInt
using VectorizationBase: align, gep, AbstractStridedPointer, AbstractSIMDVector, vnoaliasstore!, staticm1,
static_sizeof, lazymul, vmul_fast, StridedPointer, gesp, zero_offsets, pause,
CACHE_COUNT, NUM_CORES, CACHE_INCLUSIVITY, zstridedpointer
using LoopVectorization: maybestaticsize, mᵣ, nᵣ, preserve_buffer, CloseOpen
using ArrayInterface: StaticInt, Zero, One, OptionallyStaticUnitRange, size, strides, offsets, indices,
static_length, static_first, static_last, axes,
dense_dims, DenseDims, stride_rank, StrideRank

export matmul
export matmul!
using ThreadingUtilities:
_atomic_add!, _atomic_umax!, _atomic_umin!,
_atomic_load, _atomic_store!, _atomic_cas_cmp!,
SPIN, WAIT, TASK, LOCK, STUP, taskpointer,
wake_thread!, __wait, load, store!

export matmul!, matmul_serial!, matmul, matmul_serial, StaticInt

include("global_constants.jl")
include("types.jl")

include("staticfloats.jl")
include("integerdivision.jl")
include("memory_buffer.jl")
include("block_sizes.jl")
include("funcptrs.jl")
include("macrokernels.jl")
include("matmul.jl")
include("memory_buffer.jl")
include("pointer_matrix.jl")
include("utils.jl")
include("matmul.jl")

include("init.jl") # `Octavian.__init__()` is defined in this file

Expand Down
283 changes: 223 additions & 60 deletions src/block_sizes.jl
Original file line number Diff line number Diff line change
@@ -1,70 +1,233 @@

first_effective_cache(::Type{T}) where {T} = StaticInt{FIRST__CACHE_SIZE}() ÷ static_sizeof(T)
second_effective_cache(::Type{T}) where {T} = StaticInt{SECOND_CACHE_SIZE}() ÷ static_sizeof(T)

function block_sizes(::Type{T}, _α, _β, R₁, R₂) where {T}
W = VectorizationBase.pick_vector_width_val(T)
α = _α * W
β = _β * W
L₁ₑ = first_effective_cache(T) * R₁
L₂ₑ = second_effective_cache(T) * R₂
block_sizes(W, α, β, L₁ₑ, L₂ₑ)
end
function block_sizes(W, α, β, L₁ₑ, L₂ₑ)
MᵣW = StaticInt{mᵣ}() * W

Mc = floortostaticint(√(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ) / MᵣW) * MᵣW
Kc = roundtostaticint(√(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α))
Nc = floortostaticint(√(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ) / StaticInt{nᵣ}()) * StaticInt{nᵣ}()

Mc, Kc, Nc
end
function block_sizes(::Type{T}) where {T}
block_sizes(T, StaticFloat{W₁Default}(), StaticFloat{W₂Default}(), StaticFloat{R₁Default}(), StaticFloat{R₂Default}())
end

"""
block_sizes(::Type{T}) -> (Mc, Kc, Nc)

Returns the dimensions of our macrokernel, which iterates over the microkernel. That is, in
calculating `C = A * B`, our macrokernel will be called on `Mc × Kc` blocks of `A`, multiplying
them with `Kc × Nc` blocks of `B`, to update `Mc × Nc` blocks of `C`.

We want these blocks to fit nicely in the cache. There is a lot of room for improvement here, but this
initial implementation should work reasonably well.


The constants `LoopVectorization.mᵣ` and `LoopVectorization.nᵣ` are the factors by which LoopVectorization
wants to unroll the rows and columns of the microkernel, respectively. `LoopVectorization` defines these
constants by running it's analysis on a gemm kernel; they're there for convenience/to make it easier to
implement matrix-multiply.
It also wants to vectorize the rows by `W = VectorizationBase.pick_vector_width_val(T)`.
Thus, the microkernel's dimensions are `(W * mᵣ) × nᵣ`; that is, the microkernel updates a `(W * mᵣ) × nᵣ`
block of `C`.

Because the macrokernel iterates over tiles and repeatedly applies the microkernel, we would prefer the
macrokernel's dimensions to be an integer multiple of the microkernel's.
That is, we want `Mc` to be an integer multiple of `W * mᵣ` and `Nc` to be an integer multiple of `nᵣ`.

Additionally, we want our blocks of `A` to fit in the core-local L2 cache.
Empirically, I found that when using `Float64` arrays, 72 rows works well on Haswell (where `W * mᵣ = 8`)
and 96 works well for Cascadelake (where `W * mᵣ = 24`).
So I kind of heuristically multiply `W * mᵣ` by `4` given 32 vector register (as in Cascadelake), which
would yield `96`, and multiply by `9` otherwise, which would give `72` on Haswell.
Ideally, we'd have a better means of picking.
I suspect relatively small numbers work well because I'm currently using a column-major memory layout for
the internal packing arrays. A column-major memory layout means that if our macro-kernel had a lot of rows,
moving across columns would involve reading memory far apart, moving across memory pages more rapidly,
hitting the TLB harder. This is why libraries like OpenBLAS and BLIS don't use a column-major layout, but
reshape into a 3-d array, e.g. `A` will be reshaped into a `Mᵣ × Kc × (Mc ÷ Mᵣ)` array (also sometimes
referred to as a tile-major matrix), so that all memory reads happen in consecutive memory locations.


Now that we have `Mc`, we use it and the `L2` cache size to calculate `Kc`, but shave off a percent to
leave room in the cache for some other things.

We want out blocks of `B` to fir in the `L3` cache, so we can use the `L3` cache-size and `Kc` to
similarly calculate `Nc`, with the additional note that we also divide and multiply by `nᵣ` to ensure
that `Nc` is an integer multiple of `nᵣ`.
split_m(M, Miters_base, W)

Splits `M` into at most `Miters_base` iterations.
For example, if we wish to divide `517` iterations into roughly 7 blocks using multiples of `8`:

```julia
julia> split_m(517, 7, 8)
(72, 2, 69, 7)
```

This suggests we have base block sizes of size `72`, with two iterations requiring an extra remainder of `8 ( = W)`,
and a final block of `69` to handle the remainder. It also tells us that there are `7` total iterations, as requested.
```julia
julia> 80*2 + 72*(7-2-1) + 69
517
```
This is meant to specify roughly the requested amount of blocks, and return relatively even sizes.

This method is used fairly generally.
"""
function block_sizes(::Type{T}) where {T}
_L1, _L2, __L3, _L4 = VectorizationBase.CACHE_SIZE
L1c, L2c, L3c, L4c = VectorizationBase.CACHE_COUNT
# TODO: something better than treating it as 4 MiB if there is no L3 cache
# one possibility is to focus on the L1 and L2 caches instead of the L2 and L3.
_L3 = something(__L3, 4194304)
@assert L1c == L2c == VectorizationBase.NUM_CORES
@inline function split_m(M, _Mblocks, W)
Miters = cld_fast(M, W)
Mblocks = min(_Mblocks, Miters)
Miter_per_block, Mrem = divrem_fast(Miters, Mblocks)
Mbsize = Miter_per_block * W
Mremfinal = M - Mbsize*(Mblocks-1) - Mrem * W
Mbsize, Mrem, Mremfinal, Mblocks
end

"""
solve_block_sizes(::Type{T}, M, K, N, α, β, R₂, R₃)

This function returns iteration/blocking descriptions `Mc`, `Kc`, and `Nc` for use when packing both `A` and `B`.

It tries to roughly minimize the cost
```julia
MKN/(Kc*W) + α * MKN/Mc + β * MKN/Nc
```
subject to constraints
```julia
Mc - M ≤ 0
Kc - K ≤ 0
Nc - N ≤ 0
Mc*Kc - L₁ₑ ≤ 0
Kc*Nc - L₂ₑ ≤ 0
```
That is, our constraints say that our block sizes shouldn't be bigger than the actual dimensions, and also that
our packed `A` (`Mc × Kc`) should fit into the first packing cache (generally, actually the `L₂`, and our packed
`B` (`Kc × Nc`) should fit into the second packing cache (generally the `L₃`).

st = VectorizationBase.static_sizeof(T)
Our cost model consists of three components:
1. Cost of moving data in and out of registers. This is done `(M/Mᵣ * K/Kc * N/Nᵣ)` times and the cost per is `(Mᵣ/W * Nᵣ)`.
2. Cost of moving strips from `B` pack from the low cache levels to the highest cache levels when multiplying `Aₚ * Bₚ`.
This is done `(M / Mc * K / Kc * N / Nc)` times, and the cost per is proportional to `(Kc * Nᵣ)`.
`α` is the proportionality-constant parameter.
3. Cost of packing `A`. This is done `(M / Mc * K / Kc * N / Nc)` times, and the cost per is proportional to
`(Mc * Kc)`. `β` is the proportionality-constant parameter.

L2 = (StaticInt{_L2}() - StaticInt{_L1}()) ÷ st
L3 = _calculate_L3(StaticInt{_L2}(), StaticInt{_L3}(), st, Val{VectorizationBase.CACHE_INCLUSIVITY[3]}())
As `W` is a constant, we multiply the cost by `W` and absorb it into `α` and `β`. We drop it from the description
from here on out.

In the full problem, we would have Lagrangian, with μ < 0:
f((Mc,Kc,Nc),(μ₁,μ₂,μ₃,μ₄,μ₅))
MKN/Kc + α * MKN/Mc + β * MKN/Nc - μ₁(Mc - M) - μ₂(Kc - K) - μ₃(Nc - N) - μ₄(Mc*Kc - L2) - μ₅(Kc*Nc - L3)
```julia
0 = ∂L/∂Mc = - α * MKN / Mc² - μ₁ - μ₄*Kc
0 = ∂L/∂Kc = - MKN / Kc² - μ₂ - μ₄*Mc - μ₅*Nc
0 = ∂L/∂Nc = - β * MKN / Nc² - μ₃ - μ₅*Kc
0 = ∂L/∂μ₁ = M - Mc
0 = ∂L/∂μ₂ = K - Kc
0 = ∂L/∂μ₃ = N - Nc
0 = ∂L/∂μ₄ = L₁ₑ - Mc*Kc
0 = ∂L/∂μ₅ = L₂ₑ - Kc*Nc
```
The first 3 constraints complicate things, because they're trivially solved by setting `M = Mc`, `K = Kc`, and `N = Nc`.
But this will violate the last two constraints in general; normally we will be on the interior of the inequalities,
meaning we'd be dropping those constraints. Doing so, this leaves us with:

First, lets just solve the cost w/o constraints 1-3
```julia
0 = ∂L/∂Mc = - α * MKN / Mc² - μ₄*Kc
0 = ∂L/∂Kc = - MKN / Kc² - μ₄*Mc - μ₅*Nc
0 = ∂L/∂Nc = - β * MKN / Nc² - μ₅*Kc
0 = ∂L/∂μ₄ = L₁ₑ - Mc*Kc
0 = ∂L/∂μ₅ = L₂ₑ - Kc*Nc
```
Solving:
```julia
Mc = √(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ)
Kc = √(L₁ₑ)*√(L₂ₑ)/√(L₁ₑ*β + L₂ₑ*α)
Nc = √(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ)
μ₄ = -K*√(L₂ₑ)*M*N*α/(L₁ₑ^(3/2)*√(L₁ₑ*β + L₂ₑ*α))
μ₅ = -K*√(L₁ₑ)*M*N*β/(L₂ₑ^(3/2)*√(L₁ₑ*β + L₂ₑ*α))
```
These solutions are indepedent of matrix size.
The approach we'll take here is solving for `Nc`, `Kc`, and then finally `Mc` one after the other, incorporating sizes.

Starting with `N`, we check how many iterations would be implied by `Nc`, and then choose the smallest value that would
yield that number of iterations. This also ensures that `Nc ≤ N`.
```julia
Niter = cld(N, √(L₂ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₁ₑ))
Nblock, Nrem = divrem(N, Niter)
Nblock_Nrem = Nblock + (Nrem > 0)
```
We have `Nrem` iterations of size `Nblock_Nrem`, and `Niter - Nrem` iterations of size `Nblock`.

We can now make `Nc = Nblock_Nrem` a constant, and solve the remaining three equations again:
```julia
0 = ∂L/∂Mc = - α * MKN / Mc² - μ₄*Kc
0 = ∂L/∂Kc = - MKN / Kc² - μ₄*Mc - μ₅*Ncm
0 = ∂L/∂μ₄ = L₂ₑ - Mc*Kc
```
yielding
```julia
Mc = √(L₁ₑ)*√(α)
Kc = √(L₁ₑ)/√(α)
μ₄ = -K*M*N*√(α)/L₁ₑ^(3/2)
```
We proceed in the same fashion as for `Nc`, being sure to reapply the `Kc * Nc ≤ L₂ₑ` constraint:
```julia
Kiter = cld(K, min(√(L₁ₑ)/√(α), L₂ₑ/Nc))
Kblock, Krem = divrem(K, Ki)
Kblock_Krem = Kblock + (Krem > 0)
```
This leaves `Mc` partitioning, for which, for which we use the constraint `Mc * Kc ≤ L₁ₑ` to set
the initial number of proposed iterations as `cld(M, L₁ₑ / Kcm)` for calling `split_m`.
```julia
Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, cld(M, L₁ₑ / Kcm), StaticInt{W}())
```

Note that for synchronization on `B`, all threads must have the same values for `Kc` and `Nc`.
`K` and `N` will be equal between threads, but `M` may differ. By calculating `Kc` and `Nc`
independently of `M`, this algorithm guarantees all threads are on the same page.
"""
@inline function solve_block_sizes(::Type{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
W = VectorizationBase.pick_vector_width_val(T)
Mr = StaticInt{LoopVectorization.mᵣ}()
Nr = StaticInt{LoopVectorization.nᵣ}()
α = _α * W
β = _β * W
L₁ₑ = first_effective_cache(T) * R₂
L₂ₑ = second_effective_cache(T) * R₃

Mc = (VectorizationBase.REGISTER_COUNT == 32 ? StaticInt{4}() : StaticInt{9}()) * Mr * W
Kc = (StaticInt{5}() * L2) ÷ (StaticInt{7}() * Mc)
Nc = ((StaticInt{5}() * L3) ÷ (StaticInt{7}() * Kc * Nr)) * Nr
# Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
Nc_init⁻¹ = √(L₁ₑ) / (√(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ))

Niter = cldapproxi(N, Nc_init⁻¹) # approximate `ceil`
Nblock, Nrem = divrem_fast(N, Niter)
Nblock_Nrem = Nblock + One()#(Nrem > 0)

Mc, Kc, Nc
((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = solve_McKc(T, M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)

(Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), promote(Nblock, Nblock_Nrem, Nrem, Niter)
end
# Takes Nc, calcs Mc and Kc
@inline function solve_McKc(::Type{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
W = VectorizationBase.pick_vector_width_val(T)
α = _α * W
β = _β * W
L₁ₑ = first_effective_cache(T) * R₂
L₂ₑ = second_effective_cache(T) * R₃

Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
Kblock, Krem = divrem_fast(K, Kiter)
Kblock_Krem = Kblock + One()

Miter_init = cldapproxi(M * inv(L₁ₑ), Kblock_Krem) # Miter = M * Kc / L₁ₑ
Mbsize, Mrem, Mremfinal, Miter = split_m(M, Miter_init, W * Wfactor)
Mblock_Mrem = Mbsize + W * Wfactor

promote(Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter), promote(Kblock, Kblock_Krem, Krem, Kiter)
end

@inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil`

"""
find_first_acceptable(M, W)

Finds first combination of `Miter` and `Niter` that doesn't make `M` too small while producing `Miter * Niter = NUM_CORES`.
This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point.
"""
@inline function find_first_acceptable(M, W)
Mᵣ = StaticInt{mᵣ}() * W
for (miter,niter) ∈ CORE_FACTORS
if miter * ((MᵣW_mul_factor - One()) * Mᵣ) ≤ M + (W + W)
return miter, niter
end
end
last(CORE_FACTORS)
end
"""
divide_blocks(M, Ntotal, _nspawn, W)

Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices.
"""
@inline function divide_blocks(M, Ntotal, _nspawn, W)
_nspawn == NUM_CORES && return find_first_acceptable(M, W)

Miter = clamp(div_fast(M, W*StaticInt{mᵣ}() * MᵣW_mul_factor), 1, _nspawn)
nspawn = div_fast(_nspawn, Miter)
Niter = if (nspawn ≤ 1) & (Miter < _nspawn)
# rebalance Miter
Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter))
nspawn = div_fast(_nspawn, Miter)
end
Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn)))
end

_calculate_L3(_L2, _L3, st, ::Val{true}) = (_L3 - _L2) ÷ st
_calculate_L3(_L2, _L3, st, ::Val{false}) = _L3 ÷ st
Loading