Skip to content

Commit 3b2e64a

Browse files
authored
Merge bceb269 into fe21ae1
2 parents fe21ae1 + bceb269 commit 3b2e64a

File tree

7 files changed

+365
-394
lines changed

7 files changed

+365
-394
lines changed

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "Octavian"
22
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
33
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
4-
version = "0.2.8"
4+
version = "0.2.9"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -11,9 +11,9 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1111

1212
[compat]
1313
ArrayInterface = "3"
14-
LoopVectorization = "0.11"
14+
LoopVectorization = "0.11.2"
1515
ThreadingUtilities = "0.2"
16-
VectorizationBase = "0.17"
16+
VectorizationBase = "0.18.1"
1717
julia = "1.5"
1818

1919
[extras]

benchmark/staticarraybench.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ rename!(df, matmulmethodnames);
6363
df.Size = sizerange
6464

6565
function pick_suffix(desc = "")
66-
suffix = if Octavian.VectorizationBase.has_feature("x86_64_avx512f")
66+
suffix = if Bool(Octavian.VectorizationBase.has_feature(Val(:x86_64_avx512f)))
6767
"AVX512"
68-
elseif Octavian.VectorizationBase.has_feature("x86_64_avx2")
68+
elseif Bool(Octavian.VectorizationBase.has_feature(Val(:x86_64_avx2)))
6969
"AVX2"
70-
elseif Octavian.VectorizationBase.has_feature("x86_64_avx")
70+
elseif Bool(Octavian.VectorizationBase.has_feature(Val(:x86_64_avx)))
7171
"AVX"
7272
else
7373
"REGSIZE$(Octavian.VectorizationBase.register_size())"

src/Octavian.jl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@ module Octavian
33
using VectorizationBase, ArrayInterface, LoopVectorization
44

55
using VectorizationBase: align, AbstractStridedPointer, zstridedpointer,
6-
static_sizeof, lazymul, StridedPointer, gesp, pause, pick_vector_width_val,
7-
snum_cache_levels, scache_size, snum_cores, num_cores, cache_inclusivity, scacheline_size
6+
static_sizeof, lazymul, StridedPointer, gesp, pause, pick_vector_width, has_feature,
7+
num_cache_levels, cache_size, num_cores, num_cores, cache_inclusive, cache_linesize, ifelse
88
using LoopVectorization: maybestaticsize, matmul_params, preserve_buffer, CloseOpen
99
using ArrayInterface: StaticInt, Zero, One, OptionallyStaticUnitRange, size, strides, offsets, indices,
10-
static_length, static_first, static_last, axes, dense_dims, stride_rank
10+
static_length, static_first, static_last, axes, dense_dims, stride_rank,
11+
StaticBool, True, False, gt, eq
1112

1213
using ThreadingUtilities:
1314
_atomic_add!, _atomic_umax!, _atomic_umin!,

src/block_sizes.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22

33
function block_sizes(::Type{T}, _α, _β, R₁, R₂) where {T}
4-
W = pick_vector_width_val(T)
4+
W = pick_vector_width(T)
55
α =* W
66
β =* W
77
L₁ₑ = first_cache_size(T) * R₁
@@ -158,7 +158,7 @@ Note that for synchronization on `B`, all threads must have the same values for
158158
independently of `M`, this algorithm guarantees all threads are on the same page.
159159
"""
160160
@inline function solve_block_sizes(::Type{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
161-
W = pick_vector_width_val(T)
161+
W = pick_vector_width(T)
162162
α =* W
163163
β =* W
164164
L₁ₑ = first_cache_size(T) * R₂
@@ -177,7 +177,7 @@ independently of `M`, this algorithm guarantees all threads are on the same page
177177
end
178178
# Takes Nc, calcs Mc and Kc
179179
@inline function solve_McKc(::Type{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
180-
W = pick_vector_width_val(T)
180+
W = pick_vector_width(T)
181181
α =* W
182182
β =* W
183183
L₁ₑ = first_cache_size(T) * R₂

src/global_constants.jl

Lines changed: 48 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
const OCTAVIAN_NUM_TASKS = Ref(1)
33
_nthreads() = OCTAVIAN_NUM_TASKS[]
44

5-
@generated function calc_factors(::Union{Val{nc},StaticInt{nc}} = snum_cores()) where {nc}
5+
@generated function calc_factors(::Union{Val{nc},StaticInt{nc}} = num_cores()) where {nc}
66
t = Expr(:tuple)
77
for i nc:-1:1
88
d, r = divrem(nc, i)
@@ -12,87 +12,58 @@ _nthreads() = OCTAVIAN_NUM_TASKS[]
1212
end
1313
# const CORE_FACTORS = calc_factors()
1414

15-
@generated function MᵣW_mul_factor()
16-
f = VectorizationBase.has_feature("x86_64_avx512f") ? 4 : 9
17-
Expr(:call, Expr(:curly, :StaticInt, f))
18-
end
15+
MᵣW_mul_factor(::True) = StaticInt{4}()
16+
MᵣW_mul_factor(::False) = StaticInt{9}()
17+
MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))
1918

20-
@generated function W₁Default()
21-
w = if VectorizationBase.has_feature("x86_64_avx512f")
22-
0.006089395198610773
23-
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
24-
0.1
25-
elseif Sys.CPU_NAME === "znver1"
26-
0.053918949422353986
27-
else
28-
0.1
29-
end
30-
Expr(:call, Expr(:curly, :StaticFloat, w))
31-
end
32-
@generated function W₂Default()
33-
w = if VectorizationBase.has_feature("x86_64_avx512f")
34-
0.7979822724696168
35-
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
36-
0.993489411720157
37-
elseif Sys.CPU_NAME === "znver1"
38-
0.3013238122374886
39-
else
40-
0.15989396641218157
41-
end
42-
Expr(:call, Expr(:curly, :StaticFloat, w))
43-
end
44-
@generated function R₁Default()
45-
w = if VectorizationBase.has_feature("x86_64_avx512f")
46-
0.5900561503730485
47-
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
48-
0.6052218809954467
49-
elseif Sys.CPU_NAME === "znver1"
50-
0.6077103834481342
51-
else
52-
0.4203583148344484
53-
end
54-
Expr(:call, Expr(:curly, :StaticFloat, w))
55-
end
56-
@generated function R₂Default()
57-
w = if VectorizationBase.has_feature("x86_64_avx512f")
58-
0.762152930709678
59-
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
60-
0.7594052633561165
61-
elseif Sys.CPU_NAME === "znver1"
62-
0.8775382433240162
63-
else
64-
0.6344856142604789
65-
end
66-
Expr(:call, Expr(:curly, :StaticFloat, w))
67-
end
19+
W₁Default(::True) = StaticFloat{0.006089395198610773}()
20+
W₂Default(::True) = StaticFloat{0.7979822724696168}()
21+
R₁Default(::True) = StaticFloat{0.5900561503730485}()
22+
R₂Default(::True) = StaticFloat{0.762152930709678}()
23+
24+
W₁Default_arch(::Val{:znver1}) = StaticFloat{0.053918949422353986}()
25+
W₂Default_arch(::Val{:znver1}) = StaticFloat{0.3013238122374886}()
26+
R₁Default_arch(::Val{:znver1}) = StaticFloat{0.6077103834481342}()
27+
R₂Default_arch(::Val{:znver1}) = StaticFloat{0.8775382433240162}()
28+
29+
W₁Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat{0.1}()
30+
W₂Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat{0.993489411720157}()
31+
R₁Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat{0.6052218809954467}()
32+
R₂Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat{0.7594052633561165}()
33+
34+
W₁Default_arch(_) = StaticFloat{0.1}()
35+
W₂Default_arch(_) = StaticFloat{0.15989396641218157}()
36+
R₁Default_arch(_) = StaticFloat{0.4203583148344484}()
37+
R₂Default_arch(_) = StaticFloat{0.8775382433240162}()
38+
39+
W₁Default(::False) = W₁Default_arch(VectorizationBase.cpu_name())
40+
W₂Default(::False) = W₂Default_arch(VectorizationBase.cpu_name())
41+
R₁Default(::False) = R₁Default_arch(VectorizationBase.cpu_name())
42+
R₂Default(::False) = R₂Default_arch(VectorizationBase.cpu_name())
43+
44+
W₁Default() = W₁Default(has_feature(Val(:x86_64_avx512f)))
45+
W₂Default() = W₂Default(has_feature(Val(:x86_64_avx512f)))
46+
R₁Default() = R₁Default(has_feature(Val(:x86_64_avx512f)))
47+
R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f)))
48+
49+
50+
51+
52+
first_cache() = ifelse(gt(num_cache_levels(), StaticInt{2}()), StaticInt{2}(), StaticInt{1}())
53+
second_cache() = first_cache() + One()
54+
55+
_first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs)
56+
_first_cache_size(::Nothing) = StaticInt(262144)
57+
first_cache_size() = _first_cache_size(cache_size(first_cache()))
58+
59+
_second_cache_size(scs::StaticInt) = ifelse(cache_inclusive(second_cache()), scs - cache_size(first_cache()), scs)
60+
_second_cache_size(::Nothing) = StaticInt(3145728)
61+
second_cache_size() = _second_cache_size(cache_size(second_cache()))
6862

69-
first_cache() = StaticInt{1}() + (snum_cache_levels() > StaticInt{2}() ? One() : Zero())
70-
second_cache() = StaticInt{2}() + (snum_cache_levels() > StaticInt{2}() ? One() : Zero())
71-
72-
function first_cache_size()
73-
fcs = scache_size(first_cache())
74-
if fcs === Zero()
75-
return StaticInt(262144)
76-
elseif (first_cache() === StaticInt(2)) && cache_inclusivity()[2]
77-
return fcs - scache_size(One())
78-
else
79-
return fcs
80-
end
81-
end
82-
function second_cache_size()
83-
scs = scache_size(second_cache())
84-
if scs === Zero()
85-
return StaticInt(3145728)
86-
elseif cache_inclusivity()[second_cache()]
87-
return scs - scache_size(first_cache())
88-
else
89-
return scs
90-
end
91-
end
9263
first_cache_size(::Type{T}) where {T} = first_cache_size() ÷ static_sizeof(T)
9364
second_cache_size(::Type{T}) where {T} = second_cache_size() ÷ static_sizeof(T)
9465

95-
bcache_count() = VectorizationBase.scache_count(second_cache())
66+
bcache_count() = VectorizationBase.num_cache(second_cache())
9667

9768
const BCACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
9869
const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt))

0 commit comments

Comments
 (0)