From 84c9749da4160ed468d9330971942d59a89a97cd Mon Sep 17 00:00:00 2001 From: N5N3 <2642243996@qq.com> Date: Sat, 25 Dec 2021 13:54:20 +0800 Subject: [PATCH 1/3] enable fma on Windows --- src/llvm-cpufeatures.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/llvm-cpufeatures.cpp b/src/llvm-cpufeatures.cpp index dc35aeb569cfa..8accd399371ae 100644 --- a/src/llvm-cpufeatures.cpp +++ b/src/llvm-cpufeatures.cpp @@ -36,10 +36,7 @@ Optional always_have_fma(Function &intr) { auto intr_name = intr.getName(); auto typ = intr_name.substr(strlen("julia.cpu.have_fma.")); -#if defined(_OS_WINDOWS_) - // FMA on Windows is weirdly broken (#43088) - return false; -#elif defined(_CPU_AARCH64_) +#if defined(_CPU_AARCH64_) return typ == "f32" || typ == "f64"; #else (void)typ; From 1b849ff65a40263724845e39713701f6fd096b5e Mon Sep 17 00:00:00 2001 From: N5N3 <2642243996@qq.com> Date: Sat, 25 Dec 2021 13:56:07 +0800 Subject: [PATCH 2/3] define `julia_fma(f)` on windows 1. add `volatile` to fix Win32 2. fix `issubnormal` on Win32 --- src/runtime_intrinsics.c | 104 +++++++++++++++++++++++++++++++++++++++ test/math.jl | 42 ++++++++-------- 2 files changed, 124 insertions(+), 22 deletions(-) diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index d64e1945c52a1..5acd26ca64776 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -1175,8 +1175,112 @@ bi_fintrinsic(div,div_float) bi_fintrinsic(frem,rem_float) // ternary operators // +// runtime fma is broken on windows, define julia_fma(f) ourself with fma_emulated as reference. +#if defined(_OS_WINDOWS_) +// reinterpret(UInt64, ::Float64) +uint64_t bitcast_d2u(double d) { + uint64_t r; + memcpy(&r, &d, 8); + return r; +} +// reinterpret(Float64, ::UInt64) +double bitcast_u2d(uint64_t d) { + double r; + memcpy(&r, &d, 8); + return r; +} +// Base.splitbits(::Float64) +void splitbits(double *hi, double *lo, double d) { + *hi = bitcast_u2d(bitcast_d2u(d) & 0xfffffffff8000000); + *lo = d - *hi; +} +// Base.exponent(::Float64) +int exponent(double a) { + int e; + frexp(a, &e); + return e - 1; +} +// Base.fma_emulated(::Float32, ::Float32, ::Float32) +float julia_fmaf(float a, float b, float c) { + double ab, res; + ab = (double)a * b; + res = ab + (double)c; + if ((bitcast_d2u(res) & 0x1fffffff) == 0x10000000){ + double reslo = fabsf(c) > fabs(ab) ? ab-(res - c) : c-(res - ab); + if (reslo != 0) + res = nextafter(res, copysign(1.0/0.0, reslo)); + } + return (float)res; +} +// Base.twomul(::Float64, ::Float64) +void two_mul(double *abhi, double *ablo, double a, double b) { + double ahi, alo, bhi, blo, blohi, blolo; + splitbits(&ahi, &alo, a); + splitbits(&bhi, &blo, b); + splitbits(&blohi, &blolo, blo); + *abhi = a*b; + *ablo = alo*blohi - (((*abhi - ahi*bhi) - alo*bhi) - ahi*blo) + blolo*alo; +} +// Base.issubnormal(::Float64) (Win32's fpclassify seems broken) +int issubnormal(double d) { + uint64_t y = bitcast_d2u(d); + return ((y & 0x7ff0000000000000) == 0) & ((y & 0x000fffffffffffff) != 0); +} +#if defined(_WIN32) +// Win32 needs volatile (avoid over optimization?) +#define VDOUBLE volatile double +#else +#define VDOUBLE double +#endif + +// Base.fma_emulated(::Float64, ::Float64, ::Float64) +double julia_fma(double a, double b, double c) { + double abhi, ablo, r, s; + two_mul(&abhi, &ablo, a, b); + if (!isfinite(abhi+c) || fabs(abhi) < 2.0041683600089732e-292 || + issubnormal(a) || issubnormal(b)) { + int aandbfinite = isfinite(a) && isfinite(b); + if (!(aandbfinite && isfinite(c))) + return aandbfinite ? c : abhi+c; + if (a == 0 || b == 0) + return abhi+c; + int bias = exponent(a) + exponent(b); + VDOUBLE c_denorm = ldexp(c, -bias); + if (isfinite(c_denorm)) { + if (issubnormal(a)) + a *= 4.503599627370496e15; + if (issubnormal(b)) + b *= 4.503599627370496e15; + a = bitcast_u2d((bitcast_d2u(a) & 0x800fffffffffffff) | 0x3ff0000000000000); + b = bitcast_u2d((bitcast_d2u(b) & 0x800fffffffffffff) | 0x3ff0000000000000); + c = c_denorm; + two_mul(&abhi, &ablo, a, b); + r = abhi+c; + s = (fabs(abhi) > fabs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo); + double sumhi = r+s; + if (issubnormal(ldexp(sumhi, bias))) { + double sumlo = r-sumhi+s; + int bits_lost = -bias-exponent(sumhi)-1022; + if ((bits_lost != 1) ^ ((bitcast_d2u(sumhi)&1) == 1)) + if (sumlo != 0) + sumhi = nextafter(sumhi, copysign(1.0/0.0, sumlo)); + } + return ldexp(sumhi, bias); + } + if (isinf(abhi) && signbit(c) == signbit(a*b)) + return abhi; + } + r = abhi+c; + s = (fabs(abhi) > fabs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo); + return r+s; +} +#define fma(a, b, c) \ + sizeof(a) == sizeof(float) ? julia_fmaf(a, b, c) : julia_fma(a, b, c) +#else // On other systems use fma(f) directly #define fma(a, b, c) \ sizeof(a) == sizeof(float) ? fmaf(a, b, c) : fma(a, b, c) +#endif + #define muladd(a, b, c) a * b + c ter_fintrinsic(fma,fma_float) ter_fintrinsic(muladd,muladd_float) diff --git a/test/math.jl b/test/math.jl index 664a1f637314c..a0c6d84ff2d86 100644 --- a/test/math.jl +++ b/test/math.jl @@ -1289,29 +1289,27 @@ end end @testset "fma" begin - if !(@static Sys.iswindows() && Int===Int64) # windows fma currently seems broken somehow. - for func in (fma, Base.fma_emulated) - @test func(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16 - @test func(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7 - @testset "$T" for T in (Float32, Float64) - @test func(floatmax(T), T(2), -floatmax(T)) === floatmax(T) - @test func(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf) - @test func(T(Inf), T(Inf), T(Inf)) === T(Inf) - @test func(floatmax(T), floatmax(T), -T(Inf)) === -T(Inf) - @test func(floatmax(T), -floatmax(T), T(Inf)) === T(Inf) - @test isnan_type(T, func(T(Inf), T(1), -T(Inf))) - @test isnan_type(T, func(T(Inf), T(0), -T(0))) - @test func(-zero(T), zero(T), -zero(T)) === -zero(T) - for _ in 1:2^18 - a, b, c = reinterpret.(T, rand(Base.uinttype(T), 3)) - @test isequal(func(a, b, c), fma(a, b, c)) || (a,b,c) - end + for func in (fma, Base.fma_emulated, Base.fma_float) # use fma_float to test runtime fma + @test func(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16 + @test func(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7 + @testset "$T" for T in (Float32, Float64) + @test func(floatmax(T), T(2), -floatmax(T)) === floatmax(T) + @test func(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf) + @test func(T(Inf), T(Inf), T(Inf)) === T(Inf) + @test func(floatmax(T), floatmax(T), -T(Inf)) === -T(Inf) + @test func(floatmax(T), -floatmax(T), T(Inf)) === T(Inf) + @test isnan_type(T, func(T(Inf), T(1), -T(Inf))) + @test isnan_type(T, func(T(Inf), T(0), -T(0))) + @test func(-zero(T), zero(T), -zero(T)) === -zero(T) + for _ in 1:2^18 + a, b, c = reinterpret.(T, rand(Base.uinttype(T), 3)) + @test isequal(func(a, b, c), fma(a, b, c)) || (a,b,c) end - @test func(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292 - @test func(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31 - @test func(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf - @test func(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf - @test func(-1.9369631f13, 2.1513551f-7, -1.7354427f-24) == -4.1670958f6 end + @test func(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292 + @test func(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31 + @test func(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf + @test func(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf + @test func(-1.9369631f13, 2.1513551f-7, -1.7354427f-24) == -4.1670958f6 end end From 1148b79c690fbf7e8144740aa481216ac20ef103 Mon Sep 17 00:00:00 2001 From: N5N3 <2642243996@qq.com> Date: Sat, 25 Dec 2021 14:01:57 +0800 Subject: [PATCH 3/3] skip runtime fma test on linux32 Update math.jl --- test/math.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/math.jl b/test/math.jl index a0c6d84ff2d86..b5a6d3abf55a4 100644 --- a/test/math.jl +++ b/test/math.jl @@ -1289,7 +1289,11 @@ end end @testset "fma" begin - for func in (fma, Base.fma_emulated, Base.fma_float) # use fma_float to test runtime fma + fma_list = (fma, Base.fma_emulated) + if !(Sys.islinux() && Int == Int32) # test runtime fma (skip linux32) + fma_list = (fma_list..., Base.fma_float) + end + for func in fma_list @test func(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16 @test func(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7 @testset "$T" for T in (Float32, Float64)