From 84c9749da4160ed468d9330971942d59a89a97cd Mon Sep 17 00:00:00 2001
From: N5N3 <2642243996@qq.com>
Date: Sat, 25 Dec 2021 13:54:20 +0800
Subject: [PATCH 1/3] enable fma on Windows

---
 src/llvm-cpufeatures.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/src/llvm-cpufeatures.cpp b/src/llvm-cpufeatures.cpp
index dc35aeb569cfa..8accd399371ae 100644
--- a/src/llvm-cpufeatures.cpp
+++ b/src/llvm-cpufeatures.cpp
@@ -36,10 +36,7 @@ Optional<bool> always_have_fma(Function &intr) {
     auto intr_name = intr.getName();
     auto typ = intr_name.substr(strlen("julia.cpu.have_fma."));
 
-#if defined(_OS_WINDOWS_)
-    // FMA on Windows is weirdly broken (#43088)
-    return false;
-#elif defined(_CPU_AARCH64_)
+#if defined(_CPU_AARCH64_)
     return typ == "f32" || typ == "f64";
 #else
     (void)typ;

From 1b849ff65a40263724845e39713701f6fd096b5e Mon Sep 17 00:00:00 2001
From: N5N3 <2642243996@qq.com>
Date: Sat, 25 Dec 2021 13:56:07 +0800
Subject: [PATCH 2/3] define `julia_fma(f)` on windows

1. add `volatile` to fix Win32
2. fix `issubnormal` on Win32
---
 src/runtime_intrinsics.c | 104 +++++++++++++++++++++++++++++++++++++++
 test/math.jl             |  42 ++++++++--------
 2 files changed, 124 insertions(+), 22 deletions(-)

diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index d64e1945c52a1..5acd26ca64776 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -1175,8 +1175,112 @@ bi_fintrinsic(div,div_float)
 bi_fintrinsic(frem,rem_float)
 
 // ternary operators //
+// runtime fma is broken on windows, define julia_fma(f) ourself with fma_emulated as reference.
+#if defined(_OS_WINDOWS_)
+// reinterpret(UInt64, ::Float64)
+uint64_t bitcast_d2u(double d) {
+    uint64_t r;
+    memcpy(&r, &d, 8);
+    return r;
+}
+// reinterpret(Float64, ::UInt64)
+double bitcast_u2d(uint64_t d) {
+    double r;
+    memcpy(&r, &d, 8);
+    return r;
+}
+// Base.splitbits(::Float64)
+void splitbits(double *hi, double *lo, double d) {
+    *hi = bitcast_u2d(bitcast_d2u(d) & 0xfffffffff8000000);
+    *lo = d - *hi;
+}
+// Base.exponent(::Float64)
+int exponent(double a) {
+    int e;
+    frexp(a, &e);
+    return e - 1;
+}
+// Base.fma_emulated(::Float32, ::Float32, ::Float32)
+float julia_fmaf(float a, float b, float c) {
+    double ab, res;
+    ab = (double)a * b;
+    res = ab + (double)c;
+    if ((bitcast_d2u(res) & 0x1fffffff) == 0x10000000){
+        double reslo = fabsf(c) > fabs(ab) ? ab-(res - c) : c-(res - ab);
+        if (reslo != 0)
+            res = nextafter(res, copysign(1.0/0.0, reslo));
+    }
+    return (float)res;
+}
+// Base.twomul(::Float64, ::Float64)
+void two_mul(double *abhi, double *ablo, double a, double b) {
+    double ahi, alo, bhi, blo, blohi, blolo;
+    splitbits(&ahi, &alo, a);
+    splitbits(&bhi, &blo, b);
+    splitbits(&blohi, &blolo, blo);
+    *abhi = a*b;
+    *ablo = alo*blohi - (((*abhi - ahi*bhi) - alo*bhi) - ahi*blo) + blolo*alo;
+}
+// Base.issubnormal(::Float64) (Win32's fpclassify seems broken)
+int issubnormal(double d) {
+    uint64_t y = bitcast_d2u(d);
+    return ((y & 0x7ff0000000000000) == 0) & ((y & 0x000fffffffffffff) != 0);
+}
+#if defined(_WIN32)
+// Win32 needs volatile (avoid over optimization?)
+#define VDOUBLE volatile double
+#else
+#define VDOUBLE double
+#endif
+
+// Base.fma_emulated(::Float64, ::Float64, ::Float64)
+double julia_fma(double a, double b, double c) {
+    double abhi, ablo, r, s;
+    two_mul(&abhi, &ablo, a, b);
+    if (!isfinite(abhi+c) || fabs(abhi) < 2.0041683600089732e-292 ||
+        issubnormal(a) || issubnormal(b)) {
+        int aandbfinite = isfinite(a) && isfinite(b);
+        if (!(aandbfinite && isfinite(c)))
+            return aandbfinite ? c : abhi+c;
+        if (a == 0 || b == 0)
+            return abhi+c;
+        int bias = exponent(a) + exponent(b);
+        VDOUBLE c_denorm = ldexp(c, -bias);
+        if (isfinite(c_denorm)) {
+            if (issubnormal(a))
+                a *= 4.503599627370496e15;
+            if (issubnormal(b))
+                b *= 4.503599627370496e15;
+            a = bitcast_u2d((bitcast_d2u(a) & 0x800fffffffffffff) | 0x3ff0000000000000);
+            b = bitcast_u2d((bitcast_d2u(b) & 0x800fffffffffffff) | 0x3ff0000000000000);
+            c = c_denorm;
+            two_mul(&abhi, &ablo, a, b);
+            r = abhi+c;
+            s = (fabs(abhi) > fabs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo);
+            double sumhi = r+s;
+            if (issubnormal(ldexp(sumhi, bias))) {
+                double sumlo = r-sumhi+s;
+                int bits_lost = -bias-exponent(sumhi)-1022;
+                if ((bits_lost != 1) ^ ((bitcast_d2u(sumhi)&1) == 1))
+                    if (sumlo != 0)
+                        sumhi = nextafter(sumhi, copysign(1.0/0.0, sumlo));
+            }
+            return ldexp(sumhi, bias);
+        }
+        if (isinf(abhi) && signbit(c) == signbit(a*b))
+            return abhi;
+    }
+    r = abhi+c;
+    s = (fabs(abhi) > fabs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo);
+    return r+s;
+}
+#define fma(a, b, c) \
+    sizeof(a) == sizeof(float) ? julia_fmaf(a, b, c) : julia_fma(a, b, c)
+#else // On other systems use fma(f) directly
 #define fma(a, b, c) \
     sizeof(a) == sizeof(float) ? fmaf(a, b, c) : fma(a, b, c)
+#endif
+
 #define muladd(a, b, c) a * b + c
 ter_fintrinsic(fma,fma_float)
 ter_fintrinsic(muladd,muladd_float)
diff --git a/test/math.jl b/test/math.jl
index 664a1f637314c..a0c6d84ff2d86 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1289,29 +1289,27 @@ end
 end
 
 @testset "fma" begin
-    if !(@static Sys.iswindows() && Int===Int64) # windows fma currently seems broken somehow.
-        for func in (fma, Base.fma_emulated)
-            @test func(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16
-            @test func(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7
-            @testset "$T" for T in (Float32, Float64)
-                @test func(floatmax(T), T(2), -floatmax(T)) === floatmax(T)
-                @test func(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf)
-                @test func(T(Inf), T(Inf), T(Inf)) === T(Inf)
-                @test func(floatmax(T), floatmax(T), -T(Inf)) === -T(Inf)
-                @test func(floatmax(T), -floatmax(T), T(Inf)) === T(Inf)
-                @test isnan_type(T, func(T(Inf), T(1), -T(Inf)))
-                @test isnan_type(T, func(T(Inf), T(0), -T(0)))
-                @test func(-zero(T), zero(T), -zero(T)) === -zero(T)
-                for _ in 1:2^18
-                    a, b, c = reinterpret.(T, rand(Base.uinttype(T), 3))
-                    @test isequal(func(a, b, c), fma(a, b, c)) || (a,b,c)
-                end
+    for func in (fma, Base.fma_emulated, Base.fma_float) # use fma_float to test runtime fma
+        @test func(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16
+        @test func(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7
+        @testset "$T" for T in (Float32, Float64)
+            @test func(floatmax(T), T(2), -floatmax(T)) === floatmax(T)
+            @test func(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf)
+            @test func(T(Inf), T(Inf), T(Inf)) === T(Inf)
+            @test func(floatmax(T), floatmax(T), -T(Inf)) === -T(Inf)
+            @test func(floatmax(T), -floatmax(T), T(Inf)) === T(Inf)
+            @test isnan_type(T, func(T(Inf), T(1), -T(Inf)))
+            @test isnan_type(T, func(T(Inf), T(0), -T(0)))
+            @test func(-zero(T), zero(T), -zero(T)) === -zero(T)
+            for _ in 1:2^18
+                a, b, c = reinterpret.(T, rand(Base.uinttype(T), 3))
+                @test isequal(func(a, b, c), fma(a, b, c)) || (a,b,c)
             end
-            @test func(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
-            @test func(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
-            @test func(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf
-            @test func(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf
-            @test func(-1.9369631f13, 2.1513551f-7, -1.7354427f-24) == -4.1670958f6
         end
+        @test func(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
+        @test func(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
+        @test func(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf
+        @test func(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf
+        @test func(-1.9369631f13, 2.1513551f-7, -1.7354427f-24) == -4.1670958f6
     end
 end

From 1148b79c690fbf7e8144740aa481216ac20ef103 Mon Sep 17 00:00:00 2001
From: N5N3 <2642243996@qq.com>
Date: Sat, 25 Dec 2021 14:01:57 +0800
Subject: [PATCH 3/3] skip runtime fma test on linux32

Update math.jl
---
 test/math.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/math.jl b/test/math.jl
index a0c6d84ff2d86..b5a6d3abf55a4 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1289,7 +1289,11 @@ end
 end
 
 @testset "fma" begin
-    for func in (fma, Base.fma_emulated, Base.fma_float) # use fma_float to test runtime fma
+    fma_list = (fma, Base.fma_emulated)
+    if !(Sys.islinux() && Int == Int32) # test runtime fma (skip linux32)
+        fma_list = (fma_list..., Base.fma_float)
+    end
+    for func in fma_list
         @test func(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16
         @test func(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7
         @testset "$T" for T in (Float32, Float64)