Skip to content

Add emulation for CPU without FMA instruction set#59

Closed
lnuic wants to merge 1 commit intomitsuba-renderer:masterfrom
lnuic:llvm_fma_instruction_set_emulation
Closed

Add emulation for CPU without FMA instruction set#59
lnuic wants to merge 1 commit intomitsuba-renderer:masterfrom
lnuic:llvm_fma_instruction_set_emulation

Conversation

@lnuic
Copy link
Contributor

@lnuic lnuic commented Apr 7, 2023

Description

FMA instruction set was introduced in 2012 (AMD Piledriver, Intel Haswell), but architectures before that already had AVX and SSE4.2 instruction sets. Dr. Jit doesn't currently verify whether a CPU supports the FMA instruction set. As a result, LLVM generates a global offset table when FMA is not available, which leads to a critical compiler failure in Dr. Jit.

To address this issue, emulation of the FMA instruction was implemented using the existing fmul and fadd instructions. This will ensure that the code runs smoothly on CPUs that do not have native FMA support, preventing any potential failures caused by the absence of the instruction set.

This PR should also fix: mitsuba-renderer/drjit/#46

Code to reproduce:

import drjit as dr
dr.set_log_level(dr.LogLevel.Trace)
a = dr.llvm.Array3f([1.], [2.], [3.])
b = dr.fma(a, a, a)
print(b)

Error message:

Critical Dr.Jit compiler failure: jit_llvm_compile(): a global offset table was generated by LLVM, which typically means that a compiler intrinsic was not supported by the target architecture. DrJit cannot handle this case and will terminate the application now. For reference, the following kernel code was responsible for this problem:

define void @drjit_4300624cc1a9b63fa571334796cbc01c(i64 %start, i64 %end, i8** noalias %params) #0 {
entry:
    br label %body

body:
    %index = phi i64 [ %index_next, %suffix ], [ %start, %entry ]
    %f1_p1 = getelementptr inbounds i8*, i8** %params, i32 3
    %f1_p2 = load i8*, i8** %f1_p1, align 8, !alias.scope !2
    %f1_p3 = bitcast i8* %f1_p2 to float*
    %f1_0 = load float, float* %f1_p3, align 4, !alias.scope !2
    %f1_1 = insertelement <8 x float> undef, float %f1_0, i32 0
    %f1 = shufflevector <8 x float> %f1_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f2_p1 = getelementptr inbounds i8*, i8** %params, i32 4
    %f2_p2 = load i8*, i8** %f2_p1, align 8, !alias.scope !2
    %f2_p3 = bitcast i8* %f2_p2 to float*
    %f2_p4 = getelementptr inbounds float, float* %f2_p3, i64 %index
    %f2_p5 = bitcast float* %f2_p4 to <8 x float>*
    %f2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f1, <8 x float> %f1, <8 x float> %f1)
    store <8 x float> %f2, <8 x float>* %f2_p5, align 32, !noalias !2, !nontemporal !3
    %f3_p1 = getelementptr inbounds i8*, i8** %params, i32 5
    %f3_p2 = load i8*, i8** %f3_p1, align 8, !alias.scope !2
    %f3_p3 = bitcast i8* %f3_p2 to float*
    %f3_0 = load float, float* %f3_p3, align 4, !alias.scope !2
    %f3_1 = insertelement <8 x float> undef, float %f3_0, i32 0
    %f3 = shufflevector <8 x float> %f3_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f4_p1 = getelementptr inbounds i8*, i8** %params, i32 6
    %f4_p2 = load i8*, i8** %f4_p1, align 8, !alias.scope !2
    %f4_p3 = bitcast i8* %f4_p2 to float*
    %f4_p4 = getelementptr inbounds float, float* %f4_p3, i64 %index
    %f4_p5 = bitcast float* %f4_p4 to <8 x float>*
    %f4 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f3, <8 x float> %f3, <8 x float> %f3)
    store <8 x float> %f4, <8 x float>* %f4_p5, align 32, !noalias !2, !nontemporal !3
    %f5_p1 = getelementptr inbounds i8*, i8** %params, i32 7
    %f5_p2 = load i8*, i8** %f5_p1, align 8, !alias.scope !2
    %f5_p3 = bitcast i8* %f5_p2 to float*
    %f5_0 = load float, float* %f5_p3, align 4, !alias.scope !2
    %f5_1 = insertelement <8 x float> undef, float %f5_0, i32 0
    %f5 = shufflevector <8 x float> %f5_1, <8 x float> undef, <8 x i32> zeroinitializer
    %f6_p1 = getelementptr inbounds i8*, i8** %params, i32 8
    %f6_p2 = load i8*, i8** %f6_p1, align 8, !alias.scope !2
    %f6_p3 = bitcast i8* %f6_p2 to float*
    %f6_p4 = getelementptr inbounds float, float* %f6_p3, i64 %index
    %f6_p5 = bitcast float* %f6_p4 to <8 x float>*
    %f6 = call <8 x float> @llvm.fma.v8f32(<8 x float> %f5, <8 x float> %f5, <8 x float> %f5)
    store <8 x float> %f6, <8 x float>* %f6_p5, align 32, !noalias !2, !nontemporal !3
    br label %suffix

suffix:
    %index_next = add i64 %index, 8
    %cond = icmp uge i64 %index_next, %end
    br i1 %cond, label %done, label %body, !llvm.loop !4

done:
    ret void
}

declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)


!0 = !{!0}
!1 = !{!1, !0}
!2 = !{!1}
!3 = !{i32 1}
!4 = !{!"llvm.loop.unroll.disable", !"llvm.loop.vectorize.enable", i1 0}

attributes #0 = { norecurse nounwind "frame-pointer"="none" "no-builtins" "no-stack-arg-probe" "target-cpu"="ivybridge" "target-features"="-vzeroupper,+sse2,-tsxldtrk,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,-prfchw,-bmi2,-cldemote,+fsgsbase,-ptwrite,-amx-tile,-uintr,+popcnt,-widekl,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512er,-avxvnni,-avx512vnni,-amx-bf16,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-xsavec,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,-rdseed,-waitpkg,-kl,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,-serialize,-hreset,-invpcid,-avx512cd,+avx,-vaes,-avx512bf16,+cx8,-fma,-rtm,-bmi,-enqcmd,+rdrnd,-mwaitx,+sse4.1,+sse4.2,-avx2,+fxsr,-wbnoinvd,+sse,-lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,-amx-int8,-movbe,-avx512vp2intersect,+xsaveopt,-avx512dq,-adx,-avx512pf,+sse3" }
[1]    19864 abort      python drjit_test.py

@njroussel
Copy link
Member

njroussel commented Apr 11, 2023

Hi @lnuic

This looks good to me.

One question, have you taken a look at the fmuladd LLVM intrinsic (link)? It's unclear to me if we shouldn't just always use this or at least when fma is not supported on the target cpu.

@wjakob
Copy link
Member

wjakob commented Apr 11, 2023

fmul+fadd have different rounding behavior, which could lead to subtle platform-dependent inconsistencies. I am thinking that it might be easier to just error out in this case.

For example, our pip wheels depend on Haswell IIRC. So even if the LLVM codegen is adjusted, they will still generate invalid instruction failures on older hardware.

@njroussel
Copy link
Member

I see.

I double-checked, the pip wheels target Ivy Bridge currently. So, we can either error out (and bump the pip wheel architecture) or have a fix for Ivy Bridge specifically. I'm fine with dropping Ivy Bridge support.

@lnuic
Copy link
Contributor Author

lnuic commented Apr 11, 2023

Thank you, @njroussel, for bringing the existence of fmuladd to my attention.

I conducted experiments with it, and it appears to work effectively and code can be simplified that way.
The problem with fmuladd is that by LLVM documentation "Fusion is not guaranteed, even if the target platform supports it." which can add additional disambiguity.

The challenges related to precision and architecture support remain.

@njroussel
Copy link
Member

Follows-up here: #60

@lnuic lnuic deleted the llvm_fma_instruction_set_emulation branch May 2, 2023 13:38
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Compiler intrinsic not supported

3 participants