diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index ba9d0682b26dd..c54452b13898f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -8162,6 +8162,7 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); + auto *LoongArchFI = MF.getInfo(); switch (CallConv) { default: @@ -8225,6 +8226,8 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( continue; } InVals.push_back(ArgValue); + if (Ins[InsIdx].Flags.isByVal()) + LoongArchFI->addIncomingByValArgs(ArgValue); } if (IsVarArg) { @@ -8233,7 +8236,6 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( const TargetRegisterClass *RC = &LoongArch::GPRRegClass; MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); - auto *LoongArchFI = MF.getInfo(); // Offset of the first variable argument from stack pointer, and size of // the vararg save area. For now, the varargs save area is either zero or @@ -8283,6 +8285,8 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( LoongArchFI->setVarArgsSaveSize(VarArgsSaveSize); } + LoongArchFI->setArgumentStackSize(CCInfo.getStackSize()); + // All stores are grouped in one node to allow the matching between // the size of Ins and InVals. This only happens for vararg functions. if (!OutChains.empty()) { @@ -8339,9 +8343,11 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization( auto &Outs = CLI.Outs; auto &Caller = MF.getFunction(); auto CallerCC = Caller.getCallingConv(); + auto *LoongArchFI = MF.getInfo(); - // Do not tail call opt if the stack is used to pass parameters. - if (CCInfo.getStackSize() != 0) + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + if (CCInfo.getStackSize() > LoongArchFI->getArgumentStackSize()) return false; // Do not tail call opt if any parameters need to be passed indirectly. @@ -8353,13 +8359,18 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization( // semantics. auto IsCallerStructRet = Caller.hasStructRetAttr(); auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet(); - if (IsCallerStructRet || IsCalleeStructRet) + if (IsCallerStructRet != IsCalleeStructRet) return false; - // Do not tail call opt if either the callee or caller has a byval argument. - for (auto &Arg : Outs) - if (Arg.Flags.isByVal()) + // Do not tail call opt if caller's and callee's byval arguments do not match. + for (unsigned i = 0, j = 0; i < Outs.size(); i++) { + if (!Outs[i].Flags.isByVal()) + continue; + if (j++ >= LoongArchFI->getIncomingByValArgsSize()) + return false; + if (LoongArchFI->getIncomingByValArgs(i).getValueType() != Outs[i].ArgVT) return false; + } // The callee has to preserve all registers the caller needs to preserve. const LoongArchRegisterInfo *TRI = Subtarget.getRegisterInfo(); @@ -8369,6 +8380,14 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization( if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } + + // If the callee takes no arguments then go on to check the results of the + // call. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SmallVectorImpl &OutVals = CLI.OutVals; + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) + return false; + return true; } @@ -8396,6 +8415,7 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, bool &IsTailCall = CLI.IsTailCall; MachineFunction &MF = DAG.getMachineFunction(); + auto *LoongArchFI = MF.getInfo(); // Analyze the operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -8421,7 +8441,7 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, // Create local copies for byval args. SmallVector ByValArgs; - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + for (unsigned i = 0, j = 0, e = Outs.size(); i != e; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; if (!Flags.isByVal()) continue; @@ -8429,22 +8449,39 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue Arg = OutVals[i]; unsigned Size = Flags.getByValSize(); Align Alignment = Flags.getNonZeroByValAlign(); - - int FI = - MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false); - SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue SizeNode = DAG.getConstant(Size, DL, GRLenVT); + SDValue Dst; - Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment, - /*IsVolatile=*/false, - /*AlwaysInline=*/false, /*CI=*/nullptr, std::nullopt, - MachinePointerInfo(), MachinePointerInfo()); - ByValArgs.push_back(FIPtr); + if (IsTailCall) { + SDValue CallerArg = LoongArchFI->getIncomingByValArgs(j++); + if (isa(Arg) || isa(Arg) || + isa(Arg)) + Dst = CallerArg; + } else { + int FI = + MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false); + Dst = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + } + if (Dst) { + Chain = + DAG.getMemcpy(Chain, DL, Dst, Arg, SizeNode, Alignment, + /*IsVolatile=*/false, + /*AlwaysInline=*/false, /*CI=*/nullptr, std::nullopt, + MachinePointerInfo(), MachinePointerInfo()); + ByValArgs.push_back(Dst); + } } if (!IsTailCall) Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); + // During a tail call, stores to the argument area must happen after all of + // the function's incoming arguments have been loaded because they may alias. + // This is done by folding in a TokenFactor from LowerFormalArguments, but + // there's no point in doing so repeatedly so this tracks whether that's + // happened yet. + bool AfterFormalArgLoads = false; + // Copy argument values to their designated locations. SmallVector> RegsToPass; SmallVector MemOpChains; @@ -8539,27 +8576,44 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, } // Use local copy if it is a byval arg. - if (Flags.isByVal()) - ArgValue = ByValArgs[j++]; + if (Flags.isByVal()) { + if (!IsTailCall || (isa(ArgValue) || + isa(ArgValue) || + isa(ArgValue))) + ArgValue = ByValArgs[j++]; + } if (VA.isRegLoc()) { // Queue up the argument copies and emit them at the end. RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue)); } else { assert(VA.isMemLoc() && "Argument not register or memory"); - assert(!IsTailCall && "Tail call not allowed if stack is used " - "for passing parameters"); + SDValue DstAddr; + MachinePointerInfo DstInfo; + int32_t Offset = VA.getLocMemOffset(); // Work out the address of the stack slot. if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, DL, LoongArch::R3, PtrVT); - SDValue Address = - DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, - DAG.getIntPtrConstant(VA.getLocMemOffset(), DL)); + + if (IsTailCall) { + unsigned OpSize = divideCeil(VA.getValVT().getSizeInBits(), 8); + int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); + DstAddr = DAG.getFrameIndex(FI, PtrVT); + DstInfo = MachinePointerInfo::getFixedStack(MF, FI); + if (!AfterFormalArgLoads) { + Chain = DAG.getStackArgumentTokenFactor(Chain); + AfterFormalArgLoads = true; + } + } else { + SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); + DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); + DstInfo = MachinePointerInfo::getStack(MF, Offset); + } // Emit the store. MemOpChains.push_back( - DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo())); + DAG.getStore(Chain, DL, ArgValue, DstAddr, DstInfo)); } } diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h index 904985c189dba..75db3365415e8 100644 --- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h @@ -32,10 +32,17 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo { /// Size of stack frame to save callee saved registers unsigned CalleeSavedStackSize = 0; + /// Amount of bytes on stack consumed by the arguments being passed on + /// the stack + unsigned ArgumentStackSize = 0; + /// FrameIndex of the spill slot when there is no scavenged register in /// insertIndirectBranch. int BranchRelaxationSpillFrameIndex = -1; + /// Incoming ByVal arguments + SmallVector IncomingByValArgs; + /// Registers that have been sign extended from i32. SmallVector SExt32Registers; @@ -63,6 +70,9 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo { unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; } void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; } + unsigned getArgumentStackSize() const { return ArgumentStackSize; } + void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } + int getBranchRelaxationSpillFrameIndex() { return BranchRelaxationSpillFrameIndex; } @@ -70,6 +80,10 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo { BranchRelaxationSpillFrameIndex = Index; } + void addIncomingByValArgs(SDValue Val) { IncomingByValArgs.push_back(Val); } + SDValue &getIncomingByValArgs(int Idx) { return IncomingByValArgs[Idx]; } + unsigned getIncomingByValArgsSize() { return IncomingByValArgs.size(); } + void addSExt32Register(Register Reg) { SExt32Registers.push_back(Reg); } bool isSExt32Register(Register Reg) const { diff --git a/llvm/test/CodeGen/LoongArch/musttail.ll b/llvm/test/CodeGen/LoongArch/musttail.ll new file mode 100644 index 0000000000000..4d9be2869fd9f --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/musttail.ll @@ -0,0 +1,566 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64 + +declare i32 @many_args_callee(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) + +define i32 @many_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; LA32-LABEL: many_args_tail: +; LA32: # %bb.0: +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: b many_args_callee +; +; LA64-LABEL: many_args_tail: +; LA64: # %bb.0: +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $t8, %call36(many_args_callee) +; LA64-NEXT: jr $t8 + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +define i32 @many_args_musttail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; LA32-LABEL: many_args_musttail: +; LA32: # %bb.0: +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: b many_args_callee +; +; LA64-LABEL: many_args_musttail: +; LA64: # %bb.0: +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $t8, %call36(many_args_callee) +; LA64-NEXT: jr $t8 + %ret = musttail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +; This function has more arguments than it's tail-callee. This isn't valid for +; the musttail attribute, but can still be tail-called as a non-guaranteed +; optimisation, because the outgoing arguments to @many_args_callee fit in the +; stack space allocated by the caller of @more_args_tail. +define i32 @more_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; LA32-LABEL: more_args_tail: +; LA32: # %bb.0: +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: b many_args_callee +; +; LA64-LABEL: more_args_tail: +; LA64: # %bb.0: +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $t8, %call36(many_args_callee) +; LA64-NEXT: jr $t8 + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +; Again, this isn't valid for musttail, but can be tail-called in practice +; because the stack size if the same. +define i32 @different_args_tail_32bit(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4) nounwind { +; LA32-LABEL: different_args_tail_32bit: +; LA32: # %bb.0: +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: b many_args_callee +; +; LA64-LABEL: different_args_tail_32bit: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $ra, %call36(many_args_callee) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +define i32 @different_args_tail_64bit(i128 %0, i128 %1, i128 %2, i128 %3, i128 %4) nounwind { +; LA32-LABEL: different_args_tail_64bit: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: bl many_args_callee +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: different_args_tail_64bit: +; LA64: # %bb.0: +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $t8, %call36(many_args_callee) +; LA64-NEXT: jr $t8 + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +; Here, the caller requires less stack space for it's arguments than the +; callee, so it would not ba valid to do a tail-call. +define i32 @fewer_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) nounwind { +; LA32-LABEL: fewer_args_tail: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: bl many_args_callee +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: fewer_args_tail: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $ra, %call36(many_args_callee) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +declare void @foo(i32, i32, i32, i32, i32, i32, i32, i32, i32) + +define void @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8) nounwind { +; LA32-LABEL: bar: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill +; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a7 +; LA32-NEXT: move $s0, $a6 +; LA32-NEXT: move $s1, $a5 +; LA32-NEXT: move $s2, $a4 +; LA32-NEXT: move $s3, $a3 +; LA32-NEXT: move $s4, $a2 +; LA32-NEXT: move $s5, $a1 +; LA32-NEXT: move $s6, $a0 +; LA32-NEXT: ori $a0, $zero, 1 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $s6 +; LA32-NEXT: bl foo +; LA32-NEXT: ori $a0, $zero, 2 +; LA32-NEXT: st.w $a0, $sp, 48 +; LA32-NEXT: move $a0, $s6 +; LA32-NEXT: move $a1, $s5 +; LA32-NEXT: move $a2, $s4 +; LA32-NEXT: move $a3, $s3 +; LA32-NEXT: move $a4, $s2 +; LA32-NEXT: move $a5, $s1 +; LA32-NEXT: move $a6, $s0 +; LA32-NEXT: move $a7, $fp +; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: b foo +; +; LA64-LABEL: bar: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -96 +; LA64-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 64 # 8-byte Folded Spill +; LA64-NEXT: st.d $s2, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $s3, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: st.d $s4, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $s5, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s6, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a7 +; LA64-NEXT: move $s0, $a6 +; LA64-NEXT: move $s1, $a5 +; LA64-NEXT: move $s2, $a4 +; LA64-NEXT: move $s3, $a3 +; LA64-NEXT: move $s4, $a2 +; LA64-NEXT: move $s5, $a1 +; LA64-NEXT: move $s6, $a0 +; LA64-NEXT: ori $a0, $zero, 1 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $s6 +; LA64-NEXT: pcaddu18i $ra, %call36(foo) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ori $a0, $zero, 2 +; LA64-NEXT: st.d $a0, $sp, 96 +; LA64-NEXT: move $a0, $s6 +; LA64-NEXT: move $a1, $s5 +; LA64-NEXT: move $a2, $s4 +; LA64-NEXT: move $a3, $s3 +; LA64-NEXT: move $a4, $s2 +; LA64-NEXT: move $a5, $s1 +; LA64-NEXT: move $a6, $s0 +; LA64-NEXT: move $a7, $fp +; LA64-NEXT: ld.d $s6, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s5, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s4, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s3, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s2, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 64 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 96 +; LA64-NEXT: pcaddu18i $t8, %call36(foo) +; LA64-NEXT: jr $t8 +entry: + call void @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 1) + musttail call void @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 2) + ret void +} + +declare void @sret_callee(ptr sret({ double, double }) align 8) + +; Functions which return by sret can be tail-called because the incoming sret +; pointer gets passed through to the callee. +define void @sret_caller_tail(ptr sret({ double, double }) align 8 %result) { +; LA32-LABEL: sret_caller_tail: +; LA32: # %bb.0: # %entry +; LA32-NEXT: b sret_callee +; +; LA64-LABEL: sret_caller_tail: +; LA64: # %bb.0: # %entry +; LA64-NEXT: pcaddu18i $t8, %call36(sret_callee) +; LA64-NEXT: jr $t8 +entry: + tail call void @sret_callee(ptr sret({ double, double }) align 8 %result) + ret void +} + +define void @sret_caller_musttail(ptr sret({ double, double }) align 8 %result) { +; LA32-LABEL: sret_caller_musttail: +; LA32: # %bb.0: # %entry +; LA32-NEXT: b sret_callee +; +; LA64-LABEL: sret_caller_musttail: +; LA64: # %bb.0: # %entry +; LA64-NEXT: pcaddu18i $t8, %call36(sret_callee) +; LA64-NEXT: jr $t8 +entry: + musttail call void @sret_callee(ptr sret({ double, double }) align 8 %result) + ret void +} + +%twenty_bytes = type { [5 x i32] } +declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4) + +; Functions with byval parameters can be tail-called, because the value is +; actually passed in registers in the same way for the caller and callee. +define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; LA32-LABEL: large_caller: +; LA32: # %bb.0: # %entry +; LA32-NEXT: b large_callee +; +; LA64-LABEL: large_caller: +; LA64: # %bb.0: # %entry +; LA64-NEXT: pcaddu18i $t8, %call36(large_callee) +; LA64-NEXT: jr $t8 +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; As above, but with some inline asm to test that the arguments in r4 is +; re-loaded before the call. +define void @large_caller_check_regs(%twenty_bytes* byval(%twenty_bytes) align 4 %a) nounwind { +; LA32-LABEL: large_caller_check_regs: +; LA32: # %bb.0: # %entry +; LA32-NEXT: move $a1, $a0 +; LA32-NEXT: #APP +; LA32-NEXT: #NO_APP +; LA32-NEXT: move $a0, $a1 +; LA32-NEXT: b large_callee +; +; LA64-LABEL: large_caller_check_regs: +; LA64: # %bb.0: # %entry +; LA64-NEXT: move $a1, $a0 +; LA64-NEXT: #APP +; LA64-NEXT: #NO_APP +; LA64-NEXT: move $a0, $a1 +; LA64-NEXT: pcaddu18i $t8, %call36(large_callee) +; LA64-NEXT: jr $t8 +entry: + tail call void asm sideeffect "", "~{r4}"() + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; The IR for this one looks dodgy, because it has an alloca passed to a +; musttail function, but it is passed as a byval argument, so will be copied +; into the stack space allocated by @large_caller_new_value's caller, so is +; valid. +define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) nounwind { +; LA32-LABEL: large_caller_new_value: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $zero, $sp, 12 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.w $a1, $sp, 16 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: st.w $a2, $sp, 20 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: st.w $a3, $sp, 24 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: st.w $a4, $sp, 28 +; LA32-NEXT: st.w $a4, $a0, 16 +; LA32-NEXT: st.w $a3, $a0, 12 +; LA32-NEXT: st.w $a2, $a0, 8 +; LA32-NEXT: st.w $a1, $a0, 4 +; LA32-NEXT: st.w $zero, $a0, 0 +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: b large_callee +; +; LA64-LABEL: large_caller_new_value: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: ori $a1, $zero, 0 +; LA64-NEXT: lu32i.d $a1, 1 +; LA64-NEXT: st.d $a1, $sp, 12 +; LA64-NEXT: ori $a1, $zero, 2 +; LA64-NEXT: lu32i.d $a1, 3 +; LA64-NEXT: st.d $a1, $sp, 20 +; LA64-NEXT: ori $a1, $zero, 4 +; LA64-NEXT: st.w $a1, $sp, 28 +; LA64-NEXT: st.w $a1, $a0, 16 +; LA64-NEXT: vld $vr0, $sp, 12 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: pcaddu18i $t8, %call36(large_callee) +; LA64-NEXT: jr $t8 +entry: + %y = alloca %twenty_bytes, align 4 + store i32 0, ptr %y, align 4 + %0 = getelementptr inbounds i8, ptr %y, i32 4 + store i32 1, ptr %0, align 4 + %1 = getelementptr inbounds i8, ptr %y, i32 8 + store i32 2, ptr %1, align 4 + %2 = getelementptr inbounds i8, ptr %y, i32 12 + store i32 3, ptr %2, align 4 + %3 = getelementptr inbounds i8, ptr %y, i32 16 + store i32 4, ptr %3, align 4 + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y) + ret void +} + +declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4) +define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; LA32-LABEL: swap_byvals: +; LA32: # %bb.0: # %entry +; LA32-NEXT: move $a2, $a0 +; LA32-NEXT: move $a0, $a1 +; LA32-NEXT: move $a1, $a2 +; LA32-NEXT: b two_byvals_callee +; +; LA64-LABEL: swap_byvals: +; LA64: # %bb.0: # %entry +; LA64-NEXT: move $a2, $a0 +; LA64-NEXT: move $a0, $a1 +; LA64-NEXT: move $a1, $a2 +; LA64-NEXT: pcaddu18i $t8, %call36(two_byvals_callee) +; LA64-NEXT: jr $t8 +entry: + musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; A forwarded byval arg, but in a different argument register, so it needs to +; be moved between registers first. This can't be musttail because of the +; different signatures, but is still tail-called as an optimisation. +declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4) +define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; LA32-LABEL: shift_byval: +; LA32: # %bb.0: # %entry +; LA32-NEXT: move $a0, $a1 +; LA32-NEXT: b shift_byval_callee +; +; LA64-LABEL: shift_byval: +; LA64: # %bb.0: # %entry +; LA64-NEXT: move $a0, $a1 +; LA64-NEXT: pcaddu18i $t8, %call36(shift_byval_callee) +; LA64-NEXT: jr $t8 +entry: + tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b) + ret void +} + +; A global object passed to a byval argument, so it must be copied, but doesn't +; need a stack temporary. +@large_global = external global %twenty_bytes +define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; LA32-LABEL: large_caller_from_global: +; LA32: # %bb.0: # %entry +; LA32-NEXT: pcalau12i $a1, %got_pc_hi20(large_global) +; LA32-NEXT: ld.w $a1, $a1, %got_pc_lo12(large_global) +; LA32-NEXT: ld.w $a2, $a1, 16 +; LA32-NEXT: st.w $a2, $a0, 16 +; LA32-NEXT: ld.w $a2, $a1, 12 +; LA32-NEXT: st.w $a2, $a0, 12 +; LA32-NEXT: ld.w $a2, $a1, 8 +; LA32-NEXT: st.w $a2, $a0, 8 +; LA32-NEXT: ld.w $a2, $a1, 4 +; LA32-NEXT: st.w $a2, $a0, 4 +; LA32-NEXT: ld.w $a1, $a1, 0 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: b large_callee +; +; LA64-LABEL: large_caller_from_global: +; LA64: # %bb.0: # %entry +; LA64-NEXT: pcalau12i $a1, %got_pc_hi20(large_global) +; LA64-NEXT: ld.d $a1, $a1, %got_pc_lo12(large_global) +; LA64-NEXT: ld.w $a2, $a1, 16 +; LA64-NEXT: st.w $a2, $a0, 16 +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: pcaddu18i $t8, %call36(large_callee) +; LA64-NEXT: jr $t8 +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global) + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/tail-calls.ll b/llvm/test/CodeGen/LoongArch/tail-calls.ll index 533761c8a1c70..e14fbc2302cce 100644 --- a/llvm/test/CodeGen/LoongArch/tail-calls.ll +++ b/llvm/test/CodeGen/LoongArch/tail-calls.ll @@ -80,20 +80,15 @@ entry: ret void } -;; Do not tail call optimize if stack is used to pass parameters. +;; Perform tail call optimization if callee arg stack usage ≤ caller declare i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i) define i32 @caller_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i) nounwind { ; CHECK-LABEL: caller_args: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -16 -; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: ld.d $t0, $sp, 16 +; CHECK-NEXT: ld.d $t0, $sp, 0 ; CHECK-NEXT: st.d $t0, $sp, 0 -; CHECK-NEXT: pcaddu18i $ra, %call36(callee_args) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 16 -; CHECK-NEXT: ret +; CHECK-NEXT: pcaddu18i $t8, %call36(callee_args) +; CHECK-NEXT: jr $t8 entry: %r = tail call i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i) ret i32 %r