Skip to content

Commit 9022f47

Browse files
authored
[AMDGPU] Implement llvm.sponentry (#176357)
In some of our use cases, the GPU runtime stores some data at the top of the stack. It figures out where it's safe to store it by using the PAL metadata generated by the backend, which includes the total stack size. However, the metadata does not include the space reserved at the bottom of the stack for the trap handler when CWSR is enabled in dynamic VGPR mode. This space is reserved dynamically based on whether or not the code is running on the compute queue. Therefore, the runtime needs a way to take that into account. Add support for `llvm.sponentry`, which should return the base of the stack, skipping over any reserved areas. This allows us to keep this computation in one place rather than duplicate it between the backend and the runtime. The implementation for functions that set up their own stack uses a pseudo that is expanded to the same code sequence as that used in the prolog to set up the stack in the first place. In callable functions, we generate a fixed stack object and use that instead, similar to the Arm/AArch64 approach. This wastes some stack space but that's not a problem for now because we're not planning to use this in callable functions yet.
1 parent b36d14d commit 9022f47

File tree

12 files changed

+565
-10
lines changed

12 files changed

+565
-10
lines changed

llvm/include/llvm/Target/TargetSelectionDAG.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -918,6 +918,10 @@ def convergencectrl_loop : SDNode<"ISD::CONVERGENCECTRL_LOOP",
918918
def convergencectrl_glue : SDNode<"ISD::CONVERGENCECTRL_GLUE",
919919
SDTypeProfile<0, 1, [SDTCisVT<0, untyped>]>>;
920920

921+
def sponentry : SDNode<
922+
"ISD::SPONENTRY", SDTypeProfile <1, 0, [SDTCisPtrTy<0>]>
923+
>;
924+
921925
//===----------------------------------------------------------------------===//
922926
// Selection DAG Condition Codes
923927

llvm/lib/IR/Verifier.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7207,6 +7207,13 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
72077207
&Call);
72087208
break;
72097209
}
7210+
case Intrinsic::sponentry: {
7211+
const unsigned StackAS = DL.getAllocaAddrSpace();
7212+
const Type *RetTy = Call.getFunctionType()->getReturnType();
7213+
Check(RetTy->getPointerAddressSpace() == StackAS,
7214+
"llvm.sponentry must return a pointer to the stack", &Call);
7215+
break;
7216+
}
72107217
};
72117218

72127219
// Verify that there aren't any unmediated control transfers between funclets.

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,8 @@ def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
329329
// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
330330
// so we don't mark it as equivalent.
331331

332+
def : GINodeEquiv<G_AMDGPU_SPONENTRY, sponentry>;
333+
332334
class GISelSop2Pat <
333335
SDPatternOperator node,
334336
Instruction inst,

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
3131
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
3232
#include "llvm/CodeGen/GlobalISel/Utils.h"
33+
#include "llvm/CodeGen/MachineFrameInfo.h"
3334
#include "llvm/CodeGen/PseudoSourceValueManager.h"
3435
#include "llvm/CodeGen/TargetOpcodes.h"
3536
#include "llvm/IR/DiagnosticInfo.h"
@@ -7759,6 +7760,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
77597760
// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
77607761
auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
77617762
switch (IntrID) {
7763+
case Intrinsic::sponentry:
7764+
if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
7765+
// FIXME: The imported pattern checks for i32 instead of p5; if we fix
7766+
// that we can remove this cast.
7767+
const LLT S32 = LLT::scalar(32);
7768+
Register TmpReg = MRI.createGenericVirtualRegister(S32);
7769+
B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
7770+
7771+
Register DstReg = MI.getOperand(0).getReg();
7772+
B.buildIntToPtr(DstReg, TmpReg);
7773+
MI.eraseFromParent();
7774+
} else {
7775+
int FI = B.getMF().getFrameInfo().CreateFixedObject(
7776+
1, 0, /*IsImmutable=*/false);
7777+
B.buildFrameIndex(MI.getOperand(0), FI);
7778+
MI.eraseFromParent();
7779+
}
7780+
return true;
77627781
case Intrinsic::amdgcn_if:
77637782
case Intrinsic::amdgcn_else: {
77647783
MachineInstr *Br = nullptr;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4585,6 +4585,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45854585
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
45864586
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
45874587
break;
4588+
case AMDGPU::G_AMDGPU_SPONENTRY: {
4589+
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4590+
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4591+
break;
4592+
}
45884593
case AMDGPU::G_INTRINSIC:
45894594
case AMDGPU::G_INTRINSIC_CONVERGENT: {
45904595
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -724,14 +724,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
724724
FrameInfo.getMaxAlign());
725725
MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
726726

727-
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
728-
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(
729-
AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
730-
// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
731-
// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
732-
// SCC, so we need to check for 0 manually.
733-
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
734-
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
727+
BuildMI(MBB, I, DL, TII->get(AMDGPU::GET_STACK_BASE), FPReg);
735728
if (requiresStackPointerReference(MF)) {
736729
Register SPReg = MFI->getStackPtrOffsetReg();
737730
assert(SPReg != AMDGPU::SP_REG);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6985,6 +6985,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
69856985
return LowerBRCOND(Op, DAG);
69866986
case ISD::RETURNADDR:
69876987
return LowerRETURNADDR(Op, DAG);
6988+
case ISD::SPONENTRY:
6989+
return LowerSPONENTRY(Op, DAG);
69886990
case ISD::LOAD: {
69896991
SDValue Result = LowerLOAD(Op, DAG);
69906992
assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
@@ -7998,6 +8000,20 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
79988000
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
79998001
}
80008002

8003+
SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8004+
MachineFunction &MF = DAG.getMachineFunction();
8005+
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8006+
8007+
// For functions that set up their own stack, select the GET_STACK_BASE
8008+
// pseudo.
8009+
if (MFI->isBottomOfStack())
8010+
return Op;
8011+
8012+
// For everything else, create a dummy stack object.
8013+
int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
8014+
return DAG.getFrameIndex(FI, Op.getValueType());
8015+
}
8016+
80018017
SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
80028018
const SDLoc &DL, EVT VT) const {
80038019
return Op.getValueType().bitsLE(VT)

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
132132
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
133133
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
134134
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
135+
SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
135136
SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
136137
SelectionDAG &DAG, ArrayRef<SDValue> Ops,
137138
bool IsIntrinsic = false) const;

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2567,6 +2567,38 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
25672567
Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
25682568
break;
25692569
}
2570+
2571+
case AMDGPU::GET_STACK_BASE:
2572+
// The stack starts at offset 0 unless we need to reserve some space at the
2573+
// bottom.
2574+
if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2575+
// When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2576+
// some of the VGPRs. The size of the required scratch space has already
2577+
// been computed by prolog epilog insertion.
2578+
const SIMachineFunctionInfo *MFI =
2579+
MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2580+
unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2581+
Register DestReg = MI.getOperand(0).getReg();
2582+
BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2583+
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(
2584+
AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
2585+
// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2586+
// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2587+
// SCC, so we need to check for 0 manually.
2588+
BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2589+
// Change the implicif-def of SCC to an explicit use (but first remove
2590+
// the dead flag if present).
2591+
MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2592+
MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2593+
MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2594+
MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2595+
} else {
2596+
MI.setDesc(get(AMDGPU::S_MOV_B32));
2597+
MI.addOperand(MachineOperand::CreateImm(0));
2598+
MI.removeOperand(
2599+
MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2600+
}
2601+
break;
25702602
}
25712603

25722604
return true;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,7 @@ multiclass si_cs_chain_tc_dvgpr_patterns<
930930

931931
defm : si_cs_chain_tc_dvgpr_patterns<i32>; // On GFX12, dVGPR mode is wave32-only.
932932

933+
let Defs = [SCC] in {
933934
def ADJCALLSTACKUP : SPseudoInstSI<
934935
(outs), (ins i32imm:$amt0, i32imm:$amt1),
935936
[(callseq_start timm:$amt0, timm:$amt1)],
@@ -939,7 +940,6 @@ def ADJCALLSTACKUP : SPseudoInstSI<
939940
let hasSideEffects = 1;
940941
let usesCustomInserter = 1;
941942
let SchedRW = [WriteSALU];
942-
let Defs = [SCC];
943943
}
944944

945945
def ADJCALLSTACKDOWN : SPseudoInstSI<
@@ -950,9 +950,16 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
950950
let hasSideEffects = 1;
951951
let usesCustomInserter = 1;
952952
let SchedRW = [WriteSALU];
953-
let Defs = [SCC];
954953
}
955954

955+
// Get the offset of the base of the stack, skipping any reserved areas.
956+
def GET_STACK_BASE : SPseudoInstSI<(outs SGPR_32:$dst), (ins),
957+
[(set p5:$dst, (sponentry))]> {
958+
let Size = 16; // Worst case (s_getreg, s_cmp, s_cselect + constant).
959+
let SchedRW = [WriteSALU];
960+
}
961+
} // End Defs = [SCC]
962+
956963
let Defs = [M0, EXEC, SCC],
957964
UseNamedOperandTable = 1 in {
958965

@@ -4830,6 +4837,13 @@ def G_AMDGPU_READANYLANE : AMDGPUGenericInstruction {
48304837
let hasSideEffects = 0;
48314838
}
48324839

4840+
// llvm.sponentry
4841+
def G_AMDGPU_SPONENTRY : AMDGPUGenericInstruction {
4842+
let OutOperandList = (outs type0:$dst);
4843+
let InOperandList = (ins);
4844+
let hasSideEffects = 0;
4845+
}
4846+
48334847
//============================================================================//
48344848
// Dummy Instructions
48354849
//============================================================================//

0 commit comments

Comments
 (0)