Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/include/llvm/Target/TargetSelectionDAG.td
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,10 @@ def convergencectrl_loop : SDNode<"ISD::CONVERGENCECTRL_LOOP",
def convergencectrl_glue : SDNode<"ISD::CONVERGENCECTRL_GLUE",
SDTypeProfile<0, 1, [SDTCisVT<0, untyped>]>>;

def sponentry : SDNode<
"ISD::SPONENTRY", SDTypeProfile <1, 0, [SDTCisPtrTy<0>]>
>;

//===----------------------------------------------------------------------===//
// Selection DAG Condition Codes

Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/IR/Verifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7207,6 +7207,13 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
&Call);
break;
}
case Intrinsic::sponentry: {
const unsigned StackAS = DL.getAllocaAddrSpace();
const Type *RetTy = Call.getFunctionType()->getReturnType();
Check(RetTy->getPointerAddressSpace() == StackAS,
"llvm.sponentry must return a pointer to the stack", &Call);
break;
}
};

// Verify that there aren't any unmediated control transfers between funclets.
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,8 @@ def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
// so we don't mark it as equivalent.

def : GINodeEquiv<G_AMDGPU_SPONENTRY, sponentry>;

class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
Expand Down
19 changes: 19 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DiagnosticInfo.h"
Expand Down Expand Up @@ -7759,6 +7760,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrID) {
case Intrinsic::sponentry:
if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
// FIXME: The imported pattern checks for i32 instead of p5; if we fix
// that we can remove this cast.
const LLT S32 = LLT::scalar(32);
Comment on lines +7765 to +7767
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you add an explicit p5 does it work? I thought this was a solved problem

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure where to add the explicit p5. Did you mean in the TableGen definition?

The pattern in SIInstructions.td is already written with p5 for the output, but the generated code in AMDGPUGenGlobalISel.inc looks like this (note the GILLT_s32):

/* 2023706 */ // Label 168: @2023706
/* 2023706 */ GIM_Try, /*On fail goto*//*Label 29065*/ GIMT_Encode4(2023731), // Rule ID 4295 //
/* 2023711 */   GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s32,
/* 2023714 */   GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(AMDGPU::SGPR_32RegClassID),
/* 2023718 */   // (sponentry:{ *:[i32] })  =>  (GET_STACK_BASE:{ *:[i32] }:{ *:[i1] })
/* 2023718 */   GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/0, /*Opcode*/GIMT_Encode2(AMDGPU::GET_STACK_BASE),
/* 2023723 */   GIR_AddImplicitDef, /*InsnID*/0, GIMT_Encode2(AMDGPU::SCC), GIMT_Encode2(static_cast<unsigned>(RegState::Dead)),
/* 2023729 */   GIR_RootConstrainSelectedInstOperands,
/* 2023730 */   // GIR_Coverage, 4295,
/* 2023730 */   GIR_Done,

Did you have some workaround in mind?

For reference, this is what I was trying to select without the cast: LLVM ERROR: cannot select: %2:sreg_32(p5) = G_AMDGPU_SPONENTRY (in function: sponentry_cs_dvgpr_16).

Register TmpReg = MRI.createGenericVirtualRegister(S32);
B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);

Register DstReg = MI.getOperand(0).getReg();
B.buildIntToPtr(DstReg, TmpReg);
MI.eraseFromParent();
} else {
int FI = B.getMF().getFrameInfo().CreateFixedObject(
1, 0, /*IsImmutable=*/false);
B.buildFrameIndex(MI.getOperand(0), FI);
MI.eraseFromParent();
}
return true;
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else: {
MachineInstr *Br = nullptr;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4585,6 +4585,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
case AMDGPU::G_AMDGPU_SPONENTRY: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
Expand Down
9 changes: 1 addition & 8 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -724,14 +724,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
FrameInfo.getMaxAlign());
MFI->setScratchReservedForDynamicVGPRs(VGPRSize);

BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(
AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
// SCC, so we need to check for 0 manually.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
BuildMI(MBB, I, DL, TII->get(AMDGPU::GET_STACK_BASE), FPReg);
if (requiresStackPointerReference(MF)) {
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
Expand Down
16 changes: 16 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6894,6 +6894,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerBRCOND(Op, DAG);
case ISD::RETURNADDR:
return LowerRETURNADDR(Op, DAG);
case ISD::SPONENTRY:
return LowerSPONENTRY(Op, DAG);
case ISD::LOAD: {
SDValue Result = LowerLOAD(Op, DAG);
assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
Expand Down Expand Up @@ -7907,6 +7909,20 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
}

SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

// For functions that set up their own stack, select the GET_STACK_BASE
// pseudo.
if (MFI->isBottomOfStack())
return Op;

// For everything else, create a dummy stack object.
int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
return DAG.getFrameIndex(FI, Op.getValueType());
}

SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
const SDLoc &DL, EVT VT) const {
return Op.getValueType().bitsLE(VT)
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
Expand Down
32 changes: 32 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2560,6 +2560,38 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
break;
}

case AMDGPU::GET_STACK_BASE:
// The stack starts at offset 0 unless we need to reserve some space at the
// bottom.
if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
// When CWSR is used in dynamic VGPR mode, the trap handler needs to save
// some of the VGPRs. The size of the required scratch space has already
// been computed by prolog epilog insertion.
const SIMachineFunctionInfo *MFI =
MBB.getParent()->getInfo<SIMachineFunctionInfo>();
unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
Register DestReg = MI.getOperand(0).getReg();
BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(
AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
// SCC, so we need to check for 0 manually.
BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
// Change the implicif-def of SCC to an explicit use (but first remove
// the dead flag if present).
MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
MI.setDesc(get(AMDGPU::S_CMOVK_I32));
MI.addOperand(MachineOperand::CreateImm(VGPRSize));
} else {
MI.setDesc(get(AMDGPU::S_MOV_B32));
MI.addOperand(MachineOperand::CreateImm(0));
MI.removeOperand(
MI.getNumExplicitOperands()); // Drop implicit def of SCC.
}
break;
}

return true;
Expand Down
18 changes: 16 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,7 @@ multiclass si_cs_chain_tc_dvgpr_patterns<

defm : si_cs_chain_tc_dvgpr_patterns<i32>; // On GFX12, dVGPR mode is wave32-only.

let Defs = [SCC] in {
def ADJCALLSTACKUP : SPseudoInstSI<
(outs), (ins i32imm:$amt0, i32imm:$amt1),
[(callseq_start timm:$amt0, timm:$amt1)],
Expand All @@ -937,7 +938,6 @@ def ADJCALLSTACKUP : SPseudoInstSI<
let hasSideEffects = 1;
let usesCustomInserter = 1;
let SchedRW = [WriteSALU];
let Defs = [SCC];
}

def ADJCALLSTACKDOWN : SPseudoInstSI<
Expand All @@ -948,9 +948,16 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let hasSideEffects = 1;
let usesCustomInserter = 1;
let SchedRW = [WriteSALU];
let Defs = [SCC];
}

// Get the offset of the base of the stack, skipping any reserved areas.
def GET_STACK_BASE : SPseudoInstSI<(outs SGPR_32:$dst), (ins),
[(set p5:$dst, (sponentry))]> {
let Size = 16; // Worst case (s_getreg, s_cmp, s_cselect + constant).
let SchedRW = [WriteSALU];
}
} // End Defs = [SCC]

let Defs = [M0, EXEC, SCC],
UseNamedOperandTable = 1 in {

Expand Down Expand Up @@ -4828,6 +4835,13 @@ def G_AMDGPU_READANYLANE : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}

// llvm.sponentry
def G_AMDGPU_SPONENTRY : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins);
let hasSideEffects = 0;
}

//============================================================================//
// Dummy Instructions
//============================================================================//
Expand Down
Loading