Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3799,6 +3799,12 @@ def int_amdgcn_cooperative_atomic_store_16x8B : AMDGPUCooperativeAtomicStore<llv
def int_amdgcn_cooperative_atomic_load_8x16B : AMDGPUCooperativeAtomicLoad<llvm_v4i32_ty>;
def int_amdgcn_cooperative_atomic_store_8x16B : AMDGPUCooperativeAtomicStore<llvm_v4i32_ty>;

// Return the offset for the actual base of the stack, skipping over any
// reserved areas (e.g. the area reserved for saving the dynamic VGPRs when CWSR
// is active). The returned value only makes sense in functions that set up
// their own stack.
def int_amdgcn_get_stack_base : PureIntrinsic<[llvm_i32_ty]>;

//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4882,6 +4882,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_get_stack_base:
case Intrinsic::amdgcn_kernarg_segment_ptr:
case Intrinsic::amdgcn_s_getpc:
case Intrinsic::amdgcn_groupstaticsize:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ def : AlwaysUniform<int_amdgcn_s_getpc>;
def : AlwaysUniform<int_amdgcn_s_getreg>;
def : AlwaysUniform<int_amdgcn_s_memrealtime>;
def : AlwaysUniform<int_amdgcn_s_memtime>;
def : AlwaysUniform<int_amdgcn_get_stack_base>;

def AMDGPUImageDMaskIntrinsicTable : GenericTable {
let FilterClass = "AMDGPUImageDMaskIntrinsic";
Expand Down
9 changes: 1 addition & 8 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -724,14 +724,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
FrameInfo.getMaxAlign());
MFI->setScratchReservedForDynamicVGPRs(VGPRSize);

BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(
AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
// SCC, so we need to check for 0 manually.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
BuildMI(MBB, I, DL, TII->get(AMDGPU::GET_STACK_BASE), FPReg);
if (requiresStackPointerReference(MF)) {
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
Expand Down
34 changes: 33 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2537,7 +2537,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
break;

case AMDGPU::V_MAX_BF16_PSEUDO_e64:
case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
assert(ST.hasBF16PackedInsts());
MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
Expand All @@ -2550,6 +2550,38 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
break;
}

case AMDGPU::GET_STACK_BASE:
// The stack starts at offset 0 unless we need to reserve some space at the
// bottom.
if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
// When CWSR is used in dynamic VGPR mode, the trap handler needs to save
// some of the VGPRs. The size of the required scratch space has already
// been computed by prolog epilog insertion.
const SIMachineFunctionInfo *MFI =
MBB.getParent()->getInfo<SIMachineFunctionInfo>();
unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
Register DestReg = MI.getOperand(0).getReg();
BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(
AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
// SCC, so we need to check for 0 manually.
BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
MI.setDesc(get(AMDGPU::S_CMOVK_I32));
MI.addOperand(MachineOperand::CreateImm(VGPRSize));
// Change the implicif-def of SCC to an explicit use (but first remove
// the dead flag if present).
MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
} else {
MI.setDesc(get(AMDGPU::S_MOV_B32));
MI.addOperand(MachineOperand::CreateImm(0));
MI.removeOperand(MI.getNumExplicitOperands()); // Drop implicit def of SCC.
}
break;
}

return true;
}

Expand Down
11 changes: 9 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,7 @@ multiclass si_cs_chain_tc_dvgpr_patterns<

defm : si_cs_chain_tc_dvgpr_patterns<i32>; // On GFX12, dVGPR mode is wave32-only.

let Defs = [SCC] in {
def ADJCALLSTACKUP : SPseudoInstSI<
(outs), (ins i32imm:$amt0, i32imm:$amt1),
[(callseq_start timm:$amt0, timm:$amt1)],
Expand All @@ -935,7 +936,6 @@ def ADJCALLSTACKUP : SPseudoInstSI<
let hasSideEffects = 1;
let usesCustomInserter = 1;
let SchedRW = [WriteSALU];
let Defs = [SCC];
}

def ADJCALLSTACKDOWN : SPseudoInstSI<
Expand All @@ -946,9 +946,16 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let hasSideEffects = 1;
let usesCustomInserter = 1;
let SchedRW = [WriteSALU];
let Defs = [SCC];
}

// Get the offset of the base of the stack, skipping any reserved areas.
def GET_STACK_BASE : SPseudoInstSI<(outs SGPR_32:$dst), (ins),
[(set SGPR_32:$dst, (int_amdgcn_get_stack_base))]> {
let hasSideEffects = 0;
let SchedRW = [WriteSALU];
}
} // End Defs = [SCC]

let Defs = [M0, EXEC, SCC],
UseNamedOperandTable = 1 in {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,13 @@ define void @s_memrealtime(ptr addrspace(1) inreg %out) {
ret void
}

; CHECK-LABEL: for function 'get_stack_base':
; CHECK: ALL VALUES UNIFORM
define amdgpu_cs void @get_stack_base(ptr addrspace(1) inreg %out) {
%v = call i32 @llvm.amdgcn.get.stack.base()
store i32 %v, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.readfirstlane(i32) #0
Expand All @@ -216,6 +223,7 @@ declare i32 @llvm.amdgcn.cluster.workgroup.max.id.x()
declare i32 @llvm.amdgcn.cluster.workgroup.max.id.y()
declare i32 @llvm.amdgcn.cluster.workgroup.max.id.z()
declare i32 @llvm.amdgcn.cluster.workgroup.max.flat.id()
declare i32 @llvm.amdgcn.get.stack.base()

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind readnone convergent }
Expand Down
101 changes: 101 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.get.stack.base.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s

; Test that the llvm.amdgcn.get.stack.base intrinsic returns the correct value:
; - for functions that need to reserve space for CWSR, it should return the offset
; past the reserved area (i.e. the offset of the first spill or local variables)
; - for functions that don't reserve any space, it should return 0

define amdgpu_cs i32 @stack_base_cs_dvgpr_16(i32 %val) #0 {
; CHECK-LABEL: stack_base_cs_dvgpr_16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
; CHECK-NEXT: s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
; CHECK-NEXT: s_cmp_lg_u32 0, s0
; CHECK-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: s_cmovk_i32 s0, 0x1c0
; CHECK-NEXT: ; return to shader part epilog
%local = alloca i32, addrspace(5)
store volatile i32 %val, ptr addrspace(5) %local
%stack.base = call i32 @llvm.amdgcn.get.stack.base()
ret i32 %stack.base
}

define amdgpu_cs i32 @stack_base_cs_dvgpr_32(i32 %val) #1 {
; CHECK-LABEL: stack_base_cs_dvgpr_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
; CHECK-NEXT: s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_cmovk_i32 s33, 0x380
; CHECK-NEXT: s_cmp_lg_u32 0, s0
; CHECK-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: s_cmovk_i32 s0, 0x380
; CHECK-NEXT: ; return to shader part epilog
%local = alloca i32, addrspace(5)
store volatile i32 %val, ptr addrspace(5) %local
%stack.base = call i32 @llvm.amdgcn.get.stack.base()
ret i32 %stack.base
}

define amdgpu_cs i32 @stack_base_cs_no_dvgpr(i32 %val) #2 {
; CHECK-LABEL: stack_base_cs_no_dvgpr:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: scratch_store_b32 off, v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: ; return to shader part epilog
%local = alloca i32, addrspace(5)
store volatile i32 %val, ptr addrspace(5) %local
%stack.base = call i32 @llvm.amdgcn.get.stack.base()
ret i32 %stack.base
}

define amdgpu_cs i32 @stack_base_cs_dvgpr_control_flow(i32 %val) #0 {
; CHECK-LABEL: stack_base_cs_dvgpr_control_flow:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
; CHECK-NEXT: s_mov_b32 s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 0, s33
; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
; CHECK-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: v_cmpx_gt_i32_e32 0x43, v0
; CHECK-NEXT: ; %bb.1: ; %if.then
; CHECK-NEXT: s_getreg_b32 s1, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: s_cmp_lg_u32 0, s1
; CHECK-NEXT: s_cmovk_i32 s1, 0x1c0
; CHECK-NEXT: v_mov_b32_e32 v0, s1
; CHECK-NEXT: ; %bb.2: ; %if.end
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_wait_alu depctr_va_sdst(0)
; CHECK-NEXT: ; return to shader part epilog
entry:
%local = alloca i32, addrspace(5)
store volatile i32 %val, ptr addrspace(5) %local
%which = icmp slt i32 %val, 67
br i1 %which, label %if.then, label %if.end

if.then:
%stack.base = call i32 @llvm.amdgcn.get.stack.base()
br label %if.end

if.end:
%ret = phi i32 [ %stack.base, %if.then ], [ %val, %entry ]
ret i32 %ret
}


attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
attributes #1 = { nounwind "amdgpu-dynamic-vgpr-block-size"="32" }
attributes #2 = { nounwind "amdgpu-dynamic-vgpr-block-size"="0" }
Loading