llvm · rovka · Feb 3, 2026 · Jan 15, 2026 · Jan 16, 2026 · Jan 22, 2026
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3799,6 +3799,12 @@ def int_amdgcn_cooperative_atomic_store_16x8B : AMDGPUCooperativeAtomicStore<llv
 def int_amdgcn_cooperative_atomic_load_8x16B : AMDGPUCooperativeAtomicLoad<llvm_v4i32_ty>;
 def int_amdgcn_cooperative_atomic_store_8x16B : AMDGPUCooperativeAtomicStore<llvm_v4i32_ty>;
 
+// Return the offset for the actual base of the stack, skipping over any
+// reserved areas (e.g. the area reserved for saving the dynamic VGPRs when CWSR
+// is active). The returned value only makes sense in functions that set up
+// their own stack.
+def int_amdgcn_get_stack_base : PureIntrinsic<[llvm_i32_ty]>;
+
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
 // should emit calls to these.

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4882,6 +4882,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
         return getDefaultMappingSOP(MI);
       return getDefaultMappingVOP(MI);
+    case Intrinsic::amdgcn_get_stack_base:
     case Intrinsic::amdgcn_kernarg_segment_ptr:
     case Intrinsic::amdgcn_s_getpc:
     case Intrinsic::amdgcn_groupstaticsize:

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -411,6 +411,7 @@ def : AlwaysUniform<int_amdgcn_s_getpc>;
 def : AlwaysUniform<int_amdgcn_s_getreg>;
 def : AlwaysUniform<int_amdgcn_s_memrealtime>;
 def : AlwaysUniform<int_amdgcn_s_memtime>;
+def : AlwaysUniform<int_amdgcn_get_stack_base>;
 
 def AMDGPUImageDMaskIntrinsicTable : GenericTable {
   let FilterClass = "AMDGPUImageDMaskIntrinsic";

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -724,14 +724,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
         FrameInfo.getMaxAlign());
     MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
 
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
-        .addImm(AMDGPU::Hwreg::HwregEncoding::encode(
-            AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
-    // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
-    // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
-    // SCC, so we need to check for 0 manually.
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::GET_STACK_BASE), FPReg);
     if (requiresStackPointerReference(MF)) {
       Register SPReg = MFI->getStackPtrOffsetReg();
       assert(SPReg != AMDGPU::SP_REG);

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2537,7 +2537,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     }
     break;
 
-  case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+  case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
     assert(ST.hasBF16PackedInsts());
     MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
     MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
@@ -2550,6 +2550,38 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     break;
   }
 
+  case AMDGPU::GET_STACK_BASE:
+    // The stack starts at offset 0 unless we need to reserve some space at the
+    // bottom.
+    if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
+      // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
+      // some of the VGPRs. The size of the required scratch space has already
+      // been computed by prolog epilog insertion.
+      const SIMachineFunctionInfo *MFI =
+          MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+      unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
+      Register DestReg = MI.getOperand(0).getReg();
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
+          .addImm(AMDGPU::Hwreg::HwregEncoding::encode(
+              AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
+      // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
+      // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
+      // SCC, so we need to check for 0 manually.
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
+      MI.setDesc(get(AMDGPU::S_CMOVK_I32));
+      MI.addOperand(MachineOperand::CreateImm(VGPRSize));
+      // Change the implicif-def of SCC to an explicit use (but first remove
+      // the dead flag if present).
+      MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
+      MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
+    } else {
+      MI.setDesc(get(AMDGPU::S_MOV_B32));
+      MI.addOperand(MachineOperand::CreateImm(0));
+      MI.removeOperand(MI.getNumExplicitOperands()); // Drop implicit def of SCC.
+    }
+    break;
+  }
+
   return true;
 }
 

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -926,6 +926,7 @@ multiclass si_cs_chain_tc_dvgpr_patterns<
 
 defm : si_cs_chain_tc_dvgpr_patterns<i32>; // On GFX12, dVGPR mode is wave32-only.
 
+let Defs = [SCC] in {
 def ADJCALLSTACKUP : SPseudoInstSI<
   (outs), (ins i32imm:$amt0, i32imm:$amt1),
   [(callseq_start timm:$amt0, timm:$amt1)],
@@ -935,7 +936,6 @@ def ADJCALLSTACKUP : SPseudoInstSI<
   let hasSideEffects = 1;
   let usesCustomInserter = 1;
   let SchedRW = [WriteSALU];
-  let Defs = [SCC];
 }
 
 def ADJCALLSTACKDOWN : SPseudoInstSI<
@@ -946,9 +946,16 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
   let hasSideEffects = 1;
   let usesCustomInserter = 1;
   let SchedRW = [WriteSALU];
-  let Defs = [SCC];
 }
 
+// Get the offset of the base of the stack, skipping any reserved areas.
+def GET_STACK_BASE : SPseudoInstSI<(outs SGPR_32:$dst), (ins),
+  [(set SGPR_32:$dst, (int_amdgcn_get_stack_base))]> {
+  let hasSideEffects = 0;
+  let SchedRW = [WriteSALU];
+}
+} // End Defs = [SCC]
+
 let Defs = [M0, EXEC, SCC],
   UseNamedOperandTable = 1 in {
 

diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -199,6 +199,13 @@ define void @s_memrealtime(ptr addrspace(1) inreg %out) {
   ret void
 }
 
+; CHECK-LABEL: for function 'get_stack_base':
+; CHECK: ALL VALUES UNIFORM
+define amdgpu_cs void @get_stack_base(ptr addrspace(1) inreg %out) {
+  %v = call i32 @llvm.amdgcn.get.stack.base()
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.readfirstlane(i32) #0
@@ -216,6 +223,7 @@ declare i32 @llvm.amdgcn.cluster.workgroup.max.id.x()
 declare i32 @llvm.amdgcn.cluster.workgroup.max.id.y()
 declare i32 @llvm.amdgcn.cluster.workgroup.max.id.z()
 declare i32 @llvm.amdgcn.cluster.workgroup.max.flat.id()
+declare i32 @llvm.amdgcn.get.stack.base()
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind readnone convergent }

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.get.stack.base.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.get.stack.base.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s
+
+; Test that the llvm.amdgcn.get.stack.base intrinsic returns the correct value:
+; - for functions that need to reserve space for CWSR, it should return the offset
+; past the reserved area (i.e. the offset of the first spill or local variables)
+; - for functions that don't reserve any space, it should return 0
+
+define amdgpu_cs i32 @stack_base_cs_dvgpr_16(i32 %val) #0 {
+; CHECK-LABEL: stack_base_cs_dvgpr_16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
+; CHECK-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
+; CHECK-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT:    s_cmp_lg_u32 0, s0
+; CHECK-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
+; CHECK-NEXT:    s_cmovk_i32 s0, 0x1c0
+; CHECK-NEXT:    ; return to shader part epilog
+  %local = alloca i32, addrspace(5)
+  store volatile i32 %val, ptr addrspace(5) %local
+  %stack.base = call i32 @llvm.amdgcn.get.stack.base()
+  ret i32 %stack.base
+}
+
+define amdgpu_cs i32 @stack_base_cs_dvgpr_32(i32 %val) #1 {
+; CHECK-LABEL: stack_base_cs_dvgpr_32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
+; CHECK-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
+; CHECK-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-NEXT:    s_cmovk_i32 s33, 0x380
+; CHECK-NEXT:    s_cmp_lg_u32 0, s0
+; CHECK-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
+; CHECK-NEXT:    s_cmovk_i32 s0, 0x380
+; CHECK-NEXT:    ; return to shader part epilog
+  %local = alloca i32, addrspace(5)
+  store volatile i32 %val, ptr addrspace(5) %local
+  %stack.base = call i32 @llvm.amdgcn.get.stack.base()
+  ret i32 %stack.base
+}
+
+define amdgpu_cs i32 @stack_base_cs_no_dvgpr(i32 %val) #2 {
+; CHECK-LABEL: stack_base_cs_no_dvgpr:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
+; CHECK-NEXT:    ; return to shader part epilog
+  %local = alloca i32, addrspace(5)
+  store volatile i32 %val, ptr addrspace(5) %local
+  %stack.base = call i32 @llvm.amdgcn.get.stack.base()
+  ret i32 %stack.base
+}
+
+define amdgpu_cs i32 @stack_base_cs_dvgpr_control_flow(i32 %val) #0 {
+; CHECK-LABEL: stack_base_cs_dvgpr_control_flow:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
+; CHECK-NEXT:    s_mov_b32 s0, exec_lo
+; CHECK-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
+; CHECK-NEXT:    v_cmpx_gt_i32_e32 0x43, v0
+; CHECK-NEXT:  ; %bb.1: ; %if.then
+; CHECK-NEXT:    s_getreg_b32 s1, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT:    s_cmp_lg_u32 0, s1
+; CHECK-NEXT:    s_cmovk_i32 s1, 0x1c0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:  ; %bb.2: ; %if.end
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    s_wait_alu depctr_va_sdst(0)
+; CHECK-NEXT:    ; return to shader part epilog
+entry:
+  %local = alloca i32, addrspace(5)
+  store volatile i32 %val, ptr addrspace(5) %local
+  %which = icmp slt i32 %val, 67
+  br i1 %which, label %if.then, label %if.end
+
+if.then:
+  %stack.base = call i32 @llvm.amdgcn.get.stack.base()
+  br label %if.end
+
+if.end:
+  %ret = phi i32 [ %stack.base, %if.then ], [ %val, %entry ]
+  ret i32 %ret
+}
+
+
+attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
+attributes #1 = { nounwind "amdgpu-dynamic-vgpr-block-size"="32" }
+attributes #2 = { nounwind "amdgpu-dynamic-vgpr-block-size"="0" }