From 4518b6908faf2d09c4799dd844fec109b96c2375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Csan=C3=A1d=20Hajd=C3=BA?= Date: Fri, 10 Oct 2025 17:37:58 +0200 Subject: [PATCH 1/3] [AArch64] Convert `CSEL(X, 1)` into `CSINC(X, XZR)` in early-ifcvt Early if conversion can create instruction sequences such as ``` mov x1, #1 csel x0, x1, x2, eq ``` which could be simplified into the following instead ``` csinc x0, x2, xzr, ne ``` One notable example that generates code like this is `cmpxchg weak`. This is fixed by handling an immediate value of 1 as `add(wzr, 1)` so that the addition can be folded into CSEL by using CSINC instead. --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 44 ++++++++-- .../test/CodeGen/AArch64/arm64-early-ifcvt.ll | 80 +++++++++++++++++++ llvm/test/CodeGen/AArch64/peephole-csel.ll | 5 +- 3 files changed, 120 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 12c600f0f2661..1d10ae4628fc8 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -708,8 +708,32 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); const MachineInstr *DefMI = MRI.getVRegDef(VReg); unsigned Opc = 0; - unsigned SrcOpNum = 0; + unsigned SrcReg = 0; switch (DefMI->getOpcode()) { + case AArch64::SUBREG_TO_REG: + // Check for the following way to define an 64-bit immediate: + // %0:gpr32 = MOVi32imm 1 + // %1:gpr64 = SUBREG_TO_REG 0, %0:gpr32, %subreg.sub_32 + if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 0) + return 0; + if (!DefMI->getOperand(2).isReg()) + return 0; + if (!DefMI->getOperand(3).isImm() || + DefMI->getOperand(3).getImm() != AArch64::sub_32) + return 0; + DefMI = MRI.getVRegDef(DefMI->getOperand(2).getReg()); + if (DefMI->getOpcode() != AArch64::MOVi32imm) + return 0; + // fall-through to MOVi32imm case. + [[fallthrough]]; + case AArch64::MOVi32imm: + case AArch64::MOVi64imm: + if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1) + return 0; + SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; + break; + case AArch64::ADDSXri: case AArch64::ADDSWri: // if NZCV is used, do not fold. @@ -724,7 +748,7 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || DefMI->getOperand(3).getImm() != 0) return 0; - SrcOpNum = 1; + SrcReg = DefMI->getOperand(1).getReg(); Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; break; @@ -734,7 +758,7 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) return 0; - SrcOpNum = 2; + SrcReg = DefMI->getOperand(2).getReg(); Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; break; } @@ -753,17 +777,17 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) return 0; - SrcOpNum = 2; + SrcReg = DefMI->getOperand(2).getReg(); Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; break; } default: return 0; } - assert(Opc && SrcOpNum && "Missing parameters"); + assert(Opc && SrcReg && "Missing parameters"); if (NewVReg) - *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); + *NewVReg = SrcReg; return Opc; } @@ -976,6 +1000,14 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, // Fold the operation. Leave any dead instructions for DCE to clean up. if (FoldedOpc) { + // NewVReg might be XZR/WZR. In that case create a COPY into a virtual + // register. + if (!Register::isVirtualRegister(NewVReg)) { + unsigned ZeroReg = NewVReg; + NewVReg = MRI.createVirtualRegister(RC); + BuildMI(MBB, I, DL, get(TargetOpcode::COPY), NewVReg).addReg(ZeroReg); + } + FalseReg = NewVReg; Opc = FoldedOpc; // The extends the live range of NewVReg. diff --git a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll index 97a7741bcde75..849323f0fedf3 100644 --- a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll @@ -421,3 +421,83 @@ for.body51: ; preds = %is_sbox.exit155 unreachable } declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, ptr nocapture) nounwind ssp + +; CHECK-LABEL: fold_imm1_csinc_32: +; CHECK: cmp w0, w1 +; CHECK-NEXT: csinc w0, w2, wzr, ge +; CHECK-NEXT: ret +define i32 @fold_imm1_csinc_32(i32 %x, i32 %y, i32 %n) nounwind ssp { +entry: + %cmp = icmp slt i32 %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: + br label %exit + +if.else: + br label %exit + +exit: + %result = phi i32 [ 1, %if.then ], [ %n, %if.else ] + ret i32 %result +} + +; CHECK-LABEL: fold_imm1_csinc_64: +; CHECK: cmp x0, x1 +; CHECK-NEXT: csinc x0, x2, xzr, ge +; CHECK-NEXT: ret +define i64 @fold_imm1_csinc_64(i64 %x, i64 %y, i64 %n) nounwind ssp { +entry: + %cmp = icmp slt i64 %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: + br label %exit + +if.else: + br label %exit + +exit: + %result = phi i64 [ 1, %if.then ], [ %n, %if.else ] + ret i64 %result +} + +; CHECK-LABEL: fold_imm1_cset_32: +; CHECK: cmp w0, w1 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret +define i32 @fold_imm1_cset_32(i32 %x, i32 %y) nounwind ssp { +entry: + %cmp = icmp slt i32 %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: + br label %exit + +if.else: + br label %exit + +exit: + %result = phi i32 [ 1, %if.then ], [ 0, %if.else ] + ret i32 %result +} + +; CHECK-LABEL: fold_imm1_cset_64: +; CHECK: cmp x0, x1 +; CHECK-NEXT: cset x0, lt +; CHECK-NEXT: ret +define i64 @fold_imm1_cset_64(i64 %x, i64 %y) nounwind ssp { +entry: + %cmp = icmp slt i64 %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: + br label %exit + +if.else: + br label %exit + +exit: + %result = phi i64 [ 1, %if.then ], [ 0, %if.else ] + ret i64 %result +} diff --git a/llvm/test/CodeGen/AArch64/peephole-csel.ll b/llvm/test/CodeGen/AArch64/peephole-csel.ll index 868b9f1f2f6ac..b085258059b7e 100644 --- a/llvm/test/CodeGen/AArch64/peephole-csel.ll +++ b/llvm/test/CodeGen/AArch64/peephole-csel.ll @@ -5,10 +5,9 @@ define void @peephole_csel(ptr %dst, i1 %0, i1 %cmp) { ; CHECK-LABEL: peephole_csel: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: tst w2, #0x1 -; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: tst w1, #0x1 -; CHECK-NEXT: csel x8, x8, x9, eq +; CHECK-NEXT: csinc x8, x8, xzr, ne ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: ret entry: From 311dcc91394db153909d934f5fc834d4ef6755d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Csan=C3=A1d=20Hajd=C3=BA?= Date: Wed, 15 Oct 2025 21:50:13 +0200 Subject: [PATCH 2/3] Address review feedback * Remove fallthrough * Rename NewVReg -> NewReg --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 33 +++++++++++--------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 1d10ae4628fc8..777df137b9d6a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -700,7 +700,7 @@ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { // csel instruction. If so, return the folded opcode, and the replacement // register. static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, - unsigned *NewVReg = nullptr) { + unsigned *NewReg = nullptr) { VReg = removeCopies(MRI, VReg); if (!Register::isVirtualRegister(VReg)) return 0; @@ -724,8 +724,13 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, DefMI = MRI.getVRegDef(DefMI->getOperand(2).getReg()); if (DefMI->getOpcode() != AArch64::MOVi32imm) return 0; - // fall-through to MOVi32imm case. - [[fallthrough]]; + if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1) + return 0; + assert(Is64Bit); + SrcReg = AArch64::XZR; + Opc = AArch64::CSINCXr; + break; + case AArch64::MOVi32imm: case AArch64::MOVi64imm: if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1) @@ -786,8 +791,8 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, } assert(Opc && SrcReg && "Missing parameters"); - if (NewVReg) - *NewVReg = SrcReg; + if (NewReg) + *NewReg = SrcReg; return Opc; } @@ -988,30 +993,30 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, // Try folding simple instructions into the csel. if (TryFold) { - unsigned NewVReg = 0; - unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); + unsigned NewReg = 0; + unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg); if (FoldedOpc) { // The folded opcodes csinc, csinc and csneg apply the operation to // FalseReg, so we need to invert the condition. CC = AArch64CC::getInvertedCondCode(CC); TrueReg = FalseReg; } else - FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); + FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg); // Fold the operation. Leave any dead instructions for DCE to clean up. if (FoldedOpc) { // NewVReg might be XZR/WZR. In that case create a COPY into a virtual // register. - if (!Register::isVirtualRegister(NewVReg)) { - unsigned ZeroReg = NewVReg; - NewVReg = MRI.createVirtualRegister(RC); - BuildMI(MBB, I, DL, get(TargetOpcode::COPY), NewVReg).addReg(ZeroReg); + if (!Register::isVirtualRegister(NewReg)) { + unsigned ZeroReg = NewReg; + NewReg = MRI.createVirtualRegister(RC); + BuildMI(MBB, I, DL, get(TargetOpcode::COPY), NewReg).addReg(ZeroReg); } - FalseReg = NewVReg; + FalseReg = NewReg; Opc = FoldedOpc; // The extends the live range of NewVReg. - MRI.clearKillFlags(NewVReg); + MRI.clearKillFlags(NewReg); } } From 51f0dc6d27586efc55a2ab907c9eb7c9e06ab2f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Csan=C3=A1d=20Hajd=C3=BA?= Date: Fri, 17 Oct 2025 15:35:03 +0200 Subject: [PATCH 3/3] Address review feedback * NewVReg -> NewReg in comments. * Use WZR/XZR directly in the folded instruction. --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 777df137b9d6a..5d13691a328b9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1005,24 +1005,22 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, // Fold the operation. Leave any dead instructions for DCE to clean up. if (FoldedOpc) { - // NewVReg might be XZR/WZR. In that case create a COPY into a virtual - // register. - if (!Register::isVirtualRegister(NewReg)) { - unsigned ZeroReg = NewReg; - NewReg = MRI.createVirtualRegister(RC); - BuildMI(MBB, I, DL, get(TargetOpcode::COPY), NewReg).addReg(ZeroReg); - } - FalseReg = NewReg; Opc = FoldedOpc; - // The extends the live range of NewVReg. + // Extend the live range of NewReg. MRI.clearKillFlags(NewReg); } } // Pull all virtual register into the appropriate class. MRI.constrainRegClass(TrueReg, RC); - MRI.constrainRegClass(FalseReg, RC); + // FalseReg might be WZR or XZR if the folded operand is a literal 1. + assert( + (FalseReg.isVirtual() || FalseReg == AArch64::WZR || + FalseReg == AArch64::XZR) && + "FalseReg was folded into a non-virtual register other than WZR or XZR"); + if (FalseReg.isVirtual()) + MRI.constrainRegClass(FalseReg, RC); // Insert the csel. BuildMI(MBB, I, DL, get(Opc), DstReg)