[VPlan] Introduce and use m_SpecificSInt, rebase updates

artagnon · artagnon · commit f39a3bed56be · 2026-01-13T12:29:14.000Z
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
@@ -561,6 +561,18 @@ class [[nodiscard]] APInt {
     return I1.zext(I2.getBitWidth()) == I2;
   }
 
+  /// Determine if two APInts have the same value, after sign-extending
+  /// one of them (if needed!) to ensure that the bit-widths match.
+  static bool isSameSignedValue(const APInt &I1, const APInt &I2) {
+    if (I1.getBitWidth() == I2.getBitWidth())
+      return I1 == I2;
+
+    if (I1.getBitWidth() > I2.getBitWidth())
+      return I1 == I2.sext(I1.getBitWidth());
+
+    return I1.sext(I2.getBitWidth()) == I2;
+  }
+
   /// Overload to compute a hash_code for an APInt value.
   LLVM_ABI friend hash_code hash_value(const APInt &Arg);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -95,10 +95,9 @@ struct deferredval_ty {
 /// whichever value m_VPValue(X) populated.
 inline deferredval_ty m_Deferred(VPValue *const &V) { return V; }
 
-/// Match an integer constant or vector of constants if Pred::isValue returns
-/// true for the APInt. \p BitWidth optionally specifies the bitwidth the
-/// matched constant must have. If it is 0, the matched constant can have any
-/// bitwidth.
+/// Match an integer constant if Pred::isValue returns true for the APInt. \p
+/// BitWidth optionally specifies the bitwidth the matched constant must have.
+/// If it is 0, the matched constant can have any bitwidth.
 template <typename Pred, unsigned BitWidth = 0> struct int_pred_ty {
   Pred P;
 
@@ -123,15 +122,18 @@ template <typename Pred, unsigned BitWidth = 0> struct int_pred_ty {
   }
 };
 
-/// Match a specified integer value or vector of all elements of that
-/// value. \p BitWidth optionally specifies the bitwidth the matched constant
-/// must have. If it is 0, the matched constant can have any bitwidth.
+/// Match a specified signed or unsigned integer value.
 struct is_specific_int {
   APInt Val;
+  bool IsSigned;
 
-  is_specific_int(APInt Val) : Val(std::move(Val)) {}
+  is_specific_int(APInt Val, bool IsSigned = false)
+      : Val(std::move(Val)), IsSigned(IsSigned) {}
 
-  bool isValue(const APInt &C) const { return APInt::isSameValue(Val, C); }
+  bool isValue(const APInt &C) const {
+    return IsSigned ? APInt::isSameSignedValue(Val, C)
+                    : APInt::isSameValue(Val, C);
+  }
 };
 
 template <unsigned Bitwidth = 0>
@@ -141,6 +143,11 @@ inline specific_intval<0> m_SpecificInt(uint64_t V) {
   return specific_intval<0>(is_specific_int(APInt(64, V)));
 }
 
+inline specific_intval<0> m_SpecificSInt(int64_t V) {
+  return specific_intval<0>(
+      is_specific_int(APInt(64, V, /*isSigned=*/true), /*IsSigned=*/true));
+}
+
 inline specific_intval<1> m_False() {
   return specific_intval<1>(is_specific_int(APInt(64, 0)));
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2933,11 +2933,11 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
         VEPR->getGEPNoWrapFlags(), &EVL, VEPR->getDebugLoc());
   };
 
-  auto m_VecEndPtrVF = [&Plan](VPValue *&Addr) { // NOLINT
+  auto m_VecEndPtrVF = [&Plan](VPValue *&Addr, int64_t Stride) { // NOLINT
     return m_VecEndPtr(
         m_VPValue(Addr),
         m_c_Mul(
-            m_VPValue(),
+            m_SpecificSInt(Stride),
             m_Sub(m_ZExtOrTruncOrSelf(m_Specific(&Plan->getVF())), m_One())));
   };
 
@@ -2951,7 +2951,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
   if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
       match(ReversedVal,
             m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
-      match(EndPtr, m_VecEndPtrVF(Addr)) &&
+      isa<VPVectorEndPointerRecipe>(EndPtr) &&
+      match(EndPtr,
+            m_VecEndPtrVF(
+                Addr, cast<VPVectorEndPointerRecipe>(EndPtr)->getStride())) &&
       cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
     auto *LoadR = new VPWidenLoadEVLRecipe(
         *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
@@ -2971,7 +2974,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
   if (match(&CurRecipe,
             m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
                           m_RemoveMask(HeaderMask, Mask))) &&
-      match(EndPtr, m_VecEndPtrVF(Addr)) &&
+      isa<VPVectorEndPointerRecipe>(EndPtr) &&
+      match(EndPtr,
+            m_VecEndPtrVF(
+                Addr, cast<VPVectorEndPointerRecipe>(EndPtr)->getStride())) &&
       cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
     auto *NewReverse = new VPWidenIntrinsicRecipe(
         Intrinsic::experimental_vp_reverse,
@@ -3550,10 +3556,9 @@ void VPlanTransforms::createInterleaveGroups(
     // the pointer operand of the interleaved access is supposed to be uniform.
     if (IG->isReverse()) {
       B.setInsertPoint(InsertPos);
-      auto *ReversePtr = B.createVectorEndPointerRecipe(
+      Addr = B.createVectorEndPointerRecipe(
           Addr, getLoadStoreType(IRInsertPos), -(int64_t)IG->getFactor(), NW,
           &Plan.getVF(), InsertPos->getDebugLoc());
-      Addr = ReversePtr;
     }
     auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
                                         InsertPos->getMask(), NeedsMaskForGaps,
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -1008,13 +1008,11 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC1VF4-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
 ; IC1VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
 ; IC1VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
-; IC1VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
-; IC1VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 -3
+; IC1VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -3
 ; IC1VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; IC1VF4-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; IC1VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0
-; IC1VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3
+; IC1VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -3
 ; IC1VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
 ; IC1VF4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC1VF4-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]]
@@ -1068,14 +1066,10 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC4VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -1)
 ; IC4VF4-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -1)
 ; IC4VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
-; IC4VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0
-; IC4VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3
-; IC4VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -4
-; IC4VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -3
-; IC4VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -8
-; IC4VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -3
-; IC4VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -12
-; IC4VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 -3
+; IC4VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -3
+; IC4VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -7
+; IC4VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -11
+; IC4VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 -15
 ; IC4VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
 ; IC4VF4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
 ; IC4VF4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
@@ -1085,14 +1079,10 @@ define i64 @select_decreasing_induction_icmp_iv_just_within_bounds(ptr %a, ptr %
 ; IC4VF4-NEXT:    [[REVERSE8:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC4VF4-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD6]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; IC4VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; IC4VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 0
-; IC4VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 -3
-; IC4VF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
-; IC4VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 -3
-; IC4VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -8
-; IC4VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i64 -3
-; IC4VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -12
-; IC4VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i64 -3
+; IC4VF4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -3
+; IC4VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -7
+; IC4VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -11
+; IC4VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -15
 ; IC4VF4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
 ; IC4VF4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
 ; IC4VF4-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, ptr [[TMP20]], align 1