diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c4ba8e9857dc4..af78e0c1e4799 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1647,12 +1647,12 @@ class TargetTransformInfo {
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
   /// Calculate the cost of an extended reduction pattern, similar to
-  /// getArithmeticReductionCost of an Add reduction with multiply and optional
-  /// extensions. This is the cost of as:
-  /// ResTy vecreduce.add(mul (A, B)).
-  /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)).
+  /// getArithmeticReductionCost of an Add/Sub reduction with multiply and
+  /// optional extensions. This is the cost of as:
+  /// * ResTy vecreduce.add/sub(mul (A, B)) or,
+  /// * ResTy vecreduce.add/sub(mul(ext(Ty A), ext(Ty B)).
   LLVM_ABI InstructionCost getMulAccReductionCost(
-      bool IsUnsigned, Type *ResTy, VectorType *Ty,
+      bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
   /// Calculate the cost of an extended reduction pattern, similar to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 43813d2f3acb5..9c2ebb1891cac 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -971,8 +971,8 @@ class TargetTransformInfoImplBase {
   }
 
   virtual InstructionCost
-  getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty,
-                         TTI::TargetCostKind CostKind) const {
+  getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy,
+                         VectorType *Ty, TTI::TargetCostKind CostKind) const {
     return 1;
   }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0a10b51f97c63..dce423fc1b18b 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -3260,14 +3260,17 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   }
 
   InstructionCost
-  getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty,
+  getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy,
+                         VectorType *Ty,
                          TTI::TargetCostKind CostKind) const override {
     // Without any native support, this is equivalent to the cost of
     // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
     // vecreduce.add(mul(A, B)).
+    assert((RedOpcode == Instruction::Add || RedOpcode == Instruction::Sub) &&
+           "The reduction opcode is expected to be Add or Sub.");
     VectorType *ExtTy = VectorType::get(ResTy, Ty);
     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
-        Instruction::Add, ExtTy, std::nullopt, CostKind);
+        RedOpcode, ExtTy, std::nullopt, CostKind);
     InstructionCost ExtCost = thisT()->getCastInstrCost(
         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
         TTI::CastContextHint::None, CostKind);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4ac8f03e6dbf5..b4fa0d5964cb6 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1283,9 +1283,10 @@ InstructionCost TargetTransformInfo::getExtendedReductionCost(
 }
 
 InstructionCost TargetTransformInfo::getMulAccReductionCost(
-    bool IsUnsigned, Type *ResTy, VectorType *Ty,
+    bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty,
     TTI::TargetCostKind CostKind) const {
-  return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, CostKind);
+  return TTIImpl->getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty,
+                                         CostKind);
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 490f6391c15a0..922da10f4e39f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5486,13 +5486,14 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost(
 }
 
 InstructionCost
-AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
-                                       VectorType *VecTy,
+AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
+                                       Type *ResTy, VectorType *VecTy,
                                        TTI::TargetCostKind CostKind) const {
   EVT VecVT = TLI->getValueType(DL, VecTy);
   EVT ResVT = TLI->getValueType(DL, ResTy);
 
-  if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) {
+  if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
+      RedOpcode == Instruction::Add) {
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
 
     // The legal cases with dotprod are
@@ -5503,7 +5504,8 @@ AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
       return LT.first + 2;
   }
 
-  return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind);
+  return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
+                                       CostKind);
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 42ae962b3b426..b994ca74aa222 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -460,7 +460,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
                            TTI::TargetCostKind CostKind) const override;
 
   InstructionCost getMulAccReductionCost(
-      bool IsUnsigned, Type *ResTy, VectorType *Ty,
+      bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override;
 
   InstructionCost
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 6b2854171c819..9b250e6cac3ab 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1916,9 +1916,11 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
 }
 
 InstructionCost
-ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
-                                   VectorType *ValTy,
+ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
+                                   Type *ResTy, VectorType *ValTy,
                                    TTI::TargetCostKind CostKind) const {
+  if (RedOpcode != Instruction::Add)
+    return InstructionCost::getInvalid(CostKind);
   EVT ValVT = TLI->getValueType(DL, ValTy);
   EVT ResVT = TLI->getValueType(DL, ResTy);
 
@@ -1939,7 +1941,8 @@ ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
       return ST->getMVEVectorCostFactor(CostKind) * LT.first;
   }
 
-  return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
+  return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,
+                                       CostKind);
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index cdd8bcb9f7416..0810c5532ed91 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -299,7 +299,8 @@ class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
                            VectorType *ValTy, std::optional<FastMathFlags> FMF,
                            TTI::TargetCostKind CostKind) const override;
   InstructionCost
-  getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy,
+  getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy,
+                         VectorType *ValTy,
                          TTI::TargetCostKind CostKind) const override;
 
   InstructionCost
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1b1797ab30a35..7c43da0b4d552 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5414,7 +5414,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
                              TTI::CastContextHint::None, CostKind, RedOp);
 
     InstructionCost RedCost = TTI.getMulAccReductionCost(
-        IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
+        IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
+        CostKind);
 
     if (RedCost.isValid() &&
         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
@@ -5459,7 +5460,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
 
       InstructionCost RedCost = TTI.getMulAccReductionCost(
-          IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
+          IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
+          CostKind);
       InstructionCost ExtraExtCost = 0;
       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
@@ -5478,7 +5480,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
 
       InstructionCost RedCost = TTI.getMulAccReductionCost(
-          true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
+          true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
+          CostKind);
 
       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
         return I == RetI ? RedCost : 0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bd9a93ed57b8a..93e97faaefb4e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2803,10 +2803,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
       toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
   assert(RedTy->isIntegerTy() &&
          "VPExpressionRecipe only supports integer types currently.");
+  unsigned Opcode = RecurrenceDescriptor::getOpcode(
+      cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
   switch (ExpressionType) {
   case ExpressionTypes::ExtendedReduction: {
-    unsigned Opcode = RecurrenceDescriptor::getOpcode(
-        cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
     return Ctx.TTI.getExtendedReductionCost(
         Opcode,
         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
@@ -2814,13 +2814,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
         RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
   }
   case ExpressionTypes::MulAccReduction:
-    return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
+    return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
+                                          Ctx.CostKind);
 
   case ExpressionTypes::ExtMulAccReduction:
     return Ctx.TTI.getMulAccReductionCost(
         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
             Instruction::ZExt,
-        RedTy, SrcVecTy, Ctx.CostKind);
+        Opcode, RedTy, SrcVecTy, Ctx.CostKind);
   }
   llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6c5f9b7302292..362480a923b6f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3151,7 +3151,7 @@ static VPExpressionRecipe *
 tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
                                           VPCostContext &Ctx, VFRange &Range) {
   unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
-  if (Opcode != Instruction::Add)
+  if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
     return nullptr;
 
   Type *RedTy = Ctx.Types.inferScalarType(Red);
@@ -3166,8 +3166,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
           Type *SrcTy =
               Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
           auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
-          InstructionCost MulAccCost =
-              Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind);
+          InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
+              isZExt, Opcode, RedTy, SrcVecTy, CostKind);
           InstructionCost MulCost = Mul->computeCost(VF, Ctx);
           InstructionCost RedCost = Red->computeCost(VF, Ctx);
           InstructionCost ExtCost = 0;
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c88ed95de2946..bc93cc6ab725a 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1468,8 +1468,8 @@ static void analyzeCostOfVecReduction(const IntrinsicInst &II,
                              TTI::CastContextHint::None, CostKind, RedOp);
 
     CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
-    CostAfterReduction =
-        TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind);
+    CostAfterReduction = TTI.getMulAccReductionCost(
+        IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
     return;
   }
   CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 4af3fa9202c77..2ffb8203d49dd 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -416,3 +416,238 @@ exit:
   %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
   ret i64 %r.0.lcssa
 }
+
+define i64 @print_extended_sub_reduction(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: 'print_extended_sub_reduction'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK:      vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]>
+; CHECK-NEXT:     EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.sub (ir<[[LOAD]]> zext to i64)
+; CHECK-NEXT:     EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %iv
+  %load0 = load i32, ptr %arrayidx, align 4
+  %conv0 = zext i32 %load0 to i64
+  %rdx.next = sub nsw i64 %rdx, %conv0
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+  ret i64 %r.0.lcssa
+}
+
+define i32 @print_mulacc_sub(ptr %a, ptr %b) {
+; CHECK-LABEL: 'print_mulacc_sub'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%3> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%3>, vp<%8>
+; CHECK-NEXT:     vp<%5> = SCALAR-STEPS vp<%4>, ir<1>, vp<%0>
+; CHECK-NEXT:     CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%5>
+; CHECK-NEXT:     vp<%6> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:     WIDEN ir<%load.a> = load vp<%6>
+; CHECK-NEXT:     CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%5>
+; CHECK-NEXT:     vp<%7> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:     WIDEN ir<%load.b> = load vp<%7>
+; CHECK-NEXT:     EXPRESSION vp<%8> = ir<%accum> + reduce.sub (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32))
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<%4>, vp<%1>
+; CHECK-NEXT:     EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%10> = compute-reduction-result ir<%accum>, vp<%8>
+; CHECK-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<1024>, vp<%2>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %loop ] (extra operand: vp<%10> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%10>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %entry ], [ %add, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
+; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
+; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
+; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
+; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
+; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
+; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
+; CHECK-NEXT:   IR   %add = sub i32 %accum, %mul
+; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 1024
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK:      VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in ir<1024> = vector-trip-count
+; CHECK-NEXT: Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:  EMIT branch-on-cond ir<false>
+; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add>
+; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load ir<%gep.a>
+; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load ir<%gep.b>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
+; CHECK-NEXT:   REDUCE ir<%add> = ir<%accum> + reduce.sub (ir<%mul>)
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:   EMIT branch-on-count vp<%index.next>, ir<1024>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
+; CHECK-NEXT: Successor(s): ir-bb<exit>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[RED_RESULT]]> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<scalar.ph>:
+; CHECK-NEXT: Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %scalar.ph ], [ %iv.next, %loop ] (extra operand: ir<0> from ir-bb<scalar.ph>)
+; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %scalar.ph ], [ %add, %loop ] (extra operand: ir<0> from ir-bb<scalar.ph>)
+; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
+; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
+; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
+; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
+; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
+; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
+; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
+; CHECK-NEXT:   IR   %add = sub i32 %accum, %mul
+; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 1024
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %loop ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = sub i32 %accum, %mul
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret i32 %add
+}
+
+define i64 @print_mulacc_sub_extended(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: 'print_mulacc_sub_extended'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK:      vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT:     CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
+; CHECK-NEXT:     WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]>
+; CHECK-NEXT:     CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
+; CHECK-NEXT:     WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
+; CHECK-NEXT:     EXPRESSION vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.sub (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64))
+; CHECK-NEXT:     EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv
+  %load0 = load i16, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %iv
+  %load1 = load i16, ptr %arrayidx1, align 4
+  %conv0 = sext i16 %load0 to i32
+  %conv1 = sext i16 %load1 to i32
+  %mul = mul nsw i32 %conv0, %conv1
+  %conv = sext i32 %mul to i64
+  %rdx.next = sub nsw i64 %rdx, %conv
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+  ret i64 %r.0.lcssa
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
index 36826eb6681c8..c1a87f0c5f907 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
@@ -1149,3 +1149,264 @@ entry:
   %add.15 = add nsw i32 %mul.15, %add.14
   ret i32 %add.15
 }
+
+; COST-LABEL: Function:  mla_v16i8_i32_sub
+; COST: Cost:            '-2'
+define i32 @mla_v16i8_i32_sub(ptr %x, ptr %y) "target-features"="+dotprod" {
+; CHECK-LABEL: @mla_v16i8_i32_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[X:%.*]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[Y:%.*]], align 1
+; CHECK-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = sext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
+; CHECK-NEXT:    [[CONV3_1:%.*]] = sext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nsw i32 [[CONV3_1]], [[CONV_1]]
+; CHECK-NEXT:    [[SUB_1:%.*]] = sub nsw i32 [[MUL_1]], [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = sext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
+; CHECK-NEXT:    [[CONV3_2:%.*]] = sext i8 [[TMP5]] to i32
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nsw i32 [[CONV3_2]], [[CONV_2]]
+; CHECK-NEXT:    [[SUB_2:%.*]] = sub nsw i32 [[MUL_2]], [[SUB_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
+; CHECK-NEXT:    [[CONV_3:%.*]] = sext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
+; CHECK-NEXT:    [[CONV3_3:%.*]] = sext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul nsw i32 [[CONV3_3]], [[CONV_3]]
+; CHECK-NEXT:    [[SUB_3:%.*]] = sub nsw i32 [[MUL_3]], [[SUB_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
+; CHECK-NEXT:    [[CONV_4:%.*]] = sext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1
+; CHECK-NEXT:    [[CONV3_4:%.*]] = sext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul nsw i32 [[CONV3_4]], [[CONV_4]]
+; CHECK-NEXT:    [[SUB_4:%.*]] = sub nsw i32 [[MUL_4]], [[SUB_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
+; CHECK-NEXT:    [[CONV_5:%.*]] = sext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 5
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1
+; CHECK-NEXT:    [[CONV3_5:%.*]] = sext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[MUL_5:%.*]] = mul nsw i32 [[CONV3_5]], [[CONV_5]]
+; CHECK-NEXT:    [[SUB_5:%.*]] = sub nsw i32 [[MUL_5]], [[SUB_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
+; CHECK-NEXT:    [[CONV_6:%.*]] = sext i8 [[TMP12]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1
+; CHECK-NEXT:    [[CONV3_6:%.*]] = sext i8 [[TMP13]] to i32
+; CHECK-NEXT:    [[MUL_6:%.*]] = mul nsw i32 [[CONV3_6]], [[CONV_6]]
+; CHECK-NEXT:    [[SUB_6:%.*]] = sub nsw i32 [[MUL_6]], [[SUB_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
+; CHECK-NEXT:    [[CONV_7:%.*]] = sext i8 [[TMP14]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1
+; CHECK-NEXT:    [[CONV3_7:%.*]] = sext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[MUL_7:%.*]] = mul nsw i32 [[CONV3_7]], [[CONV_7]]
+; CHECK-NEXT:    [[SUB_7:%.*]] = sub nsw i32 [[MUL_7]], [[SUB_6]]
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX_8]], align 1
+; CHECK-NEXT:    [[CONV_8:%.*]] = sext i8 [[TMP16]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX2_8]], align 1
+; CHECK-NEXT:    [[CONV3_8:%.*]] = sext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[MUL_8:%.*]] = mul nsw i32 [[CONV3_8]], [[CONV_8]]
+; CHECK-NEXT:    [[SUB_8:%.*]] = sub nsw i32 [[MUL_8]], [[SUB_7]]
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 9
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX_9]], align 1
+; CHECK-NEXT:    [[CONV_9:%.*]] = sext i8 [[TMP18]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_9:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 9
+; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX2_9]], align 1
+; CHECK-NEXT:    [[CONV3_9:%.*]] = sext i8 [[TMP19]] to i32
+; CHECK-NEXT:    [[MUL_9:%.*]] = mul nsw i32 [[CONV3_9]], [[CONV_9]]
+; CHECK-NEXT:    [[SUB_9:%.*]] = sub nsw i32 [[MUL_9]], [[SUB_8]]
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX_10]], align 1
+; CHECK-NEXT:    [[CONV_10:%.*]] = sext i8 [[TMP20]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_10:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 10
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX2_10]], align 1
+; CHECK-NEXT:    [[CONV3_10:%.*]] = sext i8 [[TMP21]] to i32
+; CHECK-NEXT:    [[MUL_10:%.*]] = mul nsw i32 [[CONV3_10]], [[CONV_10]]
+; CHECK-NEXT:    [[SUB_10:%.*]] = sub nsw i32 [[MUL_10]], [[SUB_9]]
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 11
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX_11]], align 1
+; CHECK-NEXT:    [[CONV_11:%.*]] = sext i8 [[TMP22]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_11:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 11
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX2_11]], align 1
+; CHECK-NEXT:    [[CONV3_11:%.*]] = sext i8 [[TMP23]] to i32
+; CHECK-NEXT:    [[MUL_11:%.*]] = mul nsw i32 [[CONV3_11]], [[CONV_11]]
+; CHECK-NEXT:    [[SUB_11:%.*]] = sub nsw i32 [[MUL_11]], [[SUB_10]]
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX_12]], align 1
+; CHECK-NEXT:    [[CONV_12:%.*]] = sext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 12
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX2_12]], align 1
+; CHECK-NEXT:    [[CONV3_12:%.*]] = sext i8 [[TMP25]] to i32
+; CHECK-NEXT:    [[MUL_12:%.*]] = mul nsw i32 [[CONV3_12]], [[CONV_12]]
+; CHECK-NEXT:    [[SUB_12:%.*]] = sub nsw i32 [[MUL_12]], [[SUB_11]]
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 13
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
+; CHECK-NEXT:    [[CONV_13:%.*]] = sext i8 [[TMP26]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_13:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 13
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX2_13]], align 1
+; CHECK-NEXT:    [[CONV3_13:%.*]] = sext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[MUL_13:%.*]] = mul nsw i32 [[CONV3_13]], [[CONV_13]]
+; CHECK-NEXT:    [[SUB_13:%.*]] = sub nsw i32 [[MUL_13]], [[SUB_12]]
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
+; CHECK-NEXT:    [[CONV_14:%.*]] = sext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_14:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 14
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2_14]], align 1
+; CHECK-NEXT:    [[CONV3_14:%.*]] = sext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[MUL_14:%.*]] = mul nsw i32 [[CONV3_14]], [[CONV_14]]
+; CHECK-NEXT:    [[SUB_14:%.*]] = sub nsw i32 [[MUL_14]], [[SUB_13]]
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 15
+; CHECK-NEXT:    [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
+; CHECK-NEXT:    [[CONV_15:%.*]] = sext i8 [[TMP30]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_15:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 15
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX2_15]], align 1
+; CHECK-NEXT:    [[CONV3_15:%.*]] = sext i8 [[TMP31]] to i32
+; CHECK-NEXT:    [[MUL_15:%.*]] = mul nsw i32 [[CONV3_15]], [[CONV_15]]
+; CHECK-NEXT:    [[SUB_15:%.*]] = sub nsw i32 [[MUL_15]], [[SUB_14]]
+; CHECK-NEXT:    ret i32 [[SUB_15]]
+;
+entry:
+  %0 = load i8, ptr %x
+  %conv = sext i8 %0 to i32
+  %1 = load i8, ptr %y
+  %conv3 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 1
+  %2 = load i8, ptr %arrayidx.1
+  %conv.1 = sext i8 %2 to i32
+  %arrayidx2.1 = getelementptr inbounds nuw i8, ptr %y, i64 1
+  %3 = load i8, ptr %arrayidx2.1
+  %conv3.1 = sext i8 %3 to i32
+  %mul.1 = mul nsw i32 %conv3.1, %conv.1
+  %sub.1 = sub nsw i32 %mul.1, %mul
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %4 = load i8, ptr %arrayidx.2
+  %conv.2 = sext i8 %4 to i32
+  %arrayidx2.2 = getelementptr inbounds nuw i8, ptr %y, i64 2
+  %5 = load i8, ptr %arrayidx2.2
+  %conv3.2 = sext i8 %5 to i32
+  %mul.2 = mul nsw i32 %conv3.2, %conv.2
+  %sub.2 = sub nsw i32 %mul.2, %sub.1
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 3
+  %6 = load i8, ptr %arrayidx.3
+  %conv.3 = sext i8 %6 to i32
+  %arrayidx2.3 = getelementptr inbounds nuw i8, ptr %y, i64 3
+  %7 = load i8, ptr %arrayidx2.3
+  %conv3.3 = sext i8 %7 to i32
+  %mul.3 = mul nsw i32 %conv3.3, %conv.3
+  %sub.3 = sub nsw i32 %mul.3, %sub.2
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %8 = load i8, ptr %arrayidx.4
+  %conv.4 = sext i8 %8 to i32
+  %arrayidx2.4 = getelementptr inbounds nuw i8, ptr %y, i64 4
+  %9 = load i8, ptr %arrayidx2.4
+  %conv3.4 = sext i8 %9 to i32
+  %mul.4 = mul nsw i32 %conv3.4, %conv.4
+  %sub.4 = sub nsw i32 %mul.4, %sub.3
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 5
+  %10 = load i8, ptr %arrayidx.5
+  %conv.5 = sext i8 %10 to i32
+  %arrayidx2.5 = getelementptr inbounds nuw i8, ptr %y, i64 5
+  %11 = load i8, ptr %arrayidx2.5
+  %conv3.5 = sext i8 %11 to i32
+  %mul.5 = mul nsw i32 %conv3.5, %conv.5
+  %sub.5 = sub nsw i32 %mul.5, %sub.4
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %12 = load i8, ptr %arrayidx.6
+  %conv.6 = sext i8 %12 to i32
+  %arrayidx2.6 = getelementptr inbounds nuw i8, ptr %y, i64 6
+  %13 = load i8, ptr %arrayidx2.6
+  %conv3.6 = sext i8 %13 to i32
+  %mul.6 = mul nsw i32 %conv3.6, %conv.6
+  %sub.6 = sub nsw i32 %mul.6, %sub.5
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 7
+  %14 = load i8, ptr %arrayidx.7
+  %conv.7 = sext i8 %14 to i32
+  %arrayidx2.7 = getelementptr inbounds nuw i8, ptr %y, i64 7
+  %15 = load i8, ptr %arrayidx2.7
+  %conv3.7 = sext i8 %15 to i32
+  %mul.7 = mul nsw i32 %conv3.7, %conv.7
+  %sub.7 = sub nsw i32 %mul.7, %sub.6
+  %arrayidx.8 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %16 = load i8, ptr %arrayidx.8
+  %conv.8 = sext i8 %16 to i32
+  %arrayidx2.8 = getelementptr inbounds nuw i8, ptr %y, i64 8
+  %17 = load i8, ptr %arrayidx2.8
+  %conv3.8 = sext i8 %17 to i32
+  %mul.8 = mul nsw i32 %conv3.8, %conv.8
+  %sub.8 = sub nsw i32 %mul.8, %sub.7
+  %arrayidx.9 = getelementptr inbounds nuw i8, ptr %x, i64 9
+  %18 = load i8, ptr %arrayidx.9
+  %conv.9 = sext i8 %18 to i32
+  %arrayidx2.9 = getelementptr inbounds nuw i8, ptr %y, i64 9
+  %19 = load i8, ptr %arrayidx2.9
+  %conv3.9 = sext i8 %19 to i32
+  %mul.9 = mul nsw i32 %conv3.9, %conv.9
+  %sub.9 = sub nsw i32 %mul.9, %sub.8
+  %arrayidx.10 = getelementptr inbounds nuw i8, ptr %x, i64 10
+  %20 = load i8, ptr %arrayidx.10
+  %conv.10 = sext i8 %20 to i32
+  %arrayidx2.10 = getelementptr inbounds nuw i8, ptr %y, i64 10
+  %21 = load i8, ptr %arrayidx2.10
+  %conv3.10 = sext i8 %21 to i32
+  %mul.10 = mul nsw i32 %conv3.10, %conv.10
+  %sub.10 = sub nsw i32 %mul.10, %sub.9
+  %arrayidx.11 = getelementptr inbounds nuw i8, ptr %x, i64 11
+  %22 = load i8, ptr %arrayidx.11
+  %conv.11 = sext i8 %22 to i32
+  %arrayidx2.11 = getelementptr inbounds nuw i8, ptr %y, i64 11
+  %23 = load i8, ptr %arrayidx2.11
+  %conv3.11 = sext i8 %23 to i32
+  %mul.11 = mul nsw i32 %conv3.11, %conv.11
+  %sub.11 = sub nsw i32 %mul.11, %sub.10
+  %arrayidx.12 = getelementptr inbounds nuw i8, ptr %x, i64 12
+  %24 = load i8, ptr %arrayidx.12
+  %conv.12 = sext i8 %24 to i32
+  %arrayidx2.12 = getelementptr inbounds nuw i8, ptr %y, i64 12
+  %25 = load i8, ptr %arrayidx2.12
+  %conv3.12 = sext i8 %25 to i32
+  %mul.12 = mul nsw i32 %conv3.12, %conv.12
+  %sub.12 = sub nsw i32 %mul.12, %sub.11
+  %arrayidx.13 = getelementptr inbounds nuw i8, ptr %x, i64 13
+  %26 = load i8, ptr %arrayidx.13
+  %conv.13 = sext i8 %26 to i32
+  %arrayidx2.13 = getelementptr inbounds nuw i8, ptr %y, i64 13
+  %27 = load i8, ptr %arrayidx2.13
+  %conv3.13 = sext i8 %27 to i32
+  %mul.13 = mul nsw i32 %conv3.13, %conv.13
+  %sub.13 = sub nsw i32 %mul.13, %sub.12
+  %arrayidx.14 = getelementptr inbounds nuw i8, ptr %x, i64 14
+  %28 = load i8, ptr %arrayidx.14
+  %conv.14 = sext i8 %28 to i32
+  %arrayidx2.14 = getelementptr inbounds nuw i8, ptr %y, i64 14
+  %29 = load i8, ptr %arrayidx2.14
+  %conv3.14 = sext i8 %29 to i32
+  %mul.14 = mul nsw i32 %conv3.14, %conv.14
+  %sub.14 = sub nsw i32 %mul.14, %sub.13
+  %arrayidx.15 = getelementptr inbounds nuw i8, ptr %x, i64 15
+  %30 = load i8, ptr %arrayidx.15
+  %conv.15 = sext i8 %30 to i32
+  %arrayidx2.15 = getelementptr inbounds nuw i8, ptr %y, i64 15
+  %31 = load i8, ptr %arrayidx2.15
+  %conv3.15 = sext i8 %31 to i32
+  %mul.15 = mul nsw i32 %conv3.15, %conv.15
+  %sub.15 = sub nsw i32 %mul.15, %sub.14
+  ret i32 %sub.15
+}