diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index c4ba8e9857dc4..af78e0c1e4799 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1647,12 +1647,12 @@ class TargetTransformInfo { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; /// Calculate the cost of an extended reduction pattern, similar to - /// getArithmeticReductionCost of an Add reduction with multiply and optional - /// extensions. This is the cost of as: - /// ResTy vecreduce.add(mul (A, B)). - /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). + /// getArithmeticReductionCost of an Add/Sub reduction with multiply and + /// optional extensions. This is the cost of as: + /// * ResTy vecreduce.add/sub(mul (A, B)) or, + /// * ResTy vecreduce.add/sub(mul(ext(Ty A), ext(Ty B)). LLVM_ABI InstructionCost getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; /// Calculate the cost of an extended reduction pattern, similar to diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 43813d2f3acb5..9c2ebb1891cac 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -971,8 +971,8 @@ class TargetTransformInfoImplBase { } virtual InstructionCost - getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, - TTI::TargetCostKind CostKind) const { + getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, + VectorType *Ty, TTI::TargetCostKind CostKind) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 0a10b51f97c63..dce423fc1b18b 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -3260,14 +3260,17 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } InstructionCost - getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, + getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, + VectorType *Ty, TTI::TargetCostKind CostKind) const override { // Without any native support, this is equivalent to the cost of // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or // vecreduce.add(mul(A, B)). + assert((RedOpcode == Instruction::Add || RedOpcode == Instruction::Sub) && + "The reduction opcode is expected to be Add or Sub."); VectorType *ExtTy = VectorType::get(ResTy, Ty); InstructionCost RedCost = thisT()->getArithmeticReductionCost( - Instruction::Add, ExtTy, std::nullopt, CostKind); + RedOpcode, ExtTy, std::nullopt, CostKind); InstructionCost ExtCost = thisT()->getCastInstrCost( IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, TTI::CastContextHint::None, CostKind); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 4ac8f03e6dbf5..b4fa0d5964cb6 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1283,9 +1283,10 @@ InstructionCost TargetTransformInfo::getExtendedReductionCost( } InstructionCost TargetTransformInfo::getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const { - return TTIImpl->getMulAccReductionCost(IsUnsigned, ResTy, Ty, CostKind); + return TTIImpl->getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty, + CostKind); } InstructionCost diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 490f6391c15a0..922da10f4e39f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5486,13 +5486,14 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost( } InstructionCost -AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, - VectorType *VecTy, +AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, + Type *ResTy, VectorType *VecTy, TTI::TargetCostKind CostKind) const { EVT VecVT = TLI->getValueType(DL, VecTy); EVT ResVT = TLI->getValueType(DL, ResTy); - if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) { + if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() && + RedOpcode == Instruction::Add) { std::pair LT = getTypeLegalizationCost(VecTy); // The legal cases with dotprod are @@ -5503,7 +5504,8 @@ AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, return LT.first + 2; } - return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind); + return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy, + CostKind); } InstructionCost diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 42ae962b3b426..b994ca74aa222 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -460,7 +460,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase { TTI::TargetCostKind CostKind) const override; InstructionCost getMulAccReductionCost( - bool IsUnsigned, Type *ResTy, VectorType *Ty, + bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override; InstructionCost diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 6b2854171c819..9b250e6cac3ab 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1916,9 +1916,11 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost( } InstructionCost -ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, - VectorType *ValTy, +ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, + Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const { + if (RedOpcode != Instruction::Add) + return InstructionCost::getInvalid(CostKind); EVT ValVT = TLI->getValueType(DL, ValTy); EVT ResVT = TLI->getValueType(DL, ResTy); @@ -1939,7 +1941,8 @@ ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy, return ST->getMVEVectorCostFactor(CostKind) * LT.first; } - return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind); + return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy, + CostKind); } InstructionCost diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index cdd8bcb9f7416..0810c5532ed91 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -299,7 +299,8 @@ class ARMTTIImpl final : public BasicTTIImplBase { VectorType *ValTy, std::optional FMF, TTI::TargetCostKind CostKind) const override; InstructionCost - getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, + getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, + VectorType *ValTy, TTI::TargetCostKind CostKind) const override; InstructionCost diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1b1797ab30a35..7c43da0b4d552 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5414,7 +5414,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI::CastContextHint::None, CostKind, RedOp); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType, + CostKind); if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) @@ -5459,7 +5460,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType, + CostKind); InstructionCost ExtraExtCost = 0; if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; @@ -5478,7 +5480,8 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getMulAccReductionCost( - true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); + true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy, + CostKind); if (RedCost.isValid() && RedCost < MulCost + BaseCost) return I == RetI ? RedCost : 0; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bd9a93ed57b8a..93e97faaefb4e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2803,10 +2803,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); assert(RedTy->isIntegerTy() && "VPExpressionRecipe only supports integer types currently."); + unsigned Opcode = RecurrenceDescriptor::getOpcode( + cast(ExpressionRecipes.back())->getRecurrenceKind()); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { - unsigned Opcode = RecurrenceDescriptor::getOpcode( - cast(ExpressionRecipes[1])->getRecurrenceKind()); return Ctx.TTI.getExtendedReductionCost( Opcode, cast(ExpressionRecipes.front())->getOpcode() == @@ -2814,13 +2814,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); } case ExpressionTypes::MulAccReduction: - return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind); + return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, + Ctx.CostKind); case ExpressionTypes::ExtMulAccReduction: return Ctx.TTI.getMulAccReductionCost( cast(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, - RedTy, SrcVecTy, Ctx.CostKind); + Opcode, RedTy, SrcVecTy, Ctx.CostKind); } llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6c5f9b7302292..362480a923b6f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3151,7 +3151,7 @@ static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); - if (Opcode != Instruction::Add) + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return nullptr; Type *RedTy = Ctx.Types.inferScalarType(Red); @@ -3166,8 +3166,8 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Type *SrcTy = Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); - InstructionCost MulAccCost = - Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind); + InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost( + isZExt, Opcode, RedTy, SrcVecTy, CostKind); InstructionCost MulCost = Mul->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); InstructionCost ExtCost = 0; diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index c88ed95de2946..bc93cc6ab725a 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1468,8 +1468,8 @@ static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::CastContextHint::None, CostKind, RedOp); CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost; - CostAfterReduction = - TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind); + CostAfterReduction = TTI.getMulAccReductionCost( + IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind); return; } CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy, diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 4af3fa9202c77..2ffb8203d49dd 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -416,3 +416,238 @@ exit: %r.0.lcssa = phi i64 [ %rdx.next, %loop ] ret i64 %r.0.lcssa } + +define i64 @print_extended_sub_reduction(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_extended_sub_reduction' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.sub (ir<[[LOAD]]> zext to i64) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %iv + %load0 = load i32, ptr %arrayidx, align 4 + %conv0 = zext i32 %load0 to i64 + %rdx.next = sub nsw i64 %rdx, %conv0 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} + +define i32 @print_mulacc_sub(ptr %a, ptr %b) { +; CHECK-LABEL: 'print_mulacc_sub' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%3> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%3>, vp<%8> +; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%4>, ir<1>, vp<%0> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%5> +; CHECK-NEXT: vp<%6> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<%6> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%5> +; CHECK-NEXT: vp<%7> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<%7> +; CHECK-NEXT: EXPRESSION vp<%8> = ir<%accum> + reduce.sub (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32)) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%4>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%10> = compute-reduction-result ir<%accum>, vp<%8> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<1024>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %loop ] (extra operand: vp<%10> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%10>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv +; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 +; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 +; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv +; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 +; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 +; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a +; CHECK-NEXT: IR %add = sub i32 %accum, %mul +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<1024> = vector-trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT branch-on-cond ir +; CHECK-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index> +; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index> +; CHECK-NEXT: WIDEN ir<%load.b> = load ir<%gep.b> +; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> +; CHECK-NEXT: REDUCE ir<%add> = ir<%accum> + reduce.sub (ir<%mul>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<1024> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[RED_RESULT]]> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %scalar.ph ], [ %iv.next, %loop ] (extra operand: ir<0> from ir-bb) +; CHECK-NEXT: IR %accum = phi i32 [ 0, %scalar.ph ], [ %add, %loop ] (extra operand: ir<0> from ir-bb) +; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv +; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 +; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 +; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv +; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 +; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 +; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a +; CHECK-NEXT: IR %add = sub i32 %accum, %mul +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %accum = phi i32 [ 0, %entry ], [ %add, %loop ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = sub i32 %accum, %mul + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret i32 %add +} + +define i64 @print_mulacc_sub_extended(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_mulacc_sub_extended' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> +; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> +; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> +; CHECK-NEXT: EXPRESSION vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.sub (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64)) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv + %load0 = load i16, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %iv + %load1 = load i16, ptr %arrayidx1, align 4 + %conv0 = sext i16 %load0 to i32 + %conv1 = sext i16 %load1 to i32 + %mul = mul nsw i32 %conv0, %conv1 + %conv = sext i32 %mul to i64 + %rdx.next = sub nsw i64 %rdx, %conv + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll index 36826eb6681c8..c1a87f0c5f907 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll @@ -1149,3 +1149,264 @@ entry: %add.15 = add nsw i32 %mul.15, %add.14 ret i32 %add.15 } + +; COST-LABEL: Function: mla_v16i8_i32_sub +; COST: Cost: '-2' +define i32 @mla_v16i8_i32_sub(ptr %x, ptr %y) "target-features"="+dotprod" { +; CHECK-LABEL: @mla_v16i8_i32_sub( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[X:%.*]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[Y:%.*]], align 1 +; CHECK-NEXT: [[CONV3:%.*]] = sext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = sext i8 [[TMP2]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1 +; CHECK-NEXT: [[CONV3_1:%.*]] = sext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[CONV3_1]], [[CONV_1]] +; CHECK-NEXT: [[SUB_1:%.*]] = sub nsw i32 [[MUL_1]], [[MUL]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1 +; CHECK-NEXT: [[CONV_2:%.*]] = sext i8 [[TMP4]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1 +; CHECK-NEXT: [[CONV3_2:%.*]] = sext i8 [[TMP5]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[CONV3_2]], [[CONV_2]] +; CHECK-NEXT: [[SUB_2:%.*]] = sub nsw i32 [[MUL_2]], [[SUB_1]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1 +; CHECK-NEXT: [[CONV_3:%.*]] = sext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1 +; CHECK-NEXT: [[CONV3_3:%.*]] = sext i8 [[TMP7]] to i32 +; CHECK-NEXT: [[MUL_3:%.*]] = mul nsw i32 [[CONV3_3]], [[CONV_3]] +; CHECK-NEXT: [[SUB_3:%.*]] = sub nsw i32 [[MUL_3]], [[SUB_2]] +; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 4 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1 +; CHECK-NEXT: [[CONV_4:%.*]] = sext i8 [[TMP8]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 4 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1 +; CHECK-NEXT: [[CONV3_4:%.*]] = sext i8 [[TMP9]] to i32 +; CHECK-NEXT: [[MUL_4:%.*]] = mul nsw i32 [[CONV3_4]], [[CONV_4]] +; CHECK-NEXT: [[SUB_4:%.*]] = sub nsw i32 [[MUL_4]], [[SUB_3]] +; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 5 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1 +; CHECK-NEXT: [[CONV_5:%.*]] = sext i8 [[TMP10]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 5 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1 +; CHECK-NEXT: [[CONV3_5:%.*]] = sext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[MUL_5:%.*]] = mul nsw i32 [[CONV3_5]], [[CONV_5]] +; CHECK-NEXT: [[SUB_5:%.*]] = sub nsw i32 [[MUL_5]], [[SUB_4]] +; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 6 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1 +; CHECK-NEXT: [[CONV_6:%.*]] = sext i8 [[TMP12]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 6 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1 +; CHECK-NEXT: [[CONV3_6:%.*]] = sext i8 [[TMP13]] to i32 +; CHECK-NEXT: [[MUL_6:%.*]] = mul nsw i32 [[CONV3_6]], [[CONV_6]] +; CHECK-NEXT: [[SUB_6:%.*]] = sub nsw i32 [[MUL_6]], [[SUB_5]] +; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 7 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1 +; CHECK-NEXT: [[CONV_7:%.*]] = sext i8 [[TMP14]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1 +; CHECK-NEXT: [[CONV3_7:%.*]] = sext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[MUL_7:%.*]] = mul nsw i32 [[CONV3_7]], [[CONV_7]] +; CHECK-NEXT: [[SUB_7:%.*]] = sub nsw i32 [[MUL_7]], [[SUB_6]] +; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 8 +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX_8]], align 1 +; CHECK-NEXT: [[CONV_8:%.*]] = sext i8 [[TMP16]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_8:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 8 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX2_8]], align 1 +; CHECK-NEXT: [[CONV3_8:%.*]] = sext i8 [[TMP17]] to i32 +; CHECK-NEXT: [[MUL_8:%.*]] = mul nsw i32 [[CONV3_8]], [[CONV_8]] +; CHECK-NEXT: [[SUB_8:%.*]] = sub nsw i32 [[MUL_8]], [[SUB_7]] +; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 9 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX_9]], align 1 +; CHECK-NEXT: [[CONV_9:%.*]] = sext i8 [[TMP18]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_9:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 9 +; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX2_9]], align 1 +; CHECK-NEXT: [[CONV3_9:%.*]] = sext i8 [[TMP19]] to i32 +; CHECK-NEXT: [[MUL_9:%.*]] = mul nsw i32 [[CONV3_9]], [[CONV_9]] +; CHECK-NEXT: [[SUB_9:%.*]] = sub nsw i32 [[MUL_9]], [[SUB_8]] +; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 10 +; CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX_10]], align 1 +; CHECK-NEXT: [[CONV_10:%.*]] = sext i8 [[TMP20]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_10:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 10 +; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX2_10]], align 1 +; CHECK-NEXT: [[CONV3_10:%.*]] = sext i8 [[TMP21]] to i32 +; CHECK-NEXT: [[MUL_10:%.*]] = mul nsw i32 [[CONV3_10]], [[CONV_10]] +; CHECK-NEXT: [[SUB_10:%.*]] = sub nsw i32 [[MUL_10]], [[SUB_9]] +; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 11 +; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX_11]], align 1 +; CHECK-NEXT: [[CONV_11:%.*]] = sext i8 [[TMP22]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_11:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 11 +; CHECK-NEXT: [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX2_11]], align 1 +; CHECK-NEXT: [[CONV3_11:%.*]] = sext i8 [[TMP23]] to i32 +; CHECK-NEXT: [[MUL_11:%.*]] = mul nsw i32 [[CONV3_11]], [[CONV_11]] +; CHECK-NEXT: [[SUB_11:%.*]] = sub nsw i32 [[MUL_11]], [[SUB_10]] +; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 12 +; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX_12]], align 1 +; CHECK-NEXT: [[CONV_12:%.*]] = sext i8 [[TMP24]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_12:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 12 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX2_12]], align 1 +; CHECK-NEXT: [[CONV3_12:%.*]] = sext i8 [[TMP25]] to i32 +; CHECK-NEXT: [[MUL_12:%.*]] = mul nsw i32 [[CONV3_12]], [[CONV_12]] +; CHECK-NEXT: [[SUB_12:%.*]] = sub nsw i32 [[MUL_12]], [[SUB_11]] +; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 13 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 +; CHECK-NEXT: [[CONV_13:%.*]] = sext i8 [[TMP26]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_13:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 13 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX2_13]], align 1 +; CHECK-NEXT: [[CONV3_13:%.*]] = sext i8 [[TMP27]] to i32 +; CHECK-NEXT: [[MUL_13:%.*]] = mul nsw i32 [[CONV3_13]], [[CONV_13]] +; CHECK-NEXT: [[SUB_13:%.*]] = sub nsw i32 [[MUL_13]], [[SUB_12]] +; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 14 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 +; CHECK-NEXT: [[CONV_14:%.*]] = sext i8 [[TMP28]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_14:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 14 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2_14]], align 1 +; CHECK-NEXT: [[CONV3_14:%.*]] = sext i8 [[TMP29]] to i32 +; CHECK-NEXT: [[MUL_14:%.*]] = mul nsw i32 [[CONV3_14]], [[CONV_14]] +; CHECK-NEXT: [[SUB_14:%.*]] = sub nsw i32 [[MUL_14]], [[SUB_13]] +; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 15 +; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; CHECK-NEXT: [[CONV_15:%.*]] = sext i8 [[TMP30]] to i32 +; CHECK-NEXT: [[ARRAYIDX2_15:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 15 +; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX2_15]], align 1 +; CHECK-NEXT: [[CONV3_15:%.*]] = sext i8 [[TMP31]] to i32 +; CHECK-NEXT: [[MUL_15:%.*]] = mul nsw i32 [[CONV3_15]], [[CONV_15]] +; CHECK-NEXT: [[SUB_15:%.*]] = sub nsw i32 [[MUL_15]], [[SUB_14]] +; CHECK-NEXT: ret i32 [[SUB_15]] +; +entry: + %0 = load i8, ptr %x + %conv = sext i8 %0 to i32 + %1 = load i8, ptr %y + %conv3 = sext i8 %1 to i32 + %mul = mul nsw i32 %conv3, %conv + %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 1 + %2 = load i8, ptr %arrayidx.1 + %conv.1 = sext i8 %2 to i32 + %arrayidx2.1 = getelementptr inbounds nuw i8, ptr %y, i64 1 + %3 = load i8, ptr %arrayidx2.1 + %conv3.1 = sext i8 %3 to i32 + %mul.1 = mul nsw i32 %conv3.1, %conv.1 + %sub.1 = sub nsw i32 %mul.1, %mul + %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 2 + %4 = load i8, ptr %arrayidx.2 + %conv.2 = sext i8 %4 to i32 + %arrayidx2.2 = getelementptr inbounds nuw i8, ptr %y, i64 2 + %5 = load i8, ptr %arrayidx2.2 + %conv3.2 = sext i8 %5 to i32 + %mul.2 = mul nsw i32 %conv3.2, %conv.2 + %sub.2 = sub nsw i32 %mul.2, %sub.1 + %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 3 + %6 = load i8, ptr %arrayidx.3 + %conv.3 = sext i8 %6 to i32 + %arrayidx2.3 = getelementptr inbounds nuw i8, ptr %y, i64 3 + %7 = load i8, ptr %arrayidx2.3 + %conv3.3 = sext i8 %7 to i32 + %mul.3 = mul nsw i32 %conv3.3, %conv.3 + %sub.3 = sub nsw i32 %mul.3, %sub.2 + %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 4 + %8 = load i8, ptr %arrayidx.4 + %conv.4 = sext i8 %8 to i32 + %arrayidx2.4 = getelementptr inbounds nuw i8, ptr %y, i64 4 + %9 = load i8, ptr %arrayidx2.4 + %conv3.4 = sext i8 %9 to i32 + %mul.4 = mul nsw i32 %conv3.4, %conv.4 + %sub.4 = sub nsw i32 %mul.4, %sub.3 + %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 5 + %10 = load i8, ptr %arrayidx.5 + %conv.5 = sext i8 %10 to i32 + %arrayidx2.5 = getelementptr inbounds nuw i8, ptr %y, i64 5 + %11 = load i8, ptr %arrayidx2.5 + %conv3.5 = sext i8 %11 to i32 + %mul.5 = mul nsw i32 %conv3.5, %conv.5 + %sub.5 = sub nsw i32 %mul.5, %sub.4 + %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 6 + %12 = load i8, ptr %arrayidx.6 + %conv.6 = sext i8 %12 to i32 + %arrayidx2.6 = getelementptr inbounds nuw i8, ptr %y, i64 6 + %13 = load i8, ptr %arrayidx2.6 + %conv3.6 = sext i8 %13 to i32 + %mul.6 = mul nsw i32 %conv3.6, %conv.6 + %sub.6 = sub nsw i32 %mul.6, %sub.5 + %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 7 + %14 = load i8, ptr %arrayidx.7 + %conv.7 = sext i8 %14 to i32 + %arrayidx2.7 = getelementptr inbounds nuw i8, ptr %y, i64 7 + %15 = load i8, ptr %arrayidx2.7 + %conv3.7 = sext i8 %15 to i32 + %mul.7 = mul nsw i32 %conv3.7, %conv.7 + %sub.7 = sub nsw i32 %mul.7, %sub.6 + %arrayidx.8 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %16 = load i8, ptr %arrayidx.8 + %conv.8 = sext i8 %16 to i32 + %arrayidx2.8 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %17 = load i8, ptr %arrayidx2.8 + %conv3.8 = sext i8 %17 to i32 + %mul.8 = mul nsw i32 %conv3.8, %conv.8 + %sub.8 = sub nsw i32 %mul.8, %sub.7 + %arrayidx.9 = getelementptr inbounds nuw i8, ptr %x, i64 9 + %18 = load i8, ptr %arrayidx.9 + %conv.9 = sext i8 %18 to i32 + %arrayidx2.9 = getelementptr inbounds nuw i8, ptr %y, i64 9 + %19 = load i8, ptr %arrayidx2.9 + %conv3.9 = sext i8 %19 to i32 + %mul.9 = mul nsw i32 %conv3.9, %conv.9 + %sub.9 = sub nsw i32 %mul.9, %sub.8 + %arrayidx.10 = getelementptr inbounds nuw i8, ptr %x, i64 10 + %20 = load i8, ptr %arrayidx.10 + %conv.10 = sext i8 %20 to i32 + %arrayidx2.10 = getelementptr inbounds nuw i8, ptr %y, i64 10 + %21 = load i8, ptr %arrayidx2.10 + %conv3.10 = sext i8 %21 to i32 + %mul.10 = mul nsw i32 %conv3.10, %conv.10 + %sub.10 = sub nsw i32 %mul.10, %sub.9 + %arrayidx.11 = getelementptr inbounds nuw i8, ptr %x, i64 11 + %22 = load i8, ptr %arrayidx.11 + %conv.11 = sext i8 %22 to i32 + %arrayidx2.11 = getelementptr inbounds nuw i8, ptr %y, i64 11 + %23 = load i8, ptr %arrayidx2.11 + %conv3.11 = sext i8 %23 to i32 + %mul.11 = mul nsw i32 %conv3.11, %conv.11 + %sub.11 = sub nsw i32 %mul.11, %sub.10 + %arrayidx.12 = getelementptr inbounds nuw i8, ptr %x, i64 12 + %24 = load i8, ptr %arrayidx.12 + %conv.12 = sext i8 %24 to i32 + %arrayidx2.12 = getelementptr inbounds nuw i8, ptr %y, i64 12 + %25 = load i8, ptr %arrayidx2.12 + %conv3.12 = sext i8 %25 to i32 + %mul.12 = mul nsw i32 %conv3.12, %conv.12 + %sub.12 = sub nsw i32 %mul.12, %sub.11 + %arrayidx.13 = getelementptr inbounds nuw i8, ptr %x, i64 13 + %26 = load i8, ptr %arrayidx.13 + %conv.13 = sext i8 %26 to i32 + %arrayidx2.13 = getelementptr inbounds nuw i8, ptr %y, i64 13 + %27 = load i8, ptr %arrayidx2.13 + %conv3.13 = sext i8 %27 to i32 + %mul.13 = mul nsw i32 %conv3.13, %conv.13 + %sub.13 = sub nsw i32 %mul.13, %sub.12 + %arrayidx.14 = getelementptr inbounds nuw i8, ptr %x, i64 14 + %28 = load i8, ptr %arrayidx.14 + %conv.14 = sext i8 %28 to i32 + %arrayidx2.14 = getelementptr inbounds nuw i8, ptr %y, i64 14 + %29 = load i8, ptr %arrayidx2.14 + %conv3.14 = sext i8 %29 to i32 + %mul.14 = mul nsw i32 %conv3.14, %conv.14 + %sub.14 = sub nsw i32 %mul.14, %sub.13 + %arrayidx.15 = getelementptr inbounds nuw i8, ptr %x, i64 15 + %30 = load i8, ptr %arrayidx.15 + %conv.15 = sext i8 %30 to i32 + %arrayidx2.15 = getelementptr inbounds nuw i8, ptr %y, i64 15 + %31 = load i8, ptr %arrayidx2.15 + %conv3.15 = sext i8 %31 to i32 + %mul.15 = mul nsw i32 %conv3.15, %conv.15 + %sub.15 = sub nsw i32 %mul.15, %sub.14 + ret i32 %sub.15 +}