From 85eb08f0fa264f44041395290321496bc154293c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 18 Jun 2025 14:07:04 +0100 Subject: [PATCH] [LV] Vectorize maxnum/minnum w/o fast-math flags. Update LV to vectorize maxnum/minnum reductions without fast-math flags, by adding an extra check in the loop if any inputs to maxnum/minnum are NaN. If any input is NaN, *exit the vector loop, *compute the reduction result up to the vector iteration that contained NaN inputs and * resume in the scalar loop New recurrence kinds are added for reductions using maxnum/minnum without fast-math flags. The new recurrence kinds are not supported in the code to generate IR to perform the reductions to prevent accidential mis-use. Users need to add the required checks ensuring no NaN inputs, and convert to regular FMin/FMax recurrence kinds. --- llvm/include/llvm/Analysis/IVDescriptors.h | 2 + llvm/lib/Analysis/IVDescriptors.cpp | 24 +++- llvm/lib/Transforms/Utils/LoopUtils.cpp | 2 + .../Transforms/Vectorize/LoopVectorize.cpp | 14 ++- llvm/lib/Transforms/Vectorize/VPlan.h | 16 +-- .../Transforms/Vectorize/VPlanAnalysis.cpp | 1 + .../Vectorize/VPlanConstruction.cpp | 108 ++++++++++++++++++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 ++- .../Transforms/Vectorize/VPlanTransforms.h | 5 + .../AArch64/fmax-without-fast-math-flags.ll | 52 +++++++-- ...fmax-without-fast-math-flags-interleave.ll | 52 +++++++-- .../fmax-without-fast-math-flags.ll | 86 +++++++++++--- .../LoopVectorize/reduction-small-size.ll | 3 +- 13 files changed, 335 insertions(+), 45 deletions(-) diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index b985292ccee40..e28901c072e29 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -47,6 +47,8 @@ enum class RecurKind { FMul, ///< Product of floats. FMin, ///< FP min implemented in terms of select(cmp()). FMax, ///< FP max implemented in terms of select(cmp()). + FMinNumNoFMFs, ///< FP min with llvm.minnum semantics and no fast-math flags. + FMaxNumNoFMFs, ///< FP max with llvm.maxnumsemantics and no fast-math flags. FMinimum, ///< FP min with llvm.minimum semantics FMaximum, ///< FP max with llvm.maximum semantics FMinimumNum, ///< FP min with llvm.minimumnum semantics diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 39f74beca082f..d8923ab7bc908 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -941,10 +941,28 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( m_Intrinsic(m_Value(), m_Value())) || match(I, m_Intrinsic(m_Value(), m_Value())); }; - if (isIntMinMaxRecurrenceKind(Kind) || - (HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind))) + if (isIntMinMaxRecurrenceKind(Kind)) return isMinMaxPattern(I, Kind, Prev); - else if (isFMulAddIntrinsic(I)) + if (isFPMinMaxRecurrenceKind(Kind)) { + if (HasRequiredFMF()) + return isMinMaxPattern(I, Kind, Prev); + // For FMax/FMin reductions using maxnum/minnum intrinsics with non-NaN + // start value, we may be able to vectorize with extra checks ensuring the + // inputs are not NaN. + auto *StartV = dyn_cast( + OrigPhi->getIncomingValueForBlock(L->getLoopPredecessor())); + if (StartV && !StartV->getValue().isNaN() && + isMinMaxPattern(I, Kind, Prev).isRecurrence()) { + if (((Kind == RecurKind::FMax && + match(I, m_Intrinsic(m_Value(), m_Value()))) || + Kind == RecurKind::FMaxNumNoFMFs)) + return InstDesc(I, RecurKind::FMaxNumNoFMFs); + if (((Kind == RecurKind::FMin && + match(I, m_Intrinsic(m_Value(), m_Value()))) || + Kind == RecurKind::FMinNumNoFMFs)) + return InstDesc(I, RecurKind::FMinNumNoFMFs); + } + } else if (isFMulAddIntrinsic(I)) return InstDesc(Kind == RecurKind::FMulAdd, I, I->hasAllowReassoc() ? nullptr : I); return InstDesc(false, I); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 200d1fb854155..0cf7bc7af878c 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { case RecurKind::UMin: return Intrinsic::vector_reduce_umin; case RecurKind::FMax: + case RecurKind::FMaxNumNoFMFs: return Intrinsic::vector_reduce_fmax; case RecurKind::FMin: + case RecurKind::FMinNumNoFMFs: return Intrinsic::vector_reduce_fmin; case RecurKind::FMaximum: return Intrinsic::vector_reduce_fmaximum; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f3de24aa4c3d1..ffd9d5c22cc13 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4346,8 +4346,15 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. - if (any_of(OrigLoop->getHeader()->phis(), - [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) + if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { + if (Legal->isReductionVariable(&Phi)) { + RecurKind RK = + Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); + return RK == RecurKind::FMinNumNoFMFs || + RK == RecurKind::FMaxNumNoFMFs; + } + return Legal->isFixedOrderRecurrence(&Phi); + })) return false; // Phis with uses outside of the loop require special handling and are @@ -8808,6 +8815,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + if (!VPlanTransforms::runPass( + VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath, *Plan)) + return nullptr; // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9a6e4b36397b3..96eced547d0ba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1326,20 +1326,22 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags, unsigned Opcode; public: - VPWidenRecipe(unsigned Opcode, ArrayRef Operands, - const VPIRFlags &Flags, DebugLoc DL) - : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL), - Opcode(Opcode) {} - VPWidenRecipe(Instruction &I, ArrayRef Operands) : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPIRMetadata(I), Opcode(I.getOpcode()) {} + VPWidenRecipe(unsigned Opcode, ArrayRef Operands, + const VPIRFlags &Flags, const VPIRMetadata &Metadata, + DebugLoc DL) + : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL), + VPIRMetadata(Metadata), Opcode(Opcode) {} + ~VPWidenRecipe() override = default; VPWidenRecipe *clone() override { - auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands()); - R->transferFlags(*this); + auto *R = + new VPWidenRecipe(getOpcode(), operands(), *this, *this, getDebugLoc()); + R->setUnderlyingValue(getUnderlyingValue()); return R; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 92db9674ef42b..971e86c3c7b8c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -85,6 +85,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return ResTy; } case Instruction::ICmp: + case Instruction::FCmp: case VPInstruction::ActiveLaneMask: assert(inferScalarType(R->getOperand(0)) == inferScalarType(R->getOperand(1)) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 37e15326e01f9..005392eaaea76 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -628,3 +628,111 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, Term->addMetadata(LLVMContext::MD_prof, BranchWeights); } } + +static VPValue *getMinMaxCompareValue(VPSingleDefRecipe *MinMaxOp, + VPReductionPHIRecipe *RedPhi) { + auto *RepR = dyn_cast(MinMaxOp); + if (!isa(MinMaxOp) && + !(RepR && (isa(RepR->getUnderlyingInstr())))) + return nullptr; + + if (MinMaxOp->getOperand(0) == RedPhi) + return MinMaxOp->getOperand(1); + return MinMaxOp->getOperand(0); +} + +bool VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan) { + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPValue *AnyNaN = nullptr; + VPReductionPHIRecipe *RedPhiR = nullptr; + VPRecipeWithIRFlags *MinMaxOp = nullptr; + for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) { + auto *Cur = dyn_cast(&R); + if (!Cur) + continue; + if (RedPhiR) + return false; + if (Cur->getRecurrenceKind() != RecurKind::FMaxNumNoFMFs && + Cur->getRecurrenceKind() != RecurKind::FMinNumNoFMFs) + continue; + + RedPhiR = Cur; + + MinMaxOp = dyn_cast( + RedPhiR->getBackedgeValue()->getDefiningRecipe()); + if (!MinMaxOp) + return false; + VPValue *In = getMinMaxCompareValue(MinMaxOp, RedPhiR); + if (!In) + return false; + + auto *IsNaN = + new VPInstruction(Instruction::FCmp, {In, In}, {CmpInst::FCMP_UNO}, {}); + IsNaN->insertBefore(MinMaxOp); + AnyNaN = new VPInstruction(VPInstruction::AnyOf, {IsNaN}); + AnyNaN->getDefiningRecipe()->insertAfter(IsNaN); + } + + if (!AnyNaN) + return true; + + auto *MiddleVPBB = Plan.getMiddleBlock(); + auto *RdxResult = cast(&MiddleVPBB->front()); + auto *ScalarPH = Plan.getScalarPreheader(); + // Update the resume phis in the scalar preheader. They all must either resume + // from the reduction result or the canonical induction. Bail out if there are + // other resume phis. + for (auto &R : ScalarPH->phis()) { + auto *ResumeR = cast(&R); + VPValue *VecV = ResumeR->getOperand(0); + VPValue *BypassV = ResumeR->getOperand(ResumeR->getNumOperands() - 1); + if (VecV != RdxResult && VecV != &Plan.getVectorTripCount()) + return false; + ResumeR->setOperand( + 1, VecV == &Plan.getVectorTripCount() ? Plan.getCanonicalIV() : VecV); + ResumeR->addOperand(BypassV); + } + + // Create a new reduction phi recipe with either FMin/FMax, replacing + // FMinNumNoFMFs/FMaxNumNoFMFs. + RecurKind NewRK = RedPhiR->getRecurrenceKind() != RecurKind::FMinNumNoFMFs + ? RecurKind::FMin + : RecurKind::FMax; + auto *NewRedPhiR = new VPReductionPHIRecipe( + cast(RedPhiR->getUnderlyingValue()), NewRK, + *RedPhiR->getStartValue(), RedPhiR->isInLoop(), RedPhiR->isOrdered()); + NewRedPhiR->addOperand(RedPhiR->getOperand(1)); + NewRedPhiR->insertBefore(RedPhiR); + RedPhiR->replaceAllUsesWith(NewRedPhiR); + RedPhiR->eraseFromParent(); + + // Update the loop exit condition to exit if either any of the inputs is NaN + // or the vector trip count is reached. + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); + VPBuilder Builder(LatchVPBB->getTerminator()); + auto *LatchExitingBranch = cast(LatchVPBB->getTerminator()); + assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount && + "Unexpected terminator"); + auto *IsLatchExitTaken = + Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0), + LatchExitingBranch->getOperand(1)); + auto *AnyExitTaken = + Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken}); + Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); + LatchExitingBranch->eraseFromParent(); + + // Split the middle block and introduce a new block, branching to the scalar + // preheader to resume iteration in the scalar loop if any NaNs have been + // encountered. + MiddleVPBB->splitAt(std::prev(MiddleVPBB->end())); + Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin()); + auto *NewSel = + Builder.createSelect(AnyNaN, NewRedPhiR, RdxResult->getOperand(1)); + RdxResult->setOperand(1, NewSel); + Builder.setInsertPoint(MiddleVPBB); + Builder.createNaryOp(VPInstruction::BranchOnCond, AnyNaN); + VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); + MiddleVPBB->swapSuccessors(); + std::swap(ScalarPH->getPredecessors()[1], ScalarPH->getPredecessors().back()); + return true; +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 75ade13b09d9c..ea360fb63f69e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -585,6 +585,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); return Builder.CreateFreeze(Op, Name); } + case Instruction::FCmp: case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); @@ -595,8 +596,11 @@ Value *VPInstruction::generate(VPTransformState &State) { llvm_unreachable("should be handled by VPPhi::execute"); } case Instruction::Select: { - bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); + bool OnlyFirstLaneUsed = + State.VF.isScalar() || vputils::onlyFirstLaneUsed(this); + Value *Cond = + State.get(getOperand(0), + OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0))); Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); return Builder.CreateSelect(Cond, Op1, Op2, Name); @@ -858,7 +862,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Res = State.get(getOperand(0)); for (VPValue *Op : drop_begin(operands())) Res = Builder.CreateOr(Res, State.get(Op)); - return Builder.CreateOrReduce(Res); + return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); } case VPInstruction::FirstActiveLane: { if (getNumOperands() == 1) { @@ -1031,6 +1035,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { switch (getOpcode()) { case Instruction::ExtractElement: case Instruction::Freeze: + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: @@ -1066,6 +1071,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { return Op == getOperand(1); case Instruction::PHI: return true; + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: case Instruction::Or: @@ -1098,6 +1104,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { switch (getOpcode()) { default: return false; + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: return vputils::onlyFirstPartUsed(this); @@ -1782,7 +1789,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { return Opcode == Instruction::ZExt; break; case OperationType::Cmp: - return Opcode == Instruction::ICmp; + return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; case OperationType::Other: return true; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 870b1bb68b79a..fe590e9410941 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -99,6 +99,11 @@ struct VPlanTransforms { /// not valid. static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder); + /// Check if \p Plan contains any FMaxNumNoFMFs or FMinNumNoFMFs reductions. + /// If they do, try to update the vector loop to exit early if any input is + /// NaN and resume executing in the scalar loop to handle the NaNs there. + static bool handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan); + /// Clear NSW/NUW flags from reduction instructions if necessary. static void clearReductionWrapFlags(VPlan &Plan); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll index 77b40dabae1e1..2cc456627f6b0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll @@ -42,18 +42,56 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float 0xFFF8000000000000), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp olt <4 x float> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP11]], <4 x float> [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[SCALAR_PH]], label %[[MIDDLE_BLOCK_SPLIT:.*]] +; CHECK: [[MIDDLE_BLOCK_SPLIT]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK_SPLIT]] ], [ [[IV]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK_SPLIT]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK_SPLIT]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll index fb68d4cbd9e4b..36df05a7c14f2 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll @@ -42,18 +42,56 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float 0xFFF8000000000000), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp olt <4 x float> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP11]], <4 x float> [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[SCALAR_PH]], label %[[MIDDLE_BLOCK_SPLIT:.*]] +; CHECK: [[MIDDLE_BLOCK_SPLIT]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK_SPLIT]] ], [ [[IV]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK_SPLIT]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK_SPLIT]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll index 3a8ef7e0b08c0..2d24aae0a9eac 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll @@ -192,18 +192,47 @@ define float @fmaxnum_1(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum_1( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH]], label %[[MIDDLE_BLOCK_SPLIT:.*]] +; CHECK: [[MIDDLE_BLOCK_SPLIT]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK_SPLIT]] ], [ [[IV]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK_SPLIT]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK_SPLIT]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: @@ -227,18 +256,47 @@ define float @fmaxnum_2(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum_2( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH]], label %[[MIDDLE_BLOCK_SPLIT:.*]] +; CHECK: [[MIDDLE_BLOCK_SPLIT]]: +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK_SPLIT]] ], [ [[IV]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK_SPLIT]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK_SPLIT]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll index 13cc1b657d231..fbf16b3b1a750 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll @@ -88,7 +88,8 @@ define i8 @PR34687_no_undef(i1 %c, i32 %x, i32 %n) { ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1) +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[TMP2]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1) ; CHECK-NEXT: [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]