-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[LV] Vectorize FMax w/o fast-math flags. #146711
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9d28282
5675396
ec473e5
caae126
92ebac1
25a1f39
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
#define DEBUG_TYPE "vplan" | ||
|
||
using namespace llvm; | ||
using namespace VPlanPatternMatch; | ||
|
||
namespace { | ||
// Class that is used to build the plain CFG for the incoming IR. | ||
|
@@ -427,7 +428,6 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) { | |
static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, | ||
VPBasicBlock *LatchVPBB, Type *IdxTy, | ||
DebugLoc DL) { | ||
using namespace VPlanPatternMatch; | ||
Value *StartIdx = ConstantInt::get(IdxTy, 0); | ||
auto *StartV = Plan.getOrAddLiveIn(StartIdx); | ||
|
||
|
@@ -628,3 +628,118 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, | |
Term->addMetadata(LLVMContext::MD_prof, BranchWeights); | ||
} | ||
} | ||
|
||
bool VPlanTransforms::handleFMaxReductionsWithoutFastMath(VPlan &Plan) { | ||
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); | ||
VPReductionPHIRecipe *RedPhiR = nullptr; | ||
VPRecipeWithIRFlags *MaxOp = nullptr; | ||
VPWidenIntOrFpInductionRecipe *WideIV = nullptr; | ||
|
||
// Check if there are any OrderedFCmpSelect reductions using wide selects that | ||
// we can fix up. To do so, we also need a wide canonical IV to keep track of | ||
// the indices of the max values. | ||
for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) { | ||
// We need a wide canonical IV | ||
if (auto *CurIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) { | ||
if (!CurIV->isCanonical()) | ||
continue; | ||
WideIV = CurIV; | ||
continue; | ||
} | ||
|
||
// And a single OrderedFCmpSelect reduction phi. | ||
// TODO: Support FMin reductions as well. | ||
auto *CurRedPhiR = dyn_cast<VPReductionPHIRecipe>(&R); | ||
if (!CurRedPhiR) | ||
continue; | ||
if (RedPhiR) | ||
return false; | ||
if (CurRedPhiR->getRecurrenceKind() != RecurKind::OrderedFCmpSelect || | ||
CurRedPhiR->isInLoop() || CurRedPhiR->isOrdered()) | ||
continue; | ||
RedPhiR = CurRedPhiR; | ||
|
||
// MaxOp feeding the reduction phi must be a select (either wide or a | ||
// replicate recipe), where the phi is the last operand, and the compare | ||
// predicate is strict. This ensures NaNs won't get propagated unless the | ||
// initial value is NaN | ||
VPRecipeBase *Inc = RedPhiR->getBackedgeValue()->getDefiningRecipe(); | ||
auto *RepR = dyn_cast<VPReplicateRecipe>(Inc); | ||
if (!isa<VPWidenSelectRecipe>(Inc) && | ||
!(RepR && (isa<SelectInst>(RepR->getUnderlyingInstr())))) | ||
return false; | ||
|
||
MaxOp = cast<VPRecipeWithIRFlags>(Inc); | ||
auto *Cmp = cast<VPRecipeWithIRFlags>(MaxOp->getOperand(0)); | ||
if (MaxOp->getOperand(1) == RedPhiR || | ||
!CmpInst::isStrictPredicate(Cmp->getPredicate())) | ||
return false; | ||
} | ||
|
||
// Nothing to do. | ||
if (!RedPhiR) | ||
return true; | ||
|
||
// A wide canonical IV is currently required. | ||
// TODO: Create an induction if no suitable existing one is available. | ||
if (!WideIV) | ||
return false; | ||
Comment on lines
+683
to
+686
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that a scalar canonical IV always exists, and is unique. But widen ones may exist (last one found is used?) or not. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, at this stage, all inductions will still be widened, but may not be canonical. |
||
|
||
// Create a reduction that tracks the first indices where the latest maximum | ||
// value has been selected. This is later used to select the max value from | ||
// the partial reductions in a way that correctly handles signed zeros and | ||
// NaNs in the input. | ||
// Note that we do not need to check if the induction may hit the sentinel | ||
// value. If the sentinel value gets hit, the final reduction value is at the | ||
// last index or the maximum was never set and all lanes contain the start | ||
// value. In either case, the correct value is selected. | ||
unsigned IVWidth = | ||
VPTypeAnalysis(Plan).inferScalarType(WideIV)->getScalarSizeInBits(); | ||
LLVMContext &Ctx = Plan.getScalarHeader()->getIRBasicBlock()->getContext(); | ||
VPValue *UMinSentinel = | ||
Plan.getOrAddLiveIn(ConstantInt::get(Ctx, APInt::getMaxValue(IVWidth))); | ||
auto *IdxPhi = new VPReductionPHIRecipe(nullptr, RecurKind::FindFirstIVUMin, | ||
*UMinSentinel, false, false, 1); | ||
IdxPhi->insertBefore(RedPhiR); | ||
auto *MinIdxSel = new VPInstruction(Instruction::Select, | ||
{MaxOp->getOperand(0), WideIV, IdxPhi}); | ||
MinIdxSel->insertAfter(MaxOp); | ||
IdxPhi->addOperand(MinIdxSel); | ||
|
||
// Find the first index holding with the maximum value. This is used to | ||
// extract the lane with the final max value and is needed to handle signed | ||
// zeros and NaNs in the input. | ||
auto *MaxResult = find_singleton<VPSingleDefRecipe>( | ||
RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * { | ||
auto *VPI = dyn_cast<VPInstruction>(U); | ||
if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult) | ||
return VPI; | ||
return nullptr; | ||
}); | ||
VPBuilder Builder(MaxResult->getParent(), | ||
std::next(MaxResult->getIterator())); | ||
|
||
// Create mask for lanes that have the max value and use it to mask out | ||
// indices that don't contain maximum values. | ||
auto *MaskFinalMaxValue = Builder.createNaryOp( | ||
Instruction::FCmp, {MaxResult->getOperand(1), MaxResult}, | ||
VPIRFlags(CmpInst::FCMP_OEQ)); | ||
auto *IndicesWithMaxValue = Builder.createNaryOp( | ||
Instruction::Select, {MaskFinalMaxValue, MinIdxSel, UMinSentinel}); | ||
auto *FirstMaxIdx = Builder.createNaryOp( | ||
VPInstruction::ComputeFindIVResult, | ||
{IdxPhi, WideIV->getStartValue(), UMinSentinel, IndicesWithMaxValue}); | ||
// Convert the index of the first max value to an index in the vector lanes of | ||
// the partial reduction results. This ensures we select the first max value | ||
// and acts as a tie-breaker if the partial reductions contain signed zeros. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The vertical computation of each partial reduction result takes care of NaNs and signed zeroes, it is only the horizontal reduction of these vector lanes that require tie-breaking, to handle potential signed zeroes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the tie-breaking is only needed to handle signed zeroes when computing the final reduction results. Consider a final partial reduction vector with We then compare the partial reduction values to the result of the horizontal reduction (-0.0 == +0.0 will also be true, selecting all lanes with zeros of any signed-ness) Out of those, we select the one encountered first using FindFirstIV. Note that this only works for strict predicates. |
||
auto *FirstMaxLane = | ||
Builder.createNaryOp(Instruction::URem, {FirstMaxIdx, &Plan.getVFxUF()}); | ||
|
||
// Extract the final max value and update the users. | ||
auto *Res = Builder.createNaryOp(VPInstruction::ExtractLane, | ||
{FirstMaxLane, MaxResult->getOperand(1)}); | ||
MaxResult->replaceUsesWithIf(Res, [MaskFinalMaxValue](VPUser &U, unsigned) { | ||
return &U != MaskFinalMaxValue; | ||
}); | ||
return true; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure why the feature is designed to be in VPlanTransforms?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See response here https://github.com/llvm/llvm-project/pull/146711/files/598c59f3c7d565ff7994c816f52c0aaeed138402#r2189438081