Skip to content

[LV] Vectorize FMax w/o fast-math flags. #146711

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions llvm/include/llvm/Analysis/IVDescriptors.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ enum class RecurKind {
FMul, ///< Product of floats.
FMin, ///< FP min implemented in terms of select(cmp()).
FMax, ///< FP max implemented in terms of select(cmp()).
FCmpOGTSelect, ///< FP max implemented in terms of select(cmp()), but without
/// any fast-math flags. Users need to handle NaNs and signed
/// zeros when generating code.
FMinimum, ///< FP min with llvm.minimum semantics
FMaximum, ///< FP max with llvm.maximum semantics
FMinimumNum, ///< FP min with llvm.minimumnum semantics
Expand All @@ -57,6 +60,9 @@ enum class RecurKind {
FindFirstIVSMin, /// FindFirst reduction with select(icmp(),x,y) where one of
///< (x,y) is a decreasing loop induction, and both x and y
///< are integer type, producing a SMin reduction.
FindFirstIVUMin, /// FindFirst reduction with select(icmp(),x,y) where one of
///< (x,y) is a decreasing loop induction, and both x and y
///< are integer type, producing a UMin reduction.
FindLastIVSMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a SMax reduction.
Expand Down Expand Up @@ -247,8 +253,9 @@ class RecurrenceDescriptor {
/// Returns true if the recurrence kind is a floating-point min/max kind.
static bool isFPMinMaxRecurrenceKind(RecurKind Kind) {
return Kind == RecurKind::FMin || Kind == RecurKind::FMax ||
Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum ||
Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum;
Kind == RecurKind::FCmpOGTSelect || Kind == RecurKind::FMinimum ||
Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimumNum ||
Kind == RecurKind::FMaximumNum;
}

/// Returns true if the recurrence kind is any min/max kind.
Expand All @@ -265,7 +272,8 @@ class RecurrenceDescriptor {
/// Returns true if the recurrence kind is of the form
/// select(cmp(),x,y) where one of (x,y) is decreasing loop induction.
static bool isFindFirstIVRecurrenceKind(RecurKind Kind) {
return Kind == RecurKind::FindFirstIVSMin;
return Kind == RecurKind::FindFirstIVSMin ||
Kind == RecurKind::FindFirstIVUMin;
}

/// Returns true if the recurrence kind is of the form
Expand Down
26 changes: 18 additions & 8 deletions llvm/lib/Analysis/IVDescriptors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::UMin:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
return true;
Expand Down Expand Up @@ -741,10 +742,9 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
: APInt::getMinValue(NumBits);
ValidRange = ConstantRange::getNonEmpty(Sentinel + 1, Sentinel);
} else {
assert(IsSigned && "Only FindFirstIV with SMax is supported currently");
ValidRange =
ConstantRange::getNonEmpty(APInt::getSignedMinValue(NumBits),
APInt::getSignedMaxValue(NumBits) - 1);
APInt Sentinel = IsSigned ? APInt::getSignedMaxValue(NumBits)
: APInt::getMaxValue(NumBits);
ValidRange = ConstantRange::getNonEmpty(Sentinel, Sentinel - 1);
}

LLVM_DEBUG(dbgs() << "LV: "
Expand All @@ -770,6 +770,8 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,

if (CheckRange(true))
return RecurKind::FindFirstIVSMin;
if (CheckRange(false))
return RecurKind::FindFirstIVUMin;
return std::nullopt;
};

Expand Down Expand Up @@ -815,7 +817,8 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
if (match(I, m_OrdOrUnordFMin(m_Value(), m_Value())))
return InstDesc(Kind == RecurKind::FMin, I);
if (match(I, m_OrdOrUnordFMax(m_Value(), m_Value())))
return InstDesc(Kind == RecurKind::FMax, I);
return InstDesc(Kind == RecurKind::FMax || Kind == RecurKind::FCmpOGTSelect,
I);
if (match(I, m_FMinNum(m_Value(), m_Value())))
return InstDesc(Kind == RecurKind::FMin, I);
if (match(I, m_FMaxNum(m_Value(), m_Value())))
Expand Down Expand Up @@ -937,10 +940,15 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) ||
match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value()));
};
if (isIntMinMaxRecurrenceKind(Kind) ||
(HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind)))
if (isIntMinMaxRecurrenceKind(Kind))
return isMinMaxPattern(I, Kind, Prev);
else if (isFMulAddIntrinsic(I))
if (isFPMinMaxRecurrenceKind(Kind)) {
if (HasRequiredFMF())
return isMinMaxPattern(I, Kind, Prev);
if ((Kind == RecurKind::FMax || Kind == RecurKind::FCmpOGTSelect) &&
isMinMaxPattern(I, Kind, Prev).isRecurrence())
return InstDesc(I, RecurKind::FCmpOGTSelect);
} else if (isFMulAddIntrinsic(I))
return InstDesc(Kind == RecurKind::FMulAdd, I,
I->hasAllowReassoc() ? nullptr : I);
return InstDesc(false, I);
Expand Down Expand Up @@ -1183,6 +1191,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
return Instruction::Mul;
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::Or:
Expand All @@ -1202,6 +1211,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::UMin:
return Instruction::ICmp;
case RecurKind::FMax:
case RecurKind::FCmpOGTSelect:
case RecurKind::FMin:
case RecurKind::FMaximum:
case RecurKind::FMinimum:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Transforms/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,7 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
return Intrinsic::vector_reduce_umax;
case RecurKind::UMin:
return Intrinsic::vector_reduce_umin;
case RecurKind::FCmpOGTSelect:
case RecurKind::FMax:
return Intrinsic::vector_reduce_fmax;
case RecurKind::FMin:
Expand Down Expand Up @@ -1084,6 +1085,7 @@ CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
return CmpInst::ICMP_SGT;
case RecurKind::FMin:
return CmpInst::FCMP_OLT;
case RecurKind::FCmpOGTSelect:
case RecurKind::FMax:
return CmpInst::FCMP_OGT;
// We do not add FMinimum/FMaximum recurrence kind here since there is no
Expand Down Expand Up @@ -1306,6 +1308,7 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
case RecurKind::SMin:
case RecurKind::UMax:
case RecurKind::UMin:
case RecurKind::FCmpOGTSelect:
case RecurKind::FMax:
case RecurKind::FMin:
case RecurKind::FMinimum:
Expand Down
10 changes: 8 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4458,8 +4458,11 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
ElementCount VF) const {
// Cross iteration phis such as reductions need special handling and are
// currently unsupported.
if (any_of(OrigLoop->getHeader()->phis(),
[&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
return Legal->isFixedOrderRecurrence(&Phi) ||
Legal->getReductionVars().lookup(&Phi).getRecurrenceKind() ==
RecurKind::FCmpOGTSelect;
}))
return false;

// Phis with uses outside of the loop require special handling and are
Expand Down Expand Up @@ -8907,6 +8910,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(

// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
if (!VPlanTransforms::runPass(
VPlanTransforms::handleFMaxReductionsWithoutFastMath, *Plan))
return nullptr;
Comment on lines +8913 to +8915
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why the feature is designed to be in VPlanTransforms?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


// Transform recipes to abstract recipes if it is legal and beneficial and
// clamp the range for better cost estimation.
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23181,8 +23181,10 @@ class HorizontalReduction {
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FCmpOGTSelect:
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
case RecurKind::None:
Expand Down Expand Up @@ -23317,8 +23319,10 @@ class HorizontalReduction {
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FCmpOGTSelect:
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
case RecurKind::None:
Expand Down Expand Up @@ -23418,8 +23422,10 @@ class HorizontalReduction {
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FCmpOGTSelect:
case RecurKind::FMaximumNum:
case RecurKind::FMinimumNum:
case RecurKind::None:
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,10 @@ class VPInstruction : public VPRecipeWithIRFlags,
ReductionStartVector,
// Creates a step vector starting from 0 to VF with a step of 1.
StepVector,

/// Extracts a single lane (first operand) from a set of vector operands.
/// The lane specifies an index into a vector formed by combining all vector
/// operands (all operands after the first one).
ExtractLane,
};

private:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return ResTy;
}
case Instruction::ICmp:
case Instruction::FCmp:
case VPInstruction::ActiveLaneMask:
assert(inferScalarType(R->getOperand(0)) ==
inferScalarType(R->getOperand(1)) &&
Expand All @@ -110,6 +111,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
return SetResultTyFromOp();
case VPInstruction::ExtractLane:
return inferScalarType(R->getOperand(1));
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
Expand Down
113 changes: 112 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#define DEBUG_TYPE "vplan"

using namespace llvm;
using namespace VPlanPatternMatch;

namespace {
// Class that is used to build the plain CFG for the incoming IR.
Expand Down Expand Up @@ -426,7 +427,6 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
VPBasicBlock *LatchVPBB, Type *IdxTy,
DebugLoc DL) {
using namespace VPlanPatternMatch;
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getOrAddLiveIn(StartIdx);

Expand Down Expand Up @@ -589,3 +589,114 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
TopRegion->setName("vector loop");
TopRegion->getEntryBasicBlock()->setName("vector.body");
}

bool VPlanTransforms::handleFMaxReductionsWithoutFastMath(VPlan &Plan) {
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPReductionPHIRecipe *RedPhiR = nullptr;
VPRecipeWithIRFlags *MaxOp = nullptr;
VPWidenIntOrFpInductionRecipe *WideIV = nullptr;

// Check if there are any FCmpOGTSelect reductions using wide selects that we
// can fix up. To do so, we also need a wide canonical IV to keep track of
// the indices of the max values.
for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
// We need a wide canonical IV
if (auto *CurIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
if (!CurIV->isCanonical())
continue;
WideIV = CurIV;
continue;
}

// And a single FCmpOGTSelect reduction phi.
// TODO: Support FMin reductions as well.
auto *CurRedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!CurRedPhiR)
continue;
if (RedPhiR)
return false;
if (CurRedPhiR->getRecurrenceKind() != RecurKind::FCmpOGTSelect ||
CurRedPhiR->isInLoop() || CurRedPhiR->isOrdered())
continue;
RedPhiR = CurRedPhiR;

// MaxOp feeding the reduction phi must be a select (either wide or a
// replicate recipe), where the phi is the last operand, and the compare
// predicate is strict. This ensures NaNs won't get propagated unless the
// initial value is NaN
VPRecipeBase *Inc = RedPhiR->getBackedgeValue()->getDefiningRecipe();
auto *RepR = dyn_cast<VPReplicateRecipe>(Inc);
if (!isa<VPWidenSelectRecipe>(Inc) &&
!(RepR && (isa<SelectInst>(RepR->getUnderlyingInstr()))))
return false;

MaxOp = cast<VPRecipeWithIRFlags>(Inc);
auto *Cmp = cast<VPRecipeWithIRFlags>(MaxOp->getOperand(0));
if (MaxOp->getOperand(1) == RedPhiR ||
!CmpInst::isStrictPredicate(Cmp->getPredicate()))
return false;
}

// Nothing to do.
if (!RedPhiR)
return true;

// A wide canonical IV is currently required.
// TODO: Create an induction if no suitable existing one is available.
if (!WideIV)
return false;

// Create a reduction that tracks the first indices where the latest maximum
// value has been selected. This is later used to select the max value from
// the partial reductions in a way that correctly handles signed zeros and
// NaNs in the input.
// Note that we do not need to check if the induction may hit the sentinel
// value. If the sentinel value gets hit, the final reduction value is at the
// last index or the maximum was never set and all lanes contain the start
// value. In either case, the correct value is selected.
unsigned IVWidth =
VPTypeAnalysis(Plan).inferScalarType(WideIV)->getScalarSizeInBits();
LLVMContext &Ctx = Plan.getScalarHeader()->getIRBasicBlock()->getContext();
VPValue *UMinSentinel =
Plan.getOrAddLiveIn(ConstantInt::get(Ctx, APInt::getMaxValue(IVWidth)));
auto *IdxPhi = new VPReductionPHIRecipe(nullptr, RecurKind::FindFirstIVUMin,
*UMinSentinel, false, false, 1);
IdxPhi->insertBefore(RedPhiR);
auto *MinIdxSel = new VPInstruction(Instruction::Select,
{MaxOp->getOperand(0), WideIV, IdxPhi});
MinIdxSel->insertAfter(MaxOp);
IdxPhi->addOperand(MinIdxSel);

// Find the first index of with the maximum value. This is used to extract the
// lane with the final max value and is needed to handle signed zeros and NaNs
// in the input.
auto *MiddleVPBB = Plan.getMiddleBlock();
auto *OrigRdxResult = cast<VPSingleDefRecipe>(&MiddleVPBB->front());
VPBuilder Builder(OrigRdxResult->getParent(),
std::next(OrigRdxResult->getIterator()));

// Create mask for lanes that have the max value and use it to mask out
// indices that don't contain maximum values.
auto *MaskFinalMaxValue = Builder.createNaryOp(
Instruction::FCmp, {OrigRdxResult->getOperand(1), OrigRdxResult},
VPIRFlags(CmpInst::FCMP_OEQ));
auto *IndicesWithMaxValue = Builder.createNaryOp(
Instruction::Select, {MaskFinalMaxValue, MinIdxSel, UMinSentinel});
auto *FirstMaxIdx = Builder.createNaryOp(
VPInstruction::ComputeFindIVResult,
{IdxPhi, WideIV->getStartValue(), UMinSentinel, IndicesWithMaxValue});
// Convert the index of the first max value to an index in the vector lanes of
// the partial reduction results. This ensures we select the first max value
// and acts as a tie-breaker if the partial reductions contain signed zeros.
auto *FirstMaxLane =
Builder.createNaryOp(Instruction::URem, {FirstMaxIdx, &Plan.getVFxUF()});

// Extract the final max value and update the users.
auto *Res = Builder.createNaryOp(
VPInstruction::ExtractLane, {FirstMaxLane, OrigRdxResult->getOperand(1)});
OrigRdxResult->replaceUsesWithIf(Res,
[MaskFinalMaxValue](VPUser &U, unsigned) {
return &U != MaskFinalMaxValue;
});
return true;
}
Loading