From 1512049b40b73bac7efb31dafff5b981b324833a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 4 Jul 2025 13:44:53 +0100 Subject: [PATCH 1/4] [LAA] Support assumptions in evaluatePtrAddRecAtMaxBTCWillNotWrap This patch extends the logic added in https://github.com/llvm/llvm-project/pull/128061 to support dereferenceability information from assumptions as well. Unfortunately both assumption cache and the dominator tree need to be threaded through multiple layers to make them available where needed. --- .../llvm/Analysis/LoopAccessAnalysis.h | 23 ++++-- llvm/lib/Analysis/Loads.cpp | 2 +- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 76 +++++++++++++------ llvm/lib/Transforms/Scalar/LoopFlatten.cpp | 3 +- .../Transforms/Scalar/LoopVersioningLICM.cpp | 2 +- .../early-exit-runtime-checks.ll | 8 +- .../single-early-exit-deref-assumptions.ll | 35 ++++++++- .../Transforms/Vectorize/VPlanSlpTest.cpp | 3 +- 8 files changed, 109 insertions(+), 43 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 1415da14a3494..a1a5a4625a047 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -180,10 +180,12 @@ class MemoryDepChecker { const SmallVectorImpl &Instrs) const; }; - MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L, + MemoryDepChecker(PredicatedScalarEvolution &PSE, AssumptionCache *AC, + DominatorTree *DT, const Loop *L, const DenseMap &SymbolicStrides, unsigned MaxTargetVectorWidthInBits) - : PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides), + : PSE(PSE), AC(AC), DT(DT), InnermostLoop(L), + SymbolicStrides(SymbolicStrides), MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {} /// Register the location (instructions are given increasing numbers) @@ -288,6 +290,9 @@ class MemoryDepChecker { return PointerBounds; } + AssumptionCache *getAC() const { return AC; } + DominatorTree *getDT() const { return DT; } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and /// applies dynamic knowledge to simplify SCEV expressions and convert them @@ -296,6 +301,10 @@ class MemoryDepChecker { /// example we might assume a unit stride for a pointer in order to prove /// that a memory access is strided and doesn't wrap. PredicatedScalarEvolution &PSE; + + AssumptionCache *AC; + DominatorTree *DT; + const Loop *InnermostLoop; /// Reference to map of pointer values to @@ -665,7 +674,7 @@ class LoopAccessInfo { LLVM_ABI LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetTransformInfo *TTI, const TargetLibraryInfo *TLI, AAResults *AA, - DominatorTree *DT, LoopInfo *LI, + DominatorTree *DT, LoopInfo *LI, AssumptionCache *AC, bool AllowPartial = false); /// Return true we can analyze the memory accesses in the loop and there are @@ -917,7 +926,8 @@ LLVM_ABI std::pair getStartAndEndForAccess( const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC, const SCEV *MaxBTC, ScalarEvolution *SE, DenseMap, - std::pair> *PointerBounds); + std::pair> *PointerBounds, + AssumptionCache *AC, DominatorTree *DT); class LoopAccessInfoManager { /// The cache. @@ -930,12 +940,13 @@ class LoopAccessInfoManager { LoopInfo &LI; TargetTransformInfo *TTI; const TargetLibraryInfo *TLI = nullptr; + AssumptionCache *AC; public: LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT, LoopInfo &LI, TargetTransformInfo *TTI, - const TargetLibraryInfo *TLI) - : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI) {} + const TargetLibraryInfo *TLI, AssumptionCache *AC) + : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC) {} LLVM_ABI const LoopAccessInfo &getInfo(Loop &L, bool AllowPartial = false); diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 880249588f0b2..7b4e00b298657 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -326,7 +326,7 @@ bool llvm::isDereferenceableAndAlignedInLoop( return false; const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess( - L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr); + L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, AC, &DT); if (isa(AccessStart) || isa(AccessEnd)) return false; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 7b71dad23948c..2719ed79cf1fb 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -23,6 +23,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/AssumeBundleQueries.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -208,28 +210,52 @@ static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B, /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at /// \p MaxBTC is guaranteed inbounds of the accessed object. -static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, - const SCEV *MaxBTC, - const SCEV *EltSize, - ScalarEvolution &SE, - const DataLayout &DL) { +static bool +evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, + const SCEV *MaxBTC, const SCEV *EltSize, + ScalarEvolution &SE, const DataLayout &DL, + AssumptionCache *AC, DominatorTree *DT) { auto *PointerBase = SE.getPointerBase(AR->getStart()); auto *StartPtr = dyn_cast(PointerBase); if (!StartPtr) return false; + const Loop *L = AR->getLoop(); bool CheckForNonNull, CheckForFreed; - uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes( + Value *StartPtrV = StartPtr->getValue(); + uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes( DL, CheckForNonNull, CheckForFreed); - if (CheckForNonNull || CheckForFreed) + if (DerefBytes && (CheckForNonNull || CheckForFreed)) return false; const SCEV *Step = AR->getStepRecurrence(SE); + Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType()); + const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes); + + // Check if we have a suitable dereferencable assumption we can use. + RetainedKnowledge DerefRK; + if (!StartPtrV->canBeFreed() && + getKnowledgeForValue( + StartPtrV, {Attribute::Dereferenceable}, *AC, + [&](RetainedKnowledge RK, Instruction *Assume, auto) { + if (!isValidAssumeForContext( + Assume, L->getLoopPredecessor()->getTerminator(), DT)) + return false; + if (RK.AttrKind == Attribute::Dereferenceable) { + DerefRK = std::max(DerefRK, RK); + return true; + } + return false; + }) && + DerefRK.ArgValue) { + DerefBytesSCEV = SE.getUMaxExpr(DerefBytesSCEV, + SE.getConstant(WiderTy, DerefRK.ArgValue)); + } + bool IsKnownNonNegative = SE.isKnownNonNegative(Step); if (!IsKnownNonNegative && !SE.isKnownNegative(Step)) return false; - Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType()); Step = SE.getNoopOrSignExtend(Step, WiderTy); MaxBTC = SE.getNoopOrZeroExtend(MaxBTC, WiderTy); @@ -256,8 +282,7 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE); if (!EndBytes) return false; - return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, - SE.getConstant(WiderTy, DerefBytes)); + return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV); } // For negative steps check if @@ -265,15 +290,15 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, // * StartOffset <= DerefBytes. assert(SE.isKnownNegative(Step) && "must be known negative"); return SE.isKnownPredicate(CmpInst::ICMP_SGE, StartOffset, OffsetEndBytes) && - SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, - SE.getConstant(WiderTy, DerefBytes)); + SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, DerefBytesSCEV); } std::pair llvm::getStartAndEndForAccess( const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC, const SCEV *MaxBTC, ScalarEvolution *SE, DenseMap, - std::pair> *PointerBounds) { + std::pair> *PointerBounds, + AssumptionCache *AC, DominatorTree *DT) { std::pair *PtrBoundsPair; if (PointerBounds) { auto [Iter, Ins] = PointerBounds->insert( @@ -308,8 +333,8 @@ std::pair llvm::getStartAndEndForAccess( // sets ScEnd to the maximum unsigned value for the type. Note that LAA // separately checks that accesses cannot not wrap, so unsigned max // represents an upper bound. - if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, - DL)) { + if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL, + AC, DT)) { ScEnd = AR->evaluateAtIteration(MaxBTC, *SE); } else { ScEnd = SE->getAddExpr( @@ -356,9 +381,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, bool NeedsFreeze) { const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); const SCEV *BTC = PSE.getBackedgeTakenCount(); - const auto &[ScStart, ScEnd] = - getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, - PSE.getSE(), &DC.getPointerBounds()); + const auto &[ScStart, ScEnd] = getStartAndEndForAccess( + Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(), + &DC.getPointerBounds(), DC.getAC(), DC.getDT()); assert(!isa(ScStart) && !isa(ScEnd) && "must be able to compute both start and end expressions"); @@ -2011,10 +2036,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(InnermostLoop, Src, ATy, BTC, SymbolicMaxBTC, - PSE.getSE(), &PointerBounds); + PSE.getSE(), &PointerBounds, AC, DT); const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(InnermostLoop, Sink, BTy, BTC, SymbolicMaxBTC, - PSE.getSE(), &PointerBounds); + PSE.getSE(), &PointerBounds, AC, DT); if (!isa(SrcStart_) && !isa(SrcEnd_) && !isa(SinkStart_) && @@ -2975,7 +3000,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetTransformInfo *TTI, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI, - bool AllowPartial) + AssumptionCache *AC, bool AllowPartial) : PSE(std::make_unique(*SE, *L)), PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) { unsigned MaxTargetVectorWidthInBits = std::numeric_limits::max(); @@ -2985,8 +3010,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, MaxTargetVectorWidthInBits = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2; - DepChecker = std::make_unique(*PSE, L, SymbolicStrides, - MaxTargetVectorWidthInBits); + DepChecker = std::make_unique( + *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits); PtrRtChecking = std::make_unique(*DepChecker, SE); if (canAnalyzeLoop()) CanVecMem = analyzeLoop(AA, LI, TLI, DT); @@ -3055,7 +3080,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L, // or if it was created with a different value of AllowPartial. if (Inserted || It->second->hasAllowPartial() != AllowPartial) It->second = std::make_unique(&L, &SE, TTI, TLI, &AA, &DT, - &LI, AllowPartial); + &LI, AC, AllowPartial); return *It->second; } @@ -3098,7 +3123,8 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, auto &LI = FAM.getResult(F); auto &TTI = FAM.getResult(F); auto &TLI = FAM.getResult(F); - return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI); + auto &AC = FAM.getResult(F); + return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC); } AnalysisKey LoopAccessAnalysis::Key; diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index f3e992c039178..b1096ce5ddd9f 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -1009,7 +1009,8 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM, // in simplified form, and also needs LCSSA. Running // this pass will simplify all loops that contain inner loops, // regardless of whether anything ends up being flattened. - LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr); + LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr, + nullptr); for (Loop *InnerLoop : LN.getLoops()) { auto *OuterLoop = InnerLoop->getParentLoop(); if (!OuterLoop) diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 4f2bfb073bafa..8e2cf832024ae 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -551,7 +551,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, const Function *F = L.getHeader()->getParent(); OptimizationRemarkEmitter ORE(F); - LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr); + LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, nullptr); if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll index 1dc8d4a7e73f8..207a44d5d08d4 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll @@ -505,7 +505,7 @@ e.1: ret i32 1 } -define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption(ptr %A, ptr %B) { +define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption(ptr %A, ptr %B) nosync nofree { ; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption' ; CHECK-NEXT: loop.header: ; CHECK-NEXT: Memory dependences are safe with run-time checks @@ -518,10 +518,10 @@ define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_kno ; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv ; CHECK-NEXT: Grouped accesses: ; CHECK-NEXT: Group GRP0: -; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr)) +; CHECK-NEXT: (Low: %B High: (2000 + %B)) ; CHECK-NEXT: Member: {%B,+,4}<%loop.header> ; CHECK-NEXT: Group GRP1: -; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr)) +; CHECK-NEXT: (Low: %A High: (2000 + %A)) ; CHECK-NEXT: Member: {%A,+,4}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. @@ -565,7 +565,7 @@ e.2: ret void } -define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_deref_via_assumption_too_small(ptr %A, ptr %B) { +define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_deref_via_assumption_too_small(ptr %A, ptr %B) nosync nofree { ; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_deref_via_assumption_too_small' ; CHECK-NEXT: loop.header: ; CHECK-NEXT: Memory dependences are safe with run-time checks diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll index 0fe893abec86c..c42b4f66da27b 100644 --- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll +++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll @@ -7,21 +7,48 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si ; CHECK-NEXT: entry: ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 1024) ] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 1024) ] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024 +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]] +; CHECK-NEXT: br label [[LOOP_END]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ -1, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP9]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp index 118bf67320a3b..c365c95da6bff 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -41,7 +41,8 @@ class VPlanSlpTest : public VPlanTestIRBase { AARes.reset(new AAResults(*TLI)); AARes->addAAResult(*BasicAA); PSE.reset(new PredicatedScalarEvolution(*SE, *L)); - LAI.reset(new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI)); + LAI.reset(new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI, + nullptr)); IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI)); IAI->analyzeInterleaving(false); return {Plan, *IAI}; From 8db1162495bd5d0a5901e3821dec264bfcd774a9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 7 Jul 2025 21:22:51 +0100 Subject: [PATCH 2/4] !fixup address comments, thanks --- .../llvm/Analysis/LoopAccessAnalysis.h | 17 ++++++--- llvm/lib/Analysis/Loads.cpp | 2 +- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 37 +++++++------------ llvm/lib/Transforms/Scalar/LoopFlatten.cpp | 3 +- .../Transforms/Scalar/LoopVersioningLICM.cpp | 2 +- 5 files changed, 29 insertions(+), 32 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index a1a5a4625a047..7eb9b19d70e67 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -290,8 +290,14 @@ class MemoryDepChecker { return PointerBounds; } - AssumptionCache *getAC() const { return AC; } - DominatorTree *getDT() const { return DT; } + DominatorTree *getDT() const { + assert(AC && "requested DT, but it is not available"); + return DT; + } + AssumptionCache *getAC() const { + assert(AC && "requested AC, but it is not available"); + return AC; + } private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and @@ -927,7 +933,7 @@ LLVM_ABI std::pair getStartAndEndForAccess( const SCEV *MaxBTC, ScalarEvolution *SE, DenseMap, std::pair> *PointerBounds, - AssumptionCache *AC, DominatorTree *DT); + DominatorTree *DT, AssumptionCache *AC); class LoopAccessInfoManager { /// The cache. @@ -944,8 +950,9 @@ class LoopAccessInfoManager { public: LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT, - LoopInfo &LI, TargetTransformInfo *TTI, - const TargetLibraryInfo *TLI, AssumptionCache *AC) + LoopInfo &LI, TargetTransformInfo *TTI = nullptr, + const TargetLibraryInfo *TLI = nullptr, + AssumptionCache *AC = nullptr) : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC) {} LLVM_ABI const LoopAccessInfo &getInfo(Loop &L, bool AllowPartial = false); diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 7b4e00b298657..d099d8e43f636 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -326,7 +326,7 @@ bool llvm::isDereferenceableAndAlignedInLoop( return false; const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess( - L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, AC, &DT); + L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, &DT, AC); if (isa(AccessStart) || isa(AccessEnd)) return false; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 2719ed79cf1fb..ad4da1876e8c4 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -214,7 +214,7 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize, ScalarEvolution &SE, const DataLayout &DL, - AssumptionCache *AC, DominatorTree *DT) { + DominatorTree *DT, AssumptionCache *AC) { auto *PointerBase = SE.getPointerBase(AR->getStart()); auto *StartPtr = dyn_cast(PointerBase); if (!StartPtr) @@ -233,23 +233,14 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes); // Check if we have a suitable dereferencable assumption we can use. - RetainedKnowledge DerefRK; - if (!StartPtrV->canBeFreed() && - getKnowledgeForValue( - StartPtrV, {Attribute::Dereferenceable}, *AC, - [&](RetainedKnowledge RK, Instruction *Assume, auto) { - if (!isValidAssumeForContext( - Assume, L->getLoopPredecessor()->getTerminator(), DT)) - return false; - if (RK.AttrKind == Attribute::Dereferenceable) { - DerefRK = std::max(DerefRK, RK); - return true; - } - return false; - }) && - DerefRK.ArgValue) { - DerefBytesSCEV = SE.getUMaxExpr(DerefBytesSCEV, - SE.getConstant(WiderTy, DerefRK.ArgValue)); + if (!StartPtrV->canBeFreed()) { + RetainedKnowledge DerefRK = getKnowledgeValidInContext( + StartPtrV, {Attribute::Dereferenceable}, *AC, + L->getLoopPredecessor()->getTerminator(), DT); + if (DerefRK) { + DerefBytesSCEV = SE.getUMaxExpr( + DerefBytesSCEV, SE.getConstant(WiderTy, DerefRK.ArgValue)); + } } bool IsKnownNonNegative = SE.isKnownNonNegative(Step); @@ -298,7 +289,7 @@ std::pair llvm::getStartAndEndForAccess( const SCEV *MaxBTC, ScalarEvolution *SE, DenseMap, std::pair> *PointerBounds, - AssumptionCache *AC, DominatorTree *DT) { + DominatorTree *DT, AssumptionCache *AC) { std::pair *PtrBoundsPair; if (PointerBounds) { auto [Iter, Ins] = PointerBounds->insert( @@ -334,7 +325,7 @@ std::pair llvm::getStartAndEndForAccess( // separately checks that accesses cannot not wrap, so unsigned max // represents an upper bound. if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL, - AC, DT)) { + DT, AC)) { ScEnd = AR->evaluateAtIteration(MaxBTC, *SE); } else { ScEnd = SE->getAddExpr( @@ -383,7 +374,7 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, const SCEV *BTC = PSE.getBackedgeTakenCount(); const auto &[ScStart, ScEnd] = getStartAndEndForAccess( Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(), - &DC.getPointerBounds(), DC.getAC(), DC.getDT()); + &DC.getPointerBounds(), DC.getDT(), DC.getAC()); assert(!isa(ScStart) && !isa(ScEnd) && "must be able to compute both start and end expressions"); @@ -2036,10 +2027,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(InnermostLoop, Src, ATy, BTC, SymbolicMaxBTC, - PSE.getSE(), &PointerBounds, AC, DT); + PSE.getSE(), &PointerBounds, DT, AC); const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(InnermostLoop, Sink, BTy, BTC, SymbolicMaxBTC, - PSE.getSE(), &PointerBounds, AC, DT); + PSE.getSE(), &PointerBounds, DT, AC); if (!isa(SrcStart_) && !isa(SrcEnd_) && !isa(SinkStart_) && diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index b1096ce5ddd9f..1a86626999878 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -1009,8 +1009,7 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM, // in simplified form, and also needs LCSSA. Running // this pass will simplify all loops that contain inner loops, // regardless of whether anything ends up being flattened. - LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr, - nullptr); + LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI); for (Loop *InnerLoop : LN.getLoops()) { auto *OuterLoop = InnerLoop->getParentLoop(); if (!OuterLoop) diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 8e2cf832024ae..603ec25ea4f45 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -551,7 +551,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, const Function *F = L.getHeader()->getParent(); OptimizationRemarkEmitter ORE(F); - LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, nullptr); + LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI); if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); From e755778d5dcc1328cbca9459a499d6ecf7d34662 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 8 Jul 2025 12:31:29 +0100 Subject: [PATCH 3/4] !fixup pass AC when needed. --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 2 +- llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp | 2 +- llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 7eb9b19d70e67..a357abcd495c1 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -291,7 +291,7 @@ class MemoryDepChecker { } DominatorTree *getDT() const { - assert(AC && "requested DT, but it is not available"); + assert(DT && "requested DT, but it is not available"); return DT; } AssumptionCache *getAC() const { diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 603ec25ea4f45..448dc2b8b52b0 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -551,7 +551,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, const Function *F = L.getHeader()->getParent(); OptimizationRemarkEmitter ORE(F); - LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI); + LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC); if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp index c365c95da6bff..7471355603640 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -41,8 +41,8 @@ class VPlanSlpTest : public VPlanTestIRBase { AARes.reset(new AAResults(*TLI)); AARes->addAAResult(*BasicAA); PSE.reset(new PredicatedScalarEvolution(*SE, *L)); - LAI.reset(new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI, - nullptr)); + LAI.reset( + new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI, &*AC)); IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI)); IAI->analyzeInterleaving(false); return {Plan, *IAI}; From d00836a58ac42464bdc77619e1e487b0cf3eca2a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 8 Jul 2025 12:40:44 +0100 Subject: [PATCH 4/4] !fixup exit early on zero --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index ad4da1876e8c4..657f59478e19d 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -243,6 +243,9 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, } } + if (DerefBytesSCEV->isZero()) + return false; + bool IsKnownNonNegative = SE.isKnownNonNegative(Step); if (!IsKnownNonNegative && !SE.isKnownNegative(Step)) return false;