diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 98b64f52457d5..3900acbc8f223 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2254,23 +2254,6 @@ class slpvectorizer::BoUpSLP { /// effectively than the base graph. bool isTreeNotExtendable() const; - /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values - /// can be load combined in the backend. Load combining may not be allowed in - /// the IR optimizer, so we do not want to alter the pattern. For example, - /// partially transforming a scalar bswap() pattern into vector code is - /// effectively impossible for the backend to undo. - /// TODO: If load combining is allowed in the IR optimizer, this analysis - /// may not be necessary. - bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; - - /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values - /// can be load combined in the backend. Load combining may not be allowed in - /// the IR optimizer, so we do not want to alter the pattern. For example, - /// partially transforming a scalar bswap() pattern into vector code is - /// effectively impossible for the backend to undo. - /// TODO: If load combining is allowed in the IR optimizer, this analysis - /// may not be necessary. - bool isLoadCombineCandidate(ArrayRef Stores) const; bool isStridedLoad(ArrayRef PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const; @@ -15608,69 +15591,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { return true; } -static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, - TargetTransformInfo *TTI, - bool MustMatchOrInst) { - // Look past the root to find a source value. Arbitrarily follow the - // path through operand 0 of any 'or'. Also, peek through optional - // shift-left-by-multiple-of-8-bits. - Value *ZextLoad = Root; - const APInt *ShAmtC; - bool FoundOr = false; - while (!isa(ZextLoad) && - (match(ZextLoad, m_Or(m_Value(), m_Value())) || - (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && - ShAmtC->urem(8) == 0))) { - auto *BinOp = cast(ZextLoad); - ZextLoad = BinOp->getOperand(0); - if (BinOp->getOpcode() == Instruction::Or) - FoundOr = true; - } - // Check if the input is an extended load of the required or/shift expression. - Value *Load; - if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || - !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa(Load)) - return false; - - // Require that the total load bit width is a legal integer type. - // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. - // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. - Type *SrcTy = Load->getType(); - unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; - if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) - return false; - - // Everything matched - assume that we can fold the whole sequence using - // load combining. - LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " - << *(cast(Root)) << "\n"); - - return true; -} - -bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { - if (RdxKind != RecurKind::Or) - return false; - - unsigned NumElts = VectorizableTree[0]->Scalars.size(); - Value *FirstReduced = VectorizableTree[0]->Scalars[0]; - return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI, - /* MatchOr */ false); -} - -bool BoUpSLP::isLoadCombineCandidate(ArrayRef Stores) const { - // Peek through a final sequence of stores and check if all operations are - // likely to be load-combined. - unsigned NumElts = Stores.size(); - for (Value *Scalar : Stores) { - Value *X; - if (!match(Scalar, m_Store(m_Value(X), m_Value())) || - !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true)) - return false; - } - return true; -} - bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { if (!DebugCounter::shouldExecute(VectorizedGraphs)) return true; @@ -23497,8 +23417,6 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, return false; } } - if (R.isLoadCombineCandidate(Chain)) - return true; R.buildTree(Chain); // Check if tree tiny and store itself or its value is not vectorized. if (R.isTreeTinyAndNotFullyVectorizable()) { @@ -25112,11 +25030,6 @@ class HorizontalReduction { V.analyzedReductionVals(VL); continue; } - if (V.isLoadCombineReductionCandidate(RdxKind)) { - if (!AdjustReducedVals()) - V.analyzedReductionVals(VL); - continue; - } V.reorderTopToBottom(); // No need to reorder the root node at all for reassociative reduction. V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() || diff --git a/llvm/test/Transforms/PhaseOrdering/X86/loadcombine.ll b/llvm/test/Transforms/PhaseOrdering/X86/loadcombine.ll index fe49ba9d61d98..d44ae86484316 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/loadcombine.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/loadcombine.ll @@ -70,23 +70,10 @@ define i32 @loadCombine_4consecutive_1243(ptr %p) { define i32 @loadCombine_4consecutive_1324(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_1324( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -114,23 +101,10 @@ define i32 @loadCombine_4consecutive_1324(ptr %p) { define i32 @loadCombine_4consecutive_1342(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_1342( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -158,23 +132,10 @@ define i32 @loadCombine_4consecutive_1342(ptr %p) { define i32 @loadCombine_4consecutive_1423(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_1423( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -202,23 +163,10 @@ define i32 @loadCombine_4consecutive_1423(ptr %p) { define i32 @loadCombine_4consecutive_1432(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_1432( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -369,23 +317,10 @@ define i32 @loadCombine_4consecutive_2341(ptr %p) { define i32 @loadCombine_4consecutive_2413(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_2413( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -413,23 +348,10 @@ define i32 @loadCombine_4consecutive_2413(ptr %p) { define i32 @loadCombine_4consecutive_2431(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_2431( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -457,23 +379,10 @@ define i32 @loadCombine_4consecutive_2431(ptr %p) { define i32 @loadCombine_4consecutive_3124(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_3124( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -501,23 +410,10 @@ define i32 @loadCombine_4consecutive_3124(ptr %p) { define i32 @loadCombine_4consecutive_3142(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_3142( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -668,23 +564,10 @@ define i32 @loadCombine_4consecutive_3421(ptr %p) { define i32 @loadCombine_4consecutive_4123(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_4123( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -712,23 +595,10 @@ define i32 @loadCombine_4consecutive_4123(ptr %p) { define i32 @loadCombine_4consecutive_4132(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_4132( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -756,23 +626,10 @@ define i32 @loadCombine_4consecutive_4132(ptr %p) { define i32 @loadCombine_4consecutive_4213(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_4213( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 @@ -800,23 +657,10 @@ define i32 @loadCombine_4consecutive_4213(ptr %p) { define i32 @loadCombine_4consecutive_4231(ptr %p) { ; CHECK-LABEL: @loadCombine_4consecutive_4231( -; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1 -; CHECK-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1 -; CHECK-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32 -; CHECK-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32 -; CHECK-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32 -; CHECK-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16 -; CHECK-NEXT: [[S4:%.*]] = shl nuw i32 [[E4]], 24 -; CHECK-NEXT: [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]] -; CHECK-NEXT: [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]] -; CHECK-NEXT: [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: ret i32 [[O3]] ; %p1 = getelementptr i8, ptr %p, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll index dcc9693860322..4a111bcc35282 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll @@ -7,43 +7,10 @@ define i64 @load_bswap(ptr %p) { ; CHECK-LABEL: @load_bswap( -; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [[V8I8:%.*]], ptr [[P:%.*]], i64 0, i32 1 -; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 2 -; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 3 -; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 4 -; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 5 -; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 6 -; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 7 -; CHECK-NEXT: [[T0:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[T1:%.*]] = load i8, ptr [[G1]], align 1 -; CHECK-NEXT: [[T2:%.*]] = load i8, ptr [[G2]], align 1 -; CHECK-NEXT: [[T3:%.*]] = load i8, ptr [[G3]], align 1 -; CHECK-NEXT: [[T4:%.*]] = load i8, ptr [[G4]], align 1 -; CHECK-NEXT: [[T5:%.*]] = load i8, ptr [[G5]], align 1 -; CHECK-NEXT: [[T6:%.*]] = load i8, ptr [[G6]], align 1 -; CHECK-NEXT: [[T7:%.*]] = load i8, ptr [[G7]], align 1 -; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64 -; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64 -; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64 -; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64 -; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64 -; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56 -; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48 -; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40 -; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32 -; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24 -; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16 -; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8 -; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]] -; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]] -; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]] -; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]] -; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]] -; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]] -; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <8 x i64> [[TMP2]], +; CHECK-NEXT: [[OR01234567:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: ret i64 [[OR01234567]] ; %g1 = getelementptr inbounds %v8i8, ptr %p, i64 0, i32 1 @@ -93,44 +60,10 @@ define i64 @load_bswap(ptr %p) { define i64 @load_bswap_nop_shift(ptr %p) { ; CHECK-LABEL: @load_bswap_nop_shift( -; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [[V8I8:%.*]], ptr [[P:%.*]], i64 0, i32 1 -; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 2 -; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 3 -; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 4 -; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 5 -; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 6 -; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], ptr [[P]], i64 0, i32 7 -; CHECK-NEXT: [[T0:%.*]] = load i8, ptr [[P]], align 1 -; CHECK-NEXT: [[T1:%.*]] = load i8, ptr [[G1]], align 1 -; CHECK-NEXT: [[T2:%.*]] = load i8, ptr [[G2]], align 1 -; CHECK-NEXT: [[T3:%.*]] = load i8, ptr [[G3]], align 1 -; CHECK-NEXT: [[T4:%.*]] = load i8, ptr [[G4]], align 1 -; CHECK-NEXT: [[T5:%.*]] = load i8, ptr [[G5]], align 1 -; CHECK-NEXT: [[T6:%.*]] = load i8, ptr [[G6]], align 1 -; CHECK-NEXT: [[T7:%.*]] = load i8, ptr [[G7]], align 1 -; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64 -; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64 -; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64 -; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64 -; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64 -; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56 -; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48 -; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40 -; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32 -; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24 -; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16 -; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8 -; CHECK-NEXT: [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0 -; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]] -; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]] -; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]] -; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]] -; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]] -; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]] -; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <8 x i64> [[TMP2]], +; CHECK-NEXT: [[OR01234567:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: ret i64 [[OR01234567]] ; %g1 = getelementptr inbounds %v8i8, ptr %p, i64 0, i32 1 @@ -182,43 +115,10 @@ define i64 @load_bswap_nop_shift(ptr %p) { define i64 @load64le(ptr %arg) { ; CHECK-LABEL: @load64le( -; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 1 -; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 2 -; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 3 -; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 4 -; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 5 -; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 6 -; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 7 -; CHECK-NEXT: [[LD0:%.*]] = load i8, ptr [[ARG]], align 1 -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[G1]], align 1 -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[G2]], align 1 -; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[G3]], align 1 -; CHECK-NEXT: [[LD4:%.*]] = load i8, ptr [[G4]], align 1 -; CHECK-NEXT: [[LD5:%.*]] = load i8, ptr [[G5]], align 1 -; CHECK-NEXT: [[LD6:%.*]] = load i8, ptr [[G6]], align 1 -; CHECK-NEXT: [[LD7:%.*]] = load i8, ptr [[G7]], align 1 -; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64 -; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64 -; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64 -; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64 -; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64 -; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24 -; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32 -; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40 -; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48 -; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56 -; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[Z0]] -; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]] -; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]] -; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]] -; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]] -; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]] -; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARG:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[TMP2]], +; CHECK-NEXT: [[O7:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: ret i64 [[O7]] ; %g1 = getelementptr inbounds i8, ptr %arg, i64 1 @@ -268,44 +168,10 @@ define i64 @load64le(ptr %arg) { define i64 @load64le_nop_shift(ptr %arg) { ; CHECK-LABEL: @load64le_nop_shift( -; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 1 -; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 2 -; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 3 -; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 4 -; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 5 -; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 6 -; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 7 -; CHECK-NEXT: [[LD0:%.*]] = load i8, ptr [[ARG]], align 1 -; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[G1]], align 1 -; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[G2]], align 1 -; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[G3]], align 1 -; CHECK-NEXT: [[LD4:%.*]] = load i8, ptr [[G4]], align 1 -; CHECK-NEXT: [[LD5:%.*]] = load i8, ptr [[G5]], align 1 -; CHECK-NEXT: [[LD6:%.*]] = load i8, ptr [[G6]], align 1 -; CHECK-NEXT: [[LD7:%.*]] = load i8, ptr [[G7]], align 1 -; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64 -; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64 -; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64 -; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64 -; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64 -; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0 -; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24 -; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32 -; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40 -; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48 -; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56 -; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[S0]] -; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]] -; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]] -; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]] -; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]] -; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]] -; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]] +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARG:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <8 x i64> [[TMP2]], +; CHECK-NEXT: [[O7:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: ret i64 [[O7]] ; %g1 = getelementptr inbounds i8, ptr %arg, i64 1 @@ -355,84 +221,22 @@ define i64 @load64le_nop_shift(ptr %arg) { define void @PR39538(ptr %t0, ptr %t1) { ; CHECK-LABEL: @PR39538( -; CHECK-NEXT: [[T6:%.*]] = getelementptr inbounds i8, ptr [[T0:%.*]], i64 1 -; CHECK-NEXT: [[T11:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 2 -; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 3 -; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 4 -; CHECK-NEXT: [[T24:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 5 -; CHECK-NEXT: [[T29:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 6 -; CHECK-NEXT: [[T34:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 7 -; CHECK-NEXT: [[T39:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 8 -; CHECK-NEXT: [[T43:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 9 -; CHECK-NEXT: [[T48:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 10 -; CHECK-NEXT: [[T53:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 11 -; CHECK-NEXT: [[T58:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 12 -; CHECK-NEXT: [[T62:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 13 -; CHECK-NEXT: [[T67:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 14 -; CHECK-NEXT: [[T72:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 15 -; CHECK-NEXT: [[T38:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 1 -; CHECK-NEXT: [[T57:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 2 -; CHECK-NEXT: [[T76:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 3 -; CHECK-NEXT: [[T3:%.*]] = load i8, ptr [[T0]], align 1 -; CHECK-NEXT: [[T7:%.*]] = load i8, ptr [[T6]], align 1 -; CHECK-NEXT: [[T12:%.*]] = load i8, ptr [[T11]], align 1 -; CHECK-NEXT: [[T17:%.*]] = load i8, ptr [[T16]], align 1 -; CHECK-NEXT: [[T21:%.*]] = load i8, ptr [[T20]], align 1 -; CHECK-NEXT: [[T25:%.*]] = load i8, ptr [[T24]], align 1 -; CHECK-NEXT: [[T30:%.*]] = load i8, ptr [[T29]], align 1 -; CHECK-NEXT: [[T35:%.*]] = load i8, ptr [[T34]], align 1 -; CHECK-NEXT: [[T40:%.*]] = load i8, ptr [[T39]], align 1 -; CHECK-NEXT: [[T44:%.*]] = load i8, ptr [[T43]], align 1 -; CHECK-NEXT: [[T49:%.*]] = load i8, ptr [[T48]], align 1 -; CHECK-NEXT: [[T54:%.*]] = load i8, ptr [[T53]], align 1 -; CHECK-NEXT: [[T59:%.*]] = load i8, ptr [[T58]], align 1 -; CHECK-NEXT: [[T63:%.*]] = load i8, ptr [[T62]], align 1 -; CHECK-NEXT: [[T68:%.*]] = load i8, ptr [[T67]], align 1 -; CHECK-NEXT: [[T73:%.*]] = load i8, ptr [[T72]], align 1 -; CHECK-NEXT: [[T4:%.*]] = zext i8 [[T3]] to i32 -; CHECK-NEXT: [[T8:%.*]] = zext i8 [[T7]] to i32 -; CHECK-NEXT: [[T13:%.*]] = zext i8 [[T12]] to i32 -; CHECK-NEXT: [[T18:%.*]] = zext i8 [[T17]] to i32 -; CHECK-NEXT: [[T22:%.*]] = zext i8 [[T21]] to i32 -; CHECK-NEXT: [[T26:%.*]] = zext i8 [[T25]] to i32 -; CHECK-NEXT: [[T31:%.*]] = zext i8 [[T30]] to i32 -; CHECK-NEXT: [[T36:%.*]] = zext i8 [[T35]] to i32 -; CHECK-NEXT: [[T41:%.*]] = zext i8 [[T40]] to i32 -; CHECK-NEXT: [[T45:%.*]] = zext i8 [[T44]] to i32 -; CHECK-NEXT: [[T50:%.*]] = zext i8 [[T49]] to i32 -; CHECK-NEXT: [[T55:%.*]] = zext i8 [[T54]] to i32 -; CHECK-NEXT: [[T60:%.*]] = zext i8 [[T59]] to i32 -; CHECK-NEXT: [[T64:%.*]] = zext i8 [[T63]] to i32 -; CHECK-NEXT: [[T69:%.*]] = zext i8 [[T68]] to i32 -; CHECK-NEXT: [[T74:%.*]] = zext i8 [[T73]] to i32 -; CHECK-NEXT: [[T5:%.*]] = shl nuw i32 [[T4]], 24 -; CHECK-NEXT: [[T23:%.*]] = shl nuw i32 [[T22]], 24 -; CHECK-NEXT: [[T42:%.*]] = shl nuw i32 [[T41]], 24 -; CHECK-NEXT: [[T61:%.*]] = shl nuw i32 [[T60]], 24 -; CHECK-NEXT: [[T9:%.*]] = shl nuw nsw i32 [[T8]], 16 -; CHECK-NEXT: [[T27:%.*]] = shl nuw nsw i32 [[T26]], 16 -; CHECK-NEXT: [[T46:%.*]] = shl nuw nsw i32 [[T45]], 16 -; CHECK-NEXT: [[T65:%.*]] = shl nuw nsw i32 [[T64]], 16 -; CHECK-NEXT: [[T14:%.*]] = shl nuw nsw i32 [[T13]], 8 -; CHECK-NEXT: [[T32:%.*]] = shl nuw nsw i32 [[T31]], 8 -; CHECK-NEXT: [[T51:%.*]] = shl nuw nsw i32 [[T50]], 8 -; CHECK-NEXT: [[T70:%.*]] = shl nuw nsw i32 [[T69]], 8 -; CHECK-NEXT: [[T10:%.*]] = or i32 [[T9]], [[T5]] -; CHECK-NEXT: [[T15:%.*]] = or i32 [[T10]], [[T14]] -; CHECK-NEXT: [[T19:%.*]] = or i32 [[T15]], [[T18]] -; CHECK-NEXT: [[T28:%.*]] = or i32 [[T27]], [[T23]] -; CHECK-NEXT: [[T33:%.*]] = or i32 [[T28]], [[T32]] -; CHECK-NEXT: [[T37:%.*]] = or i32 [[T33]], [[T36]] -; CHECK-NEXT: [[T47:%.*]] = or i32 [[T46]], [[T42]] -; CHECK-NEXT: [[T52:%.*]] = or i32 [[T47]], [[T51]] -; CHECK-NEXT: [[T56:%.*]] = or i32 [[T52]], [[T55]] -; CHECK-NEXT: [[T66:%.*]] = or i32 [[T65]], [[T61]] -; CHECK-NEXT: [[T71:%.*]] = or i32 [[T66]], [[T70]] -; CHECK-NEXT: [[T75:%.*]] = or i32 [[T71]], [[T74]] -; CHECK-NEXT: store i32 [[T19]], ptr [[T1]], align 4 -; CHECK-NEXT: store i32 [[T37]], ptr [[T38]], align 4 -; CHECK-NEXT: store i32 [[T56]], ptr [[T57]], align 4 -; CHECK-NEXT: store i32 [[T75]], ptr [[T76]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[T0:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i8> [[TMP6]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw <4 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw <4 x i32> [[TMP7]], splat (i32 8) +; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP9]] +; CHECK-NEXT: store <4 x i32> [[TMP15]], ptr [[T1:%.*]], align 4 ; CHECK-NEXT: ret void ; %t6 = getelementptr inbounds i8, ptr %t0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll index c02ef8388b066..7936d1e77c702 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -10,23 +10,10 @@ define i32 @_Z9load_le32Ph(ptr nocapture readonly %data) { ; CHECK-LABEL: @_Z9load_le32Ph( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[DATA:%.*]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 8 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL3]], [[CONV]] -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 -; CHECK-NEXT: [[CONV5:%.*]] = zext i8 [[TMP2]] to i32 -; CHECK-NEXT: [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 16 -; CHECK-NEXT: [[OR7:%.*]] = or i32 [[OR]], [[SHL6]] -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP3]] to i32 -; CHECK-NEXT: [[SHL10:%.*]] = shl nuw i32 [[CONV9]], 24 -; CHECK-NEXT: [[OR11:%.*]] = or i32 [[OR7]], [[SHL10]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[DATA:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[OR11:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: ret i32 [[OR11]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll index 0545e5403f594..5f8f7ac9ed36c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -10,23 +10,10 @@ define i32 @_Z9load_le32Ph(ptr nocapture readonly %data) { ; CHECK-LABEL: @_Z9load_le32Ph( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[DATA:%.*]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 8 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL3]], [[CONV]] -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 -; CHECK-NEXT: [[CONV5:%.*]] = zext i8 [[TMP2]] to i32 -; CHECK-NEXT: [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 16 -; CHECK-NEXT: [[OR7:%.*]] = or i32 [[OR]], [[SHL6]] -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP3]] to i32 -; CHECK-NEXT: [[SHL10:%.*]] = shl nuw i32 [[CONV9]], 24 -; CHECK-NEXT: [[OR11:%.*]] = or i32 [[OR7]], [[SHL10]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[DATA:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[OR11:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: ret i32 [[OR11]] ; entry: