Skip to content

Commit ccb9835

Browse files
authored
[X86] LowerShift - lower vXi8 shifts of an uniform constant using PSHUFB (llvm#112175)
If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317
1 parent 4bf6e83 commit ccb9835

File tree

2 files changed

+350
-1727
lines changed

2 files changed

+350
-1727
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+33
Original file line numberDiff line numberDiff line change
@@ -30143,6 +30143,39 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3014330143
return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
3014430144
}
3014530145

30146+
// If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30147+
// look up the pre-computed shift values.
30148+
if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30149+
(VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30150+
(VT == MVT::v64i8 && Subtarget.hasBWI())) {
30151+
unsigned NumElts = VT.getVectorNumElements();
30152+
unsigned NumLanes = VT.getSizeInBits() / 128u;
30153+
unsigned NumEltsPerLane = NumElts / NumLanes;
30154+
SmallVector<APInt, 16> LUT;
30155+
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30156+
unsigned LoElt = Lane * NumEltsPerLane;
30157+
APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30158+
KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30159+
if (!KnownLane.isConstant())
30160+
break;
30161+
const APInt &LaneSplat = KnownLane.getConstant();
30162+
for (unsigned I = 0; I != 8; ++I) {
30163+
if (Opc == ISD::SHL)
30164+
LUT.push_back(LaneSplat.shl(I));
30165+
else if (Opc == ISD::SRL)
30166+
LUT.push_back(LaneSplat.lshr(I));
30167+
else if (Opc == ISD::SRA)
30168+
LUT.push_back(LaneSplat.ashr(I));
30169+
}
30170+
LUT.append(8, APInt::getZero(8));
30171+
}
30172+
if (LUT.size() == NumElts) {
30173+
APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30174+
SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30175+
return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30176+
}
30177+
}
30178+
3014630179
// It's worth extending once and using the vXi16/vXi32 shifts for smaller
3014730180
// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
3014830181
// make the existing SSE solution better.

0 commit comments

Comments
 (0)