Skip to content

Commit 6f236cd

Browse files
authored
Revert "[AMDGPU] Use LSH for lowering ctlz_zero_undef.i8/i16 (#88512)"
This reverts commit fb2c659.
1 parent 2a90d59 commit 6f236cd

File tree

5 files changed

+178
-169
lines changed

5 files changed

+178
-169
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+6-16
Original file line numberDiff line numberDiff line change
@@ -3117,30 +3117,20 @@ static bool isCttzOpc(unsigned Opc) {
31173117
SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
31183118
SelectionDAG &DAG) const {
31193119
auto SL = SDLoc(Op);
3120-
auto Opc = Op.getOpcode();
31213120
auto Arg = Op.getOperand(0u);
31223121
auto ResultVT = Op.getValueType();
31233122

31243123
if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
31253124
return {};
31263125

3127-
assert(isCtlzOpc(Opc));
3126+
assert(isCtlzOpc(Op.getOpcode()));
31283127
assert(ResultVT == Arg.getValueType());
31293128

3130-
const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3131-
SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3132-
SDValue NewOp;
3133-
3134-
if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3135-
NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3136-
NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3137-
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3138-
} else {
3139-
NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3140-
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3141-
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3142-
}
3143-
3129+
auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3130+
auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3131+
auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3132+
NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3133+
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
31443134
return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
31453135
}
31463136

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

+7-37
Original file line numberDiff line numberDiff line change
@@ -1270,22 +1270,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
12701270
.custom();
12711271

12721272
// The 64-bit versions produce 32-bit results, but only on the SALU.
1273-
getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1274-
.legalFor({{S32, S32}, {S32, S64}})
1275-
.customIf(scalarNarrowerThan(1, 32))
1276-
.clampScalar(0, S32, S32)
1277-
.clampScalar(1, S32, S64)
1278-
.scalarize(0)
1279-
.widenScalarToNextPow2(0, 32)
1280-
.widenScalarToNextPow2(1, 32);
1281-
1282-
getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1283-
.legalFor({{S32, S32}, {S32, S64}})
1284-
.clampScalar(0, S32, S32)
1285-
.clampScalar(1, S32, S64)
1286-
.scalarize(0)
1287-
.widenScalarToNextPow2(0, 32)
1288-
.widenScalarToNextPow2(1, 32);
1273+
getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1274+
.legalFor({{S32, S32}, {S32, S64}})
1275+
.clampScalar(0, S32, S32)
1276+
.clampScalar(1, S32, S64)
1277+
.scalarize(0)
1278+
.widenScalarToNextPow2(0, 32)
1279+
.widenScalarToNextPow2(1, 32);
12891280

12901281
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
12911282
// RegBankSelect.
@@ -2137,8 +2128,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
21372128
case TargetOpcode::G_CTLZ:
21382129
case TargetOpcode::G_CTTZ:
21392130
return legalizeCTLZ_CTTZ(MI, MRI, B);
2140-
case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2141-
return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
21422131
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
21432132
return legalizeFPTruncRound(MI, B);
21442133
case TargetOpcode::G_STACKSAVE:
@@ -4156,25 +4145,6 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
41564145
return true;
41574146
}
41584147

4159-
bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4160-
MachineRegisterInfo &MRI,
4161-
MachineIRBuilder &B) const {
4162-
Register Dst = MI.getOperand(0).getReg();
4163-
Register Src = MI.getOperand(1).getReg();
4164-
LLT SrcTy = MRI.getType(Src);
4165-
TypeSize NumBits = SrcTy.getSizeInBits();
4166-
4167-
assert(NumBits < 32u);
4168-
4169-
auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4170-
auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4171-
auto Shift = B.buildLShr(S32, {Extend}, ShiftAmt);
4172-
auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4173-
B.buildTrunc(Dst, Ctlz);
4174-
MI.eraseFromParent();
4175-
return true;
4176-
}
4177-
41784148
// Check that this is a G_XOR x, -1
41794149
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
41804150
if (MI.getOpcode() != TargetOpcode::G_XOR)

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

-2
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
108108
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
109109
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
110110
MachineIRBuilder &B) const;
111-
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
112-
MachineIRBuilder &B) const;
113111

114112
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
115113
const ArgDescriptor *Arg,

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir

+27-20
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,14 @@ body: |
8181
; CHECK: liveins: $vgpr0
8282
; CHECK-NEXT: {{ $}}
8383
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
84-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
85-
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
86-
; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
87-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
88-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
89-
; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
84+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
85+
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
86+
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
87+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
88+
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
89+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
90+
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
91+
; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
9092
%0:_(s32) = COPY $vgpr0
9193
%1:_(s16) = G_TRUNC %0
9294
%2:_(s16) = G_CTLZ_ZERO_UNDEF %1
@@ -147,15 +149,18 @@ body: |
147149
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
148150
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
149151
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
150-
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
151-
; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
152-
; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
153-
; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
154152
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
155-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
156-
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
157-
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
158-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
153+
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
154+
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
155+
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
156+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
157+
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
158+
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
159+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
160+
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
161+
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
162+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
163+
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
159164
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
160165
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
161166
%0:_(<2 x s16>) = COPY $vgpr0
@@ -174,12 +179,14 @@ body: |
174179
; CHECK: liveins: $vgpr0
175180
; CHECK-NEXT: {{ $}}
176181
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
177-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
178-
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
179-
; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
180-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
181-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FFBH]], [[C1]]
182-
; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
182+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
183+
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
184+
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
185+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
186+
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
187+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
188+
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
189+
; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
183190
%0:_(s32) = COPY $vgpr0
184191
%1:_(s7) = G_TRUNC %0
185192
%2:_(s7) = G_CTLZ_ZERO_UNDEF %1

0 commit comments

Comments
 (0)