Skip to content

Commit 04336da

Browse files
toppercvirnarula
authored andcommitted
[TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByConstant.
If the divisor is even, we can first shift the dividend and divisor right by the number of trailing zeros. Now the divisor is odd and we can do the original algorithm to calculate a remainder. Then we shift that remainder left by the number of trailing zeros and add the bits that were shifted out of the dividend. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D135541
1 parent 78a064d commit 04336da

File tree

5 files changed

+276
-117
lines changed

5 files changed

+276
-117
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 79 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -7168,8 +7168,17 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
71687168
// Remainder = Sum % Constant
71697169
// This is based on "Remainder by Summing Digits" from Hacker's Delight.
71707170
//
7171-
// For division, we can compute the remainder, subtract it from the dividend,
7172-
// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)).
7171+
// For division, we can compute the remainder using the algorithm described
7172+
// above, subtract it from the dividend to get an exact multiple of Constant.
7173+
// Then multiply that extact multiply by the multiplicative inverse modulo
7174+
// (1 << (BitWidth / 2)) to get the quotient.
7175+
7176+
// If Constant is even, we can shift right the dividend and the divisor by the
7177+
// number of trailing zeros in Constant before applying the remainder algorithm.
7178+
// If we're after the quotient, we can subtract this value from the shifted
7179+
// dividend and multiply by the multiplicative inverse of the shifted divisor.
7180+
// If we want the remainder, we shift the value left by the number of trailing
7181+
// zeros and add the bits that were shifted out of the dividend.
71737182
bool TargetLowering::expandDIVREMByConstant(SDNode *N,
71747183
SmallVectorImpl<SDValue> &Result,
71757184
EVT HiLoVT, SelectionDAG &DAG,
@@ -7188,7 +7197,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
71887197
if (!CN)
71897198
return false;
71907199

7191-
const APInt &Divisor = CN->getAPIntValue();
7200+
APInt Divisor = CN->getAPIntValue();
71927201
unsigned BitWidth = Divisor.getBitWidth();
71937202
unsigned HBitWidth = BitWidth / 2;
71947203
assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -7209,12 +7218,20 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
72097218
if (DAG.shouldOptForSize())
72107219
return false;
72117220

7212-
// Early out for 0, 1 or even divisors.
7213-
if (Divisor.ule(1) || Divisor[0] == 0)
7221+
// Early out for 0 or 1 divisors.
7222+
if (Divisor.ule(1))
72147223
return false;
72157224

7225+
// If the divisor is even, shift it until it becomes odd.
7226+
unsigned TrailingZeros = 0;
7227+
if (!Divisor[0]) {
7228+
TrailingZeros = Divisor.countTrailingZeros();
7229+
Divisor.lshrInPlace(TrailingZeros);
7230+
}
7231+
72167232
SDLoc dl(N);
72177233
SDValue Sum;
7234+
SDValue PartialRem;
72187235

72197236
// If (1 << HBitWidth) % divisor == 1, we can add the two halves together and
72207237
// then add in the carry.
@@ -7229,6 +7246,27 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
72297246
DAG.getIntPtrConstant(1, dl));
72307247
}
72317248

7249+
// Shift the input by the number of TrailingZeros in the divisor. The
7250+
// shifted out bits will be added to the remainder later.
7251+
if (TrailingZeros) {
7252+
LL = DAG.getNode(
7253+
ISD::OR, dl, HiLoVT,
7254+
DAG.getNode(ISD::SRL, dl, HiLoVT, LL,
7255+
DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
7256+
DAG.getNode(ISD::SHL, dl, HiLoVT, LH,
7257+
DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
7258+
HiLoVT, dl)));
7259+
LH = DAG.getNode(ISD::SRL, dl, HiLoVT, LH,
7260+
DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
7261+
7262+
// Save the shifted off bits if we need the remainder.
7263+
if (Opcode != ISD::UDIV) {
7264+
APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
7265+
PartialRem = DAG.getNode(ISD::AND, dl, HiLoVT, LL,
7266+
DAG.getConstant(Mask, dl, HiLoVT));
7267+
}
7268+
}
7269+
72327270
// Use addcarry if we can, otherwise use a compare to detect overflow.
72337271
EVT SetCCType =
72347272
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
@@ -7260,45 +7298,45 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
72607298
SDValue RemL =
72617299
DAG.getNode(ISD::UREM, dl, HiLoVT, Sum,
72627300
DAG.getConstant(Divisor.trunc(HBitWidth), dl, HiLoVT));
7263-
// High half of the remainder is 0.
72647301
SDValue RemH = DAG.getConstant(0, dl, HiLoVT);
72657302

7266-
// If we only want remainder, we're done.
7267-
if (Opcode == ISD::UREM) {
7268-
Result.push_back(RemL);
7269-
Result.push_back(RemH);
7270-
return true;
7271-
}
7272-
7273-
// Otherwise, we need to compute the quotient.
7274-
7275-
// Join the remainder halves.
7276-
SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH);
7277-
7278-
// Subtract the remainder from the input.
7279-
SDValue In = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Rem);
7280-
7281-
// Multiply by the multiplicative inverse of the divisor modulo
7282-
// (1 << BitWidth).
7283-
APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
7284-
APInt MulFactor = Divisor.zext(BitWidth + 1);
7285-
MulFactor = MulFactor.multiplicativeInverse(Mod);
7286-
MulFactor = MulFactor.trunc(BitWidth);
7287-
7288-
SDValue Quotient =
7289-
DAG.getNode(ISD::MUL, dl, VT, In, DAG.getConstant(MulFactor, dl, VT));
7290-
7291-
// Split the quotient into low and high parts.
7292-
SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
7293-
DAG.getIntPtrConstant(0, dl));
7294-
SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
7295-
DAG.getIntPtrConstant(1, dl));
7296-
Result.push_back(QuotL);
7297-
Result.push_back(QuotH);
7298-
// For DIVREM, also return the remainder parts.
7299-
if (Opcode == ISD::UDIVREM) {
7303+
if (Opcode != ISD::UREM) {
7304+
// Subtract the remainder from the shifted dividend.
7305+
SDValue Dividend = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
7306+
SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH);
7307+
7308+
Dividend = DAG.getNode(ISD::SUB, dl, VT, Dividend, Rem);
7309+
7310+
// Multiply by the multiplicative inverse of the divisor modulo
7311+
// (1 << BitWidth).
7312+
APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
7313+
APInt MulFactor = Divisor.zext(BitWidth + 1);
7314+
MulFactor = MulFactor.multiplicativeInverse(Mod);
7315+
MulFactor = MulFactor.trunc(BitWidth);
7316+
7317+
SDValue Quotient = DAG.getNode(ISD::MUL, dl, VT, Dividend,
7318+
DAG.getConstant(MulFactor, dl, VT));
7319+
7320+
// Split the quotient into low and high parts.
7321+
SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
7322+
DAG.getIntPtrConstant(0, dl));
7323+
SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
7324+
DAG.getIntPtrConstant(1, dl));
7325+
Result.push_back(QuotL);
7326+
Result.push_back(QuotH);
7327+
}
7328+
7329+
if (Opcode != ISD::UDIV) {
7330+
// If we shifted the input, shift the remainder left and add the bits we
7331+
// shifted off the input.
7332+
if (TrailingZeros) {
7333+
APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
7334+
RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
7335+
DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
7336+
RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRem);
7337+
}
73007338
Result.push_back(RemL);
7301-
Result.push_back(RemH);
7339+
Result.push_back(DAG.getConstant(0, dl, HiLoVT));
73027340
}
73037341

73047342
return true;

llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -502,24 +502,59 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
502502
define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
503503
; RV32-LABEL: test_udiv_12:
504504
; RV32: # %bb.0:
505-
; RV32-NEXT: addi sp, sp, -16
506-
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
507-
; RV32-NEXT: li a2, 12
508-
; RV32-NEXT: li a3, 0
509-
; RV32-NEXT: call __udivdi3@plt
510-
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
511-
; RV32-NEXT: addi sp, sp, 16
505+
; RV32-NEXT: slli a2, a1, 30
506+
; RV32-NEXT: srli a0, a0, 2
507+
; RV32-NEXT: or a0, a0, a2
508+
; RV32-NEXT: srli a1, a1, 2
509+
; RV32-NEXT: add a2, a0, a1
510+
; RV32-NEXT: sltu a3, a2, a0
511+
; RV32-NEXT: add a2, a2, a3
512+
; RV32-NEXT: lui a3, 699051
513+
; RV32-NEXT: addi a4, a3, -1365
514+
; RV32-NEXT: mulhu a5, a2, a4
515+
; RV32-NEXT: srli a6, a5, 1
516+
; RV32-NEXT: andi a5, a5, -2
517+
; RV32-NEXT: add a5, a5, a6
518+
; RV32-NEXT: sub a2, a2, a5
519+
; RV32-NEXT: sub a5, a0, a2
520+
; RV32-NEXT: addi a3, a3, -1366
521+
; RV32-NEXT: mul a3, a5, a3
522+
; RV32-NEXT: mulhu a6, a5, a4
523+
; RV32-NEXT: add a3, a6, a3
524+
; RV32-NEXT: sltu a0, a0, a2
525+
; RV32-NEXT: sub a0, a1, a0
526+
; RV32-NEXT: mul a0, a0, a4
527+
; RV32-NEXT: add a1, a3, a0
528+
; RV32-NEXT: mul a0, a5, a4
512529
; RV32-NEXT: ret
513530
;
514531
; RV64-LABEL: test_udiv_12:
515532
; RV64: # %bb.0:
516-
; RV64-NEXT: addi sp, sp, -16
517-
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
518-
; RV64-NEXT: li a2, 12
519-
; RV64-NEXT: li a3, 0
520-
; RV64-NEXT: call __udivti3@plt
521-
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
522-
; RV64-NEXT: addi sp, sp, 16
533+
; RV64-NEXT: slli a2, a1, 62
534+
; RV64-NEXT: srli a0, a0, 2
535+
; RV64-NEXT: or a0, a0, a2
536+
; RV64-NEXT: srli a1, a1, 2
537+
; RV64-NEXT: lui a2, %hi(.LCPI10_0)
538+
; RV64-NEXT: ld a2, %lo(.LCPI10_0)(a2)
539+
; RV64-NEXT: add a3, a0, a1
540+
; RV64-NEXT: sltu a4, a3, a0
541+
; RV64-NEXT: add a3, a3, a4
542+
; RV64-NEXT: mulhu a4, a3, a2
543+
; RV64-NEXT: srli a5, a4, 1
544+
; RV64-NEXT: andi a4, a4, -2
545+
; RV64-NEXT: lui a6, %hi(.LCPI10_1)
546+
; RV64-NEXT: ld a6, %lo(.LCPI10_1)(a6)
547+
; RV64-NEXT: add a4, a4, a5
548+
; RV64-NEXT: sub a3, a3, a4
549+
; RV64-NEXT: sub a4, a0, a3
550+
; RV64-NEXT: mul a5, a4, a6
551+
; RV64-NEXT: mulhu a6, a4, a2
552+
; RV64-NEXT: add a5, a6, a5
553+
; RV64-NEXT: sltu a0, a0, a3
554+
; RV64-NEXT: sub a0, a1, a0
555+
; RV64-NEXT: mul a0, a0, a2
556+
; RV64-NEXT: add a1, a5, a0
557+
; RV64-NEXT: mul a0, a4, a2
523558
; RV64-NEXT: ret
524559
%a = udiv iXLen2 %x, 12
525560
ret iXLen2 %a

llvm/test/CodeGen/RISCV/split-urem-by-constant.ll

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -335,24 +335,46 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
335335
define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
336336
; RV32-LABEL: test_urem_12:
337337
; RV32: # %bb.0:
338-
; RV32-NEXT: addi sp, sp, -16
339-
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
340-
; RV32-NEXT: li a2, 12
341-
; RV32-NEXT: li a3, 0
342-
; RV32-NEXT: call __umoddi3@plt
343-
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
344-
; RV32-NEXT: addi sp, sp, 16
338+
; RV32-NEXT: slli a2, a1, 30
339+
; RV32-NEXT: srli a0, a0, 2
340+
; RV32-NEXT: or a0, a0, a2
341+
; RV32-NEXT: srli a1, a1, 2
342+
; RV32-NEXT: add a1, a0, a1
343+
; RV32-NEXT: sltu a2, a1, a0
344+
; RV32-NEXT: add a1, a1, a2
345+
; RV32-NEXT: lui a2, 699051
346+
; RV32-NEXT: addi a2, a2, -1365
347+
; RV32-NEXT: mulhu a2, a1, a2
348+
; RV32-NEXT: srli a3, a2, 1
349+
; RV32-NEXT: andi a2, a2, -2
350+
; RV32-NEXT: add a2, a2, a3
351+
; RV32-NEXT: sub a1, a1, a2
352+
; RV32-NEXT: slli a1, a1, 2
353+
; RV32-NEXT: andi a0, a0, 3
354+
; RV32-NEXT: or a0, a1, a0
355+
; RV32-NEXT: li a1, 0
345356
; RV32-NEXT: ret
346357
;
347358
; RV64-LABEL: test_urem_12:
348359
; RV64: # %bb.0:
349-
; RV64-NEXT: addi sp, sp, -16
350-
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
351-
; RV64-NEXT: li a2, 12
352-
; RV64-NEXT: li a3, 0
353-
; RV64-NEXT: call __umodti3@plt
354-
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
355-
; RV64-NEXT: addi sp, sp, 16
360+
; RV64-NEXT: slli a2, a1, 62
361+
; RV64-NEXT: srli a0, a0, 2
362+
; RV64-NEXT: or a0, a0, a2
363+
; RV64-NEXT: srli a1, a1, 2
364+
; RV64-NEXT: lui a2, %hi(.LCPI10_0)
365+
; RV64-NEXT: ld a2, %lo(.LCPI10_0)(a2)
366+
; RV64-NEXT: add a1, a0, a1
367+
; RV64-NEXT: sltu a3, a1, a0
368+
; RV64-NEXT: add a1, a1, a3
369+
; RV64-NEXT: mulhu a2, a1, a2
370+
; RV64-NEXT: srli a3, a2, 1
371+
; RV64-NEXT: andi a2, a2, -2
372+
; RV64-NEXT: add a2, a2, a3
373+
; RV64-NEXT: sub a1, a1, a2
374+
; RV64-NEXT: slli a1, a1, 2
375+
; RV64-NEXT: andi a0, a0, 3
376+
; RV64-NEXT: or a0, a1, a0
377+
; RV64-NEXT: li a1, 0
356378
; RV64-NEXT: ret
357379
%a = urem iXLen2 %x, 12
358380
ret iXLen2 %a

llvm/test/CodeGen/X86/divide-by-constant.ll

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -735,13 +735,23 @@ entry:
735735
define i64 @urem_i64_12(i64 %x) nounwind {
736736
; X32-LABEL: urem_i64_12:
737737
; X32: # %bb.0: # %entry
738-
; X32-NEXT: subl $12, %esp
739-
; X32-NEXT: pushl $0
740-
; X32-NEXT: pushl $12
741-
; X32-NEXT: pushl {{[0-9]+}}(%esp)
742-
; X32-NEXT: pushl {{[0-9]+}}(%esp)
743-
; X32-NEXT: calll __umoddi3
744-
; X32-NEXT: addl $28, %esp
738+
; X32-NEXT: pushl %esi
739+
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
740+
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
741+
; X32-NEXT: shrdl $2, %ecx, %esi
742+
; X32-NEXT: shrl $2, %ecx
743+
; X32-NEXT: addl %esi, %ecx
744+
; X32-NEXT: adcl $0, %ecx
745+
; X32-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
746+
; X32-NEXT: movl %ecx, %eax
747+
; X32-NEXT: mull %edx
748+
; X32-NEXT: shrl %edx
749+
; X32-NEXT: leal (%edx,%edx,2), %eax
750+
; X32-NEXT: subl %eax, %ecx
751+
; X32-NEXT: andl $3, %esi
752+
; X32-NEXT: leal (%esi,%ecx,4), %eax
753+
; X32-NEXT: xorl %edx, %edx
754+
; X32-NEXT: popl %esi
745755
; X32-NEXT: retl
746756
;
747757
; X64-LABEL: urem_i64_12:
@@ -1116,13 +1126,33 @@ entry:
11161126
define i64 @udiv_i64_12(i64 %x) nounwind {
11171127
; X32-LABEL: udiv_i64_12:
11181128
; X32: # %bb.0: # %entry
1119-
; X32-NEXT: subl $12, %esp
1120-
; X32-NEXT: pushl $0
1121-
; X32-NEXT: pushl $12
1122-
; X32-NEXT: pushl {{[0-9]+}}(%esp)
1123-
; X32-NEXT: pushl {{[0-9]+}}(%esp)
1124-
; X32-NEXT: calll __udivdi3
1125-
; X32-NEXT: addl $28, %esp
1129+
; X32-NEXT: pushl %ebx
1130+
; X32-NEXT: pushl %edi
1131+
; X32-NEXT: pushl %esi
1132+
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
1133+
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
1134+
; X32-NEXT: shrdl $2, %edi, %ecx
1135+
; X32-NEXT: shrl $2, %edi
1136+
; X32-NEXT: movl %ecx, %esi
1137+
; X32-NEXT: addl %edi, %esi
1138+
; X32-NEXT: adcl $0, %esi
1139+
; X32-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB
1140+
; X32-NEXT: movl %esi, %eax
1141+
; X32-NEXT: mull %ebx
1142+
; X32-NEXT: shrl %edx
1143+
; X32-NEXT: leal (%edx,%edx,2), %eax
1144+
; X32-NEXT: subl %eax, %esi
1145+
; X32-NEXT: subl %esi, %ecx
1146+
; X32-NEXT: sbbl $0, %edi
1147+
; X32-NEXT: movl %ecx, %eax
1148+
; X32-NEXT: mull %ebx
1149+
; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
1150+
; X32-NEXT: addl %ecx, %edx
1151+
; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
1152+
; X32-NEXT: addl %ecx, %edx
1153+
; X32-NEXT: popl %esi
1154+
; X32-NEXT: popl %edi
1155+
; X32-NEXT: popl %ebx
11261156
; X32-NEXT: retl
11271157
;
11281158
; X64-LABEL: udiv_i64_12:

0 commit comments

Comments
 (0)