[TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByConstant.

topperc · virnarula · commit 04336da5e782 · 2022-11-02T12:12:28.000-05:00
If the divisor is even, we can first shift the dividend and divisor right by the number of trailing zeros. Now the divisor is odd and we can do the original algorithm to calculate a remainder. Then we shift that remainder left by the number of trailing zeros and add the bits that were shifted out of the dividend. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D135541
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7168,8 +7168,17 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 //   Remainder = Sum % Constant
 // This is based on "Remainder by Summing Digits" from Hacker's Delight.
 //
-// For division, we can compute the remainder, subtract it from the dividend,
-// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)).
+// For division, we can compute the remainder using the algorithm described
+// above, subtract it from the dividend to get an exact multiple of Constant.
+// Then multiply that extact multiply by the multiplicative inverse modulo
+// (1 << (BitWidth / 2)) to get the quotient.
+
+// If Constant is even, we can shift right the dividend and the divisor by the
+// number of trailing zeros in Constant before applying the remainder algorithm.
+// If we're after the quotient, we can subtract this value from the shifted
+// dividend and multiply by the multiplicative inverse of the shifted divisor.
+// If we want the remainder, we shift the value left by the number of trailing
+// zeros and add the bits that were shifted out of the dividend.
 bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                             SmallVectorImpl<SDValue> &Result,
                                             EVT HiLoVT, SelectionDAG &DAG,
@@ -7188,7 +7197,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (!CN)
     return false;
 
-  const APInt &Divisor = CN->getAPIntValue();
+  APInt Divisor = CN->getAPIntValue();
   unsigned BitWidth = Divisor.getBitWidth();
   unsigned HBitWidth = BitWidth / 2;
   assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -7209,12 +7218,20 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (DAG.shouldOptForSize())
     return false;
 
-  // Early out for 0, 1 or even divisors.
-  if (Divisor.ule(1) || Divisor[0] == 0)
+  // Early out for 0 or 1 divisors.
+  if (Divisor.ule(1))
     return false;
 
+  // If the divisor is even, shift it until it becomes odd.
+  unsigned TrailingZeros = 0;
+  if (!Divisor[0]) {
+    TrailingZeros = Divisor.countTrailingZeros();
+    Divisor.lshrInPlace(TrailingZeros);
+  }
+
   SDLoc dl(N);
   SDValue Sum;
+  SDValue PartialRem;
 
   // If (1 << HBitWidth) % divisor == 1, we can add the two halves together and
   // then add in the carry.
@@ -7229,6 +7246,27 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                        DAG.getIntPtrConstant(1, dl));
     }
 
+    // Shift the input by the number of TrailingZeros in the divisor. The
+    // shifted out bits will be added to the remainder later.
+    if (TrailingZeros) {
+      LL = DAG.getNode(
+          ISD::OR, dl, HiLoVT,
+          DAG.getNode(ISD::SRL, dl, HiLoVT, LL,
+                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
+          DAG.getNode(ISD::SHL, dl, HiLoVT, LH,
+                      DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
+                                                 HiLoVT, dl)));
+      LH = DAG.getNode(ISD::SRL, dl, HiLoVT, LH,
+                       DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+
+      // Save the shifted off bits if we need the remainder.
+      if (Opcode != ISD::UDIV) {
+        APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
+        PartialRem = DAG.getNode(ISD::AND, dl, HiLoVT, LL,
+                                 DAG.getConstant(Mask, dl, HiLoVT));
+      }
+    }
+
     // Use addcarry if we can, otherwise use a compare to detect overflow.
     EVT SetCCType =
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
@@ -7260,45 +7298,45 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   SDValue RemL =
       DAG.getNode(ISD::UREM, dl, HiLoVT, Sum,
                   DAG.getConstant(Divisor.trunc(HBitWidth), dl, HiLoVT));
-  // High half of the remainder is 0.
   SDValue RemH = DAG.getConstant(0, dl, HiLoVT);
 
-  // If we only want remainder, we're done.
-  if (Opcode == ISD::UREM) {
-    Result.push_back(RemL);
-    Result.push_back(RemH);
-    return true;
-  }
-
-  // Otherwise, we need to compute the quotient.
-
-  // Join the remainder halves.
-  SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH);
-
-  // Subtract the remainder from the input.
-  SDValue In = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Rem);
-
-  // Multiply by the multiplicative inverse of the divisor modulo
-  // (1 << BitWidth).
-  APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
-  APInt MulFactor = Divisor.zext(BitWidth + 1);
-  MulFactor = MulFactor.multiplicativeInverse(Mod);
-  MulFactor = MulFactor.trunc(BitWidth);
-
-  SDValue Quotient =
-      DAG.getNode(ISD::MUL, dl, VT, In, DAG.getConstant(MulFactor, dl, VT));
-
-  // Split the quotient into low and high parts.
-  SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
-                              DAG.getIntPtrConstant(0, dl));
-  SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
-                              DAG.getIntPtrConstant(1, dl));
-  Result.push_back(QuotL);
-  Result.push_back(QuotH);
-  // For DIVREM, also return the remainder parts.
-  if (Opcode == ISD::UDIVREM) {
+  if (Opcode != ISD::UREM) {
+    // Subtract the remainder from the shifted dividend.
+    SDValue Dividend = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
+    SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH);
+
+    Dividend = DAG.getNode(ISD::SUB, dl, VT, Dividend, Rem);
+
+    // Multiply by the multiplicative inverse of the divisor modulo
+    // (1 << BitWidth).
+    APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
+    APInt MulFactor = Divisor.zext(BitWidth + 1);
+    MulFactor = MulFactor.multiplicativeInverse(Mod);
+    MulFactor = MulFactor.trunc(BitWidth);
+
+    SDValue Quotient = DAG.getNode(ISD::MUL, dl, VT, Dividend,
+                                   DAG.getConstant(MulFactor, dl, VT));
+
+    // Split the quotient into low and high parts.
+    SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
+                                DAG.getIntPtrConstant(0, dl));
+    SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
+                                DAG.getIntPtrConstant(1, dl));
+    Result.push_back(QuotL);
+    Result.push_back(QuotH);
+  }
+
+  if (Opcode != ISD::UDIV) {
+    // If we shifted the input, shift the remainder left and add the bits we
+    // shifted off the input.
+    if (TrailingZeros) {
+      APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
+      RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
+                         DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+      RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRem);
+    }
     Result.push_back(RemL);
-    Result.push_back(RemH);
+    Result.push_back(DAG.getConstant(0, dl, HiLoVT));
   }
 
   return true;
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -502,24 +502,59 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __udivdi3@plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a2, a0, a1
+; RV32-NEXT:    sltu a3, a2, a0
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    lui a3, 699051
+; RV32-NEXT:    addi a4, a3, -1365
+; RV32-NEXT:    mulhu a5, a2, a4
+; RV32-NEXT:    srli a6, a5, 1
+; RV32-NEXT:    andi a5, a5, -2
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    addi a3, a3, -1366
+; RV32-NEXT:    mul a3, a5, a3
+; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    sub a0, a1, a0
+; RV32-NEXT:    mul a0, a0, a4
+; RV32-NEXT:    add a1, a3, a0
+; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __udivti3@plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a0, a0, 2
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    lui a2, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a2, %lo(.LCPI10_0)(a2)
+; RV64-NEXT:    add a3, a0, a1
+; RV64-NEXT:    sltu a4, a3, a0
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    mulhu a4, a3, a2
+; RV64-NEXT:    srli a5, a4, 1
+; RV64-NEXT:    andi a4, a4, -2
+; RV64-NEXT:    lui a6, %hi(.LCPI10_1)
+; RV64-NEXT:    ld a6, %lo(.LCPI10_1)(a6)
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    sub a3, a3, a4
+; RV64-NEXT:    sub a4, a0, a3
+; RV64-NEXT:    mul a5, a4, a6
+; RV64-NEXT:    mulhu a6, a4, a2
+; RV64-NEXT:    add a5, a6, a5
+; RV64-NEXT:    sltu a0, a0, a3
+; RV64-NEXT:    sub a0, a1, a0
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a1, a5, a0
+; RV64-NEXT:    mul a0, a4, a2
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 12
   ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -335,24 +335,46 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __umoddi3@plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a2, a1, a0
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    lui a2, 699051
+; RV32-NEXT:    addi a2, a2, -1365
+; RV32-NEXT:    mulhu a2, a1, a2
+; RV32-NEXT:    srli a3, a2, 1
+; RV32-NEXT:    andi a2, a2, -2
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sub a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    andi a0, a0, 3
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    li a1, 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __umodti3@plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a0, a0, 2
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    lui a2, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a2, %lo(.LCPI10_0)(a2)
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    sltu a3, a1, a0
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    mulhu a2, a1, a2
+; RV64-NEXT:    srli a3, a2, 1
+; RV64-NEXT:    andi a2, a2, -2
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    sub a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    andi a0, a0, 3
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    li a1, 0
 ; RV64-NEXT:    ret
   %a = urem iXLen2 %x, 12
   ret iXLen2 %a
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -735,13 +735,23 @@ entry:
 define i64 @urem_i64_12(i64 %x) nounwind {
 ; X32-LABEL: urem_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __umoddi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    shrdl $2, %ecx, %esi
+; X32-NEXT:    shrl $2, %ecx
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    leal (%esi,%ecx,4), %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: urem_i64_12:
@@ -1116,13 +1126,33 @@ entry:
 define i64 @udiv_i64_12(i64 %x) nounwind {
 ; X32-LABEL: udiv_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __udivdi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    shrdl $2, %edi, %ecx
+; X32-NEXT:    shrl $2, %edi
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl $-1431655765, %ebx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %esi
+; X32-NEXT:    subl %esi, %ecx
+; X32-NEXT:    sbbl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: udiv_i64_12:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll