Skip to content

Commit fdbef6c

Browse files
authored
Faster unsigned division by constants (dotnet#52893)
* Faster unsigned division by constants * Fix Arm build and add some tests. * Improve register allocation * Fix ARM64 codegen * Fix MULHI flags * Remove ARM32 codegen * Widen 32bit UDIV to 64bit MULHI when possible. Improve register allocation. * Always widen 32bit UDIV to 64bit MUL/MULHI * Cleanup * Final optimization * Fix typo * Rebase and use inst_Mov instead of inst_RV_RV(INS_mov) * Fix formatting (1 space)
1 parent 7b7954a commit fdbef6c

15 files changed

+323
-120
lines changed

THIRD-PARTY-NOTICES.TXT

+10
Original file line numberDiff line numberDiff line change
@@ -942,3 +942,13 @@ OF SUCH DAMAGES.
942942
You acknowledge that this software is not designed, licensed or
943943
intended for use in the design, construction, operation or
944944
maintenance of any nuclear facility.
945+
946+
947+
License notice for "Faster Unsigned Division by Constants"
948+
------------------------------
949+
950+
Reference implementations of computing and using the "magic number" approach to dividing
951+
by constants, including codegen instructions. The unsigned division incorporates the
952+
"round down" optimization per ridiculous_fish.
953+
954+
This is free and unencumbered software. Any copyright is dedicated to the Public Domain.

src/coreclr/jit/assertionprop.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -5152,8 +5152,9 @@ Compiler::fgWalkResult Compiler::optVNConstantPropCurStmt(BasicBlock* block, Sta
51525152
case GT_INTRINSIC:
51535153
break;
51545154

5155+
case GT_INC_SATURATE:
51555156
case GT_MULHI:
5156-
assert(false && "Unexpected GT_MULHI node encountered before lowering");
5157+
assert(false && "Unexpected GT_INC_SATURATE/GT_MULHI node encountered before lowering");
51575158
break;
51585159

51595160
case GT_JTRUE:

src/coreclr/jit/codegen.h

+1
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
843843

844844
void genCodeForDivMod(GenTreeOp* treeNode);
845845
void genCodeForMul(GenTreeOp* treeNode);
846+
void genCodeForIncSaturate(GenTree* treeNode);
846847
void genCodeForMulHi(GenTreeOp* treeNode);
847848
void genLeaInstruction(GenTreeAddrMode* lea);
848849
void genSetRegToCond(regNumber dstReg, GenTree* tree);

src/coreclr/jit/codegenarm64.cpp

+21
Original file line numberDiff line numberDiff line change
@@ -1753,6 +1753,27 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
17531753
}
17541754
}
17551755

1756+
// Produce code for a GT_INC_SATURATE node.
1757+
void CodeGen::genCodeForIncSaturate(GenTree* tree)
1758+
{
1759+
regNumber targetReg = tree->GetRegNum();
1760+
1761+
// The arithmetic node must be sitting in a register (since it's not contained)
1762+
assert(!tree->isContained());
1763+
// The dst can only be a register.
1764+
assert(targetReg != REG_NA);
1765+
1766+
GenTree* operand = tree->gtGetOp1();
1767+
assert(!operand->isContained());
1768+
// The src must be a register.
1769+
regNumber operandReg = genConsumeReg(operand);
1770+
1771+
GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1);
1772+
GetEmitter()->emitIns_R_R_COND(INS_cinv, emitActualTypeSize(tree), targetReg, targetReg, INS_COND_HS);
1773+
1774+
genProduceReg(tree);
1775+
}
1776+
17561777
// Generate code to get the high N bits of a N*N=2N bit multiplication result
17571778
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
17581779
{

src/coreclr/jit/codegenarmarch.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
303303

304304
#ifdef TARGET_ARM64
305305

306+
case GT_INC_SATURATE:
307+
genCodeForIncSaturate(treeNode);
308+
break;
309+
306310
case GT_MULHI:
307311
genCodeForMulHi(treeNode->AsOp());
308312
break;

src/coreclr/jit/codegenxarch.cpp

+21
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,23 @@ void CodeGen::genCodeForBswap(GenTree* tree)
605605
genProduceReg(tree);
606606
}
607607

608+
// Produce code for a GT_INC_SATURATE node.
609+
void CodeGen::genCodeForIncSaturate(GenTree* tree)
610+
{
611+
regNumber targetReg = tree->GetRegNum();
612+
var_types targetType = tree->TypeGet();
613+
614+
GenTree* operand = tree->gtGetOp1();
615+
assert(operand->isUsedFromReg());
616+
regNumber operandReg = genConsumeReg(operand);
617+
618+
inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true);
619+
inst_RV_IV(INS_add, targetReg, 1, emitActualTypeSize(targetType));
620+
inst_RV_IV(INS_sbb, targetReg, 0, emitActualTypeSize(targetType));
621+
622+
genProduceReg(tree);
623+
}
624+
608625
// Generate code to get the high N bits of a N*N=2N bit multiplication result
609626
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
610627
{
@@ -1608,6 +1625,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
16081625
genCodeForIndir(treeNode->AsIndir());
16091626
break;
16101627

1628+
case GT_INC_SATURATE:
1629+
genCodeForIncSaturate(treeNode);
1630+
break;
1631+
16111632
case GT_MULHI:
16121633
#ifdef TARGET_X86
16131634
case GT_MUL_LONG:

src/coreclr/jit/compiler.h

+1
Original file line numberDiff line numberDiff line change
@@ -10806,6 +10806,7 @@ class GenTreeVisitor
1080610806
case GT_RETFILT:
1080710807
case GT_RUNTIMELOOKUP:
1080810808
case GT_KEEPALIVE:
10809+
case GT_INC_SATURATE:
1080910810
{
1081010811
GenTreeUnOp* const unOp = node->AsUnOp();
1081110812
if (unOp->gtOp1 != nullptr)

src/coreclr/jit/compiler.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -4328,6 +4328,7 @@ void GenTree::VisitOperands(TVisitor visitor)
43284328
#endif // FEATURE_ARG_SPLIT
43294329
case GT_RETURNTRAP:
43304330
case GT_KEEPALIVE:
4331+
case GT_INC_SATURATE:
43314332
visitor(this->AsUnOp()->gtOp1);
43324333
return;
43334334

src/coreclr/jit/gentree.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -5217,6 +5217,7 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use)
52175217
case GT_BSWAP:
52185218
case GT_BSWAP16:
52195219
case GT_KEEPALIVE:
5220+
case GT_INC_SATURATE:
52205221
if (def == this->AsUnOp()->gtOp1)
52215222
{
52225223
*use = &this->AsUnOp()->gtOp1;
@@ -9315,6 +9316,7 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node)
93159316
case GT_BSWAP:
93169317
case GT_BSWAP16:
93179318
case GT_KEEPALIVE:
9319+
case GT_INC_SATURATE:
93189320
#if FEATURE_ARG_SPLIT
93199321
case GT_PUTARG_SPLIT:
93209322
#endif // FEATURE_ARG_SPLIT

src/coreclr/jit/gtlist.h

+1
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ GTNODE(RSH , GenTreeOp ,0,GTK_BINOP)
132132
GTNODE(RSZ , GenTreeOp ,0,GTK_BINOP)
133133
GTNODE(ROL , GenTreeOp ,0,GTK_BINOP)
134134
GTNODE(ROR , GenTreeOp ,0,GTK_BINOP)
135+
GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod)
135136
GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply)
136137
// GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn
137138
// the div into a MULHI + some adjustments. In codegen, we only use the

src/coreclr/jit/lower.cpp

+117-55
Original file line numberDiff line numberDiff line change
@@ -5171,31 +5171,48 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
51715171
if (!comp->opts.MinOpts() && (divisorValue >= 3))
51725172
{
51735173
size_t magic;
5174-
bool add;
5175-
int shift;
5174+
bool increment;
5175+
int preShift;
5176+
int postShift;
5177+
bool simpleMul = false;
51765178

51775179
if (type == TYP_INT)
51785180
{
5179-
magic = MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &add, &shift);
5181+
magic =
5182+
MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &increment, &preShift, &postShift);
5183+
5184+
#ifdef TARGET_64BIT
5185+
// avoid inc_saturate/multiple shifts by widening to 32x64 MULHI
5186+
if (increment || (preShift
5187+
#ifdef TARGET_XARCH
5188+
// IMUL reg,reg,imm32 can't be used if magic<0 because of sign-extension
5189+
&& static_cast<int32_t>(magic) < 0
5190+
#endif
5191+
))
5192+
{
5193+
magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift,
5194+
&postShift, 32);
5195+
}
5196+
// otherwise just widen to regular multiplication
5197+
else
5198+
{
5199+
postShift += 32;
5200+
simpleMul = true;
5201+
}
5202+
#endif
51805203
}
51815204
else
51825205
{
51835206
#ifdef TARGET_64BIT
5184-
magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &add, &shift);
5207+
magic =
5208+
MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift, &postShift);
51855209
#else
51865210
unreached();
51875211
#endif
51885212
}
51895213
assert(divMod->MarkedDivideByConstOptimized());
51905214

5191-
// Depending on the "add" flag returned by GetUnsignedMagicNumberForDivide we need to generate:
5192-
// add == false (when divisor == 3 for example):
5193-
// div = (dividend MULHI magic) RSZ shift
5194-
// add == true (when divisor == 7 for example):
5195-
// mulhi = dividend MULHI magic
5196-
// div = (((dividend SUB mulhi) RSZ 1) ADD mulhi)) RSZ (shift - 1)
5197-
const bool requiresAdjustment = add;
5198-
const bool requiresDividendMultiuse = requiresAdjustment || !isDiv;
5215+
const bool requiresDividendMultiuse = !isDiv;
51995216
const BasicBlock::weight_t curBBWeight = m_block->getBBWeight(comp);
52005217

52015218
if (requiresDividendMultiuse)
@@ -5204,62 +5221,107 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
52045221
dividend = ReplaceWithLclVar(dividendUse);
52055222
}
52065223

5207-
// Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
5208-
// The existing node will later be transformed into a GT_RSZ/GT_SUB that
5209-
// computes the final result. This way don't need to find and change the use
5210-
// of the existing node.
5211-
GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, dividend, divisor);
5212-
mulhi->gtFlags |= GTF_UNSIGNED;
5213-
divisor->AsIntCon()->SetIconValue(magic);
5214-
BlockRange().InsertBefore(divMod, mulhi);
5215-
GenTree* firstNode = mulhi;
5224+
GenTree* firstNode = nullptr;
5225+
GenTree* adjustedDividend = dividend;
52165226

5217-
if (requiresAdjustment)
5227+
// If "increment" flag is returned by GetUnsignedMagic we need to do Saturating Increment first
5228+
if (increment)
52185229
{
5219-
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
5220-
GenTree* sub = comp->gtNewOperNode(GT_SUB, type, dividend, mulhi);
5221-
BlockRange().InsertBefore(divMod, dividend, sub);
5222-
5223-
GenTree* one = comp->gtNewIconNode(1, TYP_INT);
5224-
GenTree* rsz = comp->gtNewOperNode(GT_RSZ, type, sub, one);
5225-
BlockRange().InsertBefore(divMod, one, rsz);
5226-
5227-
LIR::Use mulhiUse(BlockRange(), &sub->AsOp()->gtOp2, sub);
5228-
mulhi = ReplaceWithLclVar(mulhiUse);
5229-
5230-
mulhi = comp->gtNewLclvNode(mulhi->AsLclVar()->GetLclNum(), mulhi->TypeGet());
5231-
GenTree* add = comp->gtNewOperNode(GT_ADD, type, rsz, mulhi);
5232-
BlockRange().InsertBefore(divMod, mulhi, add);
5233-
5234-
mulhi = add;
5235-
shift -= 1;
5230+
adjustedDividend = comp->gtNewOperNode(GT_INC_SATURATE, type, adjustedDividend);
5231+
BlockRange().InsertBefore(divMod, adjustedDividend);
5232+
firstNode = adjustedDividend;
5233+
assert(!preShift);
52365234
}
5235+
// if "preShift" is required, then do a right shift before
5236+
else if (preShift)
5237+
{
5238+
GenTree* preShiftBy = comp->gtNewIconNode(preShift, TYP_INT);
5239+
adjustedDividend = comp->gtNewOperNode(GT_RSZ, type, adjustedDividend, preShiftBy);
5240+
BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend);
5241+
firstNode = preShiftBy;
5242+
}
5243+
else if (type != TYP_I_IMPL)
5244+
{
5245+
adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_U_IMPL);
5246+
BlockRange().InsertBefore(divMod, adjustedDividend);
5247+
firstNode = adjustedDividend;
5248+
}
5249+
5250+
#ifdef TARGET_XARCH
5251+
// force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes
5252+
// reduntant copies otherwise
5253+
if (firstNode && !simpleMul)
5254+
adjustedDividend->SetRegNum(REG_RAX);
5255+
#endif
52375256

5238-
GenTree* shiftBy = comp->gtNewIconNode(shift, TYP_INT);
5239-
BlockRange().InsertBefore(divMod, shiftBy);
5257+
divisor->gtType = TYP_I_IMPL;
5258+
divisor->AsIntCon()->SetIconValue(magic);
52405259

5241-
if (isDiv)
5260+
if (isDiv && !postShift && type == TYP_I_IMPL)
52425261
{
5243-
divMod->SetOper(GT_RSZ);
5244-
divMod->gtOp1 = mulhi;
5245-
divMod->gtOp2 = shiftBy;
5262+
divMod->SetOper(GT_MULHI);
5263+
divMod->gtOp1 = adjustedDividend;
5264+
divMod->gtFlags |= GTF_UNSIGNED;
52465265
}
52475266
else
52485267
{
5249-
GenTree* div = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy);
5268+
// Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
5269+
// The existing node will later be transformed into a GT_RSZ/GT_SUB that
5270+
// computes the final result. This way don't need to find and change the use
5271+
// of the existing node.
5272+
GenTree* mulhi = comp->gtNewOperNode(simpleMul ? GT_MUL : GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor);
5273+
mulhi->gtFlags |= GTF_UNSIGNED;
5274+
BlockRange().InsertBefore(divMod, mulhi);
5275+
if (!firstNode)
5276+
firstNode = mulhi;
5277+
5278+
if (postShift)
5279+
{
5280+
GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT);
5281+
BlockRange().InsertBefore(divMod, shiftBy);
52505282

5251-
// divisor UMOD dividend = dividend SUB (div MUL divisor)
5252-
GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
5253-
GenTree* mul = comp->gtNewOperNode(GT_MUL, type, div, divisor);
5254-
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
5283+
if (isDiv && type == TYP_I_IMPL)
5284+
{
5285+
divMod->SetOper(GT_RSZ);
5286+
divMod->gtOp1 = mulhi;
5287+
divMod->gtOp2 = shiftBy;
5288+
}
5289+
else
5290+
{
5291+
mulhi = comp->gtNewOperNode(GT_RSZ, TYP_I_IMPL, mulhi, shiftBy);
5292+
BlockRange().InsertBefore(divMod, mulhi);
5293+
}
5294+
}
5295+
5296+
if (!isDiv)
5297+
{
5298+
// divisor UMOD dividend = dividend SUB (div MUL divisor)
5299+
GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
5300+
GenTree* mul = comp->gtNewOperNode(GT_MUL, type, mulhi, divisor);
5301+
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
52555302

5256-
divMod->SetOper(GT_SUB);
5257-
divMod->gtOp1 = dividend;
5258-
divMod->gtOp2 = mul;
5303+
divMod->SetOper(GT_SUB);
5304+
divMod->gtOp1 = dividend;
5305+
divMod->gtOp2 = mul;
52595306

5260-
BlockRange().InsertBefore(divMod, div, divisor, mul, dividend);
5307+
BlockRange().InsertBefore(divMod, divisor, mul, dividend);
5308+
}
5309+
else if (type != TYP_I_IMPL)
5310+
{
5311+
#ifdef TARGET_ARMARCH
5312+
divMod->SetOper(GT_CAST);
5313+
divMod->gtFlags |= GTF_UNSIGNED;
5314+
divMod->AsCast()->gtCastType = TYP_UINT;
5315+
#else
5316+
divMod->SetOper(GT_BITCAST);
5317+
#endif
5318+
divMod->gtOp1 = mulhi;
5319+
divMod->gtOp2 = nullptr;
5320+
}
52615321
}
5262-
ContainCheckRange(firstNode, divMod);
5322+
5323+
if (firstNode)
5324+
ContainCheckRange(firstNode, divMod);
52635325
return true;
52645326
}
52655327
#endif

0 commit comments

Comments
 (0)