Skip to content

Commit 354f01b

Browse files
pkwasnie-intelfda0
authored andcommitted
precompiled emulation inlining improvements
Improvements for inlining mechanism of precompiled emulation functions (int64 math, dp math etc.). Instead of no inlining at all when total number of inlined instructions reaches threshold, inline as much as possible until threshold is reached. (cherry picked from commit a239bf5)
1 parent 69e6245 commit 354f01b

File tree

5 files changed

+330
-66
lines changed

5 files changed

+330
-66
lines changed

IGC/Compiler/Optimizer/PreCompiledFuncImport.cpp

Lines changed: 211 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ SPDX-License-Identifier: MIT
1717
#include "llvm/IR/InstIterator.h"
1818
#include "llvm/Support/MemoryBuffer.h"
1919
#include "llvm/Support/GenericDomTree.h"
20+
#include "llvm/Transforms/Utils/Cloning.h"
2021
#include "llvm/Bitcode/BitcodeReader.h"
2122
#include "llvm/Bitcode/BitcodeWriter.h"
2223
#include "llvm/Linker/Linker.h"
@@ -632,11 +633,16 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
632633
m_changed = false;
633634

634635
// When we test it, we need to set emuKind
635-
if (IGC_IS_FLAG_ENABLED(TestIGCPreCompiledFunctions))
636+
if (IGC_GET_FLAG_VALUE(TestIGCPreCompiledFunctions) == 1)
636637
{
637638
m_emuKind = EmuKind::EMU_DP;
638639
checkAndSetEnableSubroutine();
639640
}
641+
else if (IGC_GET_FLAG_VALUE(TestIGCPreCompiledFunctions) == 2)
642+
{
643+
m_emuKind = EmuKind::EMU_DP_DIV_SQRT;
644+
checkAndSetEnableSubroutine();
645+
}
640646
// sanity check
641647
if (m_emuKind == 0) {
642648
// Nothing to emulate
@@ -826,12 +832,11 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
826832
}
827833
}
828834

829-
unsigned totalNumberOfInlinedInst = 0;
835+
llvm::SmallVector<ImportedFunction, 32> importedFunctions;
836+
unsigned totalNumberOfInlinedInst = 0, totalNumberOfPotentiallyInlinedInst = 0;
830837
int emuFC = (int)IGC_GET_FLAG_VALUE(EmulationFunctionControl);
831838

832-
// Post processing, set those imported functions as internal linkage
833-
// and alwaysinline. Also count how many instructions would be added
834-
// to the shader if inlining occurred.
839+
// Post processing, set those imported functions as internal linkage.
835840
for (auto II = M.begin(), IE = M.end(); II != IE; )
836841
{
837842
Function* Func = &(*II);
@@ -853,92 +858,101 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
853858
continue;
854859
}
855860

856-
// Remove noinline/AlwaysInline attr if present.
857-
Func->removeFnAttr(llvm::Attribute::NoInline);
861+
if (std::find(importedFunctions.begin(), importedFunctions.end(), Func) == importedFunctions.end())
862+
importedFunctions.push_back(Func);
863+
}
864+
else
865+
{
866+
// Make sure original func isn't inlined accidentally.
858867
Func->removeFnAttr(llvm::Attribute::AlwaysInline);
868+
}
869+
}
859870

860-
if (m_enableCallForEmulation &&
861-
emuFC != FLAG_FCALL_DEFAULT &&
862-
emuFC != FLAG_FCALL_FORCE_INLINE)
863-
{
864-
// Disable inlining completely.
865-
continue;
866-
}
867-
868-
if (Func->hasOneUse() || emuFC == FLAG_FCALL_FORCE_INLINE)
869-
{
870-
Func->addFnAttr(llvm::Attribute::AlwaysInline);
871-
continue;
872-
}
871+
// Sort imported instructions in preferred inlining order.
872+
std::sort(importedFunctions.begin(), importedFunctions.end(), ImportedFunction::compare);
873873

874-
// Count number of instructions in the function
875-
unsigned NumInst = 0;
876-
for (BasicBlock& BB : Func->getBasicBlockList()) {
877-
NumInst += BB.getInstList().size();
878-
}
874+
// Post processing, set those imported functions as alwaysinline.
875+
// Also count how many instructions would be added to the shader
876+
// if inlining occurred.
877+
for (auto II = importedFunctions.begin(), IE = importedFunctions.end(); II != IE; ++II)
878+
{
879+
Function* Func = II->F;
879880

880-
// Don't want to subroutine small functions
881-
if (NumInst <= 5)
882-
{
883-
// Add AlwaysInline attribute to force inlining all calls.
884-
Func->addFnAttr(llvm::Attribute::AlwaysInline);
881+
// Remove noinline/AlwaysInline attr if present.
882+
Func->removeFnAttr(llvm::Attribute::NoInline);
883+
Func->removeFnAttr(llvm::Attribute::AlwaysInline);
885884

886-
continue;
887-
}
885+
if (m_enableCallForEmulation &&
886+
emuFC != FLAG_FCALL_DEFAULT &&
887+
emuFC != FLAG_FCALL_FORCE_INLINE)
888+
{
889+
// Disable inlining completely.
890+
continue;
891+
}
888892

889-
totalNumberOfInlinedInst += NumInst * Func->getNumUses();
893+
if (Func->hasOneUse() || emuFC == FLAG_FCALL_FORCE_INLINE)
894+
{
895+
Func->addFnAttr(llvm::Attribute::AlwaysInline);
896+
continue;
890897
}
891-
else
898+
899+
// Don't want to subroutine small functions
900+
if (II->funcInstructions <= 5)
892901
{
893-
// Make sure original func isn't inlined accidentally.
894-
Func->removeFnAttr(llvm::Attribute::AlwaysInline);
902+
// Add AlwaysInline attribute to force inlining all calls.
903+
Func->addFnAttr(llvm::Attribute::AlwaysInline);
904+
905+
continue;
895906
}
896-
}
897907

898-
// If true, it is a slow version of DP emu functions. Those functions
899-
// are the original ones for just passing conformance, not for perf.
900-
auto isSlowDPEmuFunc = [](Function* F) {
901-
StringRef FN = F->getName();
902-
if (FN.equals("__igcbuiltin_dp_add") ||
903-
FN.equals("__igcbuiltin_dp_sub") ||
904-
FN.equals("__igcbuiltin_dp_fma") ||
905-
FN.equals("__igcbuiltin_dp_mul") ||
906-
FN.equals("__igcbuiltin_dp_div") ||
907-
FN.equals("__igcbuiltin_dp_cmp") ||
908-
FN.equals("__igcbuiltin_dp_to_int32") ||
909-
FN.equals("__igcbuiltin_dp_to_uint32") ||
910-
FN.equals("__igcbuiltin_int32_to_dp") ||
911-
FN.equals("__igcbuiltin_uint32_to_dp") ||
912-
FN.equals("__igcbuiltin_dp_to_sp") ||
913-
FN.equals("__igcbuiltin_sp_to_dp") ||
914-
FN.equals("__igcbuiltin_dp_sqrt")) {
915-
return true;
908+
totalNumberOfPotentiallyInlinedInst += II->totalInstructions;
909+
910+
// If function fits in threshold, always inline.
911+
if (totalNumberOfInlinedInst + II->totalInstructions <= (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold))
912+
{
913+
totalNumberOfInlinedInst += II->totalInstructions;
914+
Func->addFnAttr(llvm::Attribute::AlwaysInline);
916915
}
917-
return false;
918-
};
916+
}
919917

920-
for (auto II = M.begin(), IE = M.end(); II != IE; )
918+
// Check if more functions can fit in threshold if they would be split into inline/noinline copies.
919+
if (m_enableCallForEmulation && emuFC == FLAG_FCALL_DEFAULT && totalNumberOfInlinedInst < (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold))
921920
{
922-
Function* Func = &(*II);
923-
++II;
924-
if (!Func || Func->isDeclaration())
921+
for (auto II = importedFunctions.begin(); II != importedFunctions.end(); ++II)
925922
{
926-
continue;
923+
Function* Func = II->F;
924+
925+
if (Func->hasFnAttribute(llvm::Attribute::AlwaysInline))
926+
continue;
927+
928+
unsigned calls = ((unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold) - totalNumberOfInlinedInst) / II->funcInstructions;
929+
if (calls > 0)
930+
{
931+
// Split function into inline/no-inline copies.
932+
ImportedFunction copy = createInlinedCopy(*II, calls);
933+
importedFunctions.push_back(copy);
934+
totalNumberOfInlinedInst += copy.totalInstructions;
935+
}
927936
}
937+
}
928938

929-
if (!origFunctions.count(Func) && !Func->hasFnAttribute(llvm::Attribute::AlwaysInline))
939+
for (auto II = importedFunctions.begin(), IE = importedFunctions.end(); II != IE; ++II)
940+
{
941+
Function* Func = II->F;
942+
943+
if (!Func->hasFnAttribute(llvm::Attribute::AlwaysInline))
930944
{
931945
// Special handling of DP functions: any one that has not been marked as inline
932946
// at this point, it will be either subroutine or stackcall.
933-
const bool isDPCallFunc = (isDPEmu() && isSlowDPEmuFunc(Func));
947+
const bool isDPCallFunc = (isDPEmu() && II->isSlowDPEmuFunc());
934948

935949
// Use subroutine/stackcall for some DP emulation functions if
936950
// EmulationFunctionControl is set so, or
937951
// use subroutines if total number of instructions added when
938952
// all emulated functions are inlined exceed InlinedEmulationThreshold.
939953
// If Func is a slow version of DP emu func, perf isn't important.
940954
if (m_enableCallForEmulation &&
941-
(totalNumberOfInlinedInst > (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold) ||
955+
(totalNumberOfPotentiallyInlinedInst > (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold) ||
942956
isDPCallFunc))
943957
{
944958
Func->addFnAttr(llvm::Attribute::NoInline);
@@ -1003,6 +1017,128 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
10031017
return m_changed;
10041018
}
10051019

1020+
PreCompiledFuncImport::ImportedFunction::ImportedFunction(Function* F)
1021+
: F(F), type(EmuType::OTHER), funcInstructions(0), totalInstructions(0)
1022+
{
1023+
// Count number of new instructions added by inlining.
1024+
for (BasicBlock& BB : F->getBasicBlockList())
1025+
funcInstructions += BB.getInstList().size();
1026+
1027+
updateUses();
1028+
1029+
// Get type of imported function.
1030+
StringRef name = F->getName();
1031+
1032+
if (name.equals("__igcbuiltin_dp_div_nomadm_ieee") ||
1033+
name.equals("__igcbuiltin_dp_div_nomadm_fast") ||
1034+
name.equals("__igcbuiltin_dp_sqrt_nomadm_ieee") ||
1035+
name.equals("__igcbuiltin_dp_sqrt_nomadm_fast"))
1036+
{
1037+
type = EmuType::FASTDP;
1038+
}
1039+
else if (name.equals("__igcbuiltin_dp_add") ||
1040+
name.equals("__igcbuiltin_dp_sub") ||
1041+
name.equals("__igcbuiltin_dp_fma") ||
1042+
name.equals("__igcbuiltin_dp_mul") ||
1043+
name.equals("__igcbuiltin_dp_div") ||
1044+
name.equals("__igcbuiltin_dp_cmp") ||
1045+
name.equals("__igcbuiltin_dp_to_int32") ||
1046+
name.equals("__igcbuiltin_dp_to_uint32") ||
1047+
name.equals("__igcbuiltin_int32_to_dp") ||
1048+
name.equals("__igcbuiltin_uint32_to_dp") ||
1049+
name.equals("__igcbuiltin_dp_to_sp") ||
1050+
name.equals("__igcbuiltin_sp_to_dp") ||
1051+
name.equals("__igcbuiltin_dp_sqrt"))
1052+
{
1053+
// If true, it is a slow version of DP emu functions. Those functions
1054+
// are the original ones for just passing conformance, not for perf.
1055+
type = EmuType::SLOWDP;
1056+
}
1057+
else
1058+
{
1059+
for (int i = 0; i < NUM_FUNCTIONS && type == EmuType::OTHER; ++i)
1060+
{
1061+
for (int j = 0; j < NUM_TYPES && type == EmuType::OTHER; ++j)
1062+
{
1063+
if (name.equals(m_Int64SpDivRemFunctionNames[i][j]) ||
1064+
name.equals(m_Int64DpDivRemFunctionNames[i][j]))
1065+
{
1066+
type = EmuType::INT64;
1067+
}
1068+
}
1069+
}
1070+
}
1071+
}
1072+
1073+
void PreCompiledFuncImport::ImportedFunction::updateUses()
1074+
{
1075+
totalInstructions = funcInstructions * F->getNumUses();
1076+
}
1077+
1078+
PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::ImportedFunction::copy(ImportedFunction& other)
1079+
{
1080+
ValueToValueMapTy VM;
1081+
Function* copy = CloneFunction(other.F, VM);
1082+
return PreCompiledFuncImport::ImportedFunction(copy, other.type, other.funcInstructions, 0);
1083+
}
1084+
1085+
// Compare two imported functions in order preferred for inlining.
1086+
bool PreCompiledFuncImport::ImportedFunction::compare(ImportedFunction& L, ImportedFunction& R)
1087+
{
1088+
// First sort by preferred type of emulation.
1089+
if (L.type != R.type)
1090+
return L.type < R.type;
1091+
1092+
// Then sort by number of inlined instructions.
1093+
return L.totalInstructions < R.totalInstructions;
1094+
};
1095+
1096+
PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::createInlinedCopy(ImportedFunction& IF, unsigned n)
1097+
{
1098+
std::vector<CallInst*> toDelete;
1099+
1100+
// Make copy that is always inlined.
1101+
ImportedFunction copy = ImportedFunction::copy(IF);
1102+
copy.F->setName(IF.F->getName() + "_always_inline");
1103+
copy.F->addFnAttr(llvm::Attribute::AlwaysInline);
1104+
1105+
// Collect first n calls to replace with copy.
1106+
llvm::SmallVector<CallInst*, 8> calls;
1107+
auto it = IF.F->user_begin();
1108+
for (unsigned i = 0; i < n; ++i)
1109+
{
1110+
CallInst* oldCall = dyn_cast<CallInst>(*(it++));
1111+
IGC_ASSERT(oldCall);
1112+
calls.push_back(oldCall);
1113+
}
1114+
1115+
// Replace with always inlined copy.
1116+
for (CallInst* oldCall : calls)
1117+
{
1118+
std::vector<Value*> args;
1119+
for (unsigned arg = 0; arg < IGCLLVM::getNumArgOperands(oldCall); ++arg)
1120+
args.push_back(oldCall->getArgOperand(arg));
1121+
1122+
// Create new call and insert it before old one
1123+
CallInst* newCall = CallInst::Create(copy.F, args, "", oldCall);
1124+
1125+
newCall->setCallingConv(copy.F->getCallingConv());
1126+
newCall->setAttributes(oldCall->getAttributes());
1127+
newCall->setDebugLoc(oldCall->getDebugLoc());
1128+
1129+
oldCall->replaceAllUsesWith(newCall);
1130+
toDelete.push_back(oldCall);
1131+
}
1132+
1133+
for (auto C : toDelete)
1134+
C->eraseFromParent();
1135+
1136+
copy.updateUses();
1137+
IF.updateUses();
1138+
1139+
return copy;
1140+
}
1141+
10061142
void PreCompiledFuncImport::visitBinaryOperator(BinaryOperator& I)
10071143
{
10081144
if (I.getOperand(0)->getType()->isIntOrIntVectorTy())
@@ -2547,6 +2683,7 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
25472683
bool SPDiv = isSPDiv();
25482684
bool DPEmu = isDPEmu();
25492685
bool DPDivSqrtEmu = isDPDivSqrtEmu();
2686+
bool I64DivRem = isI64DivRem();
25502687

25512688
Module* M = m_pCtx->getModule();
25522689
for (auto FI = M->begin(), FE = M->end(); FI != FE; ++FI)
@@ -2589,6 +2726,15 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
25892726
m_enableCallForEmulation = true;
25902727
}
25912728
break;
2729+
case Instruction::UDiv:
2730+
case Instruction::URem:
2731+
case Instruction::SDiv:
2732+
case Instruction::SRem:
2733+
if (I64DivRem && I->getOperand(0)->getType()->isIntegerTy(64))
2734+
{
2735+
m_enableCallForEmulation = true;
2736+
}
2737+
break;
25922738
}
25932739

25942740
GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(I);

0 commit comments

Comments
 (0)