@@ -17,6 +17,7 @@ SPDX-License-Identifier: MIT
17
17
#include " llvm/IR/InstIterator.h"
18
18
#include " llvm/Support/MemoryBuffer.h"
19
19
#include " llvm/Support/GenericDomTree.h"
20
+ #include " llvm/Transforms/Utils/Cloning.h"
20
21
#include " llvm/Bitcode/BitcodeReader.h"
21
22
#include " llvm/Bitcode/BitcodeWriter.h"
22
23
#include " llvm/Linker/Linker.h"
@@ -632,11 +633,16 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
632
633
m_changed = false ;
633
634
634
635
// When we test it, we need to set emuKind
635
- if (IGC_IS_FLAG_ENABLED (TestIGCPreCompiledFunctions))
636
+ if (IGC_GET_FLAG_VALUE (TestIGCPreCompiledFunctions) == 1 )
636
637
{
637
638
m_emuKind = EmuKind::EMU_DP;
638
639
checkAndSetEnableSubroutine ();
639
640
}
641
+ else if (IGC_GET_FLAG_VALUE (TestIGCPreCompiledFunctions) == 2 )
642
+ {
643
+ m_emuKind = EmuKind::EMU_DP_DIV_SQRT;
644
+ checkAndSetEnableSubroutine ();
645
+ }
640
646
// sanity check
641
647
if (m_emuKind == 0 ) {
642
648
// Nothing to emulate
@@ -826,12 +832,11 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
826
832
}
827
833
}
828
834
829
- unsigned totalNumberOfInlinedInst = 0 ;
835
+ llvm::SmallVector<ImportedFunction, 32 > importedFunctions;
836
+ unsigned totalNumberOfInlinedInst = 0 , totalNumberOfPotentiallyInlinedInst = 0 ;
830
837
int emuFC = (int )IGC_GET_FLAG_VALUE (EmulationFunctionControl);
831
838
832
- // Post processing, set those imported functions as internal linkage
833
- // and alwaysinline. Also count how many instructions would be added
834
- // to the shader if inlining occurred.
839
+ // Post processing, set those imported functions as internal linkage.
835
840
for (auto II = M.begin (), IE = M.end (); II != IE; )
836
841
{
837
842
Function* Func = &(*II);
@@ -853,92 +858,101 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
853
858
continue ;
854
859
}
855
860
856
- // Remove noinline/AlwaysInline attr if present.
857
- Func->removeFnAttr (llvm::Attribute::NoInline);
861
+ if (std::find (importedFunctions.begin (), importedFunctions.end (), Func) == importedFunctions.end ())
862
+ importedFunctions.push_back (Func);
863
+ }
864
+ else
865
+ {
866
+ // Make sure original func isn't inlined accidentally.
858
867
Func->removeFnAttr (llvm::Attribute::AlwaysInline);
868
+ }
869
+ }
859
870
860
- if (m_enableCallForEmulation &&
861
- emuFC != FLAG_FCALL_DEFAULT &&
862
- emuFC != FLAG_FCALL_FORCE_INLINE)
863
- {
864
- // Disable inlining completely.
865
- continue ;
866
- }
867
-
868
- if (Func->hasOneUse () || emuFC == FLAG_FCALL_FORCE_INLINE)
869
- {
870
- Func->addFnAttr (llvm::Attribute::AlwaysInline);
871
- continue ;
872
- }
871
+ // Sort imported instructions in preferred inlining order.
872
+ std::sort (importedFunctions.begin (), importedFunctions.end (), ImportedFunction::compare);
873
873
874
- // Count number of instructions in the function
875
- unsigned NumInst = 0 ;
876
- for (BasicBlock& BB : Func->getBasicBlockList ()) {
877
- NumInst += BB.getInstList ().size ();
878
- }
874
+ // Post processing, set those imported functions as alwaysinline.
875
+ // Also count how many instructions would be added to the shader
876
+ // if inlining occurred.
877
+ for (auto II = importedFunctions.begin (), IE = importedFunctions.end (); II != IE; ++II)
878
+ {
879
+ Function* Func = II->F ;
879
880
880
- // Don't want to subroutine small functions
881
- if (NumInst <= 5 )
882
- {
883
- // Add AlwaysInline attribute to force inlining all calls.
884
- Func->addFnAttr (llvm::Attribute::AlwaysInline);
881
+ // Remove noinline/AlwaysInline attr if present.
882
+ Func->removeFnAttr (llvm::Attribute::NoInline);
883
+ Func->removeFnAttr (llvm::Attribute::AlwaysInline);
885
884
886
- continue ;
887
- }
885
+ if (m_enableCallForEmulation &&
886
+ emuFC != FLAG_FCALL_DEFAULT &&
887
+ emuFC != FLAG_FCALL_FORCE_INLINE)
888
+ {
889
+ // Disable inlining completely.
890
+ continue ;
891
+ }
888
892
889
- totalNumberOfInlinedInst += NumInst * Func->getNumUses ();
893
+ if (Func->hasOneUse () || emuFC == FLAG_FCALL_FORCE_INLINE)
894
+ {
895
+ Func->addFnAttr (llvm::Attribute::AlwaysInline);
896
+ continue ;
890
897
}
891
- else
898
+
899
+ // Don't want to subroutine small functions
900
+ if (II->funcInstructions <= 5 )
892
901
{
893
- // Make sure original func isn't inlined accidentally.
894
- Func->removeFnAttr (llvm::Attribute::AlwaysInline);
902
+ // Add AlwaysInline attribute to force inlining all calls.
903
+ Func->addFnAttr (llvm::Attribute::AlwaysInline);
904
+
905
+ continue ;
895
906
}
896
- }
897
907
898
- // If true, it is a slow version of DP emu functions. Those functions
899
- // are the original ones for just passing conformance, not for perf.
900
- auto isSlowDPEmuFunc = [](Function* F) {
901
- StringRef FN = F->getName ();
902
- if (FN.equals (" __igcbuiltin_dp_add" ) ||
903
- FN.equals (" __igcbuiltin_dp_sub" ) ||
904
- FN.equals (" __igcbuiltin_dp_fma" ) ||
905
- FN.equals (" __igcbuiltin_dp_mul" ) ||
906
- FN.equals (" __igcbuiltin_dp_div" ) ||
907
- FN.equals (" __igcbuiltin_dp_cmp" ) ||
908
- FN.equals (" __igcbuiltin_dp_to_int32" ) ||
909
- FN.equals (" __igcbuiltin_dp_to_uint32" ) ||
910
- FN.equals (" __igcbuiltin_int32_to_dp" ) ||
911
- FN.equals (" __igcbuiltin_uint32_to_dp" ) ||
912
- FN.equals (" __igcbuiltin_dp_to_sp" ) ||
913
- FN.equals (" __igcbuiltin_sp_to_dp" ) ||
914
- FN.equals (" __igcbuiltin_dp_sqrt" )) {
915
- return true ;
908
+ totalNumberOfPotentiallyInlinedInst += II->totalInstructions ;
909
+
910
+ // If function fits in threshold, always inline.
911
+ if (totalNumberOfInlinedInst + II->totalInstructions <= (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold))
912
+ {
913
+ totalNumberOfInlinedInst += II->totalInstructions ;
914
+ Func->addFnAttr (llvm::Attribute::AlwaysInline);
916
915
}
917
- return false ;
918
- };
916
+ }
919
917
920
- for (auto II = M.begin (), IE = M.end (); II != IE; )
918
+ // Check if more functions can fit in threshold if they would be split into inline/noinline copies.
919
+ if (m_enableCallForEmulation && emuFC == FLAG_FCALL_DEFAULT && totalNumberOfInlinedInst < (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold))
921
920
{
922
- Function* Func = &(*II);
923
- ++II;
924
- if (!Func || Func->isDeclaration ())
921
+ for (auto II = importedFunctions.begin (); II != importedFunctions.end (); ++II)
925
922
{
926
- continue ;
923
+ Function* Func = II->F ;
924
+
925
+ if (Func->hasFnAttribute (llvm::Attribute::AlwaysInline))
926
+ continue ;
927
+
928
+ unsigned calls = ((unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold) - totalNumberOfInlinedInst) / II->funcInstructions ;
929
+ if (calls > 0 )
930
+ {
931
+ // Split function into inline/no-inline copies.
932
+ ImportedFunction copy = createInlinedCopy (*II, calls);
933
+ importedFunctions.push_back (copy);
934
+ totalNumberOfInlinedInst += copy.totalInstructions ;
935
+ }
927
936
}
937
+ }
928
938
929
- if (!origFunctions.count (Func) && !Func->hasFnAttribute (llvm::Attribute::AlwaysInline))
939
+ for (auto II = importedFunctions.begin (), IE = importedFunctions.end (); II != IE; ++II)
940
+ {
941
+ Function* Func = II->F ;
942
+
943
+ if (!Func->hasFnAttribute (llvm::Attribute::AlwaysInline))
930
944
{
931
945
// Special handling of DP functions: any one that has not been marked as inline
932
946
// at this point, it will be either subroutine or stackcall.
933
- const bool isDPCallFunc = (isDPEmu () && isSlowDPEmuFunc (Func ));
947
+ const bool isDPCallFunc = (isDPEmu () && II-> isSlowDPEmuFunc ());
934
948
935
949
// Use subroutine/stackcall for some DP emulation functions if
936
950
// EmulationFunctionControl is set so, or
937
951
// use subroutines if total number of instructions added when
938
952
// all emulated functions are inlined exceed InlinedEmulationThreshold.
939
953
// If Func is a slow version of DP emu func, perf isn't important.
940
954
if (m_enableCallForEmulation &&
941
- (totalNumberOfInlinedInst > (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold) ||
955
+ (totalNumberOfPotentiallyInlinedInst > (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold) ||
942
956
isDPCallFunc))
943
957
{
944
958
Func->addFnAttr (llvm::Attribute::NoInline);
@@ -1003,6 +1017,128 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
1003
1017
return m_changed;
1004
1018
}
1005
1019
1020
+ PreCompiledFuncImport::ImportedFunction::ImportedFunction (Function* F)
1021
+ : F(F), type(EmuType::OTHER), funcInstructions(0 ), totalInstructions(0 )
1022
+ {
1023
+ // Count number of new instructions added by inlining.
1024
+ for (BasicBlock& BB : F->getBasicBlockList ())
1025
+ funcInstructions += BB.getInstList ().size ();
1026
+
1027
+ updateUses ();
1028
+
1029
+ // Get type of imported function.
1030
+ StringRef name = F->getName ();
1031
+
1032
+ if (name.equals (" __igcbuiltin_dp_div_nomadm_ieee" ) ||
1033
+ name.equals (" __igcbuiltin_dp_div_nomadm_fast" ) ||
1034
+ name.equals (" __igcbuiltin_dp_sqrt_nomadm_ieee" ) ||
1035
+ name.equals (" __igcbuiltin_dp_sqrt_nomadm_fast" ))
1036
+ {
1037
+ type = EmuType::FASTDP;
1038
+ }
1039
+ else if (name.equals (" __igcbuiltin_dp_add" ) ||
1040
+ name.equals (" __igcbuiltin_dp_sub" ) ||
1041
+ name.equals (" __igcbuiltin_dp_fma" ) ||
1042
+ name.equals (" __igcbuiltin_dp_mul" ) ||
1043
+ name.equals (" __igcbuiltin_dp_div" ) ||
1044
+ name.equals (" __igcbuiltin_dp_cmp" ) ||
1045
+ name.equals (" __igcbuiltin_dp_to_int32" ) ||
1046
+ name.equals (" __igcbuiltin_dp_to_uint32" ) ||
1047
+ name.equals (" __igcbuiltin_int32_to_dp" ) ||
1048
+ name.equals (" __igcbuiltin_uint32_to_dp" ) ||
1049
+ name.equals (" __igcbuiltin_dp_to_sp" ) ||
1050
+ name.equals (" __igcbuiltin_sp_to_dp" ) ||
1051
+ name.equals (" __igcbuiltin_dp_sqrt" ))
1052
+ {
1053
+ // If true, it is a slow version of DP emu functions. Those functions
1054
+ // are the original ones for just passing conformance, not for perf.
1055
+ type = EmuType::SLOWDP;
1056
+ }
1057
+ else
1058
+ {
1059
+ for (int i = 0 ; i < NUM_FUNCTIONS && type == EmuType::OTHER; ++i)
1060
+ {
1061
+ for (int j = 0 ; j < NUM_TYPES && type == EmuType::OTHER; ++j)
1062
+ {
1063
+ if (name.equals (m_Int64SpDivRemFunctionNames[i][j]) ||
1064
+ name.equals (m_Int64DpDivRemFunctionNames[i][j]))
1065
+ {
1066
+ type = EmuType::INT64;
1067
+ }
1068
+ }
1069
+ }
1070
+ }
1071
+ }
1072
+
1073
+ void PreCompiledFuncImport::ImportedFunction::updateUses ()
1074
+ {
1075
+ totalInstructions = funcInstructions * F->getNumUses ();
1076
+ }
1077
+
1078
+ PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::ImportedFunction::copy (ImportedFunction& other)
1079
+ {
1080
+ ValueToValueMapTy VM;
1081
+ Function* copy = CloneFunction (other.F , VM);
1082
+ return PreCompiledFuncImport::ImportedFunction (copy, other.type , other.funcInstructions , 0 );
1083
+ }
1084
+
1085
+ // Compare two imported functions in order preferred for inlining.
1086
+ bool PreCompiledFuncImport::ImportedFunction::compare (ImportedFunction& L, ImportedFunction& R)
1087
+ {
1088
+ // First sort by preferred type of emulation.
1089
+ if (L.type != R.type )
1090
+ return L.type < R.type ;
1091
+
1092
+ // Then sort by number of inlined instructions.
1093
+ return L.totalInstructions < R.totalInstructions ;
1094
+ };
1095
+
1096
+ PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::createInlinedCopy (ImportedFunction& IF, unsigned n)
1097
+ {
1098
+ std::vector<CallInst*> toDelete;
1099
+
1100
+ // Make copy that is always inlined.
1101
+ ImportedFunction copy = ImportedFunction::copy (IF);
1102
+ copy.F ->setName (IF.F ->getName () + " _always_inline" );
1103
+ copy.F ->addFnAttr (llvm::Attribute::AlwaysInline);
1104
+
1105
+ // Collect first n calls to replace with copy.
1106
+ llvm::SmallVector<CallInst*, 8 > calls;
1107
+ auto it = IF.F ->user_begin ();
1108
+ for (unsigned i = 0 ; i < n; ++i)
1109
+ {
1110
+ CallInst* oldCall = dyn_cast<CallInst>(*(it++));
1111
+ IGC_ASSERT (oldCall);
1112
+ calls.push_back (oldCall);
1113
+ }
1114
+
1115
+ // Replace with always inlined copy.
1116
+ for (CallInst* oldCall : calls)
1117
+ {
1118
+ std::vector<Value*> args;
1119
+ for (unsigned arg = 0 ; arg < IGCLLVM::getNumArgOperands (oldCall); ++arg)
1120
+ args.push_back (oldCall->getArgOperand (arg));
1121
+
1122
+ // Create new call and insert it before old one
1123
+ CallInst* newCall = CallInst::Create (copy.F , args, " " , oldCall);
1124
+
1125
+ newCall->setCallingConv (copy.F ->getCallingConv ());
1126
+ newCall->setAttributes (oldCall->getAttributes ());
1127
+ newCall->setDebugLoc (oldCall->getDebugLoc ());
1128
+
1129
+ oldCall->replaceAllUsesWith (newCall);
1130
+ toDelete.push_back (oldCall);
1131
+ }
1132
+
1133
+ for (auto C : toDelete)
1134
+ C->eraseFromParent ();
1135
+
1136
+ copy.updateUses ();
1137
+ IF.updateUses ();
1138
+
1139
+ return copy;
1140
+ }
1141
+
1006
1142
void PreCompiledFuncImport::visitBinaryOperator (BinaryOperator& I)
1007
1143
{
1008
1144
if (I.getOperand (0 )->getType ()->isIntOrIntVectorTy ())
@@ -2547,6 +2683,7 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
2547
2683
bool SPDiv = isSPDiv ();
2548
2684
bool DPEmu = isDPEmu ();
2549
2685
bool DPDivSqrtEmu = isDPDivSqrtEmu ();
2686
+ bool I64DivRem = isI64DivRem ();
2550
2687
2551
2688
Module* M = m_pCtx->getModule ();
2552
2689
for (auto FI = M->begin (), FE = M->end (); FI != FE; ++FI)
@@ -2589,6 +2726,15 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
2589
2726
m_enableCallForEmulation = true ;
2590
2727
}
2591
2728
break ;
2729
+ case Instruction::UDiv:
2730
+ case Instruction::URem:
2731
+ case Instruction::SDiv:
2732
+ case Instruction::SRem:
2733
+ if (I64DivRem && I->getOperand (0 )->getType ()->isIntegerTy (64 ))
2734
+ {
2735
+ m_enableCallForEmulation = true ;
2736
+ }
2737
+ break ;
2592
2738
}
2593
2739
2594
2740
GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(I);
0 commit comments