@@ -8572,50 +8572,34 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8572
8572
const bool useInlineData = builder.getOption(vISA_useInlineData);
8573
8573
8574
8574
// preparation of thread payload size and start offsets
8575
-
8576
- // Payload in Memory Payload in GRF (T0)
8577
- // (Prepared by Runtime)
8578
- // (Does not contain inlineData)
8579
- // ----------------------- R1 ----------------------- <-- perThreadLoadStartGRF
8580
- // | cross thread data | \ | per thread data T0 |
8581
- // | | numCrossThreadDW R4 -----------------------
8582
- // | | / | inline data |
8583
- // ----------------------- <-- localIDsOffset | (if enable) |
8584
- // | per thread data T0 | R5 ----------------------- <-- crossThreadLoadStart, crossThreadLoadStartGRF
8585
- // ----------------------- | cross thread data | \
8586
- // | per thread data T1 | | | numCrossThreadDW
8587
- // ----------------------- | | /
8588
- // | ... | -----------------------
8589
- // -----------------------
8590
-
8591
8575
const uint32_t perThreadLoadStartGRF = kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
8592
8576
int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
8593
8577
uint32_t numPerThreadGRF = PTIS / kernel.numEltPerGRF<Type_UB>();
8594
8578
uint32_t crossThreadLoadStart = 0; // register file (grf) offset in byte
8595
8579
uint32_t crossThreadLoadStartGRF = 0; // grf number
8596
8580
// cross thread size (not including inlinedata size and alignement)
8597
8581
const uint32_t loadedCrossThreadInputSize = findLoadedInputSize(crossThreadLoadStart);
8598
- // final cross thread size to be loaded as number of DW (including aligenment)
8599
- uint32_t numCrossThreadDW = 0;
8582
+ // final cross thread size to be loaded
8583
+ uint32_t numCrossThreadGRF = 0;
8600
8584
// payload memory offset of where local id should be loaded from
8601
8585
uint32_t localIDsOffset = 0;
8602
8586
int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
8603
8587
if (CTIS < 0)
8604
8588
{
8605
8589
// per-thread payload vars
8606
8590
// N = inlinedata size
8607
- // Cross thread data size is aligned to 32byte ,
8591
+ // Payload is aligned to grf size ,
8608
8592
// if inlinedata is used, runtime puts first N bytes of payload in inlinedata.
8609
8593
// Rest of payload is shifted in the buffer by N bytes.
8610
8594
// So payload args which start at N offset, now start at 0 offset.
8611
8595
// Because of this we need to calculate localID offset:
8612
8596
const uint32_t inlineDataSize = builder.getInlineDataSize();
8613
8597
uint32_t correction = useInlineData ? inlineDataSize : 0;
8614
- localIDsOffset = AlignUp(loadedCrossThreadInputSize + correction, 32 );
8598
+ localIDsOffset = AlignUp(loadedCrossThreadInputSize + correction, kernel.getGRFSize() );
8615
8599
localIDsOffset -= useInlineData ? inlineDataSize : 0;
8616
8600
8617
8601
// cross-thread payload vars
8618
- numCrossThreadDW = AlignUp(loadedCrossThreadInputSize, 32) / TypeSize(Type_UD );
8602
+ numCrossThreadGRF = AlignUp(loadedCrossThreadInputSize, kernel.getGRFSize()) / kernel.numEltPerGRF<Type_UB>( );
8619
8603
crossThreadLoadStartGRF = crossThreadLoadStart / kernel.getGRFSize();
8620
8604
}
8621
8605
else
@@ -8625,13 +8609,13 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8625
8609
localIDsOffset -= useInlineData ? kernel.getGRFSize() : 0;
8626
8610
8627
8611
// cross-thread payload vars
8628
- numCrossThreadDW = CTIS / TypeSize(Type_UD );
8612
+ numCrossThreadGRF = CTIS / kernel.numEltPerGRF<Type_UB>( );
8629
8613
crossThreadLoadStartGRF = perThreadLoadStartGRF + numPerThreadGRF;
8630
8614
if (useInlineData)
8631
8615
{
8632
8616
// first GRF of cross-thread data is already loaded
8633
8617
crossThreadLoadStartGRF++;
8634
- numCrossThreadDW -= builder.getInlineDataSize() / TypeSize(Type_UD) ;
8618
+ numCrossThreadGRF-- ;
8635
8619
}
8636
8620
}
8637
8621
@@ -8661,21 +8645,18 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8661
8645
8662
8646
// load <numGRF> GRFs from the address "loadAddress", starting from <startGRF>
8663
8647
auto loadFromMemory = [this, &instBuffer, getHWordBlockEncoding](
8664
- G4_Declare* loadAddress, uint32_t startGRF, uint32_t numTotalDW )
8648
+ G4_Declare* loadAddress, uint32_t startGRF, uint32_t numGRF )
8665
8649
{
8666
- for (uint32_t numRemainingDW = numTotalDW, nextGRF = startGRF; numRemainingDW > 0; /* updated in body */)
8650
+ bool useHword = builder.hasHWordBlockLoad();
8651
+ for (int numRemaining = numGRF, nextGRF = startGRF; numRemaining > 0; /* updated in body */)
8667
8652
{
8668
- // can load 4, 2 or 1 grf per send.
8669
- // Still load 1 GRF if the remainingDW is less than 1 GRF. The addtional bytes those being loaded won't be used.
8670
- uint32_t DWin4GRF = 4 * builder.numEltPerGRF<Type_UD>();
8671
- uint32_t DWin2GRF = DWin4GRF / 2;
8672
- uint32_t DWin1GRF = DWin2GRF / 2;
8673
- uint32_t numGRFToLoad =
8674
- numRemainingDW >= DWin4GRF ? 4 : // 4 GRF
8675
- numRemainingDW >= DWin2GRF ? 2 : // 2 GRF
8676
- 1; // 1 GRF or less than 1 GRF
8677
-
8678
- bool useHword = builder.hasHWordBlockLoad();
8653
+ int numGRFToLoad = numRemaining > 2 ? 4 : numRemaining;
8654
+ if (numRemaining == 3)
8655
+ {
8656
+ // we can't do 4GRF load since it may overwrite values pushed from inline data,
8657
+ // break load to 2+1 instead
8658
+ numGRFToLoad = 2;
8659
+ }
8679
8660
uint32_t numElts = (numGRFToLoad * kernel.getGRFSize()) / (useHword ? 32 : 16);
8680
8661
uint32_t dataBlocks = useHword ? getHWordBlockEncoding(numElts) :
8681
8662
(numElts == 2 ? 2 : (numElts == 4 ? 3 : 4));
@@ -8690,11 +8671,9 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8690
8671
auto sendInst = builder.createSendInst(nullptr, G4_send, g4::SIMD8, sendDst, sendSrc,
8691
8672
builder.createImm(msgDescVal, Type_UD), InstOpt_WriteEnable, desc, true);
8692
8673
instBuffer.push_back(sendInst);
8693
- if (numRemainingDW < DWin1GRF)
8694
- break;
8695
- numRemainingDW -= numGRFToLoad * builder.numEltPerGRF<Type_UD>();
8674
+ numRemaining -= numGRFToLoad;
8696
8675
nextGRF += numGRFToLoad;
8697
- if (numRemainingDW > 0)
8676
+ if (numRemaining > 0)
8698
8677
{
8699
8678
// advance the address offset
8700
8679
// (W) add (1) loadAddress.2 loadAddress.2 numGRFToLoad*32
@@ -8708,36 +8687,18 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8708
8687
}
8709
8688
}
8710
8689
};
8711
-
8712
- // a helper function for loadFromMemoryLSC to get the max DW number which can fulfill
8713
- // LSC element number
8714
- auto getMaxNumDWforLscElementRequirement = [this](uint32_t numDW) {
8715
- if (builder.lscGetElementNum(numDW) != LSC_DATA_ELEMS_INVALID)
8716
- return numDW;
8717
- if (numDW > builder.numEltPerGRF<Type_UD>()) {
8718
- if (numDW > 64) return (uint32_t)64;
8719
- else if (numDW > 32) return (uint32_t)32;
8720
- else if (numDW > 16) return (uint32_t)16;
8721
- else if (numDW > 8) return (uint32_t)8;
8722
- assert(0 && "unreachable");
8723
- }
8724
- // when the numDW is less than 1 grf, we want to load all within one send
8725
- // The additional bytes being loaded won't be used so should be fine
8726
- if (numDW < 2) return (uint32_t)2;
8727
- else if (numDW < 4) return (uint32_t)4;
8728
- else if (numDW < 8) return (uint32_t)8;
8729
- else if (numDW < 16) return (uint32_t)16;
8730
- assert(0 && "unreachable");
8731
- return (uint32_t)0;
8732
- };
8733
-
8734
- auto loadFromMemoryLSC = [this, &instBuffer, &getMaxNumDWforLscElementRequirement](
8735
- G4_Declare* loadAddress, uint32_t startGRF, uint32_t numTotalDW)
8690
+ auto loadFromMemoryLSC = [this, &instBuffer](
8691
+ G4_Declare* loadAddress, uint32_t startGRF, uint32_t numGRF)
8736
8692
{
8737
8693
const auto ADDR_TYPE = LSC_ADDR_TYPE_BTI;
8738
8694
8739
- for (uint32_t numRemainingDW = numTotalDW , nextGRF = startGRF; numRemainingDW > 0; /* updated in body */)
8695
+ for (int numRemaining = numGRF , nextGRF = startGRF; numRemaining > 0; /* updated in body */)
8740
8696
{
8697
+ int numGRFToLoad =
8698
+ numRemaining > 4 ? 4 :
8699
+ numRemaining == 3 ? 2 : // split to 2+1
8700
+ numRemaining; // 2 or 1
8701
+
8741
8702
// Generate a A32 tranpose LSC load to BTI 255. size is d32x{16/32}t
8742
8703
LSC_OP op = LSC_LOAD;
8743
8704
LSC_SFID lscSfid = LSC_UGM;
@@ -8748,16 +8709,15 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8748
8709
addrInfo.immScale = 1;
8749
8710
addrInfo.immOffset = 0;
8750
8711
addrInfo.size = LSC_ADDR_SIZE_32b;
8751
-
8712
+ auto numDW = numGRFToLoad * (kernel.getGRFSize() / 4);
8752
8713
LSC_DATA_SHAPE dataShape { };
8753
8714
dataShape.size = LSC_DATA_SIZE_32b; //in the unit of 32b
8754
8715
dataShape.order = LSC_DATA_ORDER_TRANSPOSE;
8755
- uint32_t numDWToLoad = getMaxNumDWforLscElementRequirement(numRemainingDW);
8756
- dataShape.elems = builder.lscGetElementNum(numDWToLoad);
8716
+ dataShape.elems = builder.lscGetElementNum(numDW);
8757
8717
8758
8718
G4_Imm* surfaceBTI = builder.createImm(255, Type_UW);
8759
8719
8760
- auto sendDstDcl = builder.createHardwiredDeclare(numDWToLoad , Type_UD, nextGRF, 0);
8720
+ auto sendDstDcl = builder.createHardwiredDeclare(numDW , Type_UD, nextGRF, 0);
8761
8721
auto dstRead = builder.createDstRegRegion(sendDstDcl, 1);
8762
8722
auto src0Addr = builder.createSrcRegRegion(loadAddress, builder.getRegionStride1()); // address base
8763
8723
@@ -8769,7 +8729,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8769
8729
addrInfo,
8770
8730
dataShape,
8771
8731
surfaceBTI,
8772
- numDWToLoad < builder.numEltPerGRF<Type_UD>() ? 1 : numDWToLoad / builder.numEltPerGRF<Type_UD>() ,
8732
+ numGRFToLoad ,
8773
8733
1);
8774
8734
8775
8735
G4_InstSend *sendInst = builder.createLscSendInst(
@@ -8784,19 +8744,15 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8784
8744
true);
8785
8745
8786
8746
instBuffer.push_back(sendInst);
8787
- // we pick to load all data within one send in getMaxNumDWforLscElementRequirement if
8788
- // numRemainingDW is less than one grf. All should be loaded at this point.
8789
- if (numRemainingDW < builder.numEltPerGRF<Type_UD>())
8790
- break;
8791
- numRemainingDW -= numDWToLoad;
8792
- nextGRF += numDWToLoad / builder.numEltPerGRF<Type_UD>();
8793
- bool advanceLoadAddress = numRemainingDW > 0;
8747
+ numRemaining -= numGRFToLoad;
8748
+ nextGRF += numGRFToLoad;
8749
+ bool advanceLoadAddress = numRemaining > 0;
8794
8750
if (advanceLoadAddress)
8795
8751
{
8796
8752
// advance the address offset
8797
8753
// (W) add (1) loadAddress.0 loadAddress.0 numGRFToLoad*32
8798
8754
auto addSrc0 = builder.createSrcRegRegion(loadAddress, builder.getRegionScalar());
8799
- auto addSrc1 = builder.createImm(numDWToLoad * TypeSize(Type_UD ), Type_UW);
8755
+ auto addSrc1 = builder.createImm(numGRFToLoad * kernel.numEltPerGRF<Type_UB>( ), Type_UW);
8800
8756
auto addDst = builder.createDst(loadAddress->getRegVar(), 0, 0, 1, Type_UD);
8801
8757
auto addInst = builder.createBinOp(G4_add, g4::SIMD1, addDst,
8802
8758
addSrc0, addSrc1, InstOpt_WriteEnable, false);
@@ -8955,11 +8911,11 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
8955
8911
8956
8912
if (useLSC)
8957
8913
{
8958
- loadFromMemoryLSC(rtail, perThreadLoadStartGRF, numPerThreadGRF * builder.numEltPerGRF<Type_UD>() );
8914
+ loadFromMemoryLSC(rtail, perThreadLoadStartGRF, numPerThreadGRF);
8959
8915
}
8960
8916
else
8961
8917
{
8962
- loadFromMemory(rtail, perThreadLoadStartGRF, numPerThreadGRF * builder.numEltPerGRF<Type_UD>() );
8918
+ loadFromMemory(rtail, perThreadLoadStartGRF, numPerThreadGRF);
8963
8919
}
8964
8920
perThreadBB = kernel.fg.createNewBB();
8965
8921
perThreadBB->insert(perThreadBB->begin(), instBuffer.begin(), instBuffer.end());
@@ -9002,11 +8958,11 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
9002
8958
{
9003
8959
if (useLSC)
9004
8960
{
9005
- loadFromMemoryLSC(rtail, crossThreadLoadStartGRF, numCrossThreadDW );
8961
+ loadFromMemoryLSC(rtail, crossThreadLoadStartGRF, numCrossThreadGRF );
9006
8962
}
9007
8963
else
9008
8964
{
9009
- loadFromMemory(rtail, crossThreadLoadStartGRF, numCrossThreadDW );
8965
+ loadFromMemory(rtail, crossThreadLoadStartGRF, numCrossThreadGRF );
9010
8966
}
9011
8967
}
9012
8968
0 commit comments