Skip to content

Commit dd18534

Browse files
DianaChenigcbot
authored andcommitted
Revert "Cross-thread size should be 32-byte aligned insted of GRF size aligned (4th try)"
Revert "Cross-thread size should be 32-byte aligned insted of GRF size aligned (4th try)"
1 parent 55e74dc commit dd18534

File tree

2 files changed

+41
-86
lines changed

2 files changed

+41
-86
lines changed

IGC/AdaptorOCL/OCL/sp/sp_g8.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1950,9 +1950,8 @@ RETVAL CGen8OpenCLStateProcessor::CreatePatchList(
19501950
annotations.m_PrivateMemSize->Offset + iOpenCL::DATA_PARAMETER_DATA_SIZE );
19511951
}
19521952

1953-
// Payload must be a multiple of 32 bytes
1954-
// This assumption has to be the same as in vISA::Optimizer::loadThreadPayload
1955-
dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, 32);
1953+
// Payload must be a multiple of a GRF register
1954+
dataParameterStreamSize += GetAlignmentOffset(dataParameterStreamSize, CPlatform(m_Platform).getGRFSize());
19561955

19571956
if( retValue.Success )
19581957
{

visa/Optimizer.cpp

Lines changed: 39 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -8572,50 +8572,34 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
85728572
const bool useInlineData = builder.getOption(vISA_useInlineData);
85738573

85748574
// preparation of thread payload size and start offsets
8575-
8576-
// Payload in Memory Payload in GRF (T0)
8577-
// (Prepared by Runtime)
8578-
// (Does not contain inlineData)
8579-
// ----------------------- R1 ----------------------- <-- perThreadLoadStartGRF
8580-
// | cross thread data | \ | per thread data T0 |
8581-
// | | numCrossThreadDW R4 -----------------------
8582-
// | | / | inline data |
8583-
// ----------------------- <-- localIDsOffset | (if enable) |
8584-
// | per thread data T0 | R5 ----------------------- <-- crossThreadLoadStart, crossThreadLoadStartGRF
8585-
// ----------------------- | cross thread data | \
8586-
// | per thread data T1 | | | numCrossThreadDW
8587-
// ----------------------- | | /
8588-
// | ... | -----------------------
8589-
// -----------------------
8590-
85918575
const uint32_t perThreadLoadStartGRF = kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
85928576
int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
85938577
uint32_t numPerThreadGRF = PTIS / kernel.numEltPerGRF<Type_UB>();
85948578
uint32_t crossThreadLoadStart = 0; // register file (grf) offset in byte
85958579
uint32_t crossThreadLoadStartGRF = 0; // grf number
85968580
// cross thread size (not including inlinedata size and alignement)
85978581
const uint32_t loadedCrossThreadInputSize = findLoadedInputSize(crossThreadLoadStart);
8598-
// final cross thread size to be loaded as number of DW (including aligenment)
8599-
uint32_t numCrossThreadDW = 0;
8582+
// final cross thread size to be loaded
8583+
uint32_t numCrossThreadGRF = 0;
86008584
// payload memory offset of where local id should be loaded from
86018585
uint32_t localIDsOffset = 0;
86028586
int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
86038587
if (CTIS < 0)
86048588
{
86058589
// per-thread payload vars
86068590
// N = inlinedata size
8607-
// Cross thread data size is aligned to 32byte,
8591+
// Payload is aligned to grf size,
86088592
// if inlinedata is used, runtime puts first N bytes of payload in inlinedata.
86098593
// Rest of payload is shifted in the buffer by N bytes.
86108594
// So payload args which start at N offset, now start at 0 offset.
86118595
// Because of this we need to calculate localID offset:
86128596
const uint32_t inlineDataSize = builder.getInlineDataSize();
86138597
uint32_t correction = useInlineData ? inlineDataSize : 0;
8614-
localIDsOffset = AlignUp(loadedCrossThreadInputSize + correction, 32);
8598+
localIDsOffset = AlignUp(loadedCrossThreadInputSize + correction, kernel.getGRFSize());
86158599
localIDsOffset -= useInlineData ? inlineDataSize : 0;
86168600

86178601
// cross-thread payload vars
8618-
numCrossThreadDW = AlignUp(loadedCrossThreadInputSize, 32) / TypeSize(Type_UD);
8602+
numCrossThreadGRF = AlignUp(loadedCrossThreadInputSize, kernel.getGRFSize()) / kernel.numEltPerGRF<Type_UB>();
86198603
crossThreadLoadStartGRF = crossThreadLoadStart / kernel.getGRFSize();
86208604
}
86218605
else
@@ -8625,13 +8609,13 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
86258609
localIDsOffset -= useInlineData ? kernel.getGRFSize() : 0;
86268610

86278611
// cross-thread payload vars
8628-
numCrossThreadDW = CTIS / TypeSize(Type_UD);
8612+
numCrossThreadGRF = CTIS / kernel.numEltPerGRF<Type_UB>();
86298613
crossThreadLoadStartGRF = perThreadLoadStartGRF + numPerThreadGRF;
86308614
if (useInlineData)
86318615
{
86328616
// first GRF of cross-thread data is already loaded
86338617
crossThreadLoadStartGRF++;
8634-
numCrossThreadDW -= builder.getInlineDataSize() / TypeSize(Type_UD);
8618+
numCrossThreadGRF--;
86358619
}
86368620
}
86378621

@@ -8661,21 +8645,18 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
86618645

86628646
// load <numGRF> GRFs from the address "loadAddress", starting from <startGRF>
86638647
auto loadFromMemory = [this, &instBuffer, getHWordBlockEncoding](
8664-
G4_Declare* loadAddress, uint32_t startGRF, uint32_t numTotalDW)
8648+
G4_Declare* loadAddress, uint32_t startGRF, uint32_t numGRF)
86658649
{
8666-
for (uint32_t numRemainingDW = numTotalDW, nextGRF = startGRF; numRemainingDW > 0; /* updated in body */)
8650+
bool useHword = builder.hasHWordBlockLoad();
8651+
for (int numRemaining = numGRF, nextGRF = startGRF; numRemaining > 0; /* updated in body */)
86678652
{
8668-
// can load 4, 2 or 1 grf per send.
8669-
// Still load 1 GRF if the remainingDW is less than 1 GRF. The addtional bytes those being loaded won't be used.
8670-
uint32_t DWin4GRF = 4 * builder.numEltPerGRF<Type_UD>();
8671-
uint32_t DWin2GRF = DWin4GRF / 2;
8672-
uint32_t DWin1GRF = DWin2GRF / 2;
8673-
uint32_t numGRFToLoad =
8674-
numRemainingDW >= DWin4GRF ? 4 : // 4 GRF
8675-
numRemainingDW >= DWin2GRF ? 2 : // 2 GRF
8676-
1; // 1 GRF or less than 1 GRF
8677-
8678-
bool useHword = builder.hasHWordBlockLoad();
8653+
int numGRFToLoad = numRemaining > 2 ? 4 : numRemaining;
8654+
if (numRemaining == 3)
8655+
{
8656+
// we can't do 4GRF load since it may overwrite values pushed from inline data,
8657+
// break load to 2+1 instead
8658+
numGRFToLoad = 2;
8659+
}
86798660
uint32_t numElts = (numGRFToLoad * kernel.getGRFSize()) / (useHword ? 32 : 16);
86808661
uint32_t dataBlocks = useHword ? getHWordBlockEncoding(numElts) :
86818662
(numElts == 2 ? 2 : (numElts == 4 ? 3 : 4));
@@ -8690,11 +8671,9 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
86908671
auto sendInst = builder.createSendInst(nullptr, G4_send, g4::SIMD8, sendDst, sendSrc,
86918672
builder.createImm(msgDescVal, Type_UD), InstOpt_WriteEnable, desc, true);
86928673
instBuffer.push_back(sendInst);
8693-
if (numRemainingDW < DWin1GRF)
8694-
break;
8695-
numRemainingDW -= numGRFToLoad * builder.numEltPerGRF<Type_UD>();
8674+
numRemaining -= numGRFToLoad;
86968675
nextGRF += numGRFToLoad;
8697-
if (numRemainingDW > 0)
8676+
if (numRemaining > 0)
86988677
{
86998678
// advance the address offset
87008679
// (W) add (1) loadAddress.2 loadAddress.2 numGRFToLoad*32
@@ -8708,36 +8687,18 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
87088687
}
87098688
}
87108689
};
8711-
8712-
// a helper function for loadFromMemoryLSC to get the max DW number which can fulfill
8713-
// LSC element number
8714-
auto getMaxNumDWforLscElementRequirement = [this](uint32_t numDW) {
8715-
if (builder.lscGetElementNum(numDW) != LSC_DATA_ELEMS_INVALID)
8716-
return numDW;
8717-
if (numDW > builder.numEltPerGRF<Type_UD>()) {
8718-
if (numDW > 64) return (uint32_t)64;
8719-
else if (numDW > 32) return (uint32_t)32;
8720-
else if (numDW > 16) return (uint32_t)16;
8721-
else if (numDW > 8) return (uint32_t)8;
8722-
assert(0 && "unreachable");
8723-
}
8724-
// when the numDW is less than 1 grf, we want to load all within one send
8725-
// The additional bytes being loaded won't be used so should be fine
8726-
if (numDW < 2) return (uint32_t)2;
8727-
else if (numDW < 4) return (uint32_t)4;
8728-
else if (numDW < 8) return (uint32_t)8;
8729-
else if (numDW < 16) return (uint32_t)16;
8730-
assert(0 && "unreachable");
8731-
return (uint32_t)0;
8732-
};
8733-
8734-
auto loadFromMemoryLSC = [this, &instBuffer, &getMaxNumDWforLscElementRequirement](
8735-
G4_Declare* loadAddress, uint32_t startGRF, uint32_t numTotalDW)
8690+
auto loadFromMemoryLSC = [this, &instBuffer](
8691+
G4_Declare* loadAddress, uint32_t startGRF, uint32_t numGRF)
87368692
{
87378693
const auto ADDR_TYPE = LSC_ADDR_TYPE_BTI;
87388694

8739-
for (uint32_t numRemainingDW = numTotalDW, nextGRF = startGRF; numRemainingDW > 0; /* updated in body */)
8695+
for (int numRemaining = numGRF, nextGRF = startGRF; numRemaining > 0; /* updated in body */)
87408696
{
8697+
int numGRFToLoad =
8698+
numRemaining > 4 ? 4 :
8699+
numRemaining == 3 ? 2 : // split to 2+1
8700+
numRemaining; // 2 or 1
8701+
87418702
// Generate a A32 tranpose LSC load to BTI 255. size is d32x{16/32}t
87428703
LSC_OP op = LSC_LOAD;
87438704
LSC_SFID lscSfid = LSC_UGM;
@@ -8748,16 +8709,15 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
87488709
addrInfo.immScale = 1;
87498710
addrInfo.immOffset = 0;
87508711
addrInfo.size = LSC_ADDR_SIZE_32b;
8751-
8712+
auto numDW = numGRFToLoad * (kernel.getGRFSize() / 4);
87528713
LSC_DATA_SHAPE dataShape { };
87538714
dataShape.size = LSC_DATA_SIZE_32b; //in the unit of 32b
87548715
dataShape.order = LSC_DATA_ORDER_TRANSPOSE;
8755-
uint32_t numDWToLoad = getMaxNumDWforLscElementRequirement(numRemainingDW);
8756-
dataShape.elems = builder.lscGetElementNum(numDWToLoad);
8716+
dataShape.elems = builder.lscGetElementNum(numDW);
87578717

87588718
G4_Imm* surfaceBTI = builder.createImm(255, Type_UW);
87598719

8760-
auto sendDstDcl = builder.createHardwiredDeclare(numDWToLoad, Type_UD, nextGRF, 0);
8720+
auto sendDstDcl = builder.createHardwiredDeclare(numDW, Type_UD, nextGRF, 0);
87618721
auto dstRead = builder.createDstRegRegion(sendDstDcl, 1);
87628722
auto src0Addr = builder.createSrcRegRegion(loadAddress, builder.getRegionStride1()); // address base
87638723

@@ -8769,7 +8729,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
87698729
addrInfo,
87708730
dataShape,
87718731
surfaceBTI,
8772-
numDWToLoad < builder.numEltPerGRF<Type_UD>() ? 1 : numDWToLoad / builder.numEltPerGRF<Type_UD>(),
8732+
numGRFToLoad,
87738733
1);
87748734

87758735
G4_InstSend *sendInst = builder.createLscSendInst(
@@ -8784,19 +8744,15 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
87848744
true);
87858745

87868746
instBuffer.push_back(sendInst);
8787-
// we pick to load all data within one send in getMaxNumDWforLscElementRequirement if
8788-
// numRemainingDW is less than one grf. All should be loaded at this point.
8789-
if (numRemainingDW < builder.numEltPerGRF<Type_UD>())
8790-
break;
8791-
numRemainingDW -= numDWToLoad;
8792-
nextGRF += numDWToLoad / builder.numEltPerGRF<Type_UD>();
8793-
bool advanceLoadAddress = numRemainingDW > 0;
8747+
numRemaining -= numGRFToLoad;
8748+
nextGRF += numGRFToLoad;
8749+
bool advanceLoadAddress = numRemaining > 0;
87948750
if (advanceLoadAddress)
87958751
{
87968752
// advance the address offset
87978753
// (W) add (1) loadAddress.0 loadAddress.0 numGRFToLoad*32
87988754
auto addSrc0 = builder.createSrcRegRegion(loadAddress, builder.getRegionScalar());
8799-
auto addSrc1 = builder.createImm(numDWToLoad * TypeSize(Type_UD), Type_UW);
8755+
auto addSrc1 = builder.createImm(numGRFToLoad * kernel.numEltPerGRF<Type_UB>(), Type_UW);
88008756
auto addDst = builder.createDst(loadAddress->getRegVar(), 0, 0, 1, Type_UD);
88018757
auto addInst = builder.createBinOp(G4_add, g4::SIMD1, addDst,
88028758
addSrc0, addSrc1, InstOpt_WriteEnable, false);
@@ -8955,11 +8911,11 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
89558911

89568912
if (useLSC)
89578913
{
8958-
loadFromMemoryLSC(rtail, perThreadLoadStartGRF, numPerThreadGRF * builder.numEltPerGRF<Type_UD>());
8914+
loadFromMemoryLSC(rtail, perThreadLoadStartGRF, numPerThreadGRF);
89598915
}
89608916
else
89618917
{
8962-
loadFromMemory(rtail, perThreadLoadStartGRF, numPerThreadGRF * builder.numEltPerGRF<Type_UD>());
8918+
loadFromMemory(rtail, perThreadLoadStartGRF, numPerThreadGRF);
89638919
}
89648920
perThreadBB = kernel.fg.createNewBB();
89658921
perThreadBB->insert(perThreadBB->begin(), instBuffer.begin(), instBuffer.end());
@@ -9002,11 +8958,11 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
90028958
{
90038959
if (useLSC)
90048960
{
9005-
loadFromMemoryLSC(rtail, crossThreadLoadStartGRF, numCrossThreadDW);
8961+
loadFromMemoryLSC(rtail, crossThreadLoadStartGRF, numCrossThreadGRF);
90068962
}
90078963
else
90088964
{
9009-
loadFromMemory(rtail, crossThreadLoadStartGRF, numCrossThreadDW);
8965+
loadFromMemory(rtail, crossThreadLoadStartGRF, numCrossThreadGRF);
90108966
}
90118967
}
90128968

0 commit comments

Comments
 (0)