@@ -14490,10 +14490,19 @@ void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, C
14490
14490
for( unsigned int i = 0; i < numIterations; i++ )
14491
14491
{
14492
14492
// Get alias for src0, src1, and dst based on offsets and SIMD size
14493
- auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14494
- auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14495
- auto* layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14496
-
14493
+ auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14494
+ auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14495
+ CVariable* layerDst;
14496
+ if( (srcElementCount >> 1 <= dst->GetNumberElement()) && (i + 1 == numIterations ))
14497
+ {
14498
+ // Final layer, use destination of WaveAll vector intrinsic inst (passed in with correct offset)
14499
+ layerDst = dst;
14500
+ }
14501
+ else
14502
+ {
14503
+ // Use src as workspace to store intermediate values
14504
+ layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14505
+ }
14497
14506
if( !int64EmulationNeeded )
14498
14507
{
14499
14508
m_encoder->SetNoMask();
@@ -14522,13 +14531,6 @@ void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, C
14522
14531
srcElementCount >>= 1;
14523
14532
reductionElementCount >>= 1;
14524
14533
}
14525
-
14526
- // copy fully reduced elements from src to dst
14527
- auto* finalLayerDst = m_currShader->GetNewAlias( src, type, 0, dst->GetNumberElement() );
14528
- m_encoder->SetNoMask();
14529
- m_encoder->SetSimdSize( lanesToSIMDMode( dst->GetNumberElement() ) );
14530
- m_encoder->Copy( dst, finalLayerDst );
14531
- m_encoder->Push();
14532
14534
}
14533
14535
14534
14536
// Recursive function that emits one or more joint reduction trees based on the joint output width
@@ -14542,8 +14544,8 @@ void EmitPass::emitReductionTrees( e_opcode op, VISA_Type type, SIMDMode simdMod
14542
14544
// Do full tree reduction
14543
14545
unsigned int reductionElements = src->GetNumberElement() / dst->GetNumberElement();
14544
14546
unsigned int groupReductionElementCount = reductionElements * simdLanes;
14545
- CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount );
14546
- CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes);
14547
+ CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount, false );
14548
+ CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes, false );
14547
14549
emitReductionTree( op, type, srcAlias, dstAlias );
14548
14550
// Start new recursive tree if any elements are left
14549
14551
if ( numGroups > simdLanes )
@@ -23010,13 +23012,13 @@ void EmitPass::emitWaveAll(llvm::GenIntrinsicInst* inst)
23010
23012
for( uint16_t i = 0; i < dst->GetNumberElement(); i++ )
23011
23013
{
23012
23014
// Prepare reduceSrc
23013
- CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
23014
- CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
23015
+ CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
23016
+ CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
23015
23017
ScanReducePrepareSrc( type, identity, false, false, srcAlias, reduceSrcAlias );
23016
23018
23017
23019
// Prepare reduceSrcSecondHalf
23018
- CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
23019
- CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
23020
+ CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
23021
+ CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
23020
23022
ScanReducePrepareSrc( type, identity, false, true, srcSecondHalfAlias, reduceSrcSecondHalfAlias );
23021
23023
23022
23024
// Emit correct operations
0 commit comments