Skip to content

Commit

Permalink
Update gpurt from commit 80269d10
Browse files Browse the repository at this point in the history
Remove unused path from BuildParallel
Set triangle 1 bits in triangle ID only when it is a pair compressed triangle
[Continuations] Change static ID handling for continuations
Fix tdr in rayquery apps when rdp attaches
Deprecate CmdBarrier() path
rename BuildBVHPLOC to BuildPLOC
Fix cmake defines and includes for validation
Add deviceSetting to disable compaction
Enable GPU debugging in all build stages
Move cpp-shared raytracingdefs.h into non-shared ShaderDefs.hlsli
Reduce CopyAS.hlsl dependencies
Move bit-related utils from Math to Bits.hlsli
Replace bufferView with typed and untyped BufferView
Continuations persistent launch support
[Continuations] Fix legacy compilation
Initialize parentId to -1 for ray history counter
Skip redundant copy in merge-sort iteration
Copy Indirect Args in InitExecuteIndirect
[Continuations] Remove redundant repacking of constant known ray flags
[Continuations] Add options to override TraceRay flags
Fix barrier corner cases
Softcode validation file extension and directory
[Continuations] Remove stack lowering guard
Add spirv pass to validation of clean shaders
Do not limit number of waves per simd for the encode path
Recompute the dispatchID when threadGroupSize != 32
Separate merge sort local/global dispatches
  • Loading branch information
qiaojbao committed Sep 30, 2024
1 parent cf31636 commit 95c27c4
Show file tree
Hide file tree
Showing 60 changed files with 2,679 additions and 2,190 deletions.
104 changes: 34 additions & 70 deletions backends/pal/gpurtPalBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ uint32 PalBackend::GetMaxDescriptorTableSize(
ClientCmdBufferHandle cmdBuffer
) const
{
const uint32 bufferSrdSizeDw = m_deviceProperties.gfxipProperties.srdSizes.bufferView / sizeof(uint32);
const uint32 bufferSrdSizeDw = Util::Max(m_deviceProperties.gfxipProperties.srdSizes.typedBufferView,
m_deviceProperties.gfxipProperties.srdSizes.untypedBufferView) /
sizeof(uint32);
return GetCmdBuffer(cmdBuffer)->GetLargeEmbeddedDataLimit() / bufferSrdSizeDw;
}

Expand Down Expand Up @@ -239,81 +241,39 @@ void PalBackend::InsertBarrier(
const bool syncPostCpWrite = flags & BarrierFlagSyncPostCpWrite;

Pal::ICmdBuffer* pCmdBuffer = GetCmdBuffer(cmdBuffer);
if (m_deviceSettings.enableAcquireReleaseInterface)
{
Pal::AcquireReleaseInfo acqRelInfo = {};
Pal::MemBarrier memoryBarrier = {};

if (syncDispatch || syncIndirectArgs)
{
memoryBarrier.srcStageMask = Pal::PipelineStageCs;
memoryBarrier.srcAccessMask = Pal::CoherShader;
}

if (syncPostCpWrite)
{
memoryBarrier.srcStageMask |= Pal::PipelineStagePostPrefetch;
memoryBarrier.srcAccessMask |= Pal::CoherCp;
}

if (syncDispatch || syncPostCpWrite)
{
memoryBarrier.dstStageMask = Pal::PipelineStageCs;
memoryBarrier.dstAccessMask = Pal::CoherShader;
}

if (syncIndirectArgs)
{
memoryBarrier.dstStageMask |= Pal::PipelineStageFetchIndirectArgs;
memoryBarrier.dstAccessMask |= Pal::CoherIndirectArgs;
}

acqRelInfo.memoryBarrierCount = 1;
acqRelInfo.pMemoryBarriers = &memoryBarrier;
acqRelInfo.reason = m_deviceSettings.rgpBarrierReason;

pCmdBuffer->CmdReleaseThenAcquire(acqRelInfo);
}
else
{
Pal::BarrierInfo barrierInfo = {};

const uint32 pipePointCount = (syncDispatch || syncIndirectArgs) ? 1 : 0;
Pal::HwPipePoint pipePoint = Pal::HwPipePostCs;
Pal::AcquireReleaseInfo acqRelInfo = {};
Pal::MemBarrier memoryBarrier = {};

Pal::BarrierTransition transition = {};

if (syncDispatch)
{
transition.srcCacheMask = Pal::CoherShader;
}

if (syncPostCpWrite)
{
transition.srcCacheMask |= Pal::CoherCp;
}
if (syncDispatch || syncIndirectArgs)
{
memoryBarrier.srcStageMask = Pal::PipelineStageCs;
memoryBarrier.srcAccessMask = Pal::CoherShader;
}

if (syncDispatch || syncPostCpWrite)
{
barrierInfo.waitPoint = Pal::HwPipePreCs;
transition.dstCacheMask = Pal::CoherShader;
}
if (syncPostCpWrite)
{
memoryBarrier.srcStageMask |= Pal::PipelineStagePostPrefetch;
memoryBarrier.srcAccessMask |= Pal::CoherCp;
}

if (syncIndirectArgs)
{
barrierInfo.waitPoint = Pal::HwPipeTop;
transition.dstCacheMask |= Pal::CoherIndirectArgs;
}
if (syncDispatch || syncPostCpWrite)
{
memoryBarrier.dstStageMask = Pal::PipelineStageCs;
memoryBarrier.dstAccessMask = Pal::CoherShader;
}

barrierInfo.pipePointWaitCount = pipePointCount;
barrierInfo.pPipePoints = &pipePoint;
barrierInfo.transitionCount = 1;
barrierInfo.pTransitions = &transition;
if (syncIndirectArgs)
{
memoryBarrier.dstStageMask |= Pal::PipelineStageFetchIndirectArgs;
memoryBarrier.dstAccessMask |= Pal::CoherIndirectArgs;
}

barrierInfo.reason = m_deviceSettings.rgpBarrierReason;
acqRelInfo.memoryBarrierCount = 1;
acqRelInfo.pMemoryBarriers = &memoryBarrier;
acqRelInfo.reason = m_deviceSettings.rgpBarrierReason;

pCmdBuffer->CmdBarrier(barrierInfo);
}
pCmdBuffer->CmdReleaseThenAcquire(acqRelInfo);
}

// =====================================================================================================================
Expand All @@ -324,7 +284,11 @@ void PalBackend::CreateBufferViewSrds(
bool isTyped
) const
{
const uint32 bufferSrdSizeDw = m_deviceProperties.gfxipProperties.srdSizes.bufferView / sizeof(uint32);
const uint32 bufferSrdSizeDw = ((isTyped) ?
m_deviceProperties.gfxipProperties.srdSizes.typedBufferView :
m_deviceProperties.gfxipProperties.srdSizes.untypedBufferView)
/ sizeof(uint32);

const Pal::BufferViewInfo palBufferViewInfo = ConvertBufferViewToPalBufferView(bufferViewInfo);
const void* pNullBuffer = m_deviceProperties.gfxipProperties.nullSrds.pNullBufferView;

Expand Down
11 changes: 10 additions & 1 deletion gpurt/gpurt.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,10 @@ enum class InternalRayTracingCsType : uint32
BuildBVH,
BuildBVHTD,
BuildBVHTDTR,
BuildBVHPLOC,
BuildPLOC,
#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 52
BuildBVHPLOC = BuildPLOC,
#endif
UpdateQBVH,
UpdateParallel,
RefitBounds,
Expand All @@ -311,6 +314,9 @@ enum class InternalRayTracingCsType : uint32
InitExecuteIndirect,
PairCompression,
MergeSort,
MergeSortLocal,
MergeSortGlobalIteration,
MergeSortCopyLastLevel,
UpdateTriangles,
UpdateAabbs,
InitAccelerationStructure,
Expand Down Expand Up @@ -753,7 +759,9 @@ struct DeviceSettings
uint32 enableParallelUpdate : 1;
uint32 enableParallelBuild : 1;
uint32 enablePrefixScanDLB : 1;
#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 51
uint32 enableAcquireReleaseInterface : 1;
#endif
uint32 enableBuildAccelStructDumping : 1;
uint32 enableBuildAccelStructScratchDumping : 1;
uint32 enableBuildAccelStructStats : 1;
Expand All @@ -779,6 +787,7 @@ struct DeviceSettings

uint32 enableRemapScratchBuffer : 1; // Enable remapping bvh2 data from ScratchBuffer to ResultBuffer
uint32 checkBufferOverlapsInBatch : 1;
uint32 disableCompaction : 1; // Reports and perform copy instead of compaction
};

uint64 accelerationStructureUUID; // Acceleration Structure UUID
Expand Down
8 changes: 4 additions & 4 deletions gpurt/gpurtBuildSettings.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ struct CompileTimeBuildSettings
uint32 enableTopDownBuild;
uint32 useMortonCode30;
uint32 enableMergeSort;
uint32 fastBuildThreshold;
uint32 unused14;
uint32 enableFusedInstanceNode;
float tsPriority;
uint32 numRebraidIterations;
Expand Down Expand Up @@ -99,7 +99,7 @@ struct CompileTimeBuildSettings
uint32 unused11;
uint32 unused12;
uint32 unused13;
uint32 rebuildAccelStruct;
uint32 disableCompaction;
};

#define BUILD_SETTINGS_DATA_TOP_LEVEL_BUILD_ID 0
Expand All @@ -119,7 +119,7 @@ struct CompileTimeBuildSettings
#define BUILD_SETTINGS_DATA_ENABLE_TOP_DOWN_BUILD_ID 14
#define BUILD_SETTINGS_DATA_USE_MORTON_CODE_30_ID 15
#define BUILD_SETTINGS_DATA_ENABLE_MERGE_SORT_ID 16
#define BUILD_SETTINGS_DATA_FAST_BUILD_THRESHOLD_ID 17
// unused14 id 17
#define BUILD_SETTINGS_DATA_ENABLE_FUSED_INSTANCE_NODE_ID 18
#define BUILD_SETTINGS_DATA_TS_PRIORITY_ID 19
#define BUILD_SETTINGS_DATA_NUM_REBRAID_ITERATIONS_ID 20
Expand All @@ -135,7 +135,7 @@ struct CompileTimeBuildSettings
#define BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID 41
#define BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID 42
#define BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID 43
#define BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID 47
#define BUILD_SETTINGS_DATA_DISABLE_COMPACTION_ID 47

#ifdef __cplusplus
} // namespace GpuRt
Expand Down
6 changes: 3 additions & 3 deletions gpurt/gpurtDispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ struct DispatchRaysConstantData
uint32 missTableBaseAddressLo; // Miss shader table base address low 32-bits
uint32 missTableBaseAddressHi; // Miss shader table base address high 32-bits
uint32 missTableStrideInBytes; // Miss shader table record byte stride
uint32 reserved0; // Reserved padding
uint32 rayDispatchMaxGroups; // Max groups dispatched if persistent launch is enabled, else 0
uint32 hitGroupTableBaseAddressLo; // Hit group table base address low 32-bits
uint32 hitGroupTableBaseAddressHi; // Hit group table base address high 32-bits
uint32 hitGroupTableStrideInBytes; // Hit group table record byte stride
uint32 reserved1; // Reserved padding
uint32 reserved0; // Reserved padding
uint32 callableTableBaseAddressLo; // Callable shader table base address low 32-bits
uint32 callableTableBaseAddressHi; // Callable shader table base address high 32-bits
uint32 callableTableStrideInBytes; // Callable shader table byte stride
Expand Down Expand Up @@ -146,6 +146,7 @@ struct InitExecuteIndirectConstants
uint32 rtThreadGroupSizeX; // Internal RT threadgroup size X
uint32 rtThreadGroupSizeY; // Internal RT threadgroup size Y
uint32 rtThreadGroupSizeZ; // Internal RT threadgroup size Z
uint32 rayDispatchMaxGroups; // Max groups dispatched if persistent launch is enabled, else 0
uint32 counterMask; // Mask for filtering ray history token
uint32 pipelineCount; // Number of pipelines to launch (1 for indirect launch, raygen count for unified)
uint32 maxIterations; // Max traversal interations for profiling
Expand All @@ -160,7 +161,6 @@ struct InitExecuteIndirectConstants
uint32 counterRayIdRangeEnd; // Counter ray ID range end
uint32 cpsBackendStackSize; // Scratch memory used by a compiler backend, start at offset 0
uint32 padding0; // Padding for 16-byte alignment
uint32 padding1; // Padding for 16-byte alignment

#if __cplusplus
// Internal counter buffer SRDs
Expand Down
2 changes: 1 addition & 1 deletion gpurt/gpurtLib.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ namespace GpuRt
// update their definition of GPURT_CLIENT_INTERFACE_MAJOR_VERSION to indicate that they have made the required changes
// to support a new version. When the client version is updated, the old interface will be compiled out and only the
// new one will remain.
#define GPURT_INTERFACE_MAJOR_VERSION 49
#define GPURT_INTERFACE_MAJOR_VERSION 52

#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 44
// Minor interface version. This number is incrememnted when a compatible interface change is made. Compatible changes
Expand Down
84 changes: 76 additions & 8 deletions src/gpurtBvhBatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,14 @@ void BvhBatcher::BuildRaytracingAccelerationStructureBatch(
}
if ((updaters.IsEmpty() == false) || (builders.IsEmpty() == false))
{
Barrier();
uint32 barrierFlags = BarrierFlagSyncDispatch;
if (updaters.IsEmpty() == false)
{
// Updates can be launched with indirect dispatch. We need to avoid fetching the indirect arguments
// from the header before they are written by a previous build/update/copy.
barrierFlags |= BarrierFlagSyncIndirectArg;
}
Barrier(barrierFlags);
}
RGP_POP_MARKER();

Expand Down Expand Up @@ -304,11 +311,72 @@ void BvhBatcher::BuildMultiDispatch(Util::Span<BvhBuilder> builders)
if (PhaseEnabled(BuildPhaseFlags::MergeSort))
{
Barrier();
const uint32 wavesPerSimd = builders.size() == 1 ? 16U : 2U;
BuildFunction(BuildPhaseFlags::MergeSort, builders, [wavesPerSimd](BvhBuilder& builder)

if (builders.size() > 1)
{
builder.MergeSort(wavesPerSimd);
});
const uint32 wavesPerSimd = 2U;
BuildFunction(BuildPhaseFlags::MergeSort, builders, [wavesPerSimd](BvhBuilder& builder)
{
builder.MergeSort(wavesPerSimd);
});
}
else
{
RGP_PUSH_MARKER("Merge Sort");

// Batch local sorts together.
BuildPhase("Merge Sort (Local)", builders, &BvhBuilder::MergeSortLocal);

Barrier();

// Batch global sort iterations together. Compute max iterations amongst the builder batch
uint32 maxMergeSortTreeLevel = 0;

bool batchNeedsLastLevelCopy = false;

for (const auto& builder : builders)
{
const uint32 mergeSortTreeLevel = builder.GetMaxMergeSortTreeLevel();
maxMergeSortTreeLevel = Util::Max(maxMergeSortTreeLevel, mergeSortTreeLevel);
batchNeedsLastLevelCopy |= ((mergeSortTreeLevel & 1) == 1);
}

if (maxMergeSortTreeLevel > 0)
{
RGP_PUSH_MARKER("Merge Sort (Global Iteration)");
for (uint32 level = 1; level <= maxMergeSortTreeLevel; level++)
{
Barrier();

BuildFunction(nullptr, builders, [level](BvhBuilder& builder)
{
if (level <= builder.GetMaxMergeSortTreeLevel())
{
builder.MergeSortGlobalIteration(level);
}
});
}
RGP_POP_MARKER();

if (batchNeedsLastLevelCopy)
{
Barrier();

RGP_PUSH_MARKER("Merge Sort (Copy Last Level)");
BuildFunction(nullptr, builders, [](BvhBuilder& builder)
{
const uint32 mergeSortTreeLevel = builder.GetMaxMergeSortTreeLevel();
if ((mergeSortTreeLevel & 1) == 1)
{
builder.MergeSortCopyLastLevel();
}
});
RGP_POP_MARKER();
}
}

RGP_POP_MARKER();
}
}
if (PhaseEnabled(BuildPhaseFlags::RadixSort))
{
Expand All @@ -327,13 +395,13 @@ void BvhBatcher::BuildMultiDispatch(Util::Span<BvhBuilder> builders)
Barrier();
BuildPhase(BuildPhaseFlags::BuildFastAgglomerativeLbvh, builders, &BvhBuilder::BuildFastAgglomerativeLbvh);
}
if (PhaseEnabled(BuildPhaseFlags::BuildBVHPLOC))
if (PhaseEnabled(BuildPhaseFlags::BuildPLOC))
{
Barrier();
const uint32 wavesPerSimd = builders.size() == 1 ? 8U : 1U;
BuildFunction(BuildPhaseFlags::BuildBVHPLOC, builders, [wavesPerSimd](BvhBuilder& builder)
BuildFunction(BuildPhaseFlags::BuildPLOC, builders, [wavesPerSimd](BvhBuilder& builder)
{
builder.BuildBVHPLOC(wavesPerSimd);
builder.BuildPLOC(wavesPerSimd);
});
}
if (PhaseEnabled(BuildPhaseFlags::RefitBounds))
Expand Down
Loading

0 comments on commit 95c27c4

Please sign in to comment.