From 95c27c4031b112daaa231b76dee07f7ff05357d0 Mon Sep 17 00:00:00 2001 From: qiaojbao Date: Mon, 30 Sep 2024 11:46:47 +0800 Subject: [PATCH] Update gpurt from commit 80269d10 Remove unused path from BuildParallel Set triangle 1 bits in triangle ID only when it is a pair compressed triangle [Continuations] Change static ID handling for continuations Fix tdr in rayquery apps when rdp attaches Deprecate CmdBarrier() path rename BuildBVHPLOC to BuildPLOC Fix cmake defines and includes for validation Add deviceSetting to disable compaction Enable GPU debugging in all build stages Move cpp-shared raytracingdefs.h into non-shared ShaderDefs.hlsli Reduce CopyAS.hlsl dependencies Move bit-related utils from Math to Bits.hlsli Replace bufferView with typed and untyped BufferView Continuations persistent launch support [Continuations] Fix legacy compilation Initialize parentId to -1 for ray history counter Skip redundant copy in merge-sort iteration Copy Indirect Args in InitExecuteIndirect [Continuations] Remove redundant repacking of constant known ray flags [Continuations] Add options to override TraceRay flags Fix barrier corner cases Softcode validation file extension and directory [Continuations] Remove stack lowering guard Add spirv pass to validation of clean shaders Do not limit number of waves per simd for the encode path Recompute the dispatchID when threadGroupSize != 32 Separate merge sort local/global dispatches --- backends/pal/gpurtPalBackend.cpp | 104 +- gpurt/gpurt.h | 11 +- gpurt/gpurtBuildSettings.h | 8 +- gpurt/gpurtDispatch.h | 6 +- gpurt/gpurtLib.h | 2 +- src/gpurtBvhBatcher.cpp | 84 +- src/gpurtBvhBuilder.cpp | 107 +- src/gpurtBvhBuilder.h | 6 +- src/gpurtBvhBuilderCommon.h | 6 +- src/gpurtDevice.cpp | 13 +- src/gpurtInternal.h | 18 +- src/gpurtInternalShaders.cpp | 5 +- src/options.yaml | 17 + src/shaders/BuildBVHTDTR.hlsl | 138 ++ src/shaders/BuildCommon.hlsl | 31 +- src/shaders/BuildCommonScratch.hlsl | 11 +- src/shaders/BuildFastAgglomerativeLbvh.hlsl | 47 +- .../{BuildBVHPLOC.hlsl => BuildPLOC.hlsl} | 8 +- src/shaders/BuildParallel.hlsl | 100 +- src/shaders/BuildQBVH.hlsl | 17 +- src/shaders/BuildSettings.hlsli | 7 +- src/shaders/CMakeLists.txt | 15 +- src/shaders/Common.hlsl | 69 +- src/shaders/Continuations1_1.hlsl | 5 +- src/shaders/Continuations2_0.hlsl | 7 +- src/shaders/CopyAS.hlsl | 5 +- src/shaders/Debug.hlsl | 7 +- src/shaders/EncodeCommon.hlsl | 21 +- src/shaders/EncodeHwBvhCommon.hlsl | 5 +- src/shaders/EncodeNodes.hlsl | 12 +- src/shaders/EncodePairedTriangleImpl.hlsl | 33 +- src/shaders/EncodeTopLevel.hlsl | 2 +- src/shaders/EncodeTopLevelBuild.hlsl | 6 +- src/shaders/Extensions.hlsl | 175 +-- src/shaders/GpuRtLibrary.hlsl | 3 + src/shaders/GpuRtLibraryCont.hlsl | 306 +++- src/shaders/InitExecuteIndirect.hlsl | 20 +- src/shaders/MergeSort.hlsl | 318 +++- src/shaders/PairCompression.hlsl | 12 +- src/shaders/RayQuery.hlsl | 2 +- src/shaders/TaskQueueCounter.hlsl | 7 + src/shaders/TriangleSplitting.hlsl | 32 + src/shaders/Update.hlsl | 2 +- src/shadersClean/common/Bits.hlsli | 166 ++ src/shadersClean/common/BoundingBox.hlsli | 74 + src/shadersClean/common/Extensions.hlsli | 4 - src/shadersClean/common/InstanceDesc.hlsli | 51 + src/shadersClean/common/Math.hlsl | 3 + src/shadersClean/common/Math.hlsli | 142 +- src/shadersClean/common/NodePointers.hlsli | 82 + .../common/ScratchNode.hlsli} | 13 +- src/shadersClean/common/ShaderDefs.hlsli | 451 ++++++ src/shadersClean/common/TempAssert.hlsli | 38 + .../common/gfx10/BoxNode1_0.hlsli | 137 ++ .../common/gfx10/InstanceNode1_0.hlsli | 72 + .../common/gfx10/ProceduralNode1_0.hlsli | 56 + .../common/gfx10/TriangleNode1_0.hlsli | 82 + .../traversal/TraversalDefs.hlsli | 160 ++ src/shared/rayTracingDefs.h | 1389 +---------------- tools/CompileRTShaders.py | 139 +- 60 files changed, 2679 insertions(+), 2190 deletions(-) rename src/shaders/{BuildBVHPLOC.hlsl => BuildPLOC.hlsl} (99%) create mode 100644 src/shadersClean/common/Bits.hlsli create mode 100644 src/shadersClean/common/BoundingBox.hlsli create mode 100644 src/shadersClean/common/InstanceDesc.hlsli create mode 100644 src/shadersClean/common/NodePointers.hlsli rename src/{shared/scratchNode.h => shadersClean/common/ScratchNode.hlsli} (97%) create mode 100644 src/shadersClean/common/TempAssert.hlsli create mode 100644 src/shadersClean/common/gfx10/BoxNode1_0.hlsli create mode 100644 src/shadersClean/common/gfx10/InstanceNode1_0.hlsli create mode 100644 src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli create mode 100644 src/shadersClean/common/gfx10/TriangleNode1_0.hlsli create mode 100644 src/shadersClean/traversal/TraversalDefs.hlsli diff --git a/backends/pal/gpurtPalBackend.cpp b/backends/pal/gpurtPalBackend.cpp index bbae889..08f4571 100644 --- a/backends/pal/gpurtPalBackend.cpp +++ b/backends/pal/gpurtPalBackend.cpp @@ -167,7 +167,9 @@ uint32 PalBackend::GetMaxDescriptorTableSize( ClientCmdBufferHandle cmdBuffer ) const { - const uint32 bufferSrdSizeDw = m_deviceProperties.gfxipProperties.srdSizes.bufferView / sizeof(uint32); + const uint32 bufferSrdSizeDw = Util::Max(m_deviceProperties.gfxipProperties.srdSizes.typedBufferView, + m_deviceProperties.gfxipProperties.srdSizes.untypedBufferView) / + sizeof(uint32); return GetCmdBuffer(cmdBuffer)->GetLargeEmbeddedDataLimit() / bufferSrdSizeDw; } @@ -239,81 +241,39 @@ void PalBackend::InsertBarrier( const bool syncPostCpWrite = flags & BarrierFlagSyncPostCpWrite; Pal::ICmdBuffer* pCmdBuffer = GetCmdBuffer(cmdBuffer); - if (m_deviceSettings.enableAcquireReleaseInterface) - { - Pal::AcquireReleaseInfo acqRelInfo = {}; - Pal::MemBarrier memoryBarrier = {}; - - if (syncDispatch || syncIndirectArgs) - { - memoryBarrier.srcStageMask = Pal::PipelineStageCs; - memoryBarrier.srcAccessMask = Pal::CoherShader; - } - - if (syncPostCpWrite) - { - memoryBarrier.srcStageMask |= Pal::PipelineStagePostPrefetch; - memoryBarrier.srcAccessMask |= Pal::CoherCp; - } - - if (syncDispatch || syncPostCpWrite) - { - memoryBarrier.dstStageMask = Pal::PipelineStageCs; - memoryBarrier.dstAccessMask = Pal::CoherShader; - } - - if (syncIndirectArgs) - { - memoryBarrier.dstStageMask |= Pal::PipelineStageFetchIndirectArgs; - memoryBarrier.dstAccessMask |= Pal::CoherIndirectArgs; - } - - acqRelInfo.memoryBarrierCount = 1; - acqRelInfo.pMemoryBarriers = &memoryBarrier; - acqRelInfo.reason = m_deviceSettings.rgpBarrierReason; - - pCmdBuffer->CmdReleaseThenAcquire(acqRelInfo); - } - else - { - Pal::BarrierInfo barrierInfo = {}; - const uint32 pipePointCount = (syncDispatch || syncIndirectArgs) ? 1 : 0; - Pal::HwPipePoint pipePoint = Pal::HwPipePostCs; + Pal::AcquireReleaseInfo acqRelInfo = {}; + Pal::MemBarrier memoryBarrier = {}; - Pal::BarrierTransition transition = {}; - - if (syncDispatch) - { - transition.srcCacheMask = Pal::CoherShader; - } - - if (syncPostCpWrite) - { - transition.srcCacheMask |= Pal::CoherCp; - } + if (syncDispatch || syncIndirectArgs) + { + memoryBarrier.srcStageMask = Pal::PipelineStageCs; + memoryBarrier.srcAccessMask = Pal::CoherShader; + } - if (syncDispatch || syncPostCpWrite) - { - barrierInfo.waitPoint = Pal::HwPipePreCs; - transition.dstCacheMask = Pal::CoherShader; - } + if (syncPostCpWrite) + { + memoryBarrier.srcStageMask |= Pal::PipelineStagePostPrefetch; + memoryBarrier.srcAccessMask |= Pal::CoherCp; + } - if (syncIndirectArgs) - { - barrierInfo.waitPoint = Pal::HwPipeTop; - transition.dstCacheMask |= Pal::CoherIndirectArgs; - } + if (syncDispatch || syncPostCpWrite) + { + memoryBarrier.dstStageMask = Pal::PipelineStageCs; + memoryBarrier.dstAccessMask = Pal::CoherShader; + } - barrierInfo.pipePointWaitCount = pipePointCount; - barrierInfo.pPipePoints = &pipePoint; - barrierInfo.transitionCount = 1; - barrierInfo.pTransitions = &transition; + if (syncIndirectArgs) + { + memoryBarrier.dstStageMask |= Pal::PipelineStageFetchIndirectArgs; + memoryBarrier.dstAccessMask |= Pal::CoherIndirectArgs; + } - barrierInfo.reason = m_deviceSettings.rgpBarrierReason; + acqRelInfo.memoryBarrierCount = 1; + acqRelInfo.pMemoryBarriers = &memoryBarrier; + acqRelInfo.reason = m_deviceSettings.rgpBarrierReason; - pCmdBuffer->CmdBarrier(barrierInfo); - } + pCmdBuffer->CmdReleaseThenAcquire(acqRelInfo); } // ===================================================================================================================== @@ -324,7 +284,11 @@ void PalBackend::CreateBufferViewSrds( bool isTyped ) const { - const uint32 bufferSrdSizeDw = m_deviceProperties.gfxipProperties.srdSizes.bufferView / sizeof(uint32); + const uint32 bufferSrdSizeDw = ((isTyped) ? + m_deviceProperties.gfxipProperties.srdSizes.typedBufferView : + m_deviceProperties.gfxipProperties.srdSizes.untypedBufferView) + / sizeof(uint32); + const Pal::BufferViewInfo palBufferViewInfo = ConvertBufferViewToPalBufferView(bufferViewInfo); const void* pNullBuffer = m_deviceProperties.gfxipProperties.nullSrds.pNullBufferView; diff --git a/gpurt/gpurt.h b/gpurt/gpurt.h index 412b556..68d5ef5 100644 --- a/gpurt/gpurt.h +++ b/gpurt/gpurt.h @@ -285,7 +285,10 @@ enum class InternalRayTracingCsType : uint32 BuildBVH, BuildBVHTD, BuildBVHTDTR, - BuildBVHPLOC, + BuildPLOC, +#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 52 + BuildBVHPLOC = BuildPLOC, +#endif UpdateQBVH, UpdateParallel, RefitBounds, @@ -311,6 +314,9 @@ enum class InternalRayTracingCsType : uint32 InitExecuteIndirect, PairCompression, MergeSort, + MergeSortLocal, + MergeSortGlobalIteration, + MergeSortCopyLastLevel, UpdateTriangles, UpdateAabbs, InitAccelerationStructure, @@ -753,7 +759,9 @@ struct DeviceSettings uint32 enableParallelUpdate : 1; uint32 enableParallelBuild : 1; uint32 enablePrefixScanDLB : 1; +#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 51 uint32 enableAcquireReleaseInterface : 1; +#endif uint32 enableBuildAccelStructDumping : 1; uint32 enableBuildAccelStructScratchDumping : 1; uint32 enableBuildAccelStructStats : 1; @@ -779,6 +787,7 @@ struct DeviceSettings uint32 enableRemapScratchBuffer : 1; // Enable remapping bvh2 data from ScratchBuffer to ResultBuffer uint32 checkBufferOverlapsInBatch : 1; + uint32 disableCompaction : 1; // Reports and perform copy instead of compaction }; uint64 accelerationStructureUUID; // Acceleration Structure UUID diff --git a/gpurt/gpurtBuildSettings.h b/gpurt/gpurtBuildSettings.h index cce62a4..5c73247 100644 --- a/gpurt/gpurtBuildSettings.h +++ b/gpurt/gpurtBuildSettings.h @@ -69,7 +69,7 @@ struct CompileTimeBuildSettings uint32 enableTopDownBuild; uint32 useMortonCode30; uint32 enableMergeSort; - uint32 fastBuildThreshold; + uint32 unused14; uint32 enableFusedInstanceNode; float tsPriority; uint32 numRebraidIterations; @@ -99,7 +99,7 @@ struct CompileTimeBuildSettings uint32 unused11; uint32 unused12; uint32 unused13; - uint32 rebuildAccelStruct; + uint32 disableCompaction; }; #define BUILD_SETTINGS_DATA_TOP_LEVEL_BUILD_ID 0 @@ -119,7 +119,7 @@ struct CompileTimeBuildSettings #define BUILD_SETTINGS_DATA_ENABLE_TOP_DOWN_BUILD_ID 14 #define BUILD_SETTINGS_DATA_USE_MORTON_CODE_30_ID 15 #define BUILD_SETTINGS_DATA_ENABLE_MERGE_SORT_ID 16 -#define BUILD_SETTINGS_DATA_FAST_BUILD_THRESHOLD_ID 17 +// unused14 id 17 #define BUILD_SETTINGS_DATA_ENABLE_FUSED_INSTANCE_NODE_ID 18 #define BUILD_SETTINGS_DATA_TS_PRIORITY_ID 19 #define BUILD_SETTINGS_DATA_NUM_REBRAID_ITERATIONS_ID 20 @@ -135,7 +135,7 @@ struct CompileTimeBuildSettings #define BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID 41 #define BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID 42 #define BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID 43 -#define BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID 47 +#define BUILD_SETTINGS_DATA_DISABLE_COMPACTION_ID 47 #ifdef __cplusplus } // namespace GpuRt diff --git a/gpurt/gpurtDispatch.h b/gpurt/gpurtDispatch.h index b6fb1b9..8f4ce03 100644 --- a/gpurt/gpurtDispatch.h +++ b/gpurt/gpurtDispatch.h @@ -70,11 +70,11 @@ struct DispatchRaysConstantData uint32 missTableBaseAddressLo; // Miss shader table base address low 32-bits uint32 missTableBaseAddressHi; // Miss shader table base address high 32-bits uint32 missTableStrideInBytes; // Miss shader table record byte stride - uint32 reserved0; // Reserved padding + uint32 rayDispatchMaxGroups; // Max groups dispatched if persistent launch is enabled, else 0 uint32 hitGroupTableBaseAddressLo; // Hit group table base address low 32-bits uint32 hitGroupTableBaseAddressHi; // Hit group table base address high 32-bits uint32 hitGroupTableStrideInBytes; // Hit group table record byte stride - uint32 reserved1; // Reserved padding + uint32 reserved0; // Reserved padding uint32 callableTableBaseAddressLo; // Callable shader table base address low 32-bits uint32 callableTableBaseAddressHi; // Callable shader table base address high 32-bits uint32 callableTableStrideInBytes; // Callable shader table byte stride @@ -146,6 +146,7 @@ struct InitExecuteIndirectConstants uint32 rtThreadGroupSizeX; // Internal RT threadgroup size X uint32 rtThreadGroupSizeY; // Internal RT threadgroup size Y uint32 rtThreadGroupSizeZ; // Internal RT threadgroup size Z + uint32 rayDispatchMaxGroups; // Max groups dispatched if persistent launch is enabled, else 0 uint32 counterMask; // Mask for filtering ray history token uint32 pipelineCount; // Number of pipelines to launch (1 for indirect launch, raygen count for unified) uint32 maxIterations; // Max traversal interations for profiling @@ -160,7 +161,6 @@ struct InitExecuteIndirectConstants uint32 counterRayIdRangeEnd; // Counter ray ID range end uint32 cpsBackendStackSize; // Scratch memory used by a compiler backend, start at offset 0 uint32 padding0; // Padding for 16-byte alignment - uint32 padding1; // Padding for 16-byte alignment #if __cplusplus // Internal counter buffer SRDs diff --git a/gpurt/gpurtLib.h b/gpurt/gpurtLib.h index b607c3e..0fc8001 100644 --- a/gpurt/gpurtLib.h +++ b/gpurt/gpurtLib.h @@ -42,7 +42,7 @@ namespace GpuRt // update their definition of GPURT_CLIENT_INTERFACE_MAJOR_VERSION to indicate that they have made the required changes // to support a new version. When the client version is updated, the old interface will be compiled out and only the // new one will remain. -#define GPURT_INTERFACE_MAJOR_VERSION 49 +#define GPURT_INTERFACE_MAJOR_VERSION 52 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 44 // Minor interface version. This number is incrememnted when a compatible interface change is made. Compatible changes diff --git a/src/gpurtBvhBatcher.cpp b/src/gpurtBvhBatcher.cpp index 08ee242..ba70d10 100644 --- a/src/gpurtBvhBatcher.cpp +++ b/src/gpurtBvhBatcher.cpp @@ -200,7 +200,14 @@ void BvhBatcher::BuildRaytracingAccelerationStructureBatch( } if ((updaters.IsEmpty() == false) || (builders.IsEmpty() == false)) { - Barrier(); + uint32 barrierFlags = BarrierFlagSyncDispatch; + if (updaters.IsEmpty() == false) + { + // Updates can be launched with indirect dispatch. We need to avoid fetching the indirect arguments + // from the header before they are written by a previous build/update/copy. + barrierFlags |= BarrierFlagSyncIndirectArg; + } + Barrier(barrierFlags); } RGP_POP_MARKER(); @@ -304,11 +311,72 @@ void BvhBatcher::BuildMultiDispatch(Util::Span builders) if (PhaseEnabled(BuildPhaseFlags::MergeSort)) { Barrier(); - const uint32 wavesPerSimd = builders.size() == 1 ? 16U : 2U; - BuildFunction(BuildPhaseFlags::MergeSort, builders, [wavesPerSimd](BvhBuilder& builder) + + if (builders.size() > 1) { - builder.MergeSort(wavesPerSimd); - }); + const uint32 wavesPerSimd = 2U; + BuildFunction(BuildPhaseFlags::MergeSort, builders, [wavesPerSimd](BvhBuilder& builder) + { + builder.MergeSort(wavesPerSimd); + }); + } + else + { + RGP_PUSH_MARKER("Merge Sort"); + + // Batch local sorts together. + BuildPhase("Merge Sort (Local)", builders, &BvhBuilder::MergeSortLocal); + + Barrier(); + + // Batch global sort iterations together. Compute max iterations amongst the builder batch + uint32 maxMergeSortTreeLevel = 0; + + bool batchNeedsLastLevelCopy = false; + + for (const auto& builder : builders) + { + const uint32 mergeSortTreeLevel = builder.GetMaxMergeSortTreeLevel(); + maxMergeSortTreeLevel = Util::Max(maxMergeSortTreeLevel, mergeSortTreeLevel); + batchNeedsLastLevelCopy |= ((mergeSortTreeLevel & 1) == 1); + } + + if (maxMergeSortTreeLevel > 0) + { + RGP_PUSH_MARKER("Merge Sort (Global Iteration)"); + for (uint32 level = 1; level <= maxMergeSortTreeLevel; level++) + { + Barrier(); + + BuildFunction(nullptr, builders, [level](BvhBuilder& builder) + { + if (level <= builder.GetMaxMergeSortTreeLevel()) + { + builder.MergeSortGlobalIteration(level); + } + }); + } + RGP_POP_MARKER(); + + if (batchNeedsLastLevelCopy) + { + Barrier(); + + RGP_PUSH_MARKER("Merge Sort (Copy Last Level)"); + BuildFunction(nullptr, builders, [](BvhBuilder& builder) + { + const uint32 mergeSortTreeLevel = builder.GetMaxMergeSortTreeLevel(); + if ((mergeSortTreeLevel & 1) == 1) + { + builder.MergeSortCopyLastLevel(); + } + }); + RGP_POP_MARKER(); + } + } + + RGP_POP_MARKER(); + } } if (PhaseEnabled(BuildPhaseFlags::RadixSort)) { @@ -327,13 +395,13 @@ void BvhBatcher::BuildMultiDispatch(Util::Span builders) Barrier(); BuildPhase(BuildPhaseFlags::BuildFastAgglomerativeLbvh, builders, &BvhBuilder::BuildFastAgglomerativeLbvh); } - if (PhaseEnabled(BuildPhaseFlags::BuildBVHPLOC)) + if (PhaseEnabled(BuildPhaseFlags::BuildPLOC)) { Barrier(); const uint32 wavesPerSimd = builders.size() == 1 ? 8U : 1U; - BuildFunction(BuildPhaseFlags::BuildBVHPLOC, builders, [wavesPerSimd](BvhBuilder& builder) + BuildFunction(BuildPhaseFlags::BuildPLOC, builders, [wavesPerSimd](BvhBuilder& builder) { - builder.BuildBVHPLOC(wavesPerSimd); + builder.BuildPLOC(wavesPerSimd); }); } if (PhaseEnabled(BuildPhaseFlags::RefitBounds)) diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp index feb9e0b..47e4043 100644 --- a/src/gpurtBvhBuilder.cpp +++ b/src/gpurtBvhBuilder.cpp @@ -1523,7 +1523,7 @@ void BvhBuilder::InitGeometryConstants() void* pVbvTable = m_pDevice->AllocateDescriptorTable(m_cmdBuffer, geometryCount, &m_geomBufferSrdTable); void* pCbvTable = m_pDevice->AllocateDescriptorTable(m_cmdBuffer, geometryCount, &m_geomConstSrdTable); - const uint32 srdSizeBytes = m_pDevice->GetBufferSrdSizeDw() * sizeof(uint32); + const uint32 srdSizeBytes = m_pDevice->GetUntypedBufferSrdSizeDw() * sizeof(uint32); for (uint32 i = 0; i < geometryCount; i++) { @@ -2201,7 +2201,6 @@ void BvhBuilder::InitBuildSettings() m_buildSettings.rebraidType = static_cast(m_buildConfig.rebraidType); m_buildSettings.enableTopDownBuild = m_buildConfig.topDownBuild; m_buildSettings.useMortonCode30 = m_deviceSettings.enableMortonCode30; - m_buildSettings.fastBuildThreshold = m_deviceSettings.fastBuildThreshold; m_buildSettings.enableFusedInstanceNode = m_deviceSettings.enableFusedInstanceNode; m_buildSettings.enableMergeSort = m_buildConfig.enableMergeSort; @@ -2248,7 +2247,9 @@ void BvhBuilder::InitBuildSettings() m_buildSettings.updateFlags = m_buildArgs.inputs.flags & (AccelStructBuildFlagPerformUpdate | AccelStructBuildFlagAllowUpdate); - m_buildSettings.rebuildAccelStruct = m_buildConfig.rebuildAccelStruct; + + // Rebuilding an updateable acceleration structure need to use the original size and not compacted one. + m_buildSettings.disableCompaction = m_buildConfig.rebuildAccelStruct || m_deviceSettings.disableCompaction; m_buildSettings.isUpdateInPlace = IsUpdateInPlace(); m_buildSettings.encodeArrayOfPointers = @@ -2821,7 +2822,14 @@ void BvhBuilder::EmitAccelerationStructurePostBuildInfo( break; case AccelStructPostBuildInfoType::CompactedSize: - EmitASCompactedType(postBuildInfo); + if (m_deviceSettings.disableCompaction) + { + EmitASCurrentSize(postBuildInfo); + } + else + { + EmitASCompactedType(postBuildInfo); + } break; case AccelStructPostBuildInfoType::ToolsVisualization: @@ -2990,7 +2998,14 @@ void BvhBuilder::CopyAccelerationStructure( break; case AccelStructCopyMode::Compact: - CopyASCompactMode(copyArgs); + if (m_deviceSettings.disableCompaction) + { + CopyASCloneMode(copyArgs); + } + else + { + CopyASCompactMode(copyArgs); + } break; case AccelStructCopyMode::Serialize: @@ -3257,7 +3272,7 @@ BuildPhaseFlags BvhBuilder::EnabledPhases() const } if (m_buildConfig.buildMode == BvhBuildMode::PLOC) { - flags |= BuildPhaseFlags::BuildBVHPLOC; + flags |= BuildPhaseFlags::BuildPLOC; } if (AllowLatePairCompression()) { @@ -3345,6 +3360,78 @@ void BvhBuilder::MergeSort( RGP_POP_MARKER(); } +// ===================================================================================================================== +// Executes merge sort shader to sort the input keys and values +void BvhBuilder::MergeSortLocal() +{ + PAL_ASSERT(m_buildConfig.enableMergeSort); + + BindPipeline(InternalRayTracingCsType::MergeSortLocal); + + WriteBuildBufferBindings(); + + RGP_PUSH_MARKER("Merge Sort Local (maxNumPrimitives %u)", m_buildConfig.maxNumPrimitives); + + const uint32 tgSize = 512; + Dispatch(Util::RoundUpQuotient(m_buildConfig.maxNumPrimitives, tgSize)); + + RGP_POP_MARKER(); +} + +// ===================================================================================================================== +uint32 BvhBuilder::GetMaxMergeSortTreeLevel() const +{ + const uint32 tgSize = 512; + + const uint32 groupSize = tgSize; + const uint32 numKeysPerThread = 2u; + const uint32 groupCapacity = groupSize * numKeysPerThread; + const uint32 numLocalSortedGroups = Util::RoundUpQuotient(m_buildConfig.maxNumPrimitives, groupCapacity); + const uint32 numLevelsOfMergeTree = Util::CeilLog2(numLocalSortedGroups); + + return numLevelsOfMergeTree; +} + +// ===================================================================================================================== +// Executes merge sort shader to sort the input keys and values +void BvhBuilder::MergeSortGlobalIteration( + uint32 level) +{ + PAL_ASSERT(m_buildConfig.enableMergeSort); + + BindPipeline(InternalRayTracingCsType::MergeSortGlobalIteration); + + const BuildShaderRootConstants1 constants = { + .passIndex = level, + }; + WriteBuildBufferBindings(constants); + + RGP_PUSH_MARKER("Merge Sort Global Iteration (maxNumPrimitives %u, level %u)", m_buildConfig.maxNumPrimitives, level); + + const uint32 tgSize = 512; + Dispatch(Util::RoundUpQuotient(m_buildConfig.maxNumPrimitives, tgSize)); + + RGP_POP_MARKER(); +} + +// ===================================================================================================================== +// Executes merge sort shader to sort the input keys and values +void BvhBuilder::MergeSortCopyLastLevel() +{ + PAL_ASSERT(m_buildConfig.enableMergeSort); + + BindPipeline(InternalRayTracingCsType::MergeSortCopyLastLevel); + + WriteBuildBufferBindings(); + + RGP_PUSH_MARKER("Merge Sort Copy (maxNumPrimitives %u)", m_buildConfig.maxNumPrimitives); + + const uint32 tgSize = 512; + Dispatch(Util::RoundUpQuotient(m_buildConfig.maxNumPrimitives, tgSize)); + + RGP_POP_MARKER(); +} + // ===================================================================================================================== // Returns true when the builder uses the Rebraid phase bool BvhBuilder::AllowRebraid() const @@ -3476,10 +3563,10 @@ void BvhBuilder::BuildBVHTD() // ===================================================================================================================== // Executes the build BVH PLOC shader -void BvhBuilder::BuildBVHPLOC( +void BvhBuilder::BuildPLOC( uint32 wavesPerSimd) { - BindPipeline(InternalRayTracingCsType::BuildBVHPLOC); + BindPipeline(InternalRayTracingCsType::BuildPLOC); const uint32 tgSize = 256u; const uint32 numThreadGroups = GetNumPersistentThreadGroups(m_buildConfig.maxNumPrimitives, tgSize, wavesPerSimd); @@ -3664,9 +3751,7 @@ void BvhBuilder::EncodeHwBvh() } const uint32 nodeCount = GetNumInternalNodeCount(); - const uint32 numThreadGroups = - m_buildSettings.topLevelBuild ? Util::RoundUpQuotient(nodeCount, DefaultThreadGroupSize) : - GetNumPersistentThreadGroups(nodeCount); + const uint32 numThreadGroups = Util::RoundUpQuotient(nodeCount, DefaultThreadGroupSize); BuildShaderRootConstants0 shaderConstants = { diff --git a/src/gpurtBvhBuilder.h b/src/gpurtBvhBuilder.h index 4af762f..806c518 100644 --- a/src/gpurtBvhBuilder.h +++ b/src/gpurtBvhBuilder.h @@ -341,7 +341,7 @@ class BvhBuilder void BuildBVHTD(); - void BuildBVHPLOC(uint32 wavesPerSimd); + void BuildPLOC(uint32 wavesPerSimd); void BuildFastAgglomerativeLbvh(); @@ -374,6 +374,10 @@ class BvhBuilder uint32 numElems); void MergeSort(uint32 wavesPerSimd); + void MergeSortLocal(); + void MergeSortGlobalIteration(uint32 level); + void MergeSortCopyLastLevel(); + uint32 GetMaxMergeSortTreeLevel() const; void SortRadixInt32(); void ScanExclusiveAdd( diff --git a/src/gpurtBvhBuilderCommon.h b/src/gpurtBvhBuilderCommon.h index 5953044..435ca0d 100644 --- a/src/gpurtBvhBuilderCommon.h +++ b/src/gpurtBvhBuilderCommon.h @@ -49,7 +49,7 @@ enum class BuildPhaseFlags : uint32_t MergeSort = 1 << 4, RadixSort = 1 << 5, BuildBVH = 1 << 6, - BuildBVHPLOC = 1 << 7, + BuildPLOC = 1 << 7, RefitBounds = 1 << 8, PairCompression = 1 << 9, SeparateEmitPostBuildInfoPass = 1 << 12, @@ -78,8 +78,8 @@ static const char* BuildPhaseName(BuildPhaseFlags phase) return "RadixSort"; case GpuRt::BuildPhaseFlags::BuildBVH: return "BuildBVH"; - case GpuRt::BuildPhaseFlags::BuildBVHPLOC: - return "BuildBVHPLOC"; + case GpuRt::BuildPhaseFlags::BuildPLOC: + return "BuildPLOC"; case GpuRt::BuildPhaseFlags::RefitBounds: return "RefitBounds"; case GpuRt::BuildPhaseFlags::PairCompression: diff --git a/src/gpurtDevice.cpp b/src/gpurtDevice.cpp index b1368f0..6058d33 100644 --- a/src/gpurtDevice.cpp +++ b/src/gpurtDevice.cpp @@ -387,6 +387,8 @@ Device::Device( m_tlasCaptureList(this), m_isTraceActive(false), m_accelStructTraceSource(this), + m_typedBufferSrdSizeDw{}, + m_untypedBufferSrdSizeDw{}, m_rayHistoryTraceSource(this), #if GPURT_ENABLE_GPU_DEBUG m_debugMonitor(this), @@ -421,7 +423,8 @@ Pal::Result Device::Init() Pal::DeviceProperties props = {}; m_info.pPalDevice->GetProperties(&props); - m_bufferSrdSizeDw = props.gfxipProperties.srdSizes.bufferView / sizeof(uint32); + m_typedBufferSrdSizeDw = props.gfxipProperties.srdSizes.typedBufferView / sizeof(uint32); + m_untypedBufferSrdSizeDw = props.gfxipProperties.srdSizes.untypedBufferView / sizeof(uint32); if (m_info.deviceSettings.emulatedRtIpLevel == Pal::RayTracingIpLevel::None) { @@ -812,7 +815,7 @@ void* Device::AllocateDescriptorTable( uint32 count, gpusize* pGpuAddress) const { - const uint32 srdSizeBytes = m_bufferSrdSizeDw * sizeof(uint32); + const uint32 srdSizeBytes = m_typedBufferSrdSizeDw * sizeof(uint32); const uint32 srdBufferSizeBytes = srdSizeBytes * count; return AllocateTemporaryData(cmdBuffer, srdBufferSizeBytes, pGpuAddress); } @@ -828,7 +831,7 @@ uint32 Device::WriteBufferSrdTable( { gpusize tableVa; void* pTable = AllocateDescriptorTable(cmdBuffer, count, &tableVa); - const uint32 srdSizeBytes = m_bufferSrdSizeDw * sizeof(uint32); + const uint32 srdSizeBytes = (typedBuffer ? m_typedBufferSrdSizeDw : m_untypedBufferSrdSizeDw) * sizeof(uint32); for (uint32 i = 0; i < count; i++) { @@ -2134,6 +2137,10 @@ const AccelStructBuildInputs Device::OverrideBuildInputs( { buildInputs.flags &= ~(GpuRt::AccelStructBuildFlagAllowUpdate | GpuRt::AccelStructBuildFlagPerformUpdate); } + if (Settings().disableCompaction) + { + buildInputs.flags &= ~(GpuRt::AccelStructBuildFlagAllowCompaction); + } return buildInputs; } diff --git a/src/gpurtInternal.h b/src/gpurtInternal.h index d5f0251..7cf7f2c 100644 --- a/src/gpurtInternal.h +++ b/src/gpurtInternal.h @@ -231,14 +231,6 @@ using InternalPipelineMap = std::unordered_map; -//===================================================================================================================== -// different ways to encode the scene bounds used to generate morton codes -enum class SceneBoundsCalculation : uint32 -{ - BasedOnGeometry = 0, - BasedOnGeometryWithSize -}; - namespace Internal { // ===================================================================================================================== @@ -691,8 +683,11 @@ class Device : public IDevice virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const override; - // Returns size in DWORDs of a buffer view SRD - uint32 GetBufferSrdSizeDw() const { return m_bufferSrdSizeDw; }; + // Returns size in DWORDs of a typed buffer view SRD + uint32 GetTypedBufferSrdSizeDw() const { return m_typedBufferSrdSizeDw; }; + + // Returns size in DWORDs of a untyped buffer view SRD + uint32 GetUntypedBufferSrdSizeDw() const { return m_untypedBufferSrdSizeDw; }; Pal::RayTracingIpLevel GetRtIpLevel() const { return m_rtIpLevel; } @@ -736,7 +731,8 @@ class Device : public IDevice Util::Mutex m_traceBvhLock; bool m_isTraceActive; GpuRt::AccelStructTraceSource m_accelStructTraceSource; - uint32 m_bufferSrdSizeDw; + uint32 m_typedBufferSrdSizeDw; + uint32 m_untypedBufferSrdSizeDw; ClientCallbacks m_clientCb; Pal::RayTracingIpLevel m_rtIpLevel; // the actual RTIP level GPURT is using, // is based on emulatedRtIpLevel and the actual device. diff --git a/src/gpurtInternalShaders.cpp b/src/gpurtInternalShaders.cpp index 64bce5f..2f4eec5 100644 --- a/src/gpurtInternalShaders.cpp +++ b/src/gpurtInternalShaders.cpp @@ -70,7 +70,7 @@ const PipelineBuildInfo InternalPipelineBuildInfo[size_t(InternalRayTracingCsTyp PIPELINE_BUILD_BVH_INFO(BuildBVH), PIPELINE_BUILD_BVH_INFO(BuildBVHTD), PIPELINE_BUILD_BVH_INFO(BuildBVHTDTR), - PIPELINE_BUILD_BVH_INFO(BuildBVHPLOC), + PIPELINE_BUILD_BVH_INFO(BuildPLOC), PIPELINE_BUILD_INFO(UpdateQBVH), PIPELINE_BUILD_INFO(UpdateParallel), PIPELINE_BUILD_BVH_INFO(RefitBounds), @@ -96,6 +96,9 @@ const PipelineBuildInfo InternalPipelineBuildInfo[size_t(InternalRayTracingCsTyp PIPELINE_BUILD_INFO(InitExecuteIndirect), PIPELINE_BUILD_BVH_INFO(PairCompression), PIPELINE_BUILD_BVH_INFO(MergeSort), + PIPELINE_BUILD_BVH_INFO(MergeSortLocal), + PIPELINE_BUILD_BVH_INFO(MergeSortGlobalIteration), + PIPELINE_BUILD_BVH_INFO(MergeSortCopyLastLevel), PIPELINE_BUILD_INFO(UpdateTriangles), PIPELINE_BUILD_INFO(UpdateAabbs), PIPELINE_BUILD_INFO(InitAccelerationStructure), diff --git a/src/options.yaml b/src/options.yaml index 80b15d4..4ea170e 100644 --- a/src/options.yaml +++ b/src/options.yaml @@ -54,3 +54,20 @@ Options: cpsCandidatePrimitiveMode: Type: CpsCandidatePrimitiveMode Default: CpsCandidatePrimitiveMode::SuspendLane + + persistentLaunchEnabled: + Type: uint32 + Default: 0 + + rayFlagsOverrideForceEnableMask: + # The incoming TraceRay ray flags are ORed with this mask before use, allowing to force-enable specific flags. + # In case of conflicting bits with the disable mask, the enable mask wins, as it is applied after the disable mask. + # Only supported with continuations. + Type: uint32 + Default: 0 + + rayFlagsOverrideForceDisableMask: + # The incoming TraceRay ray flags are ANDed with the bitwise inverse of this mask before use, allowing to force-disable specific flags. + # Only supported with continuations. + Type: uint32 + Default: 0 diff --git a/src/shaders/BuildBVHTDTR.hlsl b/src/shaders/BuildBVHTDTR.hlsl index ed21d28..7d1d71f 100644 --- a/src/shaders/BuildBVHTDTR.hlsl +++ b/src/shaders/BuildBVHTDTR.hlsl @@ -22,6 +22,8 @@ * SOFTWARE. * **********************************************************************************************************************/ +#include "../shadersClean/common/BoundingBox.hlsli" + #define USE_SAH 1 //===================================================================================================================== // 32 bit constants @@ -46,6 +48,142 @@ struct TDArgs #define INVALID_IDX 0xffffffff #define TD_EPSILON 0.99999 +//===================================================================================================================== +#define REF_SCRATCH_SIDE_LEFT 0 +#define REF_SCRATCH_SIDE_RIGHT 1 +#define REF_SCRATCH_SIDE_LEAF 2 + +struct TDRefScratch +{ + uint primitiveIndex; + uint nodeIndex; + float3 center; + BoundingBox box; + uint side; +#if USE_BVH_REBRAID + uint nodePointer; //rebraid only +#endif +#if USE_BLAS_PRIM_COUNT + uint numPrimitives; +#endif +}; + +#define TD_REF_PRIM_INDEX_OFFSET 0 +#define TD_REF_NODE_INDEX_OFFSET 4 +#define TD_REF_CENTER_OFFSET 8 +#define TD_REF_BOX_OFFSET 20 +#define TD_REF_SIDE_OFFSET (TD_REF_BOX_OFFSET + sizeof(BoundingBox)) +#define TD_REF_NODE_POINTER_OFFSET (TD_REF_SIDE_OFFSET + 4) +#if USE_BLAS_PRIM_COUNT +#define TD_REF_NUM_PRIM_OFFSET (TD_REF_NODE_POINTER_OFFSET + sizeof(uint)) +#endif + +//===================================================================================================================== +#define NUM_SPLIT_BINS 4 + +#define TD_NODE_REBRAID_STATE_OPEN 0 +#define TD_NODE_REBRAID_STATE_CLOSED 1 + +struct TDBins +{ + uint64_t firstRefIndex; + + UintBoundingBox binBoxes[3][NUM_SPLIT_BINS]; + uint binPrimCount[3][NUM_SPLIT_BINS]; + + uint bestAxis; + uint bestSplit; + uint numLeft; + uint numRight; + +#if USE_BLAS_PRIM_COUNT + uint binBLASPrimCount[3][NUM_SPLIT_BINS]; +#endif +}; + +#define TD_BINS_FIRST_REF_INDEX_OFFSET 0 +#define TD_BINS_BIN_BOXES_OFFSET (TD_BINS_FIRST_REF_INDEX_OFFSET + 8) +#define TD_BINS_BIN_PRIM_COUNT_OFFSET (TD_BINS_BIN_BOXES_OFFSET + sizeof(UintBoundingBox) * NUM_SPLIT_BINS * 3) +#define TD_BINS_BEST_AXIS_OFFSET (TD_BINS_BIN_PRIM_COUNT_OFFSET + sizeof(uint) * NUM_SPLIT_BINS * 3) +#define TD_BINS_BEST_SPLIT_OFFSET (TD_BINS_BEST_AXIS_OFFSET + 4) +#define TD_BINS_NUM_LEFT_OFFSET (TD_BINS_BEST_SPLIT_OFFSET + 4) +#define TD_BINS_NUM_RIGHT_OFFSET (TD_BINS_NUM_LEFT_OFFSET + 4) +#if USE_BLAS_PRIM_COUNT +#define TD_BINS_BLAS_PRIM_COUNT_OFFSET (TD_BINS_NUM_RIGHT_OFFSET + 4) +#endif + +struct TDNode +{ + UintBoundingBox centroidBox; + uint binsIndex; + uint childCount; + +#if USE_BVH_REBRAID + uint largestAxis; // rebraid only + float largestWidth; // rebraid only + uint rebraidState; // rebraid only + uint primIndex; // rebraid only +#endif +}; + +#define TD_NODE_CENTROID_BOX_OFFSET 0 +#define TD_NODE_BINS_INDEX_OFFSET (TD_NODE_CENTROID_BOX_OFFSET + sizeof(UintBoundingBox)) +#define TD_NODE_CHILD_COUNT_OFFSET (TD_NODE_BINS_INDEX_OFFSET + 4) +#define TD_NODE_LARGEST_AXIS_OFFSET (TD_NODE_CHILD_COUNT_OFFSET + 4) +#define TD_NODE_LARGEST_WIDTH_OFFSET (TD_NODE_LARGEST_AXIS_OFFSET + 4) +#define TD_NODE_REBRAID_STATE_OFFSET (TD_NODE_LARGEST_WIDTH_OFFSET + 4) +#define TD_NODE_PRIM_INDEX_OFFSET (TD_NODE_REBRAID_STATE_OFFSET + 4) + +//===================================================================================================================== + +#define TD_REBRAID_STATE_NO_OPEN 0 +#define TD_REBRAID_STATE_NEED_OPEN 1 +#define TD_REBRAID_STATE_OOM 2 + +#define TD_PHASE_INIT_STATE 0 +#define TD_PHASE_INIT_REFS_TO_LEAVES 1 +#define TD_PHASE_CHECK_NEED_ALLOC 2 +#define TD_PHASE_ALLOC_ROOT_NODE 3 +#define TD_PHASE_REBRAID_COUNT_OPENINGS 4 +#define TD_PHASE_REBRAID_CHECK_TERMINATION 5 +#define TD_PHASE_REBRAID_OPEN 6 +#define TD_PHASE_REBRAID_UPDATE_NODES 7 +#define TD_PHASE_BIN_REFS 8 +#define TD_PHASE_FIND_BEST_SPLIT 9 +#define TD_PHASE_SECOND_PASS 10 +#define TD_PHASE_UPDATE_NEW_NODES 11 +#define TD_PHASE_DONE 12 + +struct StateTDBuild +{ + uint numNodes; + uint numProcessedNodes; + uint numNodesAllocated; + uint numRefs; + uint numRefsAllocated; + uint numInactiveInstance; + UintBoundingBox rootCentroidBBox; + uint numLeaves; + uint binsCounter; + +#if USE_BVH_REBRAID + uint rebraidState; + uint leafAllocOffset; +#endif +}; + +#define STATE_TD_NUM_NODES_OFFSET 0 +#define STATE_TD_NUM_PROCESSED_NODES_OFFSET 4 +#define STATE_TD_NUM_NODES_ALLOCATED_OFFSET 8 +#define STATE_TD_NUM_REFS_OFFSET 12 +#define STATE_TD_NUM_REFS_ALLOCATED_OFFSET 16 +#define STATE_TD_NUM_INACTIVE_INSTANCE_OFFSET 20 +#define STATE_TD_CENTROID_BBOX_OFFSET 24 +#define STATE_TD_NUM_LEAVES_OFFSET (STATE_TD_CENTROID_BBOX_OFFSET + sizeof(UintBoundingBox)) +#define STATE_TD_BINS_COUNTER_OFFSET (STATE_TD_NUM_LEAVES_OFFSET + 4) +#define STATE_TD_REBRAID_STATE_OFFSET (STATE_TD_BINS_COUNTER_OFFSET + 4) +#define STATE_TD_LEAF_ALLOC_OFFSET_OFFSET (STATE_TD_REBRAID_STATE_OFFSET + 4) + #if NO_SHADER_ENTRYPOINT == 0 #define USE_LDS 1 diff --git a/src/shaders/BuildCommon.hlsl b/src/shaders/BuildCommon.hlsl index 4ecf131..5746130 100644 --- a/src/shaders/BuildCommon.hlsl +++ b/src/shaders/BuildCommon.hlsl @@ -325,33 +325,6 @@ float3 Uint3ToFloat3(in uint3 v) return asfloat(v); } -//===================================================================================================================== -// Divide uints and round up -uint RoundUpQuotient( - uint dividend, - uint divisor) -{ - return (dividend + divisor - 1) / divisor; -} - -//===================================================================================================================== -// Divide ints and round up -int RoundUpQuotient( - int dividend, - int divisor) -{ - return (dividend + divisor - 1) / divisor; -} - -//===================================================================================================================== -// Divide ints and round up -uint64_t RoundUpQuotient( - uint64_t dividend, - uint64_t divisor) -{ - return (dividend + divisor - 1) / divisor; -} - //===================================================================================================================== static uint32_t GetNumInternalNodeCount( in uint32_t primitiveCount) @@ -678,9 +651,9 @@ uint PackInstanceMaskAndNodeFlags( uint PackScratchNodeFlags( uint instanceInclusionMask, uint nodeFlags, - uint triangleId) + uint quadSwizzle) { - const uint packedFlags = (triangleId << 16) | PackInstanceMaskAndNodeFlags(instanceInclusionMask, nodeFlags); + const uint packedFlags = (quadSwizzle << 16) | PackInstanceMaskAndNodeFlags(instanceInclusionMask, nodeFlags); return packedFlags; } diff --git a/src/shaders/BuildCommonScratch.hlsl b/src/shaders/BuildCommonScratch.hlsl index cd54497..b0d3197 100644 --- a/src/shaders/BuildCommonScratch.hlsl +++ b/src/shaders/BuildCommonScratch.hlsl @@ -47,7 +47,7 @@ #ifndef _BUILDCOMMONSCRATCH_HLSL #define _BUILDCOMMONSCRATCH_HLSL -#include "../shared/scratchNode.h" +#include "../shadersClean/common/ScratchNode.hlsli" #include "BuildCommon.hlsl" #include "BuildCommonScratchGlobal.hlsl" #include "TaskCounter.hlsl" @@ -853,16 +853,17 @@ void RefitNode( } //===================================================================================================================== -static TriangleData GetScratchNodeTrianglePairVertices( +static TriangleData GetScratchNodeQuadVertices( in uint scratchNodesOffset, in uint nodeIndex, in uint triangleIndex) { - const uint nodeType = (triangleIndex == 0) ? NODE_TYPE_TRIANGLE_0 : NODE_TYPE_TRIANGLE_1; - const uint packedFlags = FETCH_SCRATCH_NODE_DATA(uint, scratchNodesOffset, nodeIndex, SCRATCH_NODE_FLAGS_OFFSET); - uint3 indices = CalcTriangleCompressionVertexIndices(nodeType, ExtractScratchNodeTriangleId(packedFlags)); + const uint quadSwizzle = ExtractScratchNodeQuadSwizzle(packedFlags); + const uint triSwizzle = (quadSwizzle >> (triangleIndex * 4)) & 0xFF; + + uint3 indices = ComputeQuadTriangleVertexIndex(triangleIndex, triSwizzle); TriangleData tri; diff --git a/src/shaders/BuildFastAgglomerativeLbvh.hlsl b/src/shaders/BuildFastAgglomerativeLbvh.hlsl index 5c4a810..526053c 100644 --- a/src/shaders/BuildFastAgglomerativeLbvh.hlsl +++ b/src/shaders/BuildFastAgglomerativeLbvh.hlsl @@ -104,15 +104,14 @@ uint32_t Delta30( const int leftCode = ScratchBuffer.Load(mortonCodesOffset + (left * sizeof(int))); const int rightCode = ScratchBuffer.Load(mortonCodesOffset + (right * sizeof(int))); - // logical xor can be used instead of finding the index of the highest differing bit as we can compare the numbers. - // The higher the index of the differing bit, the larger the number - return (leftCode != rightCode) ? (leftCode ^ rightCode) : (left ^ right); + // returns number of matching bits starting from MSB + return (leftCode != rightCode) ? clz(leftCode ^ rightCode) : (32 + clz(left ^ right)); } //===================================================================================================================== // This function indicates a distance metric between the two keys where each internal node splits the hierarchy // Optionally, we can use the squared distance to compute the distance between two centroids -uint64_t Delta64( +uint32_t Delta64( uint mortonCodesOffset, uint id) { @@ -123,9 +122,8 @@ uint64_t Delta64( const uint64_t leftCode = ScratchBuffer.Load(mortonCodesOffset + (left * sizeof(uint64_t))); const uint64_t rightCode = ScratchBuffer.Load(mortonCodesOffset + (right * sizeof(uint64_t))); - // logical xor can be used instead of finding the index of the highest differing bit as we can compare the numbers. - // The higher the index of the differing bit, the larger the number - return (leftCode != rightCode) ? (leftCode ^ rightCode) : (left ^ right); + // returns number of matching bits starting from MSB + return (leftCode != rightCode) ? clz64(leftCode ^ rightCode) : (64 + clz64(left ^ right)); } //===================================================================================================================== @@ -137,11 +135,11 @@ bool IsSplitRight( { if (useMortonCode30) { - return (Delta30(mortonCodesOffset, right) < Delta30(mortonCodesOffset, left - 1)); + return (Delta30(mortonCodesOffset, right) > Delta30(mortonCodesOffset, left - 1)); } else { - return (Delta64(mortonCodesOffset, right) < Delta64(mortonCodesOffset, left - 1)); + return (Delta64(mortonCodesOffset, right) > Delta64(mortonCodesOffset, left - 1)); } } @@ -173,6 +171,21 @@ void FastAgglomerativeLbvhImpl( // Total number of internal nodes is N - 1 const uint numInternalNodes = args.numActivePrims - 1; + if (numInternalNodes == 0) + { + if (primitiveIndex == 0) + { + const uint rootIndex = FetchSortedPrimIndex(args.sortedPrimIndicesOffset, 0); + { + // Store invalid index as parent of root + WriteScratchNodeData(args.baseScratchNodesOffset, rootIndex, SCRATCH_NODE_PARENT_OFFSET, 0xffffffff); + } + + WriteRootNodeIndex(args.rootNodeIndexOffset, rootIndex); + } + return; + } + // The root of the tree will be stored in the left child of the n-th internal node, where n represents the size of // the key array @@ -244,8 +257,11 @@ void FastAgglomerativeLbvhImpl( // the root node index and remove this conditional if (parentNodeIndex == numInternalNodes) { - // Store invalid index as parent of root - WriteScratchNodeData(args.baseScratchNodesOffset, currentNodeIndex, SCRATCH_NODE_PARENT_OFFSET, 0xffffffff); + { + // Store invalid index as parent of root + WriteScratchNodeData(args.baseScratchNodesOffset, currentNodeIndex, SCRATCH_NODE_PARENT_OFFSET, 0xffffffff); + } + // Store the index of the root node WriteRootNodeIndex(args.rootNodeIndexOffset, currentNodeIndex); // Do not write the parent node since it's invalid. @@ -286,7 +302,14 @@ void BuildFastAgglomerativeLbvh( const uint numActivePrims = ReadAccelStructHeaderField(ACCEL_STRUCT_HEADER_NUM_ACTIVE_PRIMS_OFFSET); const FastLBVHArgs args = GetFastLbvhArgs(numActivePrims); - if (globalId < numActivePrims) + if (numActivePrims == 0) + { + if (globalId == 0) + { + WriteRootNodeIndex(args.rootNodeIndexOffset, 0); + } + } + else if (globalId < numActivePrims) { FastAgglomerativeLbvhImpl(globalId, args); } diff --git a/src/shaders/BuildBVHPLOC.hlsl b/src/shaders/BuildPLOC.hlsl similarity index 99% rename from src/shaders/BuildBVHPLOC.hlsl rename to src/shaders/BuildPLOC.hlsl index effeb80..2c39642 100644 --- a/src/shaders/BuildBVHPLOC.hlsl +++ b/src/shaders/BuildPLOC.hlsl @@ -88,7 +88,7 @@ struct BuildPlocArgs #include "Common.hlsl" //===================================================================================================================== -#include "..\shared\rayTracingDefs.h" +#include "../shared/rayTracingDefs.h" #define GC_DSTMETADATA #define GC_SCRATCHBUFFER @@ -748,7 +748,7 @@ void UpdateClusterCount( } //===================================================================================================================== -void BuildBvhPlocImpl( +void BuildPlocImpl( uint globalId, uint localId, uint groupId, @@ -858,7 +858,7 @@ void BuildBvhPlocImpl( //==================================================================================================================== [RootSignature(RootSig)] [numthreads(BUILD_THREADGROUP_SIZE, 1, 1)] -void BuildBVHPLOC( +void BuildPLOC( uint globalIdIn : SV_DispatchThreadID, uint groupIdIn : SV_GroupID, uint localIdIn : SV_GroupThreadID) @@ -890,7 +890,7 @@ void BuildBVHPLOC( if (numActivePrims > 0) { - BuildBvhPlocImpl(globalId, localId, groupId, numActivePrims, plocArgs); + BuildPlocImpl(globalId, localId, groupId, numActivePrims, plocArgs); } } #endif diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl index e557003..eaf9090 100644 --- a/src/shaders/BuildParallel.hlsl +++ b/src/shaders/BuildParallel.hlsl @@ -85,7 +85,7 @@ void WaitForEncodeTasksToFinish( #include "GenerateMortonCodes.hlsl" #include "RadixSort/ScanExclusiveInt4DLBCommon.hlsl" #include "RadixSort/RadixSortParallel.hlsl" -#include "BuildBVHPLOC.hlsl" +#include "BuildPLOC.hlsl" #include "BuildQBVH.hlsl" #include "BuildBVHTDTR.hlsl" #include "BuildBVH.hlsl" @@ -242,7 +242,7 @@ void TriangleSplitting( } //====================================================================================================================== -void BuildBvhPloc( +void BuildPloc( inout uint numTasksWait, inout uint waveId, uint globalId, @@ -270,7 +270,7 @@ void BuildBvhPloc( plocArgs.primIndicesSortedScratchOffset = ShaderConstants.offsets.primIndicesSorted; plocArgs.unsortedBvhLeafNodesOffset = ShaderConstants.offsets.bvhLeafNodeData; - BuildBvhPlocImpl(globalId, localId, groupId, numActivePrims, plocArgs); + BuildPlocImpl(globalId, localId, groupId, numActivePrims, plocArgs); } //====================================================================================================================== @@ -376,8 +376,8 @@ void MergeSort(inout uint numTasksWait, inout uint waveId, uint localId, uint gr numPrimitives, ShaderConstants.offsets.mortonCodes, ShaderConstants.offsets.mortonCodesSorted, - ShaderConstants.offsets.primIndicesSorted, ShaderConstants.offsets.primIndicesSortedSwap, + ShaderConstants.offsets.primIndicesSorted, Settings.useMortonCode30); } @@ -543,75 +543,59 @@ void BuildBvh( { bool needRefit = false; - if ((Settings.fastBuildThreshold) && (numPrimitives <= Settings.fastBuildThreshold) && (numPrimitives <= WaveGetLaneCount())) - { - BEGIN_TASK(1); - - FastBuildBVH(globalId, - numPrimitives, - ShaderConstants.offsets.bvhLeafNodeData, - ShaderConstants.offsets.bvhNodeData); - - END_TASK(1); - needRefit = true; - numActivePrims = ReadAccelStructHeaderField(ACCEL_STRUCT_HEADER_NUM_ACTIVE_PRIMS_OFFSET); - } - else - { - BEGIN_TASK(ShaderRootConstants.NumThreadGroups()); + BEGIN_TASK(ShaderRootConstants.NumThreadGroups()); - GenerateMortonCodes(globalId, numPrimitives); + GenerateMortonCodes(globalId, numPrimitives); - END_TASK(ShaderRootConstants.NumThreadGroups()); - WriteDebugCounter(COUNTER_MORTONGEN_OFFSET); - numActivePrims = ReadAccelStructHeaderField(ACCEL_STRUCT_HEADER_NUM_ACTIVE_PRIMS_OFFSET); + END_TASK(ShaderRootConstants.NumThreadGroups()); + WriteDebugCounter(COUNTER_MORTONGEN_OFFSET); + numActivePrims = ReadAccelStructHeaderField(ACCEL_STRUCT_HEADER_NUM_ACTIVE_PRIMS_OFFSET); - if (numActivePrims > 0) + if (numActivePrims > 0) + { + if (Settings.enableMergeSort) { - if (Settings.enableMergeSort) - { - MergeSort(numTasksWait, waveId, localId, groupId, numPrimitives); - } - else - { - RadixSort(numTasksWait, waveId, globalId, localId, groupId, numPrimitives, Settings.radixSortScanLevel, Settings.useMortonCode30); - } - WriteDebugCounter(COUNTER_MORTON_SORT_OFFSET); - // Note there is an implicit sync on the last pass of the sort + MergeSort(numTasksWait, waveId, localId, groupId, numPrimitives); + } + else + { + RadixSort(numTasksWait, waveId, globalId, localId, groupId, numPrimitives, Settings.radixSortScanLevel, Settings.useMortonCode30); + } + WriteDebugCounter(COUNTER_MORTON_SORT_OFFSET); + // Note there is an implicit sync on the last pass of the sort - // If the top down builder is off, the unsorted leaves will stay where the + // If the top down builder is off, the unsorted leaves will stay where the // Encode step put them. On top of that, if TS or Rebraid is also on, // there might be a gap between the last inner node and the first leaf // if we place the root of the tree at ShaderConstants.offsets.bvhNodeData. - // To avoid that gap, the root is moved forward by numLeafNodes - numActivePrims - // nodes from this point onwards. + // To avoid that gap, the root is moved forward by numLeafNodes - numActivePrims + // nodes from this point onwards. - if (Settings.buildMode == BUILD_MODE_PLOC) + if (Settings.buildMode == BUILD_MODE_PLOC) + { + BuildPloc(numTasksWait, waveId, globalId, localId, groupId, numActivePrims); + WriteDebugCounter(COUNTER_BUILDPLOC_OFFSET); + } + else + { + if (Settings.enableFastLBVH == false) { - BuildBvhPloc(numTasksWait, waveId, globalId, localId, groupId, numActivePrims); - WriteDebugCounter(COUNTER_BUILDPLOC_OFFSET); + BEGIN_TASK(ShaderRootConstants.NumThreadGroups()); + + BuildBvhLinear(globalId, numActivePrims, numPrimitives); + + END_TASK(ShaderRootConstants.NumThreadGroups()); + WriteDebugCounter(COUNTER_BUILDLBVH_OFFSET); + needRefit = true; } else { - if (Settings.enableFastLBVH == false) - { - BEGIN_TASK(ShaderRootConstants.NumThreadGroups()); - - BuildBvhLinear(globalId, numActivePrims, numPrimitives); - - END_TASK(ShaderRootConstants.NumThreadGroups()); - WriteDebugCounter(COUNTER_BUILDLBVH_OFFSET); - needRefit = true; - } - else - { - BEGIN_TASK(ShaderRootConstants.NumThreadGroups()); + BEGIN_TASK(ShaderRootConstants.NumThreadGroups()); - FastAgglomerativeLbvh(globalId, numActivePrims); + FastAgglomerativeLbvh(globalId, numActivePrims); - END_TASK(ShaderRootConstants.NumThreadGroups()); - WriteDebugCounter(COUNTER_BUILDFASTLBVH_OFFSET); - } + END_TASK(ShaderRootConstants.NumThreadGroups()); + WriteDebugCounter(COUNTER_BUILDFASTLBVH_OFFSET); } } } diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl index 9dd9e30..512496a 100644 --- a/src/shaders/BuildQBVH.hlsl +++ b/src/shaders/BuildQBVH.hlsl @@ -306,7 +306,8 @@ uint WritePrimitiveNode( nodeOffset = offsets.leafNodes + (destIndex * primitiveNodeSize); } - const uint triangleId = ExtractScratchNodeTriangleId(scratchNode.packedFlags); + const uint quadSwizzle = ExtractScratchNodeQuadSwizzle(scratchNode.packedFlags); + const uint boxNodeFlags = ExtractScratchNodeBoxFlags(scratchNode.packedFlags); if (nodeType == NODE_TYPE_USER_NODE_PROCEDURAL) { @@ -320,8 +321,6 @@ uint WritePrimitiveNode( } else { - DstBuffer.Store(nodeOffset + TRIANGLE_NODE_ID_OFFSET, triangleId); - bool isPairCompressed = (Settings.triangleCompressionMode == PAIR_TRIANGLE_COMPRESSION); if (Settings.enableEarlyPairCompression) { @@ -335,6 +334,18 @@ uint WritePrimitiveNode( // Pair compressed triangles nodes are referenced by triangle 1 nodeType = isPairCompressed ? NODE_TYPE_TRIANGLE_1 : NODE_TYPE_TRIANGLE_0; + uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, (quadSwizzle >> 0) & 0xF, boxNodeFlags); + + // The compaction shader (CompactASImpl1_1) looks at triangleId to determine the node type of a leaf node. + // Hence, we must only set the triangleId fields for NODE_TYPE_TRIANGLE_1 to non-zero for a pair + // compressed triangle. + if (isPairCompressed) + { + triangleId = WriteTriangleIdField(triangleId, NODE_TYPE_TRIANGLE_1, (quadSwizzle >> 4) & 0xF, boxNodeFlags); + } + + DstBuffer.Store(nodeOffset + TRIANGLE_NODE_ID_OFFSET, triangleId); + { DstBuffer.Store(nodeOffset + TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET, geometryIndexAndFlags); DstBuffer.Store(nodeOffset + TRIANGLE_NODE_PRIMITIVE_INDEX0_OFFSET + (nodeType * 4), diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli index 6bb5026..5929195 100644 --- a/src/shaders/BuildSettings.hlsli +++ b/src/shaders/BuildSettings.hlsli @@ -43,7 +43,6 @@ [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_TOP_DOWN_BUILD_ID)]] uint enableTopDownBuild = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_USE_MORTON_CODE_30_ID)]] uint useMortonCode30 = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_MERGE_SORT_ID)]] uint enableMergeSort = 0; -[[vk::constant_id(BUILD_SETTINGS_DATA_FAST_BUILD_THRESHOLD_ID)]] uint fastBuildThreshold = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_FUSED_INSTANCE_NODE_ID)]] uint enableFusedInstanceNode = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_TS_PRIORITY_ID)]] float tsPriority = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_NUM_REBRAID_ITERATIONS_ID)]] uint numRebraidIterations = 0; @@ -59,7 +58,7 @@ [[vk::constant_id(BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID)]] uint encodeArrayOfPointers = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID)]] uint sceneBoundsCalculationType = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID)]] uint rebraidQualityHeuristic = 0; -[[vk::constant_id(BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID)]] uint rebuildAccelStruct = 0; +[[vk::constant_id(BUILD_SETTINGS_DATA_DISABLE_COMPACTION_ID)]] uint disableCompaction = 0; static const CompileTimeBuildSettings Settings = { topLevelBuild, @@ -79,7 +78,7 @@ static const CompileTimeBuildSettings Settings = { enableTopDownBuild, useMortonCode30, enableMergeSort, - fastBuildThreshold, + 0, enableFusedInstanceNode, tsPriority, numRebraidIterations, @@ -109,7 +108,7 @@ static const CompileTimeBuildSettings Settings = { 0, 0, 0, - rebuildAccelStruct, + disableCompaction, }; #endif diff --git a/src/shaders/CMakeLists.txt b/src/shaders/CMakeLists.txt index 4b5b5dd..54ef25e 100644 --- a/src/shaders/CMakeLists.txt +++ b/src/shaders/CMakeLists.txt @@ -39,7 +39,7 @@ set(gpurtHlsl AccelStructTracker.hlsl BuildRootSignature.hlsl BuildBVH.hlsl - BuildBVHPLOC.hlsl + BuildPLOC.hlsl BuildBVHTDTR.hlsl BuildCommon.hlsl BuildCommonScratch.hlsl @@ -119,11 +119,22 @@ set(otherDeps ../shared/gpurtBuildConstants.h ../shared/hlslTypes.h ../shared/rayTracingDefs.h - ../shared/scratchNode.h + ../shadersClean/common/Bits.hlsli ../shadersClean/common/Math.hlsli ../shadersClean/common/Math.hlsl ../shadersClean/common/Extensions.hlsli + ../shadersClean/common/Extensions.hlsl ../shadersClean/common/ShaderDefs.hlsli + ../shadersClean/common/BoundingBox.hlsli + ../shadersClean/common/InstanceDesc.hlsli + ../shadersClean/common/NodePointers.hlsli + ../shadersClean/common/ScratchNode.hlsli + ../shadersClean/common/TempAssert.hlsli + ../shadersClean/traversal/TraversalDefs.hlsli + ../shadersClean/common/gfx10/BoxNode1_0.hlsli + ../shadersClean/common/gfx10/InstanceNode1_0.hlsli + ../shadersClean/common/gfx10/ProceduralNode1_0.hlsli + ../shadersClean/common/gfx10/TriangleNode1_0.hlsli ) set(GPURT_SHADER_SOURCE_FILES "${gpurtHlsl}" "${otherDeps}" PARENT_SCOPE) diff --git a/src/shaders/Common.hlsl b/src/shaders/Common.hlsl index d4562fb..1b55ccf 100644 --- a/src/shaders/Common.hlsl +++ b/src/shaders/Common.hlsl @@ -35,7 +35,7 @@ #define _COMMON_HLSL #include "../shared/rayTracingDefs.h" -#include "../shared/scratchNode.h" +#include "../shadersClean/common/ScratchNode.hlsli" typedef AccelStructDataOffsets AccelStructOffsets; @@ -62,16 +62,19 @@ typedef AccelStructDataOffsets AccelStructOffsets; #define INVALID_IDX 0xffffffff // Node pointer values with special meanings -#define INVALID_NODE 0xffffffff -#define TERMINAL_NODE 0xfffffffe -#define SKIP_0_3 0xfffffffd -#define SKIP_4_7 0xfffffffb -#define SKIP_0_7 0xfffffff9 -#define END_SEARCH 0xfffffff8 -#define DEAD_LANE 0xfffffff7 +#define INVALID_NODE 0xffffffff +#define TERMINAL_NODE 0xfffffffe +#define SKIP_0_3 0xfffffffd +#define SKIP_4_7 0xfffffffb +#define SKIP_0_7 0xfffffff9 +#define END_SEARCH 0xfffffff8 +#define DEAD_LANE_WITHOUT_STACK 0xfffffff7 +#define DEAD_LANE_WITH_STACK 0xfffffff6 #include "Extensions.hlsl" #include "../shadersClean/common/Math.hlsli" +#include "../shadersClean/common/BoundingBox.hlsli" +#include "../shadersClean/common/NodePointers.hlsli" #ifdef __cplusplus static const float NaN = std::numeric_limits::quiet_NaN(); @@ -399,29 +402,57 @@ static bool CheckHandleProceduralUserNode(in uint nodePointer) } //===================================================================================================================== -static uint WriteTriangleIdField(uint triangleId, uint nodeType, uint rotation, uint geometryFlags) +static uint3 ComputeQuadTriangleVertexIndex( + uint triangleIndex, // Numeric constant (0 or 1) + uint rotation) +{ + // triangle_0 vertex mapping + // + // rotation 0: t0: v0, v1, v2 + // rotation 1: t0: v1, v2, v0 + // rotation 2: t0: v2, v0, v1 + // + + // triangle_1 vertex mapping + // + // rotation 0: t0: v1, v3, v2 + // rotation 1: t0: v3, v2, v1 + // rotation 2: t0: v2, v1, v3 + // + const uint packedVertexMapping = (triangleIndex == 0) ? 0x10210 : 0x31231; + const uint packedMapping = packedVertexMapping >> (rotation * 4); + + return uint3((packedMapping >> 0) & 0xF, + (packedMapping >> 4) & 0xF, + (packedMapping >> 8) & 0xF); +} + +//===================================================================================================================== +static uint WriteTriangleIdField(uint triangleId, uint nodeType, uint rotation, uint boxNodeFlags) { const uint triangleShift = nodeType * TRIANGLE_ID_BIT_STRIDE; + // Hardware triangle ID barycentric mapping indicates the triangle vertex rotation. This maps to triangle vertex + // mapping for triangle index 0 in the quad. + const uint3 index = ComputeQuadTriangleVertexIndex(0, rotation); + // Compute the barycentrics mapping table that is stored in triangle_id for RT IP 1.1 - triangleId |= ((rotation + 1) % 3) << (triangleShift + TRIANGLE_ID_I_SRC_SHIFT); - triangleId |= ((rotation + 2) % 3) << (triangleShift + TRIANGLE_ID_J_SRC_SHIFT); + triangleId |= (index.y) << (triangleShift + TRIANGLE_ID_I_SRC_SHIFT); + triangleId |= (index.z) << (triangleShift + TRIANGLE_ID_J_SRC_SHIFT); // Add in the flags stored in triangle_id for RT IP 2.0 - if (geometryFlags & D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE) + if (boxNodeFlags & (1u << BOX_NODE_FLAGS_ONLY_OPAQUE_SHIFT)) { triangleId |= 1u << (triangleShift + TRIANGLE_ID_OPAQUE_SHIFT); } + if (boxNodeFlags & (1u << BOX_NODE_FLAGS_ONLY_PROCEDURAL_SHIFT)) + { + triangleId |= 1u << (triangleShift + TRIANGLE_ID_PROCEDURAL_SHIFT); + } return triangleId; } -//===================================================================================================================== -static uint CalcUncompressedTriangleId(uint geometryFlags) -{ - return WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags); -} - //===================================================================================================================== // Extract the order of the triangle vertices from the node's triangle ID field. static uint3 CalcTriangleCompressionVertexIndices( @@ -810,7 +841,7 @@ static uint32_t GetInstanceSidebandOffset( // Node pointers with all upper bits set are sentinels: INVALID_NODE, TERMINAL_NODE, SKIP_* static bool IsValidNode(uint nodePtr) { - return nodePtr < DEAD_LANE; + return nodePtr < DEAD_LANE_WITH_STACK; } //====================================================================================================================== diff --git a/src/shaders/Continuations1_1.hlsl b/src/shaders/Continuations1_1.hlsl index ee528c4..1d17e9d 100644 --- a/src/shaders/Continuations1_1.hlsl +++ b/src/shaders/Continuations1_1.hlsl @@ -161,7 +161,6 @@ static _AmdTraversalState InitTraversalState1_1( traversal.committed.currNodePtr = INVALID_NODE; // Start traversing from root node - traversal.nextNodePtr = isValid ? CreateRootNodePointer1_1() : INVALID_NODE; traversal.reservedNodePtr = INVALID_NODE; traversal.lastInstanceRootNodePtr = INVALID_NODE; @@ -239,7 +238,7 @@ static void TraversalInternal1_1( float2 committedBarycentrics = data.traversal.committedBarycentrics; candidateBarycentrics = float2(0.0f, 0.0f); - uint nextNodePtr = data.traversal.nextNodePtr; + uint nextNodePtr = data.dispatch.nextNodePtr; float3 candidateRayOrigin = topLevelRayOrigin; float3 candidateRayDirection = topLevelRayDirection; state = TRAVERSAL_STATE_COMMITTED_NOTHING; @@ -546,7 +545,7 @@ static void TraversalInternal1_1( data.traversal.stackPtr = stack.Pack(); // Pack traversal results back into traversal state structure - data.traversal.nextNodePtr = nextNodePtr; + data.dispatch.nextNodePtr = nextNodePtr; data.traversal.committed = committed; data.traversal.committedBarycentrics = committedBarycentrics; #if REMAT_INSTANCE_RAY == 0 diff --git a/src/shaders/Continuations2_0.hlsl b/src/shaders/Continuations2_0.hlsl index 19c3cfd..221143c 100644 --- a/src/shaders/Continuations2_0.hlsl +++ b/src/shaders/Continuations2_0.hlsl @@ -46,7 +46,6 @@ static _AmdTraversalState InitTraversalState2_0( traversal.committed.currNodePtr = INVALID_NODE; // Start traversing from root node - traversal.nextNodePtr = isValid ? CreateRootNodePointer1_1() : TERMINAL_NODE; traversal.reservedNodePtr = INVALID_NODE; traversal.lastInstanceRootNodePtr = INVALID_NODE; @@ -127,7 +126,7 @@ static void TraversalInternal2_0( instanceFlagsPreserveBits <<= POINTER_FLAGS_HIDWORD_SHIFT; - uint nextNodePtr = data.traversal.nextNodePtr; + uint nextNodePtr = data.dispatch.nextNodePtr; float3 candidateRayOrigin = topLevelRayOrigin; float3 candidateRayDirection = topLevelRayDirection; state = TRAVERSAL_STATE_COMMITTED_NOTHING; @@ -361,7 +360,7 @@ static void TraversalInternal2_0( candidate.currNodePtr = nodePtr; if (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::DeferFirst) { - haveCandidate = true; + haveCandidate = true; } else { @@ -552,7 +551,7 @@ static void TraversalInternal2_0( } // Pack traversal results back into traversal state structure - data.traversal.nextNodePtr = nextNodePtr; + data.dispatch.nextNodePtr = nextNodePtr; data.traversal.committed = committed; data.traversal.committedBarycentrics = committedBarycentrics; #if REMAT_INSTANCE_RAY == 0 diff --git a/src/shaders/CopyAS.hlsl b/src/shaders/CopyAS.hlsl index ded9b60..2ca420e 100644 --- a/src/shaders/CopyAS.hlsl +++ b/src/shaders/CopyAS.hlsl @@ -22,7 +22,8 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "Common.hlsl" +#include "../../gpurt/gpurtAccelStruct.h" +#include "../shared/rayTracingDefs.h" // Note, CBV(b255) must be the last used binding in the root signature. #define RootSig "RootConstants(num32BitConstants=3, b0, visibility=SHADER_VISIBILITY_ALL), "\ @@ -78,7 +79,7 @@ void CopyAS(in uint3 globalThreadId : SV_DispatchThreadID) if (globalID == 0) { // Offset to acceleration structure header - uint64_t gpuVa = MakeGpuVirtualAddress(ShaderConstants.AddressLo, ShaderConstants.AddressHi); + uint64_t gpuVa = PackUint64(ShaderConstants.AddressLo, ShaderConstants.AddressHi); gpuVa += metadataSizeInBytes; // Patch metadata header diff --git a/src/shaders/Debug.hlsl b/src/shaders/Debug.hlsl index 00a8724..fd45358 100644 --- a/src/shaders/Debug.hlsl +++ b/src/shaders/Debug.hlsl @@ -29,8 +29,10 @@ #include "Common.hlsl" #include "Extensions.hlsl" +#define GPURT_DEBUG_BUFFER_AVAILABLE (GPURT_ENABLE_GPU_DEBUG && GPURT_BVH_BUILD_SHADER && defined(DEBUG_BUFFER_SLOT)) + #if GPURT_ENABLE_GPU_DEBUG - #if BUILD_PARALLEL || TRIVIAL_BUILDER + #if GPURT_DEBUG_BUFFER_AVAILABLE #define GPU_ASSERT_IMPL(id, cond) DoGpuAssert(id, (cond)) #define GPU_DPF_IMPL(id, msg, ...) \ do \ @@ -44,7 +46,6 @@ { \ if (IsDebugHaltEnabled() && !(cond)) { Halt(); } \ } while (false) - #define GPU_DPF_IMPL(msg, ...) #endif @@ -69,7 +70,7 @@ void Halt() AmdExtD3DShaderIntrinsics_Halt(); } -#if GPURT_ENABLE_GPU_DEBUG && (BUILD_PARALLEL || TRIVIAL_BUILDER) +#if GPURT_DEBUG_BUFFER_AVAILABLE globallycoherent RWByteAddressBuffer DebugBuffer : register( DEBUG_BUFFER_SLOT ); diff --git a/src/shaders/EncodeCommon.hlsl b/src/shaders/EncodeCommon.hlsl index 3eca227..fb666c6 100644 --- a/src/shaders/EncodeCommon.hlsl +++ b/src/shaders/EncodeCommon.hlsl @@ -24,6 +24,8 @@ **********************************************************************************************************************/ #include "BuildCommonScratch.hlsl" +#include "../shared/rayTracingDefs.h" + #include "TrianglePrimitive.hlsl" #include "UpdateCommon.hlsl" @@ -62,8 +64,7 @@ void WriteScratchTriangleNode( flags |= SCRATCH_NODE_FLAGS_DISABLE_TRIANGLE_SPLIT_MASK; } - const uint triangleId = CalcUncompressedTriangleId(geometryFlags); - const uint packedFlags = PackScratchNodeFlags(instanceMask, flags, triangleId); + const uint packedFlags = PackScratchNodeFlags(instanceMask, flags, 0); data = uint4(INVALID_IDX, 0, 0, packedFlags); WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_SPLIT_BOX_INDEX_OFFSET, data); @@ -173,16 +174,15 @@ void EncodeTriangleNode( const uint nodeOffset = metadataSize + ExtractNodePointerOffset(nodePointer); const uint nodeType = GetNodeType(nodePointer); - uint3 vertexOffsets; + triangleId = SrcBuffer.Load(nodeOffset + TRIANGLE_NODE_ID_OFFSET); + uint3 vertexOffsets; if (Settings.triangleCompressionMode != NO_TRIANGLE_COMPRESSION) { - triangleId = SrcBuffer.Load(nodeOffset + TRIANGLE_NODE_ID_OFFSET); vertexOffsets = CalcTriangleCompressionVertexOffsets(nodeType, triangleId); } else { - triangleId = CalcUncompressedTriangleId(geomConstants.geometryFlags); vertexOffsets = CalcTriangleVertexOffsets(nodeType); } @@ -284,11 +284,11 @@ void EncodeTriangleNode( const bool isActiveTriangle = IsActive(tri); if (isActiveTriangle) { - if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometry) + if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometry) { UpdateSceneBounds(ShaderConstants.offsets.sceneBounds, boundingBox); } - else if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometryWithSize) + else if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometryWithSize) { // TODO: with tri splitting, need to not update "size" here UpdateSceneBoundsWithSize(ShaderConstants.offsets.sceneBounds, boundingBox); @@ -411,12 +411,11 @@ void WriteScratchProceduralNode( WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_V2_OFFSET, data); // type, flags, splitBox, numPrimitivesAndDoCollapse - uint triangleId = 0; // Instance mask is assumed 0 in bottom level acceleration structures const uint flags = CalcProceduralBoxNodeFlags(geometryFlags); - const uint packedFlags = PackScratchNodeFlags(instanceMask, flags, triangleId); + const uint packedFlags = PackScratchNodeFlags(instanceMask, flags, 0); data = uint4(INVALID_IDX, 0, 0, packedFlags); WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_SPLIT_BOX_INDEX_OFFSET, data); @@ -525,11 +524,11 @@ void EncodeAabbNode( } else { - if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometry) + if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometry) { UpdateSceneBounds(ShaderConstants.offsets.sceneBounds, boundingBox); } - else if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometryWithSize) + else if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometryWithSize) { UpdateSceneBoundsWithSize(ShaderConstants.offsets.sceneBounds, boundingBox); } diff --git a/src/shaders/EncodeHwBvhCommon.hlsl b/src/shaders/EncodeHwBvhCommon.hlsl index 22e6a0b..b95cd09 100644 --- a/src/shaders/EncodeHwBvhCommon.hlsl +++ b/src/shaders/EncodeHwBvhCommon.hlsl @@ -141,10 +141,9 @@ void PostHwBvhBuild( offsets, metadataSizeInBytes); - // Rebuilding an updateable acceleration structure need to use the original size and not compacted one. - if (Settings.rebuildAccelStruct) + if (Settings.disableCompaction) { - compactedSize = ShaderConstants.header.compactedSizeInBytes; + compactedSize = ShaderConstants.header.sizeInBytes; } WriteAccelStructHeaderField(ACCEL_STRUCT_HEADER_COMPACTED_BYTE_SIZE_OFFSET, compactedSize); diff --git a/src/shaders/EncodeNodes.hlsl b/src/shaders/EncodeNodes.hlsl index 2e516b8..2075069 100644 --- a/src/shaders/EncodeNodes.hlsl +++ b/src/shaders/EncodeNodes.hlsl @@ -22,7 +22,7 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "..\shared\rayTracingDefs.h" +#include "../shared/rayTracingDefs.h" #define GC_DSTBUFFER #define GC_DSTMETADATA @@ -212,11 +212,11 @@ void EncodeQuadNodes( const bool isActive = IsActive(tri); if (isActive) { - if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometry) + if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometry) { UpdateSceneBounds(ShaderConstants.offsets.sceneBounds, boundingBox); } - else if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometryWithSize) + else if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometryWithSize) { // TODO: with tri splitting, need to not update "size" here UpdateSceneBoundsWithSize(ShaderConstants.offsets.sceneBounds, boundingBox); @@ -295,18 +295,14 @@ void EncodeQuadNodes( if (hasValidQuad) { - const uint triT0Rotation = (pairInfo & 0xF); - const uint triT1Rotation = (pairInfo >> 4) & 0xF; - WriteScratchQuadNode(dstScratchNodeIdx, geomId, geomConstants.geometryFlags, tri1, primId1, - triT1Rotation, tri, primId, - triT0Rotation); + pairInfo & 0xFF); } else if (pairInfo == -1) { diff --git a/src/shaders/EncodePairedTriangleImpl.hlsl b/src/shaders/EncodePairedTriangleImpl.hlsl index 090b544..2fc83b5 100644 --- a/src/shaders/EncodePairedTriangleImpl.hlsl +++ b/src/shaders/EncodePairedTriangleImpl.hlsl @@ -48,9 +48,8 @@ void WriteScratchTriangleNode( const BoundingBox box = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2); // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out. const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff; - const uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags); - const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId); + const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), 0); data = uint4(0, 0, 0, packedFlags); WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_SPLIT_BOX_INDEX_OFFSET, data); @@ -63,21 +62,12 @@ void WriteScratchQuadNode( uint geometryFlags, TriangleData tri1, uint tri1PrimIdx, - uint triT1Rotation, TriangleData tri0, uint tri0PrimIdx, - uint triT0Rotation) + uint quadSwizzle) { // TODO: For Navi3, we can directly write the scratch node data to the result leaf node data section // - uint triangleId = 0; - - // triT0 - NODE_TYPE_TRIANGLE_0 (2nd to intersect) - triangleId = WriteTriangleIdField(triangleId, NODE_TYPE_TRIANGLE_0, triT0Rotation, geometryFlags); - - // triT1 - NODE_TYPE_TRIANGLE_1 (1st to intersect) - triangleId = WriteTriangleIdField(triangleId, NODE_TYPE_TRIANGLE_1, triT1Rotation, geometryFlags); - uint offset = CalcScratchNodeOffset(ShaderConstants.offsets.bvhLeafNodeData, dstScratchNodeIdx); WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_PRIMITIVE_ID_OFFSET, tri1PrimIdx); @@ -90,8 +80,8 @@ void WriteScratchQuadNode( WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_GEOMETRY_INDEX_OFFSET, packedGeomId); WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_PARENT_OFFSET, INVALID_IDX); - const uint3 t0VtxIndices = CalcTriangleCompressionVertexIndices(NODE_TYPE_TRIANGLE_0, triangleId); - const uint3 t1VtxIndices = CalcTriangleCompressionVertexIndices(NODE_TYPE_TRIANGLE_1, triangleId); + const uint3 t0VtxIndices = ComputeQuadTriangleVertexIndex(0, (quadSwizzle >> 0) & 0xF); + const uint3 t1VtxIndices = ComputeQuadTriangleVertexIndex(1, (quadSwizzle >> 4) & 0xF); const uint3 t1VtxOffsets = SCRATCH_NODE_V0_OFFSET + (t1VtxIndices * SCRATCH_NODE_TRIANGLE_VERTEX_STRIDE); WriteScratchNodeDataAtOffset(offset, t1VtxOffsets.x, tri1.v0); @@ -117,7 +107,8 @@ void WriteScratchQuadNode( // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out. const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff; - const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId); + + const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), quadSwizzle); WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_FLAGS_OFFSET, packedFlags); } @@ -200,20 +191,20 @@ float ComputeEdgeBoxSurfaceArea( uint rotation) { // triangle v1, v2, v0 - float3 e0 = (vertices[1]); - float3 e1 = (vertices[0]); + float3 e0 = vertices[1]; + float3 e1 = vertices[0]; if (rotation == 0) { // triangle v0, v1, v2 - e0 = (vertices[0]); - e1 = (vertices[2]); + e0 = vertices[0]; + e1 = vertices[2]; } else if (rotation == 1) { // triangle v2, v0, v1 - e0 = (vertices[2]); - e1 = (vertices[1]); + e0 = vertices[2]; + e1 = vertices[1]; } BoundingBox edgeBox = (BoundingBox)0; diff --git a/src/shaders/EncodeTopLevel.hlsl b/src/shaders/EncodeTopLevel.hlsl index 689a4ff..00419bc 100644 --- a/src/shaders/EncodeTopLevel.hlsl +++ b/src/shaders/EncodeTopLevel.hlsl @@ -22,7 +22,7 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "..\shared\rayTracingDefs.h" +#include "../shared/rayTracingDefs.h" #include "BuildRootSignature.hlsl" diff --git a/src/shaders/EncodeTopLevelBuild.hlsl b/src/shaders/EncodeTopLevelBuild.hlsl index 8e1b618..2424f4a 100644 --- a/src/shaders/EncodeTopLevelBuild.hlsl +++ b/src/shaders/EncodeTopLevelBuild.hlsl @@ -22,6 +22,10 @@ * SOFTWARE. * **********************************************************************************************************************/ +#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ScratchNode.hlsli" + +//===================================================================================================================== void WriteScratchInstanceNode( uint offset, uint instanceIndex, @@ -137,7 +141,7 @@ void EncodeInstancesBuild( if (IsRebraidEnabled() == false) { // Update scene bounding box - if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometryWithSize) + if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometryWithSize) { UpdateSceneBoundsWithSize(ShaderConstants.offsets.sceneBounds, boundingBox); } diff --git a/src/shaders/Extensions.hlsl b/src/shaders/Extensions.hlsl index bfc0812..e78c92b 100644 --- a/src/shaders/Extensions.hlsl +++ b/src/shaders/Extensions.hlsl @@ -29,11 +29,7 @@ #include "../shadersClean/common/Extensions.hlsli" #include "../shadersClean/common/Math.hlsli" -#if !defined(__cplusplus) - // Dummy implementation for Vulkan build only -__decl uint AmdExtLaneCount() DUMMY_UINT_FUNC - __decl uint AmdExtD3DShaderIntrinsics_LoadDwordAtAddr( uint gpuVaLoBits, uint gpuVaHiBits, uint offset) DUMMY_UINT_FUNC @@ -168,8 +164,6 @@ uint AmdExtD3DShaderIntrinsics_WaveClusterBitOr(uint x, uint dxClusterSize) return spirv_OpGroupNonUniformBitwiseOr_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); } -#endif - //===================================================================================================================== // The following extension functions are driver intrinsic functions // @@ -281,7 +275,13 @@ __decl uint AmdTraceRayGetBoxSortHeuristicMode() DUMMY_UINT_FUNC __decl uint2 AmdTraceRayMakePC(uint pcVaLow) DUMMY_UINT2_FUNC __decl uint AmdTraceRayGetKnownSetRayFlags() DUMMY_UINT_FUNC __decl uint AmdTraceRayGetKnownUnsetRayFlags() DUMMY_UINT_FUNC +#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 50 +__decl uint AmdTraceRayInitStaticId() DUMMY_UINT_FUNC +#else __decl void AmdTraceRayInitStaticId() DUMMY_VOID_FUNC +#endif +__decl uint AmdTraceRayPersistentLdsAtomicAdd(uint offset, uint data) DUMMY_UINT_FUNC +__decl uint AmdTraceRayPersistentLdsWrite(uint offset, uint data) DUMMY_UINT_FUNC //===================================================================================================================== // Ref: GpuRt::Device::GetStaticPipelineFlags @@ -324,11 +324,12 @@ __decl uint AmdExtLoadDwordAtAddrUncached(uint64_t addr, uint offset) DUMMY_UIN __decl void AmdExtStoreDwordAtAddrUncached(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC __decl uint3 AmdExtGroupIdCompute() DUMMY_UINT3_FUNC __decl uint3 AmdExtGroupDimCompute() DUMMY_UINT3_FUNC -__decl void AmdExtSleep(uint value) DUMMY_VOID_FUNC +__decl uint AmdExtLaneCount() DUMMY_UINT_FUNC +__decl void AmdExtSleep(uint value) DUMMY_VOID_FUNC #if USE_TEMP_ARRAY_STACK //===================================================================================================================== -// Register based stack (shared with __cplusplus path) +// Register based stack #define SHORT_STACK_SIZE 16 //===================================================================================================================== @@ -358,203 +359,46 @@ __decl uint AmdTraceRayGetStackSize() DUMMY_UINT_FUNC #define ANYHIT_CALLTYPE_NO_DUPLICATE 1 #define ANYHIT_CALLTYPE_DUPLICATE 2 -#ifdef __cplusplus -//===================================================================================================================== -static uint LoadDwordAtAddr(GpuVirtualAddress addr) -{ - return *reinterpret_cast(addr); -} -#else //===================================================================================================================== static uint LoadDwordAtAddr(GpuVirtualAddress addr) { return AmdExtD3DShaderIntrinsics_LoadDwordAtAddr(LowPart(addr), HighPart(addr), 0); } -#endif //===================================================================================================================== static uint2 LoadDwordAtAddrx2(GpuVirtualAddress addr) { -#if !defined(__cplusplus) return AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx2(LowPart(addr), HighPart(addr), 0); -#else - uint2 retVal; - retVal.x = LoadDwordAtAddr(addr); - retVal.y = LoadDwordAtAddr(addr + 4); - - return retVal; -#endif } //===================================================================================================================== static uint3 LoadDwordAtAddrx3(GpuVirtualAddress addr) { -#if !defined(__cplusplus) return AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx3(LowPart(addr), HighPart(addr), 0); -#else - uint3 retVal; - retVal.x = LoadDwordAtAddr(addr); - retVal.y = LoadDwordAtAddr(addr + 4); - retVal.z = LoadDwordAtAddr(addr + 8); - - return retVal; -#endif } //===================================================================================================================== static uint4 LoadDwordAtAddrx4(GpuVirtualAddress addr) { -#if !defined(__cplusplus) return AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx4(LowPart(addr), HighPart(addr), 0); -#else - uint4 retVal; - retVal.x = LoadDwordAtAddr(addr); - retVal.y = LoadDwordAtAddr(addr + 4); - retVal.z = LoadDwordAtAddr(addr + 8); - retVal.w = LoadDwordAtAddr(addr + 12); - - return retVal; -#endif } static uint ConstantLoadDwordAtAddr(GpuVirtualAddress addr) { -#if !defined(__cplusplus) return AmdExtD3DShaderIntrinsics_ConstantLoadDwordAtAddr(LowPart(addr), HighPart(addr), 0); -#else - return AmdExtConstantLoadDwordAtAddr(addr, 0); -#endif } static uint64_t ConstantLoadDwordAtAddrx2(GpuVirtualAddress addr) { -#if !defined(__cplusplus) uint2 retVal = AmdExtD3DShaderIntrinsics_ConstantLoadDwordAtAddrx2(LowPart(addr), HighPart(addr), 0); return PackUint64(retVal.x, retVal.y); -#else - return AmdExtConstantLoad64AtAddr(addr, 0); -#endif } static uint4 ConstantLoadDwordAtAddrx4(GpuVirtualAddress addr) { -#if !defined(__cplusplus) return AmdExtD3DShaderIntrinsics_ConstantLoadDwordAtAddrx4(LowPart(addr), HighPart(addr), 0); -#else - uint4 retVal; - retVal.xy = SplitUint64(AmdExtConstantLoad64AtAddr(addr, 0)); - retVal.zw = SplitUint64(AmdExtConstantLoad64AtAddr(addr + 8, 0)); - - return retVal; -#endif } -#ifdef __cplusplus -#include -static constexpr uint RoundModeTable[] = -{ - FE_TONEAREST, - FE_UPWARD, - FE_DOWNWARD, - FE_TOWARDZERO, -}; - -//===================================================================================================================== -static float FloatOpWithRoundMode(uint roundMode, uint operation, float src0, float src1) -{ - std::fesetround(RoundModeTable[roundMode]); - - float result; - - switch (operation) - { - case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Add: - result = src0 + src1; - break; - - case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Subtract: - result = src0 - src1; - break; - - case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Multiply: - result = src0 * src1; - break; - - default: - printf("Unknown operation for FloatOpWithRoundMode\n"); - assert(false); - break; - } - - std::fesetround(FE_TONEAREST); - - return result; -} - -//===================================================================================================================== -static float2 FloatOpWithRoundMode(uint roundMode, uint operation, float2 src0, float2 src1) -{ - std::fesetround(RoundModeTable[roundMode]); - - float2 result; - - switch (operation) - { - case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Add: - result = src0 + src1; - break; - - case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Subtract: - result = src0 - src1; - break; - - case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Multiply: - result = src0 * src1; - break; - - default: - printf("Unknown operation for FloatOpWithRoundMode\n"); - assert(false); - break; - } - - std::fesetround(FE_TONEAREST); - - return result; -} - -//===================================================================================================================== -static float3 FloatOpWithRoundMode(uint roundMode, uint operation, float3 src0, float3 src1) -{ - std::fesetround(RoundModeTable[roundMode]); - - float3 result; - - switch (operation) - { - case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Add: - result = src0 + src1; - break; - - case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Subtract: - result = src0 - src1; - break; - - case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Multiply: - result = src0 * src1; - break; - - default: - printf("Unknown operation for FloatOpWithRoundMode\n"); - assert(false); - break; - } - - std::fesetround(FE_TONEAREST); - - return result; -} -#else //===================================================================================================================== static float FloatOpWithRoundMode(uint roundMode, uint operation, float src0, float src1) { @@ -572,6 +416,5 @@ static float3 FloatOpWithRoundMode(uint roundMode, uint operation, float3 src0, { return AmdExtD3DShaderIntrinsics_FloatOpWithRoundMode(roundMode, operation, src0, src1); } -#endif #endif diff --git a/src/shaders/GpuRtLibrary.hlsl b/src/shaders/GpuRtLibrary.hlsl index 81bf9fa..879590c 100644 --- a/src/shaders/GpuRtLibrary.hlsl +++ b/src/shaders/GpuRtLibrary.hlsl @@ -26,6 +26,9 @@ #ifndef _GPURT_LIBRARY_HLSL #define _GPURT_LIBRARY_HLSL +#include "../shadersClean/traversal/TraversalDefs.hlsli" +#include "../shadersClean/common/InstanceDesc.hlsli" + // Following order matters as AccelStructTracker relies on defines from TraceRayCommon.hlsl #include "TraceRayCommon.hlsl" #include "AccelStructTracker.hlsl" diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl index 23ed420..de0dc26 100644 --- a/src/shaders/GpuRtLibraryCont.hlsl +++ b/src/shaders/GpuRtLibraryCont.hlsl @@ -32,6 +32,7 @@ #endif #include "../shadersClean/common/Math.hlsli" +#include "../shadersClean/common/InstanceDesc.hlsli" // By default, Gpurt exports both non-continuation and continuation traversal functions. Dxcp picks one based on panel // setting. @@ -251,12 +252,19 @@ struct _AmdDispatchSystemData return dispatchId; } + static _AmdDispatchSystemData MakeDeadLaneWithStack(); + static _AmdDispatchSystemData MakeDeadLaneWithoutStack(); + uint dispatchLinearId; // Packed dispatch linear id. Combine x/y/z into 1 DWORD. uint shaderRecIdx; // Record index for local root parameters. #if DEVELOPER - uint parentId; // Record the parent Id for ray history counter, -1 for RayGen shader. + uint parentId; // Record the parent's dynamic Id for ray history counter, -1 for RayGen shader. + uint staticId; // Record the static Id of current trace ray call site. #endif + + uint nextNodePtr; // Next node pointer (moved here from _AmdTraversalState due to launch kernel VGPR limits). + // Also contains the state of the current lane (e.g. dead with or without valid stack). }; //===================================================================================================================== @@ -292,6 +300,10 @@ struct _AmdRaySystemState // Apply known bits common to all TraceRay calls incomingFlags = ((incomingFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags()); #endif + // Apply options overrides + incomingFlags &= ~Options::getRayFlagsOverrideForceDisableMask(); + incomingFlags |= Options::getRayFlagsOverrideForceEnableMask(); + return incomingFlags; } @@ -494,7 +506,6 @@ struct _AmdTraversalState // register space reserved for ray attributes in general float2 committedBarycentrics; - uint nextNodePtr; uint instNodePtr; // Traversal stack state. Note, on some hardware this data represents a packed stack pointer that will @@ -647,14 +658,23 @@ struct _AmdSystemData } #endif - bool IsDeadLane() + bool IsDeadLaneWithoutStack() + { + // This type of dead lane is only possible when the continuations stack is in global memory. + // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime. + return (dispatch.nextNodePtr == DEAD_LANE_WITHOUT_STACK) && _AmdContinuationStackIsGlobal(); + } + + bool IsDeadLaneWithStack() { - return traversal.nextNodePtr == DEAD_LANE; + // This type of dead lane is only possible when persistent launch is enabled. + // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime. + return (dispatch.nextNodePtr == DEAD_LANE_WITH_STACK) && Options::getPersistentLaunchEnabled(); } bool IsTraversal() { - return IsValidNode(traversal.nextNodePtr); + return IsValidNode(dispatch.nextNodePtr); } bool IsChsOrMiss(in uint state) @@ -685,9 +705,14 @@ struct _AmdSystemData return IsChsOrMiss(state) && IsValidNode(traversal.committed.instNodePtr); } - static _AmdSystemData MakeDeadLane(); + static _AmdSystemData MakeDeadLaneWithStack(); + static _AmdSystemData MakeDeadLaneWithoutStack(); + // Note: _AmdDispatchSystemData must be the first member of _AmdSystemData. This allows us to save some VGPRs if + // we need to call a function that takes _AmdSystemData but doesn't actually need ray or traversal data. + // For example, the launch kernel can make a dead lane and enqueue traversal with just dispatch.nextNodePtr. _AmdDispatchSystemData dispatch; + _AmdRaySystemState ray; _AmdTraversalState traversal; #if DEVELOPER @@ -739,6 +764,7 @@ DECLARE_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data) DECLARE_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data) DECLARE_ENQUEUE(RayGen, uint64_t dummyReturnAddr, _AmdDispatchSystemData data) DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data) +DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdDispatchSystemData data) DECLARE_ENQUEUE(AnyHit, uint64_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics) DECLARE_ENQUEUE(Intersection, uint64_t returnAddr, _AmdAnyHitSystemData data) @@ -762,19 +788,39 @@ DECLARE_GET_UNINITIALIZED(F32, float) DECLARE_GET_UNINITIALIZED(I32, uint32_t) DECLARE_GET_UNINITIALIZED(I64, uint64_t) DECLARE_GET_UNINITIALIZED(SystemData, _AmdSystemData) +DECLARE_GET_UNINITIALIZED(DispatchSystemData, _AmdDispatchSystemData) -#if CONTINUATIONS_LGC_STACK_LOWERING DECLARE_CONT_STACK_LOAD_LAST_USE(U32, uint32_t) DECLARE_CONT_STACK_STORE(U32, uint32_t value) DECLARE_CONT_STACK_LOAD_LAST_USE(U64, uint64_t) DECLARE_CONT_STACK_STORE(U64, uint64_t value) #endif -#endif -inline _AmdSystemData _AmdSystemData::MakeDeadLane() +inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithStack() +{ + _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData(); + data.nextNodePtr = DEAD_LANE_WITH_STACK; + return data; +} + +inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithoutStack() +{ + _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData(); + data.nextNodePtr = DEAD_LANE_WITHOUT_STACK; + return data; +} + +inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithStack() +{ + _AmdSystemData data = _AmdGetUninitializedSystemData(); + data.dispatch.nextNodePtr = DEAD_LANE_WITH_STACK; + return data; +} + +inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithoutStack() { _AmdSystemData data = _AmdGetUninitializedSystemData(); - data.traversal.nextNodePtr = DEAD_LANE; + data.dispatch.nextNodePtr = DEAD_LANE_WITHOUT_STACK; return data; } @@ -896,6 +942,14 @@ static float4x3 WorldToObject4x3(in uint64_t tlasBaseAddr, in uint instNodePtr) return transpose(WorldToObject3x4(tlasBaseAddr, instNodePtr)); } +//===================================================================================================================== +__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC +__decl uint AmdExtFlattenedThreadIdInGroupCompute() DUMMY_UINT_FUNC +__decl uint AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC +__decl void AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC +__decl void AmdExtDeviceMemoryAcquire() DUMMY_VOID_FUNC +__decl void AmdExtDeviceMemoryRelease() DUMMY_VOID_FUNC + //===================================================================================================================== // Implementation of DispatchRaysIndex. export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data) @@ -907,13 +961,28 @@ export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data) // Load dispatch dimensions from constant buffer. static uint3 GetDispatchRaysDimensions() { - const uint width = DispatchRaysConstBuf.rayDispatchWidth; - const uint height = DispatchRaysConstBuf.rayDispatchHeight; - const uint depth = DispatchRaysConstBuf.rayDispatchDepth; + const uint width = DispatchRaysConstBuf.rayDispatchWidth; + const uint height = DispatchRaysConstBuf.rayDispatchHeight; + const uint depth = DispatchRaysConstBuf.rayDispatchDepth; return uint3(width, height, depth); } +//===================================================================================================================== +// Persistent dispatch size (1D). +static uint3 GetPersistentDispatchSize() +{ + // Groups needed to cover the dispatch if each thread only processes 1 ray + const uint3 rayDispatch = GetDispatchRaysDimensions(); + const uint threadsNeeded = rayDispatch.x * rayDispatch.y * rayDispatch.z; + const uint3 groupDim = AmdExtGroupDimCompute(); + const uint groupsNeeded = RoundUpQuotient(threadsNeeded, groupDim.x * groupDim.y * groupDim.z); + + // Dispatch size is the lesser of rayDispatchMaxGroups and groupsNeeded + // rayDispatchMaxGroups would mean threads handle >= 1 ray, groupsNeeded would mean threads handle <= 1 ray + return min(DispatchRaysConstBuf.rayDispatchMaxGroups, groupsNeeded); +} + //===================================================================================================================== // Implementation of DispatchRaysDimensions(). export uint3 _cont_DispatchRaysDimensions3(in _AmdDispatchSystemData data) @@ -987,24 +1056,19 @@ export float _cont_RayTCurrent(in _AmdSystemData data, in _AmdPrimitiveSystemSta #endif //===================================================================================================================== -__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC -__decl uint AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC -__decl void AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC - -//===================================================================================================================== -// Map a thread to a ray, some threads could end up with non-existent (invalid) rays. Assuming numthreads(32, 1, 1). +// Map a thread to a ray, some threads could end up with non-existent (invalid) rays. // Note D3D12_DISPATCH_RAYS_DESC::(w x h x d) are organized to DispatchDims = (?, d, 1). static uint3 GetDispatchId() { const uint3 threadIdInGroup = AmdExtThreadIdInGroupCompute(); const uint3 groupId = AmdExtGroupIdCompute(); const uint3 dims = GetDispatchRaysDimensions(); + const uint threadGroupSize = AmdExtGroupDimCompute().x * AmdExtGroupDimCompute().y * AmdExtGroupDimCompute().z; uint3 dispatchId; dispatchId.z = groupId.y; if ((dims.x > 1) && (dims.y > 1)) { - // Use 8 x 4 tiles. /* Sample: D3D12_DISPATCH_RAYS_DESC::(w x h x d) = (18, 6, 1). Divided into 8x4 tiles(boxes). A number in a box is the group id. @@ -1020,12 +1084,12 @@ static uint3 GetDispatchId() const uint yTile = groupId.x / wTile; dispatchId.x = xTile * 8 + (threadIdInGroup.x % 8); - dispatchId.y = yTile * 4 + (threadIdInGroup.x / 8); + dispatchId.y = yTile * (threadGroupSize / 8) + (threadIdInGroup.x / 8); } else { // Do a naive 1:1 simple map. - const uint id = threadIdInGroup.x + 32 * groupId.x; + const uint id = threadIdInGroup.x + threadGroupSize * groupId.x; const uint gridSize = dims.x * dims.y; // width x height dispatchId.y = id / dims.x; dispatchId.x = id - (dispatchId.y * dims.x); @@ -1034,6 +1098,45 @@ static uint3 GetDispatchId() return dispatchId; } +//===================================================================================================================== +// Compute the X/Y/Z ray index based on the dispatch dimensions and a 32-bit dispatch ID +static uint3 GetDispatchId(uint width, uint height, uint dispatchId) +{ + // Progressively work from Z to Y to X, subtracting as we go along + + // Determine the Z index - divide by size of the 2D plane + const uint planeSize = width * height; + const uint z = dispatchId / planeSize; + dispatchId -= z * planeSize; + + // Split the 2D plane into 8 x 64 tiles + const uint TileWidth = 8; + const uint TileHeight = 64; + + // Determine which tile along the Y axis - divide by size of the 2D strip + const uint yTile = dispatchId / TileHeight / width; + dispatchId -= yTile * TileHeight * width; + + // Determine which tile along the X axis - divide by size of the 2D strip + // Take care in case the dispatch height is not a multiple of TileHeight + const uint xStripHeight = min(TileHeight, height - (yTile * TileHeight)); + const uint xStripSize = TileWidth * xStripHeight; + const uint xTile = dispatchId / xStripSize; + dispatchId -= xTile * xStripSize; + + // Determine Y position within the tile - divide by width of the 2D strip + // Take care in case the dispatch width is not a multiple of TileWidth + const uint xStripWidth = min(TileWidth, width - xTile * TileWidth); + const uint y = dispatchId / xStripWidth; + dispatchId -= y * xStripWidth; + + // Remainder is the X position within the tile + const uint x = dispatchId; + + // Return ray index - X/Y based on their respective tiles and position within + return uint3(xTile * TileWidth + x, yTile * TileHeight + y, z); +} + //===================================================================================================================== export uint _cont_InstanceIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) { @@ -1144,7 +1247,7 @@ static void AcceptHit(inout_param(_AmdAnyHitSystemData) data, bool endSearch) data.base.traversal.committed = data.candidate; if (endSearch) { - data.base.traversal.nextNodePtr = END_SEARCH; // End search + data.base.dispatch.nextNodePtr = END_SEARCH; // End search } } } @@ -1186,9 +1289,9 @@ export bool _cont_IsEndSearch(in _AmdAnyHitSystemData data) { // If AnyHit shader called AcceptHitAndEndSearch, or RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH was set, nextNodePtr // is END_SEARCH. - // On the other side, the values Traversal function may set to traversal.nextNodePtr on its exit are different: + // On the other side, the values Traversal function may set to dispatch.nextNodePtr on its exit are different: // normal pointers, TERMINAL_NODE or INVALID_NODE. - return (data.base.traversal.nextNodePtr == END_SEARCH); + return (data.base.dispatch.nextNodePtr == END_SEARCH); } //===================================================================================================================== @@ -1202,9 +1305,10 @@ export uint _cont_GetContinuationStackAddr() { const uint3 threadIdInGroup = AmdExtThreadIdInGroupCompute(); const uint3 groupId = AmdExtGroupIdCompute(); + const uint threadGroupSize = AmdExtGroupDimCompute().x * AmdExtGroupDimCompute().y * AmdExtGroupDimCompute().z; - // Do a naive 1:1 simple map. Also for now, assume numthreads(32, 1, 1) - const uint id = threadIdInGroup.x + 32 * groupId.x; + // Do a naive 1:1 simple map. + const uint id = threadIdInGroup.x + threadGroupSize * groupId.x; offset = id * DispatchRaysConstBuf.cpsFrontendStackSize; } @@ -1377,12 +1481,17 @@ static void RayHistorySetCandidateTCurrent(inout_param(_AmdSystemData) data, flo } //===================================================================================================================== -static void RayHistoryInitStaticId() +static void RayHistoryInitStaticId(inout_param(_AmdSystemData) data) { #if DEVELOPER if (EnableTraversalCounter()) { +#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 50 + data.dispatch.staticId = AmdTraceRayInitStaticId(); +#else AmdTraceRayInitStaticId(); + data.dispatch.staticId = AmdTraceRayGetStaticId(); +#endif } #endif } @@ -1436,7 +1545,7 @@ static void RayHistoryWriteBegin(inout_param(_AmdSystemData) data) data.ray.Flags(), data.ray.traceParameters, rayDesc, - AmdTraceRayGetStaticId(), + data.dispatch.staticId, data.counter.dynamicId, data.dispatch.parentId); WriteRayHistoryTokenTimeStamp(rayId, data.counter.timerBegin); @@ -1582,7 +1691,7 @@ static void RayHistoryWriteAnyHitOrProceduralStatus(inout_param(_AmdSystemData) if (EnableTraversalCounter()) { const uint rayId = GetRayId(_cont_DispatchRaysIndex3(data.dispatch)); - const uint status = (data.traversal.nextNodePtr == END_SEARCH) + const uint status = (data.dispatch.nextNodePtr == END_SEARCH) ? HIT_STATUS_ACCEPT_AND_END_SEARCH : (data.ray.AnyHitDidAccept() ? HIT_STATUS_ACCEPT : HIT_STATUS_IGNORE); @@ -1840,24 +1949,97 @@ static HitGroupInfo GetHitGroupInfo( #include "Continuations2_0.hlsl" #if CONTINUATION_ON_GPU +//===================================================================================================================== +static void LaunchRayGen(bool setupStack) +{ + uint3 dispatchId; + bool valid; + + if (Options::getPersistentLaunchEnabled() == false) + { + // Each thread will process <= 1 ray. No need for extra counter logic. + dispatchId = GetDispatchId(); + valid = (dispatchId.x < DispatchRaysConstBuf.rayDispatchWidth && + dispatchId.y < DispatchRaysConstBuf.rayDispatchHeight); + } + else + { + // This is a persistent launch where each thread will process >= 1 ray. + + // This is written in a way that is intended to be correct even if threads don't reconverge after calling into + // the ray generation shader. + uint localWorkId; + const uint popCount = WaveActiveCountBits(true); + + if (WaveIsFirstLane()) + { + localWorkId = AmdTraceRayPersistentLdsAtomicAdd(0, popCount); + } + localWorkId = WaveReadLaneFirst(localWorkId) + WavePrefixCountBits(true); + + const uint3 rayDims = GetDispatchRaysDimensions(); + const uint tgCount = GetPersistentDispatchSize(); + + // Single dimension dispatch so the flattened group ID is the same as the x component of the group ID + const uint tgId = AmdExtGroupIdCompute().x; + + // Interleave waves' worth of work among CUs so that every CU does approximately the same amount of work even + // for dispatches that are smaller than the maximum occupancy of the GPU. This is probably also a bit better + // for memory and shader execution locality, since CUs should tend to stay roughly within the same region of + // the dispatch. Assume numthreads(32, 1, 1). + const uint lowPart = localWorkId & 31; + const uint highPart = localWorkId & ~31; + const uint flatDispatchId = highPart * tgCount + tgId * 32 + lowPart; + + dispatchId = GetDispatchId(rayDims.x, rayDims.y, flatDispatchId); + valid = flatDispatchId < (rayDims.x * rayDims.y * rayDims.z); + } + + // With persistent launch every lane gets a stack + if (setupStack) + { + _AmdContStackSetPtr(_cont_GetContinuationStackAddr()); + } + + if (WaveActiveAllTrue(!valid)) + { + // This wave is done. + _AmdComplete(); + } + + // But only lanes that have a valid dispatch id execute RGS, the others stay dead: + if (valid) + { + _AmdDispatchSystemData systemData; + systemData.PackDispatchId(dispatchId); + systemData.shaderRecIdx = _AmdGetUninitializedI32(); +#if DEVELOPER + systemData.parentId = -1; +#endif + _AmdEnqueueRayGen(GetRayGenVpc(), _AmdGetUninitializedI64(), systemData); + } + else if (Options::getPersistentLaunchEnabled()) + { + _AmdDispatchSystemData systemData = _AmdDispatchSystemData::MakeDeadLaneWithStack(); + _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), systemData); + } +} + //===================================================================================================================== // KernelEntry is entry function of the RayTracing continuation mode export void _cont_KernelEntry() { - _AmdDispatchSystemData systemData; - uint3 dispatchId = GetDispatchId(); - systemData.PackDispatchId(dispatchId); - systemData.shaderRecIdx = _AmdGetUninitializedI32(); - GPU_ASSERT(dispatchId.z < DispatchRaysConstBuf.rayDispatchDepth); - if (dispatchId.x >= DispatchRaysConstBuf.rayDispatchWidth || - dispatchId.y >= DispatchRaysConstBuf.rayDispatchHeight) + if (Options::getPersistentLaunchEnabled()) { - return; - } + if (AmdExtFlattenedThreadIdInGroupCompute() == 0) + { + AmdTraceRayPersistentLdsWrite(0, 0); + } - _AmdContStackSetPtr(_cont_GetContinuationStackAddr()); + GroupMemoryBarrierWithGroupSync(); + } - _AmdEnqueueRayGen(GetRayGenVpc(), _AmdGetUninitializedI64(), systemData); + LaunchRayGen(true); } //===================================================================================================================== @@ -1934,9 +2116,11 @@ export void _cont_TraceRay( { case RayTracingIpLevel::RtIp1_1: traversal = InitTraversalState1_1(instanceInclusionMask, rayDesc, isValid); + dispatch.nextNodePtr = isValid ? CreateRootNodePointer1_1() : INVALID_NODE; break; case RayTracingIpLevel::RtIp2_0: traversal = InitTraversalState2_0(instanceInclusionMask, rayDesc, isValid); + dispatch.nextNodePtr = isValid ? CreateRootNodePointer1_1() : TERMINAL_NODE; break; default: break; @@ -1947,7 +2131,7 @@ export void _cont_TraceRay( data.ray = ray; data.traversal = traversal; - RayHistoryInitStaticId(); + RayHistoryInitStaticId(data); RayHistoryWriteBegin(data); const uint callerShaderRecIdx = dispatch.shaderRecIdx; // 0 if from RayGen. @@ -2028,8 +2212,7 @@ static void TraversalInternal( { switch (_AmdGetRtip()) { -#if (GPURT_RTIP_LEVEL == 20) || (GPURT_RTIP_LEVEL == 0) - // Level 20 is used for legacy variants +#if (GPURT_RTIP_LEVEL == GPURT_RTIP_LEGACY_LEVEL) || (GPURT_RTIP_LEVEL == 0) case RayTracingIpLevel::RtIp1_1: TraversalInternal1_1(data, state, candidate, candidateBarycentrics); break; @@ -2049,7 +2232,8 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_ if (_AmdContinuationStackIsGlobal()) { // No work to do = dead lane, jump to traversal as a synchronization point with an empty system data - _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), _AmdSystemData::MakeDeadLane()); + _AmdSystemData sysData = _AmdSystemData::MakeDeadLaneWithoutStack(); + _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), sysData); } else { @@ -2118,11 +2302,6 @@ static void EnterSchedulerSection() export void _cont_Traversal( inout_param(_AmdSystemData) data) { -#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 41 - data.ray.PackAccelStructAndRayflags( - data.ray.AccelStruct(), - (data.ray.IncomingFlags() & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags()); -#endif // Discard data that doesn't need to be kept alive during Traversal data.dispatch.shaderRecIdx = _AmdGetUninitializedI32(); if (!IsBvhRebraid()) @@ -2132,7 +2311,8 @@ export void _cont_Traversal( } // Write AHS/IS returned status - if (!data.IsDeadLane()) + bool IsDeadLane = (data.IsDeadLaneWithoutStack() || data.IsDeadLaneWithStack()); + if (!IsDeadLane) { RayHistoryWriteAnyHitOrProceduralStatus(data); } @@ -2278,6 +2458,12 @@ static IntersectionResult TraceRayInternalCPSDebug( const bool isValid = true; // already verified in the caller + _AmdDispatchSystemData dispatch = (_AmdDispatchSystemData)0; + dispatch.PackDispatchId(GetDispatchId()); +#if DEVELOPER + dispatch.parentId = -1; +#endif + // Initialise traversal system state from driver intrinsic _AmdTraversalState traversal = (_AmdTraversalState)0; switch (rtIpLevel) @@ -2286,27 +2472,25 @@ static IntersectionResult TraceRayInternalCPSDebug( traversal = InitTraversalState1_1(0, rayDesc, isValid); + dispatch.nextNodePtr = isValid ? CreateRootNodePointer1_1() : INVALID_NODE; break; case GPURT_RTIP2_0: traversal = InitTraversalState2_0(0, rayDesc, isValid); + dispatch.nextNodePtr = isValid ? CreateRootNodePointer1_1() : TERMINAL_NODE; break; default: break; } _AmdSystemData sysData = (_AmdSystemData)0; - sysData.dispatch = (_AmdDispatchSystemData)0; - sysData.dispatch.PackDispatchId(GetDispatchId()); -#if DEVELOPER - sysData.dispatch.parentId = -1; -#endif - sysData.ray = ray; - sysData.traversal = traversal; + sysData.dispatch = dispatch; + sysData.ray = ray; + sysData.traversal = traversal; // Begin outer while loop - while (sysData.traversal.nextNodePtr < TERMINAL_NODE) + while (sysData.dispatch.nextNodePtr < TERMINAL_NODE) { _AmdTraversalResultData ret = TraversalInternalDebugWrapper(sysData); uint state = ret.state; @@ -2372,7 +2556,7 @@ static IntersectionResult TraceRayInternalCPSDebug( if (status == HIT_STATUS_ACCEPT_AND_END_SEARCH) { - sysData.traversal.nextNodePtr = INVALID_NODE; + sysData.dispatch.nextNodePtr = INVALID_NODE; } } } @@ -2407,7 +2591,7 @@ static IntersectionResult TraceRayInternalCPSDebug( sysData.traversal.committed = ret.candidate; if (status == HIT_STATUS_ACCEPT_AND_END_SEARCH) { - sysData.traversal.nextNodePtr = INVALID_NODE; + sysData.dispatch.nextNodePtr = INVALID_NODE; } } } diff --git a/src/shaders/InitExecuteIndirect.hlsl b/src/shaders/InitExecuteIndirect.hlsl index 2980cca..d2d079b 100644 --- a/src/shaders/InitExecuteIndirect.hlsl +++ b/src/shaders/InitExecuteIndirect.hlsl @@ -123,10 +123,13 @@ void InitExecuteIndirect( { DispatchRaysDimensions dispatchRaysDescDim; + // The DispatchRays indirect argument struct follows any resource bindings + const uint dispatchRaysDescOffset = (dispatchIdx * Constants.inputBytesPerDispatch) + Constants.bindingArgsSize; + if (Constants.indirectMode == DispatchDimensions) { // vkCmdTraceRaysIndirectKHR - ray trace query dimensions - const DispatchRaysDimensions dispatchRaysDesc = InputArgBuffer.Load(0); + const DispatchRaysDimensions dispatchRaysDesc = InputArgBuffer.Load(dispatchRaysDescOffset); dispatchRaysDescDim = dispatchRaysDesc; @@ -138,7 +141,7 @@ void InitExecuteIndirect( else { // vkCmdTraceRaysIndirect2KHR- shaderTable + ray trace query dimensions - const DispatchRaysDesc dispatchRaysDesc = InputArgBuffer.Load(0); + const DispatchRaysDesc dispatchRaysDesc = InputArgBuffer.Load(dispatchRaysDescOffset); dispatchRaysDescDim.width = dispatchRaysDesc.width; dispatchRaysDescDim.height = dispatchRaysDesc.height; @@ -164,7 +167,18 @@ void InitExecuteIndirect( OutputConstants[dispatchIdx].callableTableStrideInBytes = uint(dispatchRaysDesc.callableShaderTable.stride); } - uint outputOffset = 0; + uint inputOffset = dispatchIdx * Constants.inputBytesPerDispatch; + uint outputOffset = dispatchIdx * Constants.outputBytesPerDispatch; + + // Directly copy all indirect binding args from the app buffer to our temp internal buffer + for (uint i = 0; i < Constants.bindingArgsSize; i += sizeof(uint)) + { + const uint data = InputArgBuffer.Load(inputOffset); + OutputArgBuffer.Store(outputOffset, data); + outputOffset += sizeof(uint); + inputOffset += sizeof(uint); + } + uint3 dispatchDim = uint3(0, 0, 0); switch (Constants.dispatchDimSwizzleMode) diff --git a/src/shaders/MergeSort.hlsl b/src/shaders/MergeSort.hlsl index 1e38736..bd1921a 100644 --- a/src/shaders/MergeSort.hlsl +++ b/src/shaders/MergeSort.hlsl @@ -134,100 +134,149 @@ uint NumElemsLessThanOrEqualTo64(uint64_t val, uint offset, uint offsetNext, uin } //===================================================================================================================== -void GlobalMerge( - inout uint numTasksWait, - inout uint waveId, - uint localId, +void GlobalMergeIteration( uint groupId, - uint numPrimitives, - uint groupCapacity, - uint activeGroups, + uint localId, + uint globalId, uint groupSize, - uint outputKeysOffset, - uint outputValuesOffset, - uint keysOffsetSwap, - uint valuesOffsetSwap, + uint groupCapacity, + uint cmpGap, + uint splitGap, + uint numPrimitives, + uint srcOffsetKey, + uint srcOffsetVal, + uint dstOffsetKey, + uint dstOffsetVal, uint useMortonCode30) { - const uint numLevelsOfMergeTree = ceil(log2(activeGroups)); - activeGroups = RoundUpQuotient(numPrimitives, groupSize); - uint cmpGap = 1; - uint splitGap = 2; + const uint groupIdNew = groupId / 2; + const uint capacity = cmpGap * groupCapacity; + bool leftSubtree = true; + uint subtreeOffset; + uint subtreeEnd; + + // Left Subtree + if (groupIdNew % splitGap < cmpGap) + { + subtreeOffset = capacity * (groupIdNew / cmpGap) + capacity; + subtreeEnd = (subtreeOffset + capacity > numPrimitives) ? numPrimitives : subtreeOffset + capacity; + } + // Right Subtree + else + { + subtreeEnd = (groupIdNew / cmpGap) * capacity; + subtreeOffset = subtreeEnd - capacity; + leftSubtree = false; + } - // Level 0 is the sorted partitions - for (uint level = 1; level <= numLevelsOfMergeTree; level++) + if (globalId < numPrimitives) { - BEGIN_TASK(activeGroups); - const uint groupIdNew = groupId / 2; - const uint capacity = cmpGap * groupCapacity; - bool leftSubtree = true; - uint subtreeOffset; - uint subtreeEnd; - - // Left Subtree - if (groupIdNew % splitGap < cmpGap) + if (useMortonCode30) { - subtreeOffset = capacity * (groupIdNew / cmpGap) + capacity; - subtreeEnd = (subtreeOffset + capacity > numPrimitives) ? numPrimitives : subtreeOffset + capacity; + const uint mortonCode = FetchMortonCode(srcOffsetKey, globalId); + const uint index = FetchSortedPrimIndex(srcOffsetVal, globalId); + + uint posInMergedList = localId + (groupId % splitGap) * groupSize + (groupIdNew / splitGap) * capacity * 2; + + posInMergedList += (leftSubtree) ? NumElemsLessThan(mortonCode, subtreeOffset, subtreeEnd, srcOffsetKey) : + NumElemsLessThanOrEqualTo(mortonCode, subtreeOffset, subtreeEnd, srcOffsetKey); + + WriteMortonCode(dstOffsetKey, posInMergedList, mortonCode); + WriteSortedPrimIndex(dstOffsetVal, posInMergedList, index); } - // Right Subtree else { - subtreeEnd = (groupIdNew / cmpGap) * capacity; - subtreeOffset = subtreeEnd - capacity; - leftSubtree = false; + const uint64_t mortonCode = FetchMortonCode64(srcOffsetKey, globalId); + const uint index = FetchSortedPrimIndex(srcOffsetVal, globalId); + + uint posInMergedList = localId + (groupId % splitGap) * groupSize + (groupIdNew / splitGap) * capacity * 2; + + posInMergedList += (leftSubtree) ? NumElemsLessThan64(mortonCode, subtreeOffset, subtreeEnd, srcOffsetKey) : + NumElemsLessThanOrEqualTo64(mortonCode, subtreeOffset, subtreeEnd, srcOffsetKey); + + WriteMortonCode64(dstOffsetKey, posInMergedList, mortonCode); + WriteSortedPrimIndex(dstOffsetVal, posInMergedList, index); } + } +} - if (globalId < numPrimitives) - { - if (useMortonCode30) - { - const uint mortonCode = FetchMortonCode(outputKeysOffset, globalId); - const uint index = FetchSortedPrimIndex(outputValuesOffset, globalId); +//===================================================================================================================== +void GlobalMerge( + inout uint numTasksWait, + inout uint waveId, + uint localId, + uint groupId, + uint numPrimitives, + uint groupCapacity, + uint numLocalSortedGroups, + uint groupSize, + uint offsetKeysOutput, + uint offsetValsOutput, + uint offsetKeysInput, + uint offsetValsInput, + uint useMortonCode30) +{ + const uint numLevelsOfMergeTree = ceil(log2(numLocalSortedGroups)); + const uint activeGroups = RoundUpQuotient(numPrimitives, groupSize); - uint posInMergedList = localId + (groupId % splitGap) * groupSize + (groupIdNew / splitGap) * capacity * 2; + // Level 0 is the local sort and always copies the sorted partitions into output buffers. The first iteration of global merge + // phase (i.e. Level 1) always copies from output buffers to swap buffers and then continues to ping-pong between these buffers + // at each iteration. + // + uint level = 1; - posInMergedList += (leftSubtree) ? NumElemsLessThan(mortonCode, subtreeOffset, subtreeEnd, outputKeysOffset) : - NumElemsLessThanOrEqualTo(mortonCode, subtreeOffset, subtreeEnd, outputKeysOffset); + // Level 0 is the sorted partitions + for (; level <= numLevelsOfMergeTree; level++) + { + // Odd levels copy from output buffers to swap buffers, while even levels copy from swap buffers to output + // buffers. + const uint srcOffsetKey = ((level & 1) == 1) ? offsetKeysOutput : offsetKeysInput; + const uint srcOffsetVal = ((level & 1) == 1) ? offsetValsOutput : offsetValsInput; - WriteMortonCode(keysOffsetSwap, posInMergedList, mortonCode); - WriteSortedPrimIndex(valuesOffsetSwap, posInMergedList, index); - } - else - { - const uint64_t mortonCode = FetchMortonCode64(outputKeysOffset, globalId); - const uint index = FetchSortedPrimIndex(outputValuesOffset, globalId); + const uint dstOffsetKey = ((level & 1) == 0) ? offsetKeysOutput : offsetKeysInput; + const uint dstOffsetVal = ((level & 1) == 0) ? offsetValsOutput : offsetValsInput; - uint posInMergedList = localId + (groupId % splitGap) * groupSize + (groupIdNew / splitGap) * capacity * 2; + BEGIN_TASK(activeGroups); - posInMergedList += (leftSubtree) ? NumElemsLessThan64(mortonCode, subtreeOffset, subtreeEnd, outputKeysOffset) : - NumElemsLessThanOrEqualTo64(mortonCode, subtreeOffset, subtreeEnd, outputKeysOffset); + const uint cmpGap = 1u << (level - 1); + const uint splitGap = 1u << level; + + GlobalMergeIteration(groupId, + localId, + globalId, + groupSize, + groupCapacity, + cmpGap, + splitGap, + numPrimitives, + srcOffsetKey, + srcOffsetVal, + dstOffsetKey, + dstOffsetVal, + useMortonCode30); - WriteMortonCode64(keysOffsetSwap, posInMergedList, mortonCode); - WriteSortedPrimIndex(valuesOffsetSwap, posInMergedList, index); - } - } END_TASK(activeGroups); + } - splitGap <<= 1; - cmpGap <<= 1; - + // If we ping-ponged to an odd level, we need to copy back the data from swap buffers to output buffers + if ((level & 1) == 0) + { BEGIN_TASK(activeGroups); if (globalId < numPrimitives) { if (useMortonCode30) { - const uint mortonCode = FetchMortonCode(keysOffsetSwap, globalId); - WriteMortonCode( outputKeysOffset, globalId, mortonCode); + const uint mortonCode = FetchMortonCode(offsetKeysInput, globalId); + WriteMortonCode(offsetKeysOutput, globalId, mortonCode); } else { - const uint64_t mortonCode = FetchMortonCode64(keysOffsetSwap, globalId); - WriteMortonCode64( outputKeysOffset, globalId, mortonCode); + const uint64_t mortonCode = FetchMortonCode64(offsetKeysInput, globalId); + WriteMortonCode64(offsetKeysOutput, globalId, mortonCode); } - const uint index = FetchSortedPrimIndex(valuesOffsetSwap, globalId); - WriteSortedPrimIndex(outputValuesOffset, globalId, index); + const uint index = FetchSortedPrimIndex(offsetValsInput, globalId); + WriteSortedPrimIndex(offsetValsOutput, globalId, index); } END_TASK(activeGroups); } @@ -464,10 +513,10 @@ void MergeSortImpl( uint localId, uint groupId, uint numPrimitives, - uint inputKeysOffset, - uint outputKeysOffset, - uint outputValuesOffset, - uint valuesOffsetSwap, + uint offsetKeysInput, + uint offsetKeysOutput, + uint offsetValsInput, + uint offsetValsOutput, uint useMortonCode30) { const uint groupCapacity = BUILD_THREADGROUP_SIZE * 2; @@ -480,9 +529,9 @@ void MergeSortImpl( numPrimitives, groupCapacity, BUILD_THREADGROUP_SIZE, - inputKeysOffset, - outputKeysOffset, - outputValuesOffset, + offsetKeysInput, + offsetKeysOutput, + offsetValsOutput, useMortonCode30); END_TASK(activeGroups); @@ -494,10 +543,10 @@ void MergeSortImpl( groupCapacity, activeGroups, BUILD_THREADGROUP_SIZE, - outputKeysOffset, - outputValuesOffset, - inputKeysOffset, - valuesOffsetSwap, + offsetKeysOutput, + offsetValsOutput, + offsetKeysInput, + offsetValsInput, useMortonCode30); // Implicit Global Sync at the end of GlobalMerge(); } @@ -526,8 +575,125 @@ void MergeSort( numPrimitives, ShaderConstants.offsets.mortonCodes, ShaderConstants.offsets.mortonCodesSorted, - ShaderConstants.offsets.primIndicesSorted, ShaderConstants.offsets.primIndicesSortedSwap, + ShaderConstants.offsets.primIndicesSorted, Settings.useMortonCode30); } + +//===================================================================================================================== +// Main Function : MergeSortLocal +//===================================================================================================================== +[RootSignature(RootSig)] +[numthreads(BUILD_THREADGROUP_SIZE, 1, 1)] +void MergeSortLocal( + uint globalId : SV_DispatchThreadID, + uint localId : SV_GroupThreadID, + uint groupId : SV_GroupID) +{ + const uint numPrimitives = FetchTaskCounter( + ShaderConstants.offsets.encodeTaskCounter + ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET); + + const uint groupCapacity = BUILD_THREADGROUP_SIZE * 2; + FetchAndLocalSortAndWriteBack(localId, + groupId, + globalId, + numPrimitives, + groupCapacity, + BUILD_THREADGROUP_SIZE, + ShaderConstants.offsets.mortonCodes, + ShaderConstants.offsets.mortonCodesSorted, + ShaderConstants.offsets.primIndicesSorted, + Settings.useMortonCode30); +} + +//===================================================================================================================== +// Main Function : MergeSortGlobalIteration +//===================================================================================================================== +[RootSignature(RootSig)] +[numthreads(BUILD_THREADGROUP_SIZE, 1, 1)] +void MergeSortGlobalIteration( + uint globalId : SV_DispatchThreadID, + uint localId : SV_GroupThreadID, + uint groupId : SV_GroupID) +{ + const uint numPrimitives = FetchTaskCounter( + ShaderConstants.offsets.encodeTaskCounter + ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET); + + const uint groupSize = BUILD_THREADGROUP_SIZE; + const uint groupCapacity = BUILD_THREADGROUP_SIZE * 2; + + const uint offsetKeysOutput = ShaderConstants.offsets.mortonCodesSorted; + const uint offsetValsOutput = ShaderConstants.offsets.primIndicesSorted; + const uint offsetKeysInput = ShaderConstants.offsets.mortonCodes; + const uint offsetValsInput = ShaderConstants.offsets.primIndicesSortedSwap; + + // Level 0 is the local sort and always copies the sorted partitions into output buffers. The first iteration of global merge + // phase (i.e. Level 1) always copies from output buffers to swap buffers and then continues to ping-pong between these buffers + // at each iteration. + // + + // TODO: Fetch from root constants + const uint level = ShaderRootConstants.PassIndex(); + + // Odd levels copy from output buffers to swap buffers, while even levels copy from swap buffers to output + // buffers. + const uint srcOffsetKey = ((level & 1) == 1) ? offsetKeysOutput : offsetKeysInput; + const uint srcOffsetVal = ((level & 1) == 1) ? offsetValsOutput : offsetValsInput; + + const uint dstOffsetKey = ((level & 1) == 0) ? offsetKeysOutput : offsetKeysInput; + const uint dstOffsetVal = ((level & 1) == 0) ? offsetValsOutput : offsetValsInput; + + const uint cmpGap = 1u << (level - 1); + const uint splitGap = 1u << level; + + GlobalMergeIteration(groupId, + localId, + globalId, + groupSize, + groupCapacity, + cmpGap, + splitGap, + numPrimitives, + srcOffsetKey, + srcOffsetVal, + dstOffsetKey, + dstOffsetVal, + Settings.useMortonCode30); +} + +//===================================================================================================================== +// Main Function : MergeSortCopyLastLevel +//===================================================================================================================== +[RootSignature(RootSig)] +[numthreads(BUILD_THREADGROUP_SIZE, 1, 1)] +void MergeSortCopyLastLevel( + uint globalId : SV_DispatchThreadID, + uint localId : SV_GroupThreadID, + uint groupId : SV_GroupID) +{ + const uint numPrimitives = FetchTaskCounter( + ShaderConstants.offsets.encodeTaskCounter + ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET); + + const uint offsetKeysOutput = ShaderConstants.offsets.mortonCodesSorted; + const uint offsetValsOutput = ShaderConstants.offsets.primIndicesSorted; + const uint offsetKeysInput = ShaderConstants.offsets.mortonCodes; + const uint offsetValsInput = ShaderConstants.offsets.primIndicesSortedSwap; + + if (globalId < numPrimitives) + { + if (Settings.useMortonCode30) + { + const uint mortonCode = FetchMortonCode(offsetKeysInput, globalId); + WriteMortonCode(offsetKeysOutput, globalId, mortonCode); + } + else + { + const uint64_t mortonCode = FetchMortonCode64(offsetKeysInput, globalId); + WriteMortonCode64(offsetKeysOutput, globalId, mortonCode); + } + + const uint index = FetchSortedPrimIndex(offsetValsInput, globalId); + WriteSortedPrimIndex(offsetValsOutput, globalId, index); + } +} #endif diff --git a/src/shaders/PairCompression.hlsl b/src/shaders/PairCompression.hlsl index 799f3b0..91aac60 100644 --- a/src/shaders/PairCompression.hlsl +++ b/src/shaders/PairCompression.hlsl @@ -142,24 +142,18 @@ void WriteCompressedNodes( const uint packedGeometryInfoData = DstBuffer.Load(geometryInfoOffset + GEOMETRY_INFO_FLAGS_AND_NUM_PRIMS_OFFSET); const uint geometryFlags = ExtractGeometryInfoFlags(packedGeometryInfoData); - uint triangleId = WriteTriangleIdField(0, - NODE_TYPE_TRIANGLE_0, - GetQuadScratchNodeVertexOffset(quad.scratchNodeIndexAndOffset[0]), - geometryFlags); + uint quadSwizzle = GetQuadScratchNodeVertexOffset(quad.scratchNodeIndexAndOffset[0]); // If this quad has another triangle, update triangle ID for the pair and update referenced scratch // triangle node if (quad.scratchNodeIndexAndOffset[1] != INVALID_IDX) { - triangleId = WriteTriangleIdField(triangleId, - NODE_TYPE_TRIANGLE_1, - GetQuadScratchNodeVertexOffset(quad.scratchNodeIndexAndOffset[1]), - geometryFlags); + quadSwizzle |= GetQuadScratchNodeVertexOffset(quad.scratchNodeIndexAndOffset[1]) << 4; const uint scratchNodeOffset = CalcScratchNodeOffset(scratchNodesScratchOffset, keptIndex); // Update triangle ID field in scratch node - const uint packedFlags = (triangleNode.packedFlags & 0x0000ffff) | (triangleId << 16); + const uint packedFlags = (triangleNode.packedFlags & 0x0000ffff) | (quadSwizzle << 16); WriteScratchNodeDataAtOffset(scratchNodeOffset, SCRATCH_NODE_FLAGS_OFFSET, packedFlags); // Repurpose the node pointer for saving the index of the other node in the pair. diff --git a/src/shaders/RayQuery.hlsl b/src/shaders/RayQuery.hlsl index 49e0564..ea80508 100644 --- a/src/shaders/RayQuery.hlsl +++ b/src/shaders/RayQuery.hlsl @@ -108,7 +108,7 @@ static bool RayQueryProceedCommon( if (rayQuery.committed.currNodePtr != INVALID_NODE) { - uint instNodeIndex = FetchInstanceIdx(rtIpLevel, GetRayQueryTopBvhAddress(rayQuery), rayQuery.lastInstanceNode); + uint instNodeIndex = FetchInstanceIdx(rtIpLevel, GetRayQueryTopBvhAddress(rayQuery), rayQuery.committed.instNodePtr); WriteRayHistoryTokenEnd( rayId, diff --git a/src/shaders/TaskQueueCounter.hlsl b/src/shaders/TaskQueueCounter.hlsl index 84aa2e5..3e2c5dd 100644 --- a/src/shaders/TaskQueueCounter.hlsl +++ b/src/shaders/TaskQueueCounter.hlsl @@ -22,6 +22,13 @@ * SOFTWARE. * **********************************************************************************************************************/ +#define STATE_TASK_QUEUE_PHASE_OFFSET 0 +#define STATE_TASK_QUEUE_START_PHASE_INDEX_OFFSET 4 +#define STATE_TASK_QUEUE_END_PHASE_INDEX_OFFSET 8 +#define STATE_TASK_QUEUE_TASK_COUNTER_OFFSET 12 +#define STATE_TASK_QUEUE_NUM_TASKS_DONE_OFFSET 16 + +//===================================================================================================================== void AllocTasks(const uint numTasks, const uint phase, uint taskQueueOffset) { // start = end diff --git a/src/shaders/TriangleSplitting.hlsl b/src/shaders/TriangleSplitting.hlsl index 6860380..97cecee 100644 --- a/src/shaders/TriangleSplitting.hlsl +++ b/src/shaders/TriangleSplitting.hlsl @@ -22,6 +22,38 @@ * SOFTWARE. * **********************************************************************************************************************/ +#define TS_PHASE_INIT 0 +#define TS_PHASE_CALC_SUM 1 +#define TS_PHASE_ALLOC_REFS 2 +#define TS_PHASE_SPLIT 3 +#define TS_PHASE_DONE 4 + +struct ScratchTSRef +{ + uint leafIndex; + uint numSplits; + + uint splitLeafBaseIndex; + + BoundingBox bbox; +}; + +struct ScratchTSState +{ + uint refListIndex; + uint numRefs; + uint numRefsAlloc; + float sum; + uint mutex; +}; + +#define STATE_TS_REF_LIST_INDEX_OFFSET 0 +#define STATE_TS_NUM_REFS_OFFSET STATE_TS_REF_LIST_INDEX_OFFSET + 4 +#define STATE_TS_NUM_REFS_ALLOC_OFFSET STATE_TS_NUM_REFS_OFFSET + 4 +#define STATE_TS_SUM_OFFSET STATE_TS_NUM_REFS_ALLOC_OFFSET + 4 +#define STATE_TS_MUTEX_OFFSET STATE_TS_SUM_OFFSET + 4 + +//===================================================================================================================== // 32 bit constants struct TriangleSplittingArgs { diff --git a/src/shaders/Update.hlsl b/src/shaders/Update.hlsl index 035ad4b..3818053 100644 --- a/src/shaders/Update.hlsl +++ b/src/shaders/Update.hlsl @@ -133,8 +133,8 @@ void Update( const uint numGroups = ShaderRootConstants.numThreads / BUILD_THREADGROUP_SIZE; - ClearUpdateFlags(globalId); BEGIN_TASK(numGroups); + ClearUpdateFlags(globalId); EncodePrimitives(globalId, GEOMETRY_TYPE_TRIANGLES); END_TASK(numGroups); diff --git a/src/shadersClean/common/Bits.hlsli b/src/shadersClean/common/Bits.hlsli new file mode 100644 index 0000000..857d03e --- /dev/null +++ b/src/shadersClean/common/Bits.hlsli @@ -0,0 +1,166 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#ifndef BITS_HLSLI +#define BITS_HLSLI + +//===================================================================================================================== +static uint LowPart(uint64_t value) +{ + return uint(value); +} + +//===================================================================================================================== +static uint HighPart(uint64_t value) +{ + return uint(value >> 32); +} + +//===================================================================================================================== +// Helper function for producing a 32 bit mask of one bit +inline uint32_t bit(uint32_t index) +{ + return 1u << index; +} + +//===================================================================================================================== +// Helper function for producing a 16 bit mask of one bit +inline uint16_t bit16(uint16_t index) +{ + return uint16_t(1u << index); +} + +//===================================================================================================================== +// Helper function for producing a 64 bit mask of one bit +inline uint64_t bit64(uint32_t index) +{ + return 1ull << index; +} + +//===================================================================================================================== +// Helper function for generating a 32-bit bit mask +inline uint32_t bits(uint32_t bitcount) +{ + return (bitcount == 32) ? 0xFFFFFFFF : ((1u << bitcount) - 1); +} + +//===================================================================================================================== +// Helper function for generating a 16-bit bit mask +inline uint16_t bits16(uint16_t bitcount) +{ + return (bitcount == 16) ? uint16_t(0xFFFFu) : uint16_t((1u << bitcount) - 1); +} + +//===================================================================================================================== +// Helper function for generating a 32-bit bit mask +inline uint64_t bits64(uint64_t bitcount) +{ + return (bitcount == 64) ? 0xFFFFFFFFFFFFFFFFull : ((1ull << bitcount) - 1ull); +} + +//===================================================================================================================== +// Helper function for inserting data into a src bitfield and returning the output +static uint32_t bitFieldInsert( + in uint32_t src, + in uint32_t bitOffset, + in uint32_t numBits, + in uint32_t data) +{ + const uint32_t mask = bits(numBits); + src &= ~(mask << bitOffset); + return (src | ((data & mask) << bitOffset)); +} + +//===================================================================================================================== +// Helper function for inserting data into a uint16_t src bitfield and returning the output +static uint16_t bitFieldInsert16( + in uint16_t src, + in uint16_t bitOffset, + in uint16_t numBits, + in uint16_t data) +{ + const uint16_t mask = bits16(numBits); + src &= ~(mask << bitOffset); + return (src | ((data & mask) << bitOffset)); +} + +//===================================================================================================================== +// Helper function for inserting data into a uint64_t src bitfield and returning the output +static uint64_t bitFieldInsert64( + in uint64_t src, + in uint64_t bitOffset, + in uint64_t numBits, + in uint64_t data) +{ + const uint64_t mask = bits64(numBits); + src &= ~(mask << bitOffset); + return (src | ((data & mask) << bitOffset)); +} + +//===================================================================================================================== +// Helper function for extracting data from a src bitfield +static uint32_t bitFieldExtract( + in uint32_t src, + in uint32_t bitOffset, + in uint32_t numBits) +{ + return (src >> bitOffset) & bits(numBits); +} + +//===================================================================================================================== +// Helper function for extracting data from a src bitfield +static uint16_t bitFieldExtract16( + in uint16_t src, + in uint16_t bitOffset, + in uint16_t numBits) +{ + return (src >> bitOffset) & bits16(numBits); +} + +//===================================================================================================================== +// Helper function for extracting data from a uint64_t src bitfield +static uint64_t bitFieldExtract64( + in uint64_t src, + in uint64_t bitOffset, + in uint64_t numBits) +{ + return (src >> bitOffset) & bits64(numBits); +} + +//===================================================================================================================== +static uint32_t Pow2Align( + uint32_t value, ///< Value to align. + uint32_t alignment) ///< Desired alignment (must be a power of 2). +{ + return ((value + alignment - 1) & ~(alignment - 1)); +} + +//===================================================================================================================== +inline uint countbits64(uint64_t val) +{ + return countbits(LowPart(val)) + countbits(HighPart(val)); +} + +#endif diff --git a/src/shadersClean/common/BoundingBox.hlsli b/src/shadersClean/common/BoundingBox.hlsli new file mode 100644 index 0000000..f47ad82 --- /dev/null +++ b/src/shadersClean/common/BoundingBox.hlsli @@ -0,0 +1,74 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#ifndef BOUNDING_BOX_HLSLI +#define BOUNDING_BOX_HLSLI + +//===================================================================================================================== +struct BoundingBox // matches D3D12_RAYTRACING_AABB +{ + float3 min; + float3 max; +}; + +//===================================================================================================================== +struct BoundingBox4 +{ + float4 min; + float4 max; +}; + +//===================================================================================================================== +// Internal bounding box type for scene bounds. +struct UintBoundingBox +{ + uint3 min; + uint3 max; +}; + +struct UintBoundingBox4 +{ + uint4 min; + uint4 max; +}; + +struct PackedUintBoundingBox4 +{ + uint64_t min; + uint64_t max; +}; + +//===================================================================================================================== +static BoundingBox CombineAABB( + BoundingBox b0, + BoundingBox b1) +{ + BoundingBox bbox; + bbox.min = min(b0.min, b1.min); + bbox.max = max(b0.max, b1.max); + return bbox; +} + +#endif diff --git a/src/shadersClean/common/Extensions.hlsli b/src/shadersClean/common/Extensions.hlsli index a3b60ea..b139505 100644 --- a/src/shadersClean/common/Extensions.hlsli +++ b/src/shadersClean/common/Extensions.hlsli @@ -25,12 +25,8 @@ #ifndef EXTENSIONS_HLSLI #define EXTENSIONS_HLSLI -#if !defined(__cplusplus) - #define __decl [noinline] -#endif - #define AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_TiesToEven 0x0 #define AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_TowardPositive 0x1 #define AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_TowardNegative 0x2 diff --git a/src/shadersClean/common/InstanceDesc.hlsli b/src/shadersClean/common/InstanceDesc.hlsli new file mode 100644 index 0000000..09f910c --- /dev/null +++ b/src/shadersClean/common/InstanceDesc.hlsli @@ -0,0 +1,51 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#ifndef INSTANCE_DESC_HLSLI +#define INSTANCE_DESC_HLSLI + +#include "TempAssert.hlsli" + +//===================================================================================================================== +// 64-byte aligned structure matching D3D12_RAYTRACING_INSTANCE_DESC +struct InstanceDesc +{ + float4 Transform[3]; // Inverse transform for traversal + uint InstanceID_and_Mask; // 24-bit instance ID and 8-bit mask + uint InstanceContributionToHitGroupIndex_and_Flags; // 24-bit instance contribution and 8-bit flags + uint accelStructureAddressLo; // Lower part of acceleration structure base address + uint accelStructureAddressHiAndFlags; // Upper part of acceleration structure base address and + // HW raytracing IP 2.0 flags +}; + +#define INSTANCE_DESC_SIZE 64 +#define INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET 0 +#define INSTANCE_DESC_ID_AND_MASK_OFFSET 48 +#define INSTANCE_DESC_CONTRIBUTION_AND_FLAGS_OFFSET 52 +#define INSTANCE_DESC_VA_LO_OFFSET 56 +#define INSTANCE_DESC_VA_HI_OFFSET 60 + +GPURT_STATIC_ASSERT(INSTANCE_DESC_SIZE == sizeof(InstanceDesc), "InstanceDesc structure mismatch"); + +#endif diff --git a/src/shadersClean/common/Math.hlsl b/src/shadersClean/common/Math.hlsl index f8aa8ed..ca0f384 100644 --- a/src/shadersClean/common/Math.hlsl +++ b/src/shadersClean/common/Math.hlsl @@ -23,3 +23,6 @@ * **********************************************************************************************************************/ #include "Math.hlsli" + +#include "Bits.hlsli" + diff --git a/src/shadersClean/common/Math.hlsli b/src/shadersClean/common/Math.hlsli index 981b9b5..ab35f66 100644 --- a/src/shadersClean/common/Math.hlsli +++ b/src/shadersClean/common/Math.hlsli @@ -25,145 +25,33 @@ #ifndef MATH_HLSLI #define MATH_HLSLI -#include "ShaderDefs.hlsli" - #include "Extensions.hlsli" //===================================================================================================================== -static uint LowPart(GpuVirtualAddress addr) -{ - return uint(addr); -} - -//===================================================================================================================== -static uint HighPart(GpuVirtualAddress addr) -{ - return uint(addr >> 32); -} - -//===================================================================================================================== -// Helper function for producing a 32 bit mask of one bit -inline uint32_t bit(uint32_t index) -{ - return 1u << index; -} - -//===================================================================================================================== -// Helper function for producing a 16 bit mask of one bit -inline uint16_t bit16(uint16_t index) -{ - return uint16_t(1u << index); -} - -//===================================================================================================================== -// Helper function for producing a 64 bit mask of one bit -inline uint64_t bit64(uint32_t index) -{ - return 1ull << index; -} - -//===================================================================================================================== -// Helper function for generating a 32-bit bit mask -inline uint32_t bits(uint32_t bitcount) -{ - return (bitcount == 32) ? 0xFFFFFFFF : ((1u << bitcount) - 1); -} - -//===================================================================================================================== -// Helper function for generating a 16-bit bit mask -inline uint16_t bits16(uint16_t bitcount) -{ - return (bitcount == 16) ? uint16_t(0xFFFFu) : uint16_t((1u << bitcount) - 1); -} - -//===================================================================================================================== -// Helper function for generating a 32-bit bit mask -inline uint64_t bits64(uint64_t bitcount) -{ - return (bitcount == 64) ? 0xFFFFFFFFFFFFFFFFull : ((1ull << bitcount) - 1ull); -} - -//===================================================================================================================== -// Helper function for inserting data into a src bitfield and returning the output -static uint32_t bitFieldInsert( - in uint32_t src, - in uint32_t bitOffset, - in uint32_t numBits, - in uint32_t data) -{ - const uint32_t mask = bits(numBits); - src &= ~(mask << bitOffset); - return (src | ((data & mask) << bitOffset)); -} - -//===================================================================================================================== -// Helper function for inserting data into a uint16_t src bitfield and returning the output -static uint16_t bitFieldInsert16( - in uint16_t src, - in uint16_t bitOffset, - in uint16_t numBits, - in uint16_t data) -{ - const uint16_t mask = bits16(numBits); - src &= ~(mask << bitOffset); - return (src | ((data & mask) << bitOffset)); -} - -//===================================================================================================================== -// Helper function for inserting data into a uint64_t src bitfield and returning the output -static uint64_t bitFieldInsert64( - in uint64_t src, - in uint64_t bitOffset, - in uint64_t numBits, - in uint64_t data) -{ - const uint64_t mask = bits64(numBits); - src &= ~(mask << bitOffset); - return (src | ((data & mask) << bitOffset)); -} - -//===================================================================================================================== -// Helper function for extracting data from a src bitfield -static uint32_t bitFieldExtract( - in uint32_t src, - in uint32_t bitOffset, - in uint32_t numBits) -{ - return (src >> bitOffset) & bits(numBits); -} - -//===================================================================================================================== -// Helper function for extracting data from a src bitfield -static uint16_t bitFieldExtract16( - in uint16_t src, - in uint16_t bitOffset, - in uint16_t numBits) -{ - return (src >> bitOffset) & bits16(numBits); -} - -//===================================================================================================================== -// Helper function for extracting data from a uint64_t src bitfield -static uint64_t bitFieldExtract64( - in uint64_t src, - in uint64_t bitOffset, - in uint64_t numBits) +// Divide uints and round up +static uint RoundUpQuotient( + uint dividend, + uint divisor) { - return (src >> bitOffset) & bits64(numBits); + return (dividend + divisor - 1) / divisor; } //===================================================================================================================== -static uint32_t Pow2Align( - uint32_t value, ///< Value to align. - uint32_t alignment) ///< Desired alignment (must be a power of 2). +// Divide ints and round up +static int RoundUpQuotient( + int dividend, + int divisor) { - return ((value + alignment - 1) & ~(alignment - 1)); + return (dividend + divisor - 1) / divisor; } //===================================================================================================================== -inline uint countbits64(uint64_t val) +// Divide ints and round up +static uint64_t RoundUpQuotient( + uint64_t dividend, + uint64_t divisor) { - return countbits(LowPart(val)) + countbits(HighPart(val)); + return (dividend + divisor - 1) / divisor; } //===================================================================================================================== diff --git a/src/shadersClean/common/NodePointers.hlsli b/src/shadersClean/common/NodePointers.hlsli new file mode 100644 index 0000000..46e6fa3 --- /dev/null +++ b/src/shadersClean/common/NodePointers.hlsli @@ -0,0 +1,82 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#ifndef NODE_POINTERS_HLSLI +#define NODE_POINTERS_HLSLI + +#include "../common/TempAssert.hlsli" + +//===================================================================================================================== +// Node pointer size in bytes +#define NODE_PTR_SIZE 4 +GPURT_STATIC_ASSERT(NODE_PTR_SIZE == sizeof(uint32_t), "NODE_PTR_SIZE size mismatch"); + +//===================================================================================================================== +// Instance base pointer layout from the HW raytracing IP 2.0 spec: +// Zero [ 2: 0] +// Tree Base Address (64B index)[53: 3] +// Force Opaque [ 54] +// Force Non-Opaque [ 55] +// Disable Triangle Cull [ 56] +// Flip Facedness [ 57] +// Cull Back Facing Triangles [ 58] +// Cull Front Facing Triangles [ 59] +// Cull Opaque [ 60] +// Cull Non-Opaque [ 61] +// Skip Triangles [ 62] +// Skip Procedural [ 63] +// +// Since GPU VAs can only be 48 bits, only 42 bits of the Tree Base Address field are used: +// Used Address [44: 3] +// Unused Address [53:45] +// +#define INSTANCE_BASE_POINTER_ZERO_MASK 0x7ull +#define INSTANCE_BASE_POINTER_ADDRESS_USED_MASK 0x1FFFFFFFFFF8ull +#define INSTANCE_BASE_POINTER_ADDRESS_UNUSED_MASK 0x3FE00000000000ull +#define INSTANCE_BASE_POINTER_ADDRESS_MASK 0x3FFFFFFFFFFFF8ull +#define INSTANCE_BASE_POINTER_FLAGS_MASK 0xFFC0000000000000ull + +#define NODE_POINTER_FLAGS_SHIFT 54 +#define NODE_POINTER_FORCE_OPAQUE_SHIFT 54 +#define NODE_POINTER_FORCE_NON_OPAQUE_SHIFT 55 +#define NODE_POINTER_DISABLE_TRIANGLE_CULL_SHIFT 56 +#define NODE_POINTER_FLIP_FACEDNESS_SHIFT 57 +#define NODE_POINTER_CULL_BACK_FACING_SHIFT 58 +#define NODE_POINTER_CULL_FRONT_FACING_SHIFT 59 +#define NODE_POINTER_CULL_OPAQUE_SHIFT 60 +#define NODE_POINTER_CULL_NON_OPAQUE_SHIFT 61 +#define NODE_POINTER_SKIP_TRIANGLES_SHIFT 62 +#define NODE_POINTER_SKIP_PROCEDURAL_SHIFT 63 + +#define RAY_FLAG_VALID_MASK 0x3ffu +#define RAY_FLAG_EXCLUDE_MASK (RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER) +#define RAY_FLAG_OVERRIDE_MASK (RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_FORCE_NON_OPAQUE) // 0x3 +#define RAY_FLAG_PRESERVE_MASK (RAY_FLAG_VALID_MASK & (~RAY_FLAG_OVERRIDE_MASK)) // 0x3fc + +#define POINTER_FLAGS_HIDWORD_SHIFT (NODE_POINTER_FORCE_OPAQUE_SHIFT - 32) // 22 +#define POINTER_FLAGS_VALID_MASK (RAY_FLAG_VALID_MASK << POINTER_FLAGS_HIDWORD_SHIFT) // 0x3ff << 22 +#define POINTER_FLAGS_EXCLUDED_MASK ~(POINTER_FLAGS_VALID_MASK) // 0xFFC00000 + +#endif diff --git a/src/shared/scratchNode.h b/src/shadersClean/common/ScratchNode.hlsli similarity index 97% rename from src/shared/scratchNode.h rename to src/shadersClean/common/ScratchNode.hlsli index 1ff9b95..db7c67d 100644 --- a/src/shared/scratchNode.h +++ b/src/shadersClean/common/ScratchNode.hlsli @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,10 +22,11 @@ * SOFTWARE. * **********************************************************************************************************************/ + #ifndef _SCRATCHNODE_HLSL #define _SCRATCHNODE_HLSL -#include "rayTracingDefs.h" +#include "gfx10/BoxNode1_0.hlsli" //===================================================================================================================== // The structure is 64-byte aligned @@ -49,7 +50,7 @@ struct ScratchNode // scratch node index of the tri in the pair in PAIR_TRIANGLE_COMPRESSION / // BLAS metadata size for instance nodes uint sortedPrimIndex; // it's the index of the sorted primitive (leaf) or start index of the sorted primitives - uint packedFlags; // flags [0:7], instanceMask [8:15], triangleId [16:31] + uint packedFlags; // flags [0:7], instanceMask [8:15], quadSwizzle [16:23] }; #define SCRATCH_NODE_FLAGS_DISABLE_TRIANGLE_SPLIT_SHIFT 31 @@ -176,11 +177,11 @@ static uint ExtractScratchNodeInstanceMask( } //===================================================================================================================== -// Extract triangle ID from scratch node -static uint ExtractScratchNodeTriangleId( +// Extract quad swizzle from scratch node +static uint ExtractScratchNodeQuadSwizzle( in uint packedFlags) { - return (packedFlags >> 16); + return (packedFlags >> 16) & 0xFF; } //===================================================================================================================== diff --git a/src/shadersClean/common/ShaderDefs.hlsli b/src/shadersClean/common/ShaderDefs.hlsli index 5d254e6..3ca709b 100644 --- a/src/shadersClean/common/ShaderDefs.hlsli +++ b/src/shadersClean/common/ShaderDefs.hlsli @@ -25,8 +25,459 @@ #ifndef SHADERDEFS_HLSLI #define SHADERDEFS_HLSLI +// These DUMMY_*_FUNC postfix stubs must be included at the end of every driver stub (AmdTraceRay*) declaration to +// work around a DXC + Spirv issue where the compiler can't deal with calls to functions that don't have bodies. +#define DUMMY_BOOL_FUNC { return false; } +#define DUMMY_VOID_FUNC { } +#define DUMMY_UINT_FUNC { return 0; } +#define DUMMY_UINT2_FUNC { return uint2(0, 0); } +#define DUMMY_UINT3_FUNC { return uint3(0, 0, 0); } +#define DUMMY_UINT4_FUNC { return uint4(0, 0, 0, 0); } +#define DUMMY_FLOAT_FUNC { return 0; } +#define DUMMY_FLOAT2_FUNC { return float2(0, 0); } +#define DUMMY_FLOAT3_FUNC { return float3(0, 0, 0); } + +#include "TempAssert.hlsli" + +// TODO: there are functions that use values from these files, but really +// those functions should be in these files, and then the files that use the functions +// should include that file, instead of ShaderDefs.h +#include "gfx10/BoxNode1_0.hlsli" +#include "gfx10/TriangleNode1_0.hlsli" +#include "gfx10/ProceduralNode1_0.hlsli" +#include "gfx10/BoxNode1_0.hlsli" +#include "gfx10/InstanceNode1_0.hlsli" +#include "NodePointers.hlsli" + +#define SAH_COST_TRIANGLE_INTERSECTION 1.5 +#define SAH_COST_AABBB_INTERSECTION 1 + typedef uint64_t GpuVirtualAddress; +//===================================================================================================================== +enum PrimitiveType : uint +{ + Triangle = 0, + AABB = 1, + Instance = 2, +}; + +//===================================================================================================================== +// BVH node types shared between HW and SW nodes +#define NODE_TYPE_TRIANGLE_0 0 +#define NODE_TYPE_TRIANGLE_1 1 +#define NODE_TYPE_TRIANGLE_2 2 +#define NODE_TYPE_TRIANGLE_3 3 +#define NODE_TYPE_BOX_FLOAT16 4 +#define NODE_TYPE_BOX_FLOAT32 5 +#define NODE_TYPE_USER_NODE_INSTANCE 6 +// From the HW IP 2.0 spec: '7: User Node 1 (processed as a Procedural Node for culling)' +#define NODE_TYPE_USER_NODE_PROCEDURAL 7 + +//===================================================================================================================== +// Acceleration structure type +#define TOP_LEVEL 0 +#define BOTTOM_LEVEL 1 + +//===================================================================================================================== +// Triangle Compression Modes +#define NO_TRIANGLE_COMPRESSION 0 +#define RESERVED 1 +#define PAIR_TRIANGLE_COMPRESSION 2 +#define AUTO_TRIANGLE_COMPRESSION 3 + +#define LATE_PAIR_COMP_BATCH_SIZE 8 + +//===================================================================================================================== +// Amount of ULPs(Unit in Last Place) added to Box node when using hardware intersection instruction +#define BOX_EXPANSION_DEFAULT_AMOUNT 6 + +//===================================================================================================================== +// Box sorting heuristic value +// 0: closethit +// 1: LargestFirst +// 2: ClosestMidpoint +// 3: undefined / disabled +// 4: LargestFirstOrClosest (auto select with rayFlag) +// 5: BoxSortLargestFirstOrClosestMidPoint (auto select with rayFlag) +// 6: DisabledOnAcceptFirstHit (disable if bvhNode sort is on, and rayFlag is AcceptFirstHit) +// +// This need to match ILC_BOX_SORT_HEURISTIC_MODE +enum BoxSortHeuristic : uint +{ + Closest = 0x0, + Largest = 0x1, + MidPoint = 0x2, + Disabled = 0x3, + LargestFirstOrClosest = 0x4, + LargestFirstOrClosestMidPoint = 0x5, + DisabledOnAcceptFirstHit = 0x6, +}; + +//===================================================================================================================== +// Options for where FP16 box nodes are created within BLAS for QBVH +#define NO_NODES_IN_BLAS_AS_FP16 0 +#define LEAF_NODES_IN_BLAS_AS_FP16 1 +#define MIXED_NODES_IN_BLAS_AS_FP16 2 +#define ALL_INTERIOR_NODES_IN_BLAS_AS_FP16 3 + +// The highest 3 bits are zero after the right shift in PackNodePointer and may be repurposed. +// Mask for MSB within node pointer +#define NODE_POINTER_MASK_MSB 0x80000000u + +//===================================================================================================================== +#define BVH4_NODE_32_STRIDE_SHIFT 7 // Box 32 node +#define BVH4_NODE_16_STRIDE_SHIFT 6 // Box 16 node + +#define INVALID_IDX 0xffffffff +#define INACTIVE_PRIM 0xfffffffe + +static const uint ByteStrideScratchNode = 64; +static const uint ByteStrideU32 = 12; +static const uint IndexFormatInvalid = 0; +static const uint IndexFormatU32 = 1; +static const uint IndexFormatU16 = 2; + +const static uint TILE_WIDTH = 256; +const static uint TILE_SIZE = TILE_WIDTH * TILE_WIDTH; + +#ifndef BUILD_THREADGROUP_SIZE +#define BUILD_THREADGROUP_SIZE 64 +#endif + +//====================================================================================================================== +// matches VkAccelerationStructureBuildRangeInfoKHR +struct IndirectBuildOffset +{ + uint primitiveCount; + uint primitiveOffset; + uint firstVertex; + uint transformOffset; +}; + +//===================================================================================================================== +// Function assumes the type passed in is a valid node type +// +static uint PackNodePointer(uint type, uint address) +{ + uint nodePointer = type; // this assumes that the type is valid + // uint pointer = type & 0x7; + + // The input address is a byte offset, and node_addr is a 64-byte offset that starts at bit 3. + nodePointer |= (address >> 3); // this assumes that the input address is 64-byte aligned + // pointer |= (address >> 6) << 3; + + return nodePointer; +} + +//===================================================================================================================== +static uint GetNodeType(uint nodePointer) +{ + // From the HW raytracing spec: + // node_type = node_pointer[ 2:0] + return nodePointer & 0x7; +} + +//===================================================================================================================== +static uint ClearNodeType(uint nodePointer) +{ + return nodePointer & ~0x7; +} + +//===================================================================================================================== +// NOTE: The highest 3 bits are excluded. They aren't written when building the QBVH and may have been repurposed. See +// NODE_POINTER_MASK_MSB +static uint ExtractNodePointerOffset(uint nodePointer) +{ + // From the HW raytracing spec: + // node_addr[60:0] = node_pointer[63:3] + // Also, based on the following, the node_addr is 64-byte aligned: + // fetch_addr0 = T#.base_address*256+node_addr*64 + return ClearNodeType(nodePointer) << 3; +} + +//===================================================================================================================== +// Removes temp flag (MSB) within node type set by RefitBounds when fp16 nodes mode is LEAF_NODES_IN_BLAS_AS_FP16. +static uint GetNodePointerExclMsbFlag(uint nodePointer) +{ + return nodePointer & (~NODE_POINTER_MASK_MSB); +} + +//===================================================================================================================== +// Primitive data structure that includes the unpacked data needed to process a primitive +struct PrimitiveData +{ + uint primitiveIndex; // Primitive index used to indicate what primitive in geometry description + uint geometryIndex; // Geometry index used to indicate what geometry description + uint geometryFlags; // Geometry flags contains if the geometry is opaque or non opaque +}; + +//===================================================================================================================== +// Extract the geometry index from the bottom 24 bits +static uint ExtractGeometryIndex(uint geometryIndexAndFlags) +{ + return geometryIndexAndFlags & 0xFFFFFF; +} + +//===================================================================================================================== +// Extract the geometry flags from bits 25-26 +static uint ExtractGeometryFlags(uint geometryIndexAndFlags) +{ + return (geometryIndexAndFlags >> 24) & 0x3; +} + +//===================================================================================================================== +// Extract the geometry index from the bottom 24 bits and geometry flags from bits 25-26 +static uint2 UnpackGeometryIndexAndFlags(uint geometryIndexAndFlags) +{ + return uint2(ExtractGeometryIndex(geometryIndexAndFlags), ExtractGeometryFlags(geometryIndexAndFlags)); +} + +//===================================================================================================================== +// Pack the geometry index in the bottom 24 bits and the geometry flags into bits 25-26 +static uint PackGeometryIndexAndFlags( + uint geometryIndex, + uint geometryFlags) +{ + return (geometryFlags << 24) | (geometryIndex & 0xFFFFFF); +} + +//===================================================================================================================== +// Additional geometry information for bottom level acceleration structures primitives +struct GeometryInfo +{ + uint geometryFlagsAndNumPrimitives; + uint geometryBufferOffset; + uint primNodePtrsOffset; // Offset from the base of all prim node ptrs to this geometry's prim node ptrs +}; + +#define DXGI_FORMAT_UNKNOWN 0 +#define DXGI_FORMAT_R32G32B32_FLOAT 6 + +#define DECODE_VERTEX_STRIDE 12 +#define DECODE_PRIMITIVE_STRIDE_TRIANGLE 36 +#define DECODE_PRIMITIVE_STRIDE_AABB 24 +#define GEOMETRY_INFO_SIZE 12 +#define GEOMETRY_INFO_FLAGS_AND_NUM_PRIMS_OFFSET 0 +#define GEOMETRY_INFO_GEOM_BUFFER_OFFSET 4 +#define GEOMETRY_INFO_PRIM_NODE_PTRS_OFFSET 8 + +#define PIPELINE_FLAG_SKIP_TRIANGLES 0x100 +#define PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES 0x200 + +GPURT_STATIC_ASSERT(GEOMETRY_INFO_SIZE == sizeof(GeometryInfo), "Geometry info structure mismatch"); + +//===================================================================================================================== +static uint ExtractGeometryInfoFlags(uint packedGeometryFlagsAndNumPrimitives) +{ + return (packedGeometryFlagsAndNumPrimitives >> 29); +} + +//===================================================================================================================== +static uint ExtractGeometryInfoNumPrimitives(uint packedGeometryFlagsAndNumPrimitives) +{ + // ((1 << 29) - 1) = 0x1fffffff + return (packedGeometryFlagsAndNumPrimitives & 0x1FFFFFFF); +} + +//===================================================================================================================== +static uint PackGeometryFlagsAndNumPrimitives(uint geometryFlags, uint numPrimitives) +{ + return (geometryFlags << 29) | numPrimitives; +} + +//===================================================================================================================== +static uint64_t PackUint64(uint lowBits, uint highBits) +{ + // Note glslang doesn't like uint64_t casts + uint64_t addr = highBits; + addr = (addr << 32) | lowBits; + return addr; +} + +//====================================================================================================================== +// Packs the channels of a uint2 into a single uint64_t. +static uint64_t PackUint64(uint2 lowHigh) +{ + // Note glslang doesn't like uint64_t casts + uint64_t addr = lowHigh.y; + addr = (addr << 32) | lowHigh.x; + return addr; +} + +//===================================================================================================================== +static uint2 SplitUint64(uint64_t x) +{ + return uint2(x, (x >> 32)); +} + +//===================================================================================================================== +// Build Stage Counters (Debug only) +// It starts with the qbvhGlobalCounters offset, i.e., +// qbvhGlobalStack...qbvhGlobalStackPtrs...bvhBuildDebugCounters + +#define COUNTER_MORTONGEN_OFFSET 0x0 +#define COUNTER_MORTON_SORT_OFFSET 0x4 +#define COUNTER_SORTLEAF_OFFSET 0x8 +#define COUNTER_BUILDPLOC_OFFSET 0xC +#define COUNTER_BUILDLBVH_OFFSET 0x10 +#define COUNTER_REFIT_OFFSET 0x14 +#define COUNTER_INITENCODEHWBVH_OFFSET 0x18 +#define COUNTER_ENCODEHWBVH_OFFSET 0x1C +#define COUNTER_EMPTYPRIM_OFFSET 0x20 +#define COUNTER_EMITCOMPACTSIZE_OFFSET 0x24 +#define COUNTER_BUILDFASTLBVH_OFFSET 0x28 + +//===================================================================================================================== +// Get leaf triangle node size in bytes +static uint GetBvhNodeSizeTriangle() +{ + return TRIANGLE_NODE_SIZE; +} + +//===================================================================================================================== +// Get leaf AABB node size in bytes +static uint GetBvhNodeSizeProcedural() +{ + return USER_NODE_PROCEDURAL_SIZE; +} + +//===================================================================================================================== +// Get leaf instance node size in bytes +static uint GetBvhNodeSizeInstance(uint enableFusedInstanceNode) +{ + return (enableFusedInstanceNode == 0) ? INSTANCE_NODE_SIZE : FUSED_INSTANCE_NODE_SIZE; +} + +//===================================================================================================================== +// Get internal BVH node size in bytes +static uint GetBvhNodeSizeInternal() +{ + return FLOAT32_BOX_NODE_SIZE; +} + +//===================================================================================================================== +// Get internal BVH node size in bytes +static uint GetBvhNodeSizeLeaf( + uint primitiveType, + uint enableFusedInstanceNode) +{ + uint sizeInBytes = 0; + switch (primitiveType) + { + case PrimitiveType::Triangle: + sizeInBytes = GetBvhNodeSizeTriangle(); + break; + case PrimitiveType::AABB: + sizeInBytes = GetBvhNodeSizeProcedural(); + break; + case PrimitiveType::Instance: + sizeInBytes = GetBvhNodeSizeInstance(enableFusedInstanceNode); + break; + } + + return sizeInBytes; +} + +//===================================================================================================================== +static uint CalcParentPtrOffset(uint nodePtr) +{ + // Subtract 1 from the index to account for negative offset calculations. I.e. index 0 is actually at -4 byte + // offset from the end of the parent pointer memory + const uint linkIndex = (nodePtr >> 3) - 1; + return linkIndex * NODE_PTR_SIZE; +} + +//===================================================================================================================== +static uint CalcBottomGeometryInfoSize(uint numGeometries) +{ + return numGeometries * GEOMETRY_INFO_SIZE; +} + +//===================================================================================================================== +struct DataOffsetAndSize +{ + uint offset; + uint size; +}; + +//===================================================================================================================== +struct StateTaskQueueCounter +{ + uint phase; + uint startPhaseIndex; + uint endPhaseIndex; + uint taskCounter; + uint numTasksDone; +}; + +#define USE_BLAS_PRIM_COUNT 0 + +//===================================================================================================================== +struct Flags +{ + uint dataValid; + uint prefixSum; +}; + +#define FLAGS_DATA_VALID_OFFSET 0 +#define FLAGS_PREFIX_SUM_OFFSET 4 + +#define DLB_KEYS_PER_THREAD 4 +#define DLB_KEYS_PER_GROUP (BUILD_THREADGROUP_SIZE * DLB_KEYS_PER_THREAD) + +#define DLB_VALID_SUM 0 +#define DLB_VALID_PREFIX_SUM 1 +#define NUM_DLB_VALID_TYPES 2 + +//===================================================================================================================== + +#define PLOC_PHASE_INIT 0 +#define PLOC_PHASE_FIND_NEAREST_NEIGHBOUR 1 +#define PLOC_PHASE_UPDATE_CLUSTER_COUNT 2 +#define PLOC_PHASE_DONE 3 +struct StatePLOC +{ + uint numClusters; + uint internalNodesIndex; + uint clusterListIndex; + uint numClustersAlloc; +}; + +#define STATE_PLOC_NUM_CLUSTERS_OFFSET 0 +#define STATE_PLOC_INTERNAL_NODES_INDEX_OFFSET 4 +#define STATE_PLOC_CLUSTER_LIST_INDEX_OFFSET 8 +#define STATE_PLOC_NUM_CLUSTERS_ALLOC_OFFSET 12 + +//===================================================================================================================== +struct IndexBufferInfo +{ + uint gpuVaLo; + uint gpuVaHi; + uint byteOffset; + uint format; +}; + +//===================================================================================================================== +enum RebraidType : uint +{ + Off = 0, // No Rebraid + V1 = 1, // First version of Rebraid + V2 = 2, // Second version of Rebraid +}; + +#define BUILD_MODE_LINEAR 0 +// BUILD_MODE_AC was 1, but it has been removed. +#define BUILD_MODE_PLOC 2 + +//===================================================================================================================== +struct TriangleData +{ + float3 v0; ///< Vertex 0 + float3 v1; ///< Vertex 1 + float3 v2; ///< Vertex 2 +}; + #ifndef LIBRARY_COMPILATION // This does not include RayTracingDefs.h as the goal is // to eventually have everything in this file alone diff --git a/src/shadersClean/common/TempAssert.hlsli b/src/shadersClean/common/TempAssert.hlsli new file mode 100644 index 0000000..1407fe8 --- /dev/null +++ b/src/shadersClean/common/TempAssert.hlsli @@ -0,0 +1,38 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +// TODO: this is a temporary assert file to allow files with asserts to be "clean" +// while the assert file itself cannot be. We need this as we have to move files out of "shared" +// which use assert.h, but cannot then include assert.h as "clean" inclusion of shared files isn't set up yet, +// *because* there are too many files in shared, and they can't be moved out because +// they use assert.h and... (cyclical issue) + +#ifndef ASSERT_HLSLI +#define ASSERT_HLSLI +#ifndef GPURT_STATIC_ASSERT +// _Static_assert is not supported with -spirv: https://github.com/microsoft/DirectXShaderCompiler/issues/5750 +#define GPURT_STATIC_ASSERT(condition, message) +#endif +#endif diff --git a/src/shadersClean/common/gfx10/BoxNode1_0.hlsli b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli new file mode 100644 index 0000000..6103e61 --- /dev/null +++ b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli @@ -0,0 +1,137 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#ifndef BOX_NODE_1_1_HLSLI +#define BOX_NODE_1_1_HLSLI + +#include "../TempAssert.hlsli" + +//===================================================================================================================== +// Hardware 32-bit box node format and offsets +#define FLOAT32_BBOX_STRIDE 24 +#define FLOAT32_BOX_NODE_CHILD0_OFFSET 0 +#define FLOAT32_BOX_NODE_CHILD1_OFFSET 4 +#define FLOAT32_BOX_NODE_CHILD2_OFFSET 8 +#define FLOAT32_BOX_NODE_CHILD3_OFFSET 12 +#define FLOAT32_BOX_NODE_BB0_MIN_OFFSET 16 +#define FLOAT32_BOX_NODE_BB0_MAX_OFFSET 28 +#define FLOAT32_BOX_NODE_BB1_MIN_OFFSET 40 +#define FLOAT32_BOX_NODE_BB1_MAX_OFFSET 52 +#define FLOAT32_BOX_NODE_BB2_MIN_OFFSET 64 +#define FLOAT32_BOX_NODE_BB2_MAX_OFFSET 76 +#define FLOAT32_BOX_NODE_BB3_MIN_OFFSET 88 +#define FLOAT32_BOX_NODE_BB3_MAX_OFFSET 100 +#define FLOAT32_BOX_NODE_FLAGS_OFFSET 112 +#define FLOAT32_BOX_NODE_NUM_PRIM_OFFSET 116 +#define FLOAT32_BOX_NODE_UNUSED2_OFFSET 120 +#define FLOAT32_BOX_NODE_UNUSED3_OFFSET 124 +#define FLOAT32_BOX_NODE_SIZE 128 + +//===================================================================================================================== +// Float32 box node flags contains 4 1-byte fields, 1 per child node: +// Child 0 [ 7: 0] +// Child 1 [15: 8] +// Child 2 [23:16] +// Child 3 [31:24] +// +// Each child node's 1-byte field contains these flags: +// Only Opaque [ 0] +// Only Non-Opaque [ 1] +// Only Triangles [ 2] +// Only Procedural [ 3] +// Unused [7:4] +#define BOX_NODE_FLAGS_BIT_STRIDE 8 + +#define BOX_NODE_FLAGS_ONLY_OPAQUE_SHIFT 0 +#define BOX_NODE_FLAGS_ONLY_NON_OPAQUE_SHIFT 1 +#define BOX_NODE_FLAGS_ONLY_TRIANGLES_SHIFT 2 +#define BOX_NODE_FLAGS_ONLY_PROCEDURAL_SHIFT 3 + +//===================================================================================================================== +struct Float32BoxNode +{ + uint child0; /// Child node pointer 0 + uint child1; /// Child node pointer 1 + uint child2; /// Child node pointer 2 + uint child3; /// Child node pointer 3 + + float3 bbox0_min; /// Node bounding box 0 minimum bounds + float3 bbox0_max; /// Node bounding box 0 maximum bounds + + float3 bbox1_min; /// Node bounding box 1 minimum bounds + float3 bbox1_max; /// Node bounding box 1 maximum bounds + + float3 bbox2_min; /// Node bounding box 2 minimum bounds + float3 bbox2_max; /// Node bounding box 2 maximum bounds + + float3 bbox3_min; /// Node bounding box 3 minimum bounds + float3 bbox3_max; /// Node bounding box 3 maximum bounds + + uint flags; /// Reserved for RTIP 2.0 + uint numPrimitives; /// Padding for 64-byte alignment + uint padding2; /// Padding for 64-byte alignment + uint padding3; /// Padding for 64-byte alignment + +}; + +GPURT_STATIC_ASSERT(FLOAT32_BOX_NODE_SIZE == sizeof(Float32BoxNode), "Float32BoxNode structure mismatch"); + +//===================================================================================================================== +// Hardware 16-bit box node format and offsets +#define FLOAT16_BBOX_STRIDE 12 +#define FLOAT16_BOX_NODE_CHILD0_OFFSET 0 +#define FLOAT16_BOX_NODE_CHILD1_OFFSET 4 +#define FLOAT16_BOX_NODE_CHILD2_OFFSET 8 +#define FLOAT16_BOX_NODE_CHILD3_OFFSET 12 +#define FLOAT16_BOX_NODE_BB0_OFFSET 16 +#define FLOAT16_BOX_NODE_BB1_OFFSET 28 +#define FLOAT16_BOX_NODE_BB2_OFFSET 40 +#define FLOAT16_BOX_NODE_BB3_OFFSET 52 +#define FLOAT16_BOX_NODE_SIZE 64 + +//===================================================================================================================== +struct Float16BoxNode +{ + uint child0; /// Child node pointer 0 + uint child1; /// Child node pointer 1 + uint child2; /// Child node pointer 2 + uint child3; /// Child node pointer 3 + + uint3 bbox0; /// Node bounding box 0, packed, uses float16: minx, miny | minz, maxx | maxy, maxz + uint3 bbox1; /// Node bounding box 1, packed, uses float16: minx, miny | minz, maxx | maxy, maxz + uint3 bbox2; /// Node bounding box 2, packed, uses float16: minx, miny | minz, maxx | maxy, maxz + uint3 bbox3; /// Node bounding box 3, packed, uses float16: minx, miny | minz, maxx | maxy, maxz + + // NOTE: each bounding box is defined as uint3 for simplicity + // Each 32 bits pack 2x float16s. Order above is written as: a, b + // with a located in the lower 16 bits, b in the upper 16 bits + // bbox0.x stores minx, miny + // + // Alternatively, one can define each bbox as a pair of float16_t3 + // similar to FLOAT32_BOX_NODE. Indexing in hlsl would require extra work +}; + +GPURT_STATIC_ASSERT(FLOAT16_BOX_NODE_SIZE == sizeof(Float16BoxNode), "Float16BoxNode structure mismatch"); + +#endif diff --git a/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli new file mode 100644 index 0000000..ae0280d --- /dev/null +++ b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli @@ -0,0 +1,72 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#ifndef INSTANCE_NODE_1_1_HLSLI +#define INSTANCE_NODE_1_1_HLSLI + +#include "BoxNode1_0.hlsli" +#include "../InstanceDesc.hlsli" +#include "../TempAssert.hlsli" + +//===================================================================================================================== +struct InstanceSidebandData1_1 +{ + uint instanceIndex; + uint blasNodePointer; // might not point to root + uint blasMetadataSize; + uint padding0; + float4 Transform[3]; // Non-inverse (original D3D12_RAYTRACING_INSTANCE_DESC.Transform) +}; + +#define RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET 0 +#define RTIP1_1_INSTANCE_SIDEBAND_CHILD_POINTER_OFFSET 4 +#define RTIP1_1_INSTANCE_SIDEBAND_CHILD_METADATA_SIZE_OFFSET 8 +#define RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET 16 +#define RTIP1_1_INSTANCE_SIDEBAND_SIZE 64 + +GPURT_STATIC_ASSERT(RTIP1_1_INSTANCE_SIDEBAND_SIZE == sizeof(InstanceSidebandData1_1), "Instance sideband structure mismatch"); + +//===================================================================================================================== +struct FusedInstanceNode +{ + InstanceDesc desc; + InstanceSidebandData1_1 sideband; + Float32BoxNode blasRootNode; +}; + +//===================================================================================================================== +struct InstanceNode +{ + InstanceDesc desc; + InstanceSidebandData1_1 sideband; +}; + +#define INSTANCE_NODE_DESC_OFFSET 0 +#define INSTANCE_NODE_EXTRA_OFFSET 64 +#define INSTANCE_NODE_SIZE 128 +#define FUSED_INSTANCE_NODE_ROOT_OFFSET INSTANCE_NODE_SIZE +#define FUSED_INSTANCE_NODE_SIZE 256 +GPURT_STATIC_ASSERT(INSTANCE_NODE_SIZE == sizeof(InstanceNode), "InstanceNode structure mismatch"); + +#endif diff --git a/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli new file mode 100644 index 0000000..4431ecd --- /dev/null +++ b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli @@ -0,0 +1,56 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#ifndef PROCEDURAL_NODE_1_1_HLSLI +#define PROCEDURAL_NODE_1_1_HLSLI + +#include "../TempAssert.hlsli" + +//===================================================================================================================== +#define USER_NODE_PROCEDURAL_MIN_OFFSET 0 +#define USER_NODE_PROCEDURAL_MAX_OFFSET 12 +#define USER_NODE_PROCEDURAL_SIZE 64 + +//===================================================================================================================== +// Procedural node primitive data offsets +#define USER_NODE_PROCEDURAL_PRIMITIVE_INDEX_OFFSET TRIANGLE_NODE_PRIMITIVE_INDEX1_OFFSET +#define USER_NODE_PROCEDURAL_GEOMETRY_INDEX_AND_FLAGS_OFFSET TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET +#define USER_NODE_PROCEDURAL_TRIANGLE_ID_OFFSET TRIANGLE_NODE_ID_OFFSET + +//===================================================================================================================== +// User defined procedural node format +struct ProceduralNode +{ + float3 bbox_min; + float3 bbox_max; + uint padding1[6]; + uint geometryIndexAndFlags; + uint reserved; + uint primitiveIndex; + uint triangleId; +}; + +GPURT_STATIC_ASSERT(USER_NODE_PROCEDURAL_SIZE == sizeof(ProceduralNode), "ProceduralNode structure mismatch"); + +#endif diff --git a/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli new file mode 100644 index 0000000..0d9d1eb --- /dev/null +++ b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli @@ -0,0 +1,82 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#ifndef TRIANGLE_NODE_1_0_HLSLI +#define TRIANGLE_NODE_1_0_HLSLI + +#include "../TempAssert.hlsli" + +//===================================================================================================================== +// Hardware triangle node format and offsets +// Note: GPURT limits triangle compression to 2 triangles per node. As a result the remaining bytes in the triangle node +// are used for sideband data. The geometry index is packed in bottom 24 bits and geometry flags in bits 25-26. +#define TRIANGLE_NODE_V0_OFFSET 0 +#define TRIANGLE_NODE_V1_OFFSET 12 +#define TRIANGLE_NODE_V2_OFFSET 24 +#define TRIANGLE_NODE_V3_OFFSET 36 +#define TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET 48 +#define TRIANGLE_NODE_PRIMITIVE_INDEX0_OFFSET 52 +#define TRIANGLE_NODE_PRIMITIVE_INDEX1_OFFSET 56 +#define TRIANGLE_NODE_ID_OFFSET 60 +#define TRIANGLE_NODE_SIZE 64 + +//===================================================================================================================== +// Triangle ID contains 4 1-byte fields, 1 per triangle: +// Triangle 0 [ 7: 0] +// Triangle 1 [15: 8] +// Triangle 2 [23:16] +// Triangle 3 [31:24] +// +// Each triangle's 8-bit segment contains these fields: +// I SRC [1:0] Specifies which vertex in triangle 0 corresponds to the I barycentric value +// J SRC [3:2] Specifies which vertex in triangle 0 corresponds to the J barycentric value +// Double Sided [ 4] Specifies whether triangle 0 should be treated as double sided for culling +// Flip Winding [ 5] Specifies whether triangle 0 should have its facedness flipped +// Procedural [ 6] Specifies whether it is a procedural node +// Opaque [ 7] Specifies whether triangle 0 should be considered as opaque +#define TRIANGLE_ID_BIT_STRIDE 8 + +#define TRIANGLE_ID_I_SRC_SHIFT 0 +#define TRIANGLE_ID_J_SRC_SHIFT 2 +#define TRIANGLE_ID_DOUBLE_SIDED_SHIFT 4 +#define TRIANGLE_ID_FLIP_WINDING_SHIFT 5 +#define TRIANGLE_ID_PROCEDURAL_SHIFT 6 +#define TRIANGLE_ID_OPAQUE_SHIFT 7 + +//===================================================================================================================== +struct TriangleNode +{ + float3 v0; // Vertex 0 + float3 v1; // Vertex 1 + float3 v2; // Vertex 2 + float3 v3; // Vertex 3 + uint geometryIndexAndFlags; // Geometry index and flags for pair of triangles + uint primitiveIndex0; // Primitive index for triangle 0 + uint primitiveIndex1; // Primitive index for triangle 1 + uint triangleId; // Triangle ID +}; + +GPURT_STATIC_ASSERT(TRIANGLE_NODE_SIZE == sizeof(TriangleNode), "TriangleNode structure mismatch"); + +#endif diff --git a/src/shadersClean/traversal/TraversalDefs.hlsli b/src/shadersClean/traversal/TraversalDefs.hlsli new file mode 100644 index 0000000..8541f35 --- /dev/null +++ b/src/shadersClean/traversal/TraversalDefs.hlsli @@ -0,0 +1,160 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#ifndef TRAVERSAL_DEFS_HLSLI +#define TRAVERSAL_DEFS_HLSLI + +#include "../common/TempAssert.hlsli" + +#define ENCODE_FLAG_ARRAY_OF_POINTERS 0x00000001 +#define ENCODE_FLAG_UPDATE_IN_PLACE 0x00000002 +#define ENCODE_FLAG_REBRAID_ENABLED 0x00000004 +#define ENCODE_FLAG_ENABLE_FUSED_INSTANCE_NODE 0x00000008 + +//===================================================================================================================== +struct IntersectionResult +{ + float t; // Relative to tMin + uint nodeIndex; + float2 barycentrics; + uint geometryIndex; + uint primitiveIndex; + uint instNodePtr; + uint hitkind; + uint instanceContribution; + +#if DEVELOPER + uint numIterations; + uint maxStackDepth; + uint numRayBoxTest; + uint numCandidateHits; + uint numRayTriangleTest; + uint numAnyHitInvocation; + uint instanceIntersections; +#endif +}; + +//===================================================================================================================== +// Commit status +typedef uint COMMITTED_STATUS; + +#define COMMITTED_NOTHING 0 +#define COMMITTED_TRIANGLE_HIT 1 +#define COMMITTED_PROCEDURAL_PRIMITIVE_HIT 2 + +//===================================================================================================================== +// Candidate type +typedef uint CANDIDATE_STATUS; + +#define CANDIDATE_NON_OPAQUE_TRIANGLE 0 +#define CANDIDATE_PROCEDURAL_PRIMITIVE 1 +#define CANDIDATE_NON_OPAQUE_PROCEDURAL_PRIMITIVE 2 +#define CANDIDATE_EARLY_RAY_TERMINATE 4 + +//===================================================================================================================== +// Data required for system value intrinsics +struct RaySystemData +{ + uint currNodePtr; + float rayTCurrent; + uint instNodePtr; + uint instanceContribution; + uint geometryIndex; + uint primitiveIndex; + float2 barycentrics; + uint frontFace; + float3 origin; + float3 direction; +}; + +//===================================================================================================================== +#if DEFINE_RAYDESC +// Ray description matching the D3D12 HLSL header +struct RayDesc +{ + float3 Origin; + float TMin; + float3 Direction; + float TMax; +}; +#endif + +//===================================================================================================================== +// Internal RayQuery structure initialised at TraceRaysInline() +struct RayQueryInternal +{ + // Internal query data holding address of current BVH and stack information. + // Additional data that may be required will be stored here. + uint bvhLo; + uint bvhHi; + uint topLevelBvhLo; + uint topLevelBvhHi; + uint stackPtr; + uint stackPtrTop; + uint stackNumEntries; + uint instNodePtr; + uint currNodePtr; + uint instanceHitContributionAndFlags; + uint prevNodePtr; + uint isGoingDown; + uint lastInstanceNode; + + RayDesc rayDesc; + float rayTMin; + uint rayFlags; + uint instanceInclusionMask; + + // Candidate system data + CANDIDATE_STATUS candidateType; + RaySystemData candidate; + + // Committed system data + COMMITTED_STATUS committedStatus; + RaySystemData committed; + + uint reserved; + + // Counter data + // @note We don't wrap these in DEVELOPER because it would result in mismatch of RayQuery struct size + // on the driver side when we're not using counters. + uint numRayBoxTest; + uint numRayTriangleTest; + uint numIterations; + uint maxStackDepthAndDynamicId; + uint clocks; + uint numCandidateHits; + uint instanceIntersections; + uint rayQueryObjId; +}; + +//===================================================================================================================== +struct HitGroupInfo +{ + uint2 closestHitId; + uint2 anyHitId; + uint2 intersectionId; + uint tableIndex; +}; + +#endif diff --git a/src/shared/rayTracingDefs.h b/src/shared/rayTracingDefs.h index a2c4a8a..6dfec65 100644 --- a/src/shared/rayTracingDefs.h +++ b/src/shared/rayTracingDefs.h @@ -27,6 +27,10 @@ #ifndef _RAYTRACING_DEF_H #define _RAYTRACING_DEF_H +#ifndef __cplusplus +#include "../shadersClean/common/ShaderDefs.hlsli" +#endif + #include "../../gpurt/gpurtAccelStruct.h" #include "../../gpurt/gpurtBuildSettings.h" #include "../../gpurt/gpurtDispatch.h" @@ -51,136 +55,25 @@ static_assert(GPURT_RTIP2_0 == uint32_t(Pal::RayTracingIpLevel::RtIp2_0), "GPURT #endif //===================================================================================================================== -enum PrimitiveType : uint -{ - Triangle = 0, - AABB = 1, - Instance = 2, -}; - -#if defined(__cplusplus) -#define __decl extern -#endif - -// These DUMMY_*_FUNC postfix stubs must be included at the end of every driver stub (AmdTraceRay*) declaration to -// work around a DXC + Spirv issue where the compiler can't deal with calls to functions that don't have bodies. -#define DUMMY_BOOL_FUNC { return false; } -#define DUMMY_VOID_FUNC { } -#define DUMMY_UINT_FUNC { return 0; } -#define DUMMY_UINT2_FUNC { return uint2(0, 0); } -#define DUMMY_UINT3_FUNC { return uint3(0, 0, 0); } -#define DUMMY_UINT4_FUNC { return uint4(0, 0, 0, 0); } -#define DUMMY_FLOAT_FUNC { return 0; } -#define DUMMY_FLOAT2_FUNC { return float2(0, 0); } -#define DUMMY_FLOAT3_FUNC { return float3(0, 0, 0); } - -//===================================================================================================================== -// Acceleration structure type -#define TOP_LEVEL 0 -#define BOTTOM_LEVEL 1 - -//===================================================================================================================== -// BVH node types shared between HW and SW nodes -#define NODE_TYPE_TRIANGLE_0 0 -#define NODE_TYPE_TRIANGLE_1 1 -#define NODE_TYPE_TRIANGLE_2 2 -#define NODE_TYPE_TRIANGLE_3 3 -#define NODE_TYPE_BOX_FLOAT16 4 -#define NODE_TYPE_BOX_FLOAT32 5 -#define NODE_TYPE_USER_NODE_INSTANCE 6 -// From the HW IP 2.0 spec: '7: User Node 1 (processed as a Procedural Node for culling)' -#define NODE_TYPE_USER_NODE_PROCEDURAL 7 -//===================================================================================================================== -// Triangle Compression Modes -#define NO_TRIANGLE_COMPRESSION 0 -#define RESERVED 1 -#define PAIR_TRIANGLE_COMPRESSION 2 -#define AUTO_TRIANGLE_COMPRESSION 3 - -#define LATE_PAIR_COMP_BATCH_SIZE 8 - -//===================================================================================================================== -// Amount of ULPs(Unit in Last Place) added to Box node when using hardware intersection instruction -#define BOX_EXPANSION_DEFAULT_AMOUNT 6 - -//===================================================================================================================== -// Box sorting heuristic value -// 0: closethit -// 1: LargestFirst -// 2: ClosestMidpoint -// 3: undefined / disabled -// 4: LargestFirstOrClosest (auto select with rayFlag) -// 5: BoxSortLargestFirstOrClosestMidPoint (auto select with rayFlag) -// 6: DisabledOnAcceptFirstHit (disable if bvhNode sort is on, and rayFlag is AcceptFirstHit) -// -// This need to match ILC_BOX_SORT_HEURISTIC_MODE -enum BoxSortHeuristic : uint -{ - Closest = 0x0, - Largest = 0x1, - MidPoint = 0x2, - Disabled = 0x3, - LargestFirstOrClosest = 0x4, - LargestFirstOrClosestMidPoint = 0x5, - DisabledOnAcceptFirstHit = 0x6, -}; +#define REBRAID_PHASE_CALC_SUM 0 +#define REBRAID_PHASE_OPEN 1 +#define REBRAID_PHASE_DONE 2 -enum SceneBoundsCalculation : uint +struct RebraidState { - SceneBoundsBasedOnGeometry = 0x0, - SceneBoundsBasedOnGeometryWithSize = 0x1 + float sumValue[2]; + uint mutex; + uint numLeafIndices; + uint iterationCount; }; -//===================================================================================================================== -// Options for where FP16 box nodes are created within BLAS for QBVH -#define NO_NODES_IN_BLAS_AS_FP16 0 -#define LEAF_NODES_IN_BLAS_AS_FP16 1 -#define MIXED_NODES_IN_BLAS_AS_FP16 2 -#define ALL_INTERIOR_NODES_IN_BLAS_AS_FP16 3 - -// The highest 3 bits are zero after the right shift in PackNodePointer and may be repurposed. -// Mask for MSB within node pointer -#define NODE_POINTER_MASK_MSB 0x80000000u - -//===================================================================================================================== -#define BVH4_NODE_32_STRIDE_SHIFT 7 // Box 32 node -#define BVH4_NODE_16_STRIDE_SHIFT 6 // Box 16 node - -#define INVALID_IDX 0xffffffff -#define INACTIVE_PRIM 0xfffffffe - -static const uint ByteStrideScratchNode = 64; -static const uint ByteStrideU32 = 12; -static const uint IndexFormatInvalid = 0; -static const uint IndexFormatU32 = 1; -static const uint IndexFormatU16 = 2; - -const static uint TILE_WIDTH = 256; -const static uint TILE_SIZE = TILE_WIDTH * TILE_WIDTH; - -#ifndef BUILD_THREADGROUP_SIZE -#define BUILD_THREADGROUP_SIZE 64 -#endif - -//===================================================================================================================== -struct BoundingBox // matches D3D12_RAYTRACING_AABB -{ - float3 min; - float3 max; -}; +#define STATE_REBRAID_SUM_VALUE_OFFSET 0 +#define STATE_REBRAID_MUTEX_OFFSET (STATE_REBRAID_SUM_VALUE_OFFSET + 8) +#define STATE_REBRAID_NUM_LEAF_INDICES_OFFSET (STATE_REBRAID_MUTEX_OFFSET + 4) +#define STATE_REBRAID_ITERATION_COUNT_OFFSET (STATE_REBRAID_NUM_LEAF_INDICES_OFFSET + 4) -#ifndef __cplusplus -//===================================================================================================================== -static BoundingBox CombineAABB( - BoundingBox b0, - BoundingBox b1) -{ - BoundingBox bbox; - bbox.min = min(b0.min, b1.min); - bbox.max = max(b0.max, b1.max); - return bbox; -} -#endif +#define REBRAID_KEYS_PER_THREAD 4 +#define REBRAID_KEYS_PER_GROUP (BUILD_THREADGROUP_SIZE * REBRAID_KEYS_PER_THREAD) //====================================================================================================================== // matches VkAccelerationStructureBuildRangeInfoKHR @@ -193,711 +86,6 @@ struct IndirectBuildRangeInfo }; //===================================================================================================================== -struct BoundingBox4 -{ - float4 min; - float4 max; -}; - -//===================================================================================================================== -// Internal bounding box type for scene bounds. -struct UintBoundingBox -{ - uint3 min; - uint3 max; -}; - -struct UintBoundingBox4 -{ - uint4 min; - uint4 max; -}; - -struct PackedUintBoundingBox4 -{ - uint64_t min; - uint64_t max; -}; - -//===================================================================================================================== -// Hardware 32-bit box node format and offsets -#define FLOAT32_BBOX_STRIDE 24 -#define FLOAT32_BOX_NODE_CHILD0_OFFSET 0 -#define FLOAT32_BOX_NODE_CHILD1_OFFSET 4 -#define FLOAT32_BOX_NODE_CHILD2_OFFSET 8 -#define FLOAT32_BOX_NODE_CHILD3_OFFSET 12 -#define FLOAT32_BOX_NODE_BB0_MIN_OFFSET 16 -#define FLOAT32_BOX_NODE_BB0_MAX_OFFSET 28 -#define FLOAT32_BOX_NODE_BB1_MIN_OFFSET 40 -#define FLOAT32_BOX_NODE_BB1_MAX_OFFSET 52 -#define FLOAT32_BOX_NODE_BB2_MIN_OFFSET 64 -#define FLOAT32_BOX_NODE_BB2_MAX_OFFSET 76 -#define FLOAT32_BOX_NODE_BB3_MIN_OFFSET 88 -#define FLOAT32_BOX_NODE_BB3_MAX_OFFSET 100 -#define FLOAT32_BOX_NODE_FLAGS_OFFSET 112 -#define FLOAT32_BOX_NODE_NUM_PRIM_OFFSET 116 -#define FLOAT32_BOX_NODE_UNUSED2_OFFSET 120 -#define FLOAT32_BOX_NODE_UNUSED3_OFFSET 124 -#define FLOAT32_BOX_NODE_SIZE 128 - -//===================================================================================================================== -// Float32 box node flags contains 4 1-byte fields, 1 per child node: -// Child 0 [ 7: 0] -// Child 1 [15: 8] -// Child 2 [23:16] -// Child 3 [31:24] -// -// Each child node's 1-byte field contains these flags: -// Only Opaque [ 0] -// Only Non-Opaque [ 1] -// Only Triangles [ 2] -// Only Procedural [ 3] -// Unused [7:4] -#define BOX_NODE_FLAGS_BIT_STRIDE 8 - -#define BOX_NODE_FLAGS_ONLY_OPAQUE_SHIFT 0 -#define BOX_NODE_FLAGS_ONLY_NON_OPAQUE_SHIFT 1 -#define BOX_NODE_FLAGS_ONLY_TRIANGLES_SHIFT 2 -#define BOX_NODE_FLAGS_ONLY_PROCEDURAL_SHIFT 3 - -//===================================================================================================================== -struct Float32BoxNode -{ - uint child0; /// Child node pointer 0 - uint child1; /// Child node pointer 1 - uint child2; /// Child node pointer 2 - uint child3; /// Child node pointer 3 - - float3 bbox0_min; /// Node bounding box 0 minimum bounds - float3 bbox0_max; /// Node bounding box 0 maximum bounds - - float3 bbox1_min; /// Node bounding box 1 minimum bounds - float3 bbox1_max; /// Node bounding box 1 maximum bounds - - float3 bbox2_min; /// Node bounding box 2 minimum bounds - float3 bbox2_max; /// Node bounding box 2 maximum bounds - - float3 bbox3_min; /// Node bounding box 3 minimum bounds - float3 bbox3_max; /// Node bounding box 3 maximum bounds - - uint flags; /// Reserved for RTIP 2.0 - uint numPrimitives; /// Padding for 64-byte alignment - uint padding2; /// Padding for 64-byte alignment - uint padding3; /// Padding for 64-byte alignment - -#ifdef __cplusplus - // parameterised constructor for HLSL compatibility - Float32BoxNode(uint val) - { - memset(this, val, sizeof(Float32BoxNode)); - } - - // default constructor - Float32BoxNode() : Float32BoxNode(0) - { - } -#endif -}; - -#ifdef __cplusplus -static_assert(FLOAT32_BOX_NODE_SIZE == sizeof(Float32BoxNode), "Float32BoxNode structure mismatch"); -static_assert(FLOAT32_BOX_NODE_CHILD0_OFFSET == offsetof(Float32BoxNode, child0), ""); -static_assert(FLOAT32_BOX_NODE_CHILD1_OFFSET == offsetof(Float32BoxNode, child1), ""); -static_assert(FLOAT32_BOX_NODE_CHILD2_OFFSET == offsetof(Float32BoxNode, child2), ""); -static_assert(FLOAT32_BOX_NODE_CHILD3_OFFSET == offsetof(Float32BoxNode, child3), ""); -static_assert(FLOAT32_BOX_NODE_BB0_MIN_OFFSET == offsetof(Float32BoxNode, bbox0_min), ""); -static_assert(FLOAT32_BOX_NODE_BB0_MAX_OFFSET == offsetof(Float32BoxNode, bbox0_max), ""); -static_assert(FLOAT32_BOX_NODE_BB1_MIN_OFFSET == offsetof(Float32BoxNode, bbox1_min), ""); -static_assert(FLOAT32_BOX_NODE_BB1_MAX_OFFSET == offsetof(Float32BoxNode, bbox1_max), ""); -static_assert(FLOAT32_BOX_NODE_BB2_MIN_OFFSET == offsetof(Float32BoxNode, bbox2_min), ""); -static_assert(FLOAT32_BOX_NODE_BB2_MAX_OFFSET == offsetof(Float32BoxNode, bbox2_max), ""); -static_assert(FLOAT32_BOX_NODE_BB3_MIN_OFFSET == offsetof(Float32BoxNode, bbox3_min), ""); -static_assert(FLOAT32_BOX_NODE_BB3_MAX_OFFSET == offsetof(Float32BoxNode, bbox3_max), ""); -static_assert(FLOAT32_BOX_NODE_FLAGS_OFFSET == offsetof(Float32BoxNode, flags), ""); -static_assert(FLOAT32_BOX_NODE_NUM_PRIM_OFFSET == offsetof(Float32BoxNode, numPrimitives), ""); -static_assert(FLOAT32_BOX_NODE_UNUSED2_OFFSET == offsetof(Float32BoxNode, padding2), ""); -static_assert(FLOAT32_BOX_NODE_UNUSED3_OFFSET == offsetof(Float32BoxNode, padding3), ""); -#endif - -//===================================================================================================================== -// Hardware 16-bit box node format and offsets -#define FLOAT16_BBOX_STRIDE 12 -#define FLOAT16_BOX_NODE_CHILD0_OFFSET 0 -#define FLOAT16_BOX_NODE_CHILD1_OFFSET 4 -#define FLOAT16_BOX_NODE_CHILD2_OFFSET 8 -#define FLOAT16_BOX_NODE_CHILD3_OFFSET 12 -#define FLOAT16_BOX_NODE_BB0_OFFSET 16 -#define FLOAT16_BOX_NODE_BB1_OFFSET 28 -#define FLOAT16_BOX_NODE_BB2_OFFSET 40 -#define FLOAT16_BOX_NODE_BB3_OFFSET 52 -#define FLOAT16_BOX_NODE_SIZE 64 - -//===================================================================================================================== -struct Float16BoxNode -{ - uint child0; /// Child node pointer 0 - uint child1; /// Child node pointer 1 - uint child2; /// Child node pointer 2 - uint child3; /// Child node pointer 3 - - uint3 bbox0; /// Node bounding box 0, packed, uses float16: minx, miny | minz, maxx | maxy, maxz - uint3 bbox1; /// Node bounding box 1, packed, uses float16: minx, miny | minz, maxx | maxy, maxz - uint3 bbox2; /// Node bounding box 2, packed, uses float16: minx, miny | minz, maxx | maxy, maxz - uint3 bbox3; /// Node bounding box 3, packed, uses float16: minx, miny | minz, maxx | maxy, maxz - - // NOTE: each bounding box is defined as uint3 for simplicity - // Each 32 bits pack 2x float16s. Order above is written as: a, b - // with a located in the lower 16 bits, b in the upper 16 bits - // bbox0.x stores minx, miny - // - // Alternatively, one can define each bbox as a pair of float16_t3 - // similar to FLOAT32_BOX_NODE. Indexing in hlsl would require extra work -}; - -#ifdef __cplusplus -static_assert(FLOAT16_BOX_NODE_SIZE == sizeof(Float16BoxNode), "Float16BoxNode structure mismatch"); -static_assert(FLOAT16_BOX_NODE_CHILD0_OFFSET == offsetof(Float16BoxNode, child0), ""); -static_assert(FLOAT16_BOX_NODE_CHILD1_OFFSET == offsetof(Float16BoxNode, child1), ""); -static_assert(FLOAT16_BOX_NODE_CHILD2_OFFSET == offsetof(Float16BoxNode, child2), ""); -static_assert(FLOAT16_BOX_NODE_CHILD3_OFFSET == offsetof(Float16BoxNode, child3), ""); -static_assert(FLOAT16_BOX_NODE_BB0_OFFSET == offsetof(Float16BoxNode, bbox0), ""); -static_assert(FLOAT16_BOX_NODE_BB1_OFFSET == offsetof(Float16BoxNode, bbox1), ""); -static_assert(FLOAT16_BOX_NODE_BB2_OFFSET == offsetof(Float16BoxNode, bbox2), ""); -static_assert(FLOAT16_BOX_NODE_BB3_OFFSET == offsetof(Float16BoxNode, bbox3), ""); -#endif - -//===================================================================================================================== -// Hardware triangle node format and offsets -// Note: GPURT limits triangle compression to 2 triangles per node. As a result the remaining bytes in the triangle node -// are used for sideband data. The geometry index is packed in bottom 24 bits and geometry flags in bits 25-26. -#define TRIANGLE_NODE_V0_OFFSET 0 -#define TRIANGLE_NODE_V1_OFFSET 12 -#define TRIANGLE_NODE_V2_OFFSET 24 -#define TRIANGLE_NODE_V3_OFFSET 36 -#define TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET 48 -#define TRIANGLE_NODE_PRIMITIVE_INDEX0_OFFSET 52 -#define TRIANGLE_NODE_PRIMITIVE_INDEX1_OFFSET 56 -#define TRIANGLE_NODE_ID_OFFSET 60 -#define TRIANGLE_NODE_SIZE 64 - -//===================================================================================================================== -// Triangle ID contains 4 1-byte fields, 1 per triangle: -// Triangle 0 [ 7: 0] -// Triangle 1 [15: 8] -// Triangle 2 [23:16] -// Triangle 3 [31:24] -// -// Each triangle's 8-bit segment contains these fields: -// I SRC [1:0] Specifies which vertex in triangle 0 corresponds to the I barycentric value -// J SRC [3:2] Specifies which vertex in triangle 0 corresponds to the J barycentric value -// Double Sided [ 4] Specifies whether triangle 0 should be treated as double sided for culling -// Flip Winding [ 5] Specifies whether triangle 0 should have its facedness flipped -// Procedural [ 6] Specifies whether it is a procedural node -// Opaque [ 7] Specifies whether triangle 0 should be considered as opaque -#define TRIANGLE_ID_BIT_STRIDE 8 - -#define TRIANGLE_ID_I_SRC_SHIFT 0 -#define TRIANGLE_ID_J_SRC_SHIFT 2 -#define TRIANGLE_ID_DOUBLE_SIDED_SHIFT 4 -#define TRIANGLE_ID_FLIP_WINDING_SHIFT 5 -#define TRIANGLE_ID_PROCEDURAL_SHIFT 6 -#define TRIANGLE_ID_OPAQUE_SHIFT 7 - -//===================================================================================================================== -struct TriangleNode -{ - float3 v0; // Vertex 0 - float3 v1; // Vertex 1 - float3 v2; // Vertex 2 - float3 v3; // Vertex 3 - uint geometryIndexAndFlags; // Geometry index and flags for pair of triangles - uint primitiveIndex0; // Primitive index for triangle 0 - uint primitiveIndex1; // Primitive index for triangle 1 - uint triangleId; // Triangle ID -}; - -#ifdef __cplusplus -static_assert(TRIANGLE_NODE_SIZE == sizeof(TriangleNode), "TriangleNode structure mismatch"); -static_assert(TRIANGLE_NODE_V0_OFFSET == offsetof(TriangleNode, v0), ""); -static_assert(TRIANGLE_NODE_V1_OFFSET == offsetof(TriangleNode, v1), ""); -static_assert(TRIANGLE_NODE_V2_OFFSET == offsetof(TriangleNode, v2), ""); -static_assert(TRIANGLE_NODE_V3_OFFSET == offsetof(TriangleNode, v3), ""); -static_assert(TRIANGLE_NODE_ID_OFFSET == offsetof(TriangleNode, triangleId), ""); -#endif - -//===================================================================================================================== -#define USER_NODE_PROCEDURAL_MIN_OFFSET 0 -#define USER_NODE_PROCEDURAL_MAX_OFFSET 12 -#define USER_NODE_PROCEDURAL_SIZE 64 - -//===================================================================================================================== -// Procedural node primitive data offsets -#define USER_NODE_PROCEDURAL_PRIMITIVE_INDEX_OFFSET TRIANGLE_NODE_PRIMITIVE_INDEX1_OFFSET -#define USER_NODE_PROCEDURAL_GEOMETRY_INDEX_AND_FLAGS_OFFSET TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET -#define USER_NODE_PROCEDURAL_TRIANGLE_ID_OFFSET TRIANGLE_NODE_ID_OFFSET - -//===================================================================================================================== -// User defined procedural node format -struct ProceduralNode -{ - float3 bbox_min; - float3 bbox_max; - uint padding1[6]; - uint geometryIndexAndFlags; - uint reserved; - uint primitiveIndex; - uint triangleId; -}; - -#ifdef __cplusplus -static_assert(USER_NODE_PROCEDURAL_SIZE == sizeof(ProceduralNode), "ProceduralNode structure mismatch"); -static_assert(USER_NODE_PROCEDURAL_MIN_OFFSET == offsetof(ProceduralNode, bbox_min), ""); -static_assert(USER_NODE_PROCEDURAL_MAX_OFFSET == offsetof(ProceduralNode, bbox_max), ""); -static_assert(USER_NODE_PROCEDURAL_GEOMETRY_INDEX_AND_FLAGS_OFFSET == offsetof(ProceduralNode, geometryIndexAndFlags), ""); -static_assert(USER_NODE_PROCEDURAL_PRIMITIVE_INDEX_OFFSET == offsetof(ProceduralNode, primitiveIndex), ""); -static_assert(USER_NODE_PROCEDURAL_TRIANGLE_ID_OFFSET == offsetof(ProceduralNode, triangleId), ""); -#endif - -#ifdef __cplusplus -//===================================================================================================================== -union NodePointer32 -{ - struct - { - uint32_t type : 3; // Hardware NODE_TYPE_* - uint32_t aligned_offset_64b : 29; // 64-byte aligned offset - }; - - uint32_t u32; -}; - -//===================================================================================================================== -// Instance base pointer layout from the HW raytracing IP 2.0 spec: -// Zero [ 2: 0] -// Tree Base Address (64B index)[53: 3] -// Force Opaque [ 54] -// Force Non-Opaque [ 55] -// Disable Triangle Cull [ 56] -// Flip Facedness [ 57] -// Cull Back Facing Triangles [ 58] -// Cull Front Facing Triangles [ 59] -// Cull Opaque [ 60] -// Cull Non-Opaque [ 61] -// Skip Triangles [ 62] -// Skip Procedural [ 63] -union NodePointer64 -{ - struct - { - uint64_t type : 3; // Hardware NODE_TYPE_* - uint64_t aligned_addr_64b : 51; // 64-byte aligned address - uint64_t force_opaque : 1; - uint64_t force_non_opaque : 1; - uint64_t disable_triangle_cull : 1; - uint64_t flip_facedness : 1; - uint64_t cull_back_face_triangle : 1; - uint64_t cull_front_face_triangle : 1; - uint64_t cull_opaque : 1; - uint64_t cull_non_opaque : 1; - uint64_t skip_triangles : 1; - uint64_t skip_procedural : 1; - }; - - uint64_t u64; -}; - -//===================================================================================================================== -union HwTriangleFlags -{ - struct - { - uint8_t i : 2; - uint8_t j : 2; - uint8_t double_sided : 1; - uint8_t flip_winding : 1; - uint8_t unused : 1; - uint8_t opaque : 1; - }; - - uint8_t u8; -}; - -//===================================================================================================================== -union HwTriangleID -{ - struct - { - HwTriangleFlags triangle0; - HwTriangleFlags triangle1; - uint16_t unused; - }; - - uint32_t u32; -}; - -//===================================================================================================================== -union BoxNodeChildFlags -{ - struct - { - uint8_t only_opaque : 1; - uint8_t only_non_opaque : 1; - uint8_t only_triangles : 1; - uint8_t only_procedural : 1; - uint8_t unused : 4; - }; - - uint8_t u8All; -}; - -//===================================================================================================================== -union BoxNodeFlags -{ - struct - { - BoxNodeChildFlags child0; - BoxNodeChildFlags child1; - BoxNodeChildFlags child2; - BoxNodeChildFlags child3; - }; - - uint32_t u32All; -}; -#endif - -//===================================================================================================================== -// Node pointer size in bytes -#define NODE_PTR_SIZE 4 - -#ifdef __cplusplus -static_assert(NODE_PTR_SIZE == sizeof(NodePointer32), "Node pointer size mismatch"); -#endif - -//===================================================================================================================== -// Function assumes the type passed in is a valid node type -// -static uint PackNodePointer(uint type, uint address) -{ - uint nodePointer = type; // this assumes that the type is valid - // uint pointer = type & 0x7; - - // The input address is a byte offset, and node_addr is a 64-byte offset that starts at bit 3. - nodePointer |= (address >> 3); // this assumes that the input address is 64-byte aligned - // pointer |= (address >> 6) << 3; - - return nodePointer; -} - -//===================================================================================================================== -static uint GetNodeType(uint nodePointer) -{ - // From the HW raytracing spec: - // node_type = node_pointer[ 2:0] - return nodePointer & 0x7; -} - -//===================================================================================================================== -static uint ClearNodeType(uint nodePointer) -{ - return nodePointer & ~0x7; -} - -//===================================================================================================================== -// NOTE: The highest 3 bits are excluded. They aren't written when building the QBVH and may have been repurposed. See -// NODE_POINTER_MASK_MSB -static uint ExtractNodePointerOffset(uint nodePointer) -{ - // From the HW raytracing spec: - // node_addr[60:0] = node_pointer[63:3] - // Also, based on the following, the node_addr is 64-byte aligned: - // fetch_addr0 = T#.base_address*256+node_addr*64 - return ClearNodeType(nodePointer) << 3; -} - -//===================================================================================================================== -// Removes temp flag (MSB) within node type set by RefitBounds when fp16 nodes mode is LEAF_NODES_IN_BLAS_AS_FP16. -static uint GetNodePointerExclMsbFlag(uint nodePointer) -{ - return nodePointer & (~NODE_POINTER_MASK_MSB); -} - -//===================================================================================================================== -// Primitive data structure that includes the unpacked data needed to process a primitive -struct PrimitiveData -{ - uint primitiveIndex; // Primitive index used to indicate what primitive in geometry description - uint geometryIndex; // Geometry index used to indicate what geometry description - uint geometryFlags; // Geometry flags contains if the geometry is opaque or non opaque -}; - -//===================================================================================================================== -// Extract the geometry index from the bottom 24 bits -static uint ExtractGeometryIndex(uint geometryIndexAndFlags) -{ - return geometryIndexAndFlags & 0xFFFFFF; -} - -//===================================================================================================================== -// Extract the geometry flags from bits 25-26 -static uint ExtractGeometryFlags(uint geometryIndexAndFlags) -{ - return (geometryIndexAndFlags >> 24) & 0x3; -} - -//===================================================================================================================== -// Extract the geometry index from the bottom 24 bits and geometry flags from bits 25-26 -static uint2 UnpackGeometryIndexAndFlags(uint geometryIndexAndFlags) -{ - return uint2(ExtractGeometryIndex(geometryIndexAndFlags), ExtractGeometryFlags(geometryIndexAndFlags)); -} - -//===================================================================================================================== -// Pack the geometry index in the bottom 24 bits and the geometry flags into bits 25-26 -static uint PackGeometryIndexAndFlags( - uint geometryIndex, - uint geometryFlags) -{ - return (geometryFlags << 24) | (geometryIndex & 0xFFFFFF); -} - -//===================================================================================================================== -// Additional geometry information for bottom level acceleration structures primitives -struct GeometryInfo -{ - uint geometryFlagsAndNumPrimitives; - uint geometryBufferOffset; - uint primNodePtrsOffset; // Offset from the base of all prim node ptrs to this geometry's prim node ptrs -}; - -#define DXGI_FORMAT_UNKNOWN 0 -#define DXGI_FORMAT_R32G32B32_FLOAT 6 - -#define DECODE_VERTEX_STRIDE 12 -#define DECODE_PRIMITIVE_STRIDE_TRIANGLE 36 -#define DECODE_PRIMITIVE_STRIDE_AABB 24 -#define GEOMETRY_INFO_SIZE 12 -#define GEOMETRY_INFO_FLAGS_AND_NUM_PRIMS_OFFSET 0 -#define GEOMETRY_INFO_GEOM_BUFFER_OFFSET 4 -#define GEOMETRY_INFO_PRIM_NODE_PTRS_OFFSET 8 - -#define PIPELINE_FLAG_SKIP_TRIANGLES 0x100 -#define PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES 0x200 - -#ifdef __cplusplus -static_assert(GEOMETRY_INFO_SIZE == sizeof(GeometryInfo), "Geometry info structure mismatch"); -static_assert(GEOMETRY_INFO_FLAGS_AND_NUM_PRIMS_OFFSET == offsetof(GeometryInfo, geometryFlagsAndNumPrimitives), ""); -static_assert(GEOMETRY_INFO_GEOM_BUFFER_OFFSET == offsetof(GeometryInfo, geometryBufferOffset), ""); -static_assert(GEOMETRY_INFO_PRIM_NODE_PTRS_OFFSET == offsetof(GeometryInfo, primNodePtrsOffset), ""); -#endif - -//===================================================================================================================== -static uint ExtractGeometryInfoFlags(uint packedGeometryFlagsAndNumPrimitives) -{ - return (packedGeometryFlagsAndNumPrimitives >> 29); -} - -//===================================================================================================================== -static uint ExtractGeometryInfoNumPrimitives(uint packedGeometryFlagsAndNumPrimitives) -{ - // ((1 << 29) - 1) = 0x1fffffff - return (packedGeometryFlagsAndNumPrimitives & 0x1FFFFFFF); -} - -//===================================================================================================================== -static uint PackGeometryFlagsAndNumPrimitives(uint geometryFlags, uint numPrimitives) -{ - return (geometryFlags << 29) | numPrimitives; -} - -//===================================================================================================================== -// 64-byte aligned BVH2 node structure -struct BVHNode -{ - float3 bbox_left_min_or_v0; /// Left Node bounding box minimum bounds or vertex 0 - uint left; /// Left child node pointer (Also, primitive ID for leaves, instance ID for instances) - - float3 bbox_left_max_or_v1; /// Left Node bounding box maximum bounds or vertex 1 - uint right; /// Right child node pointer (Also, geometry Index for leaves) - - float3 bbox_right_min_or_v2; /// Right Node bounding box min bounds or vertex 2 - uint flags; /// Bottom: geometry flags OR Top: node[0] this is used to hold num instances - - float3 bbox_right_max; /// Right node bounding box max bounds - uint unused; /// Unused -}; - -#define BVH_NODE_SIZE 64 -#define BVH_NODE_LEFT_MIN_OFFSET 0 -#define BVH_NODE_V0_OFFSET BVH_NODE_LEFT_MIN_OFFSET -#define BVH_NODE_LEFT_OFFSET 12 -#define BVH_NODE_PRIMITIVE_ID_OFFSET BVH_NODE_LEFT_OFFSET -#define BVH_NODE_LEFT_MAX_OFFSET 16 -#define BVH_NODE_V1_OFFSET BVH_NODE_LEFT_MAX_OFFSET -#define BVH_NODE_RIGHT_OFFSET 28 -#define BVH_NODE_GEOMETRY_INDEX_OFFSET BVH_NODE_RIGHT_OFFSET -#define BVH_NODE_RIGHT_MIN_OFFSET 32 -#define BVH_NODE_V2_OFFSET BVH_NODE_RIGHT_MIN_OFFSET -#define BVH_NODE_FLAGS_OFFSET 44 -#define BVH_NODE_RIGHT_MAX_OFFSET 48 - -#ifdef __cplusplus -static_assert(BVH_NODE_SIZE == sizeof(BVHNode), "BVH2Node structure mismatch"); -static_assert(BVH_NODE_LEFT_MIN_OFFSET == offsetof(BVHNode, bbox_left_min_or_v0), ""); -static_assert(BVH_NODE_LEFT_OFFSET == offsetof(BVHNode, left), ""); -static_assert(BVH_NODE_LEFT_MAX_OFFSET == offsetof(BVHNode, bbox_left_max_or_v1), ""); -static_assert(BVH_NODE_RIGHT_OFFSET == offsetof(BVHNode, right), ""); -static_assert(BVH_NODE_RIGHT_MIN_OFFSET == offsetof(BVHNode, bbox_right_min_or_v2), ""); -static_assert(BVH_NODE_FLAGS_OFFSET == offsetof(BVHNode, flags), ""); -static_assert(BVH_NODE_RIGHT_MAX_OFFSET == offsetof(BVHNode, bbox_right_max), ""); -#endif - -//===================================================================================================================== -struct InstanceSidebandData1_1 -{ - uint instanceIndex; - uint blasNodePointer; // might not point to root - uint blasMetadataSize; - uint padding0; - float4 Transform[3]; // Non-inverse (original D3D12_RAYTRACING_INSTANCE_DESC.Transform) -}; - -#define RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET 0 -#define RTIP1_1_INSTANCE_SIDEBAND_CHILD_POINTER_OFFSET 4 -#define RTIP1_1_INSTANCE_SIDEBAND_CHILD_METADATA_SIZE_OFFSET 8 -#define RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET 16 -#define RTIP1_1_INSTANCE_SIDEBAND_SIZE 64 - -//===================================================================================================================== -// 64-byte aligned structure matching D3D12_RAYTRACING_INSTANCE_DESC -struct InstanceDesc -{ - float4 Transform[3]; // Inverse transform for traversal - uint InstanceID_and_Mask; // 24-bit instance ID and 8-bit mask - uint InstanceContributionToHitGroupIndex_and_Flags; // 24-bit instance contribution and 8-bit flags - uint accelStructureAddressLo; // Lower part of acceleration structure base address - uint accelStructureAddressHiAndFlags; // Upper part of acceleration structure base address and - // HW raytracing IP 2.0 flags -}; - -#define INSTANCE_DESC_SIZE 64 -#define INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET 0 -#define INSTANCE_DESC_ID_AND_MASK_OFFSET 48 -#define INSTANCE_DESC_CONTRIBUTION_AND_FLAGS_OFFSET 52 -#define INSTANCE_DESC_VA_LO_OFFSET 56 -#define INSTANCE_DESC_VA_HI_OFFSET 60 - -#ifdef __cplusplus -static_assert(INSTANCE_DESC_SIZE == sizeof(InstanceDesc), "InstanceDesc structure mismatch"); -static_assert(INSTANCE_DESC_ID_AND_MASK_OFFSET == offsetof(InstanceDesc, InstanceID_and_Mask), ""); -static_assert(INSTANCE_DESC_CONTRIBUTION_AND_FLAGS_OFFSET == offsetof(InstanceDesc, InstanceContributionToHitGroupIndex_and_Flags), ""); -static_assert(INSTANCE_DESC_VA_LO_OFFSET == offsetof(InstanceDesc, accelStructureAddressLo), ""); -static_assert(INSTANCE_DESC_VA_HI_OFFSET == offsetof(InstanceDesc, accelStructureAddressHiAndFlags), ""); -#endif - -#ifdef __cplusplus -static_assert(RTIP1_1_INSTANCE_SIDEBAND_SIZE == sizeof(InstanceSidebandData1_1), "Instance sideband structure mismatch"); -static_assert(RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET == offsetof(InstanceSidebandData1_1, instanceIndex), ""); -static_assert(RTIP1_1_INSTANCE_SIDEBAND_CHILD_POINTER_OFFSET == offsetof(InstanceSidebandData1_1, blasNodePointer), ""); -static_assert(RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET == offsetof(InstanceSidebandData1_1, Transform[0]), ""); -#endif - -//===================================================================================================================== -struct FusedInstanceNode -{ - InstanceDesc desc; - InstanceSidebandData1_1 sideband; - Float32BoxNode blasRootNode; -}; - -//===================================================================================================================== -struct InstanceNode -{ - InstanceDesc desc; - InstanceSidebandData1_1 sideband; -}; - -#define INSTANCE_NODE_DESC_OFFSET 0 -#define INSTANCE_NODE_EXTRA_OFFSET 64 -#define INSTANCE_NODE_SIZE 128 -#define FUSED_INSTANCE_NODE_ROOT_OFFSET INSTANCE_NODE_SIZE -#define FUSED_INSTANCE_NODE_SIZE 256 - -#ifdef __cplusplus -static_assert(INSTANCE_NODE_SIZE == sizeof(InstanceNode), "InstanceNode structure mismatch"); -static_assert(INSTANCE_NODE_DESC_OFFSET == offsetof(InstanceNode, desc), "InstanceNode structure mismatch"); -static_assert(INSTANCE_NODE_EXTRA_OFFSET == offsetof(InstanceNode, sideband), "InstanceNode structure mismatch"); -#endif - -//===================================================================================================================== -static uint64_t PackUint64(uint lowBits, uint highBits) -{ - // Note glslang doesn't like uint64_t casts - uint64_t addr = highBits; - addr = (addr << 32) | lowBits; - return addr; -} - -//====================================================================================================================== -// Packs the channels of a uint2 into a single uint64_t. -static uint64_t PackUint64(uint2 lowHigh) -{ - // Note glslang doesn't like uint64_t casts - uint64_t addr = lowHigh.y; - addr = (addr << 32) | lowHigh.x; - return addr; -} - -//===================================================================================================================== -static uint2 SplitUint64(uint64_t x) -{ - return uint2(x, (x >> 32)); -} - -//===================================================================================================================== -// Instance base pointer layout from the HW raytracing IP 2.0 spec: -// Zero [ 2: 0] -// Tree Base Address (64B index)[53: 3] -// Force Opaque [ 54] -// Force Non-Opaque [ 55] -// Disable Triangle Cull [ 56] -// Flip Facedness [ 57] -// Cull Back Facing Triangles [ 58] -// Cull Front Facing Triangles [ 59] -// Cull Opaque [ 60] -// Cull Non-Opaque [ 61] -// Skip Triangles [ 62] -// Skip Procedural [ 63] -// -// Since GPU VAs can only be 48 bits, only 42 bits of the Tree Base Address field are used: -// Used Address [44: 3] -// Unused Address [53:45] -// -#define INSTANCE_BASE_POINTER_ZERO_MASK 0x7ull -#define INSTANCE_BASE_POINTER_ADDRESS_USED_MASK 0x1FFFFFFFFFF8ull -#define INSTANCE_BASE_POINTER_ADDRESS_UNUSED_MASK 0x3FE00000000000ull -#define INSTANCE_BASE_POINTER_ADDRESS_MASK 0x3FFFFFFFFFFFF8ull -#define INSTANCE_BASE_POINTER_FLAGS_MASK 0xFFC0000000000000ull - -#define NODE_POINTER_FLAGS_SHIFT 54 -#define NODE_POINTER_FORCE_OPAQUE_SHIFT 54 -#define NODE_POINTER_FORCE_NON_OPAQUE_SHIFT 55 -#define NODE_POINTER_DISABLE_TRIANGLE_CULL_SHIFT 56 -#define NODE_POINTER_FLIP_FACEDNESS_SHIFT 57 -#define NODE_POINTER_CULL_BACK_FACING_SHIFT 58 -#define NODE_POINTER_CULL_FRONT_FACING_SHIFT 59 -#define NODE_POINTER_CULL_OPAQUE_SHIFT 60 -#define NODE_POINTER_CULL_NON_OPAQUE_SHIFT 61 -#define NODE_POINTER_SKIP_TRIANGLES_SHIFT 62 -#define NODE_POINTER_SKIP_PROCEDURAL_SHIFT 63 - -#define RAY_FLAG_VALID_MASK 0x3ffu -#define RAY_FLAG_EXCLUDE_MASK (RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER) -#define RAY_FLAG_OVERRIDE_MASK (RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_FORCE_NON_OPAQUE) // 0x3 -#define RAY_FLAG_PRESERVE_MASK (RAY_FLAG_VALID_MASK & (~RAY_FLAG_OVERRIDE_MASK)) // 0x3fc - -#define POINTER_FLAGS_HIDWORD_SHIFT (NODE_POINTER_FORCE_OPAQUE_SHIFT - 32) // 22 -#define POINTER_FLAGS_VALID_MASK (RAY_FLAG_VALID_MASK << POINTER_FLAGS_HIDWORD_SHIFT) // 0x3ff << 22 -#define POINTER_FLAGS_EXCLUDED_MASK ~(POINTER_FLAGS_VALID_MASK) // 0xFFC00000 //===================================================================================================================== struct StackPtrs @@ -919,112 +107,6 @@ static_assert(STACK_PTRS_DST_PTR_OFFSET == offsetof(StackPtrs, stackPtrNo static_assert(STACK_PTRS_NUM_LEAFS_DONE_OFFSET == offsetof(StackPtrs, numLeafsDone), ""); #endif -//===================================================================================================================== -// Build Stage Counters (Debug only) -// It starts with the qbvhGlobalCounters offset, i.e., -// qbvhGlobalStack...qbvhGlobalStackPtrs...bvhBuildDebugCounters - -#define COUNTER_MORTONGEN_OFFSET 0x0 -#define COUNTER_MORTON_SORT_OFFSET 0x4 -#define COUNTER_SORTLEAF_OFFSET 0x8 -#define COUNTER_BUILDPLOC_OFFSET 0xC -#define COUNTER_BUILDLBVH_OFFSET 0x10 -#define COUNTER_REFIT_OFFSET 0x14 -#define COUNTER_INITENCODEHWBVH_OFFSET 0x18 -#define COUNTER_ENCODEHWBVH_OFFSET 0x1C -#define COUNTER_EMPTYPRIM_OFFSET 0x20 -#define COUNTER_EMITCOMPACTSIZE_OFFSET 0x24 -#define COUNTER_BUILDFASTLBVH_OFFSET 0x28 - -//===================================================================================================================== -// Get leaf triangle node size in bytes -static uint GetBvhNodeSizeTriangle() -{ - return TRIANGLE_NODE_SIZE; -} - -//===================================================================================================================== -// Get leaf AABB node size in bytes -static uint GetBvhNodeSizeProcedural() -{ - return USER_NODE_PROCEDURAL_SIZE; -} - -//===================================================================================================================== -// Get leaf instance node size in bytes -static uint GetBvhNodeSizeInstance(uint enableFusedInstanceNode) -{ - return (enableFusedInstanceNode == 0) ? INSTANCE_NODE_SIZE : FUSED_INSTANCE_NODE_SIZE; -} - -//===================================================================================================================== -// Get internal BVH node size in bytes -static uint GetBvhNodeSizeInternal() -{ - return FLOAT32_BOX_NODE_SIZE; -} - -//===================================================================================================================== -// Get internal BVH node size in bytes -static uint GetBvhNodeSizeLeaf( - uint primitiveType, - uint enableFusedInstanceNode) -{ - uint sizeInBytes = 0; - switch (primitiveType) - { - case PrimitiveType::Triangle: - sizeInBytes = GetBvhNodeSizeTriangle(); - break; - case PrimitiveType::AABB: - sizeInBytes = GetBvhNodeSizeProcedural(); - break; - case PrimitiveType::Instance: - sizeInBytes = GetBvhNodeSizeInstance(enableFusedInstanceNode); - break; - } - - return sizeInBytes; -} - -//===================================================================================================================== -static uint CalcParentPtrOffset(uint nodePtr) -{ - // Subtract 1 from the index to account for negative offset calculations. I.e. index 0 is actually at -4 byte - // offset from the end of the parent pointer memory - const uint linkIndex = (nodePtr >> 3) - 1; - return linkIndex * NODE_PTR_SIZE; -} - -//===================================================================================================================== -static uint CalcBottomGeometryInfoSize(uint numGeometries) -{ - return numGeometries * GEOMETRY_INFO_SIZE; -} - -//===================================================================================================================== -struct DataOffsetAndSize -{ - uint offset; - uint size; -}; - -//===================================================================================================================== -struct StateTaskQueueCounter -{ - uint phase; - uint startPhaseIndex; - uint endPhaseIndex; - uint taskCounter; - uint numTasksDone; -}; - -#define STATE_TASK_QUEUE_PHASE_OFFSET 0 -#define STATE_TASK_QUEUE_START_PHASE_INDEX_OFFSET 4 -#define STATE_TASK_QUEUE_END_PHASE_INDEX_OFFSET 8 -#define STATE_TASK_QUEUE_TASK_COUNTER_OFFSET 12 -#define STATE_TASK_QUEUE_NUM_TASKS_DONE_OFFSET 16 - //===================================================================================================================== // Counters used in encode phase @@ -1105,439 +187,20 @@ static_assert(TASK_LOOP_QBVH_TASKS_DONE_OFFSET == offsetof(TaskLoopCou #endif //===================================================================================================================== -#define REF_SCRATCH_SIDE_LEFT 0 -#define REF_SCRATCH_SIDE_RIGHT 1 -#define REF_SCRATCH_SIDE_LEAF 2 - -#define USE_BLAS_PRIM_COUNT 0 -struct TDRefScratch -{ - uint primitiveIndex; - uint nodeIndex; - float3 center; - BoundingBox box; - uint side; -#if USE_BVH_REBRAID - uint nodePointer; //rebraid only -#endif -#if USE_BLAS_PRIM_COUNT - uint numPrimitives; -#endif -}; - -#define TD_REF_PRIM_INDEX_OFFSET 0 -#define TD_REF_NODE_INDEX_OFFSET 4 -#define TD_REF_CENTER_OFFSET 8 -#define TD_REF_BOX_OFFSET 20 -#define TD_REF_SIDE_OFFSET (TD_REF_BOX_OFFSET + sizeof(BoundingBox)) -#define TD_REF_NODE_POINTER_OFFSET (TD_REF_SIDE_OFFSET + 4) -#if USE_BLAS_PRIM_COUNT -#define TD_REF_NUM_PRIM_OFFSET (TD_REF_NODE_POINTER_OFFSET + sizeof(uint)) -#endif - -//===================================================================================================================== -#define NUM_SPLIT_BINS 4 - -#define TD_NODE_REBRAID_STATE_OPEN 0 -#define TD_NODE_REBRAID_STATE_CLOSED 1 - -struct TDBins -{ - uint64_t firstRefIndex; - - UintBoundingBox binBoxes[3][NUM_SPLIT_BINS]; - uint binPrimCount[3][NUM_SPLIT_BINS]; - - uint bestAxis; - uint bestSplit; - uint numLeft; - uint numRight; - -#if USE_BLAS_PRIM_COUNT - uint binBLASPrimCount[3][NUM_SPLIT_BINS]; -#endif -}; - -#define TD_BINS_FIRST_REF_INDEX_OFFSET 0 -#define TD_BINS_BIN_BOXES_OFFSET (TD_BINS_FIRST_REF_INDEX_OFFSET + 8) -#define TD_BINS_BIN_PRIM_COUNT_OFFSET (TD_BINS_BIN_BOXES_OFFSET + sizeof(UintBoundingBox) * NUM_SPLIT_BINS * 3) -#define TD_BINS_BEST_AXIS_OFFSET (TD_BINS_BIN_PRIM_COUNT_OFFSET + sizeof(uint) * NUM_SPLIT_BINS * 3) -#define TD_BINS_BEST_SPLIT_OFFSET (TD_BINS_BEST_AXIS_OFFSET + 4) -#define TD_BINS_NUM_LEFT_OFFSET (TD_BINS_BEST_SPLIT_OFFSET + 4) -#define TD_BINS_NUM_RIGHT_OFFSET (TD_BINS_NUM_LEFT_OFFSET + 4) -#if USE_BLAS_PRIM_COUNT -#define TD_BINS_BLAS_PRIM_COUNT_OFFSET (TD_BINS_NUM_RIGHT_OFFSET + 4) -#endif - -struct TDNode -{ - UintBoundingBox centroidBox; - uint binsIndex; - uint childCount; - -#if USE_BVH_REBRAID - uint largestAxis; // rebraid only - float largestWidth; // rebraid only - uint rebraidState; // rebraid only - uint primIndex; // rebraid only -#endif -}; - -#define TD_NODE_CENTROID_BOX_OFFSET 0 -#define TD_NODE_BINS_INDEX_OFFSET (TD_NODE_CENTROID_BOX_OFFSET + sizeof(UintBoundingBox)) -#define TD_NODE_CHILD_COUNT_OFFSET (TD_NODE_BINS_INDEX_OFFSET + 4) -#define TD_NODE_LARGEST_AXIS_OFFSET (TD_NODE_CHILD_COUNT_OFFSET + 4) -#define TD_NODE_LARGEST_WIDTH_OFFSET (TD_NODE_LARGEST_AXIS_OFFSET + 4) -#define TD_NODE_REBRAID_STATE_OFFSET (TD_NODE_LARGEST_WIDTH_OFFSET + 4) -#define TD_NODE_PRIM_INDEX_OFFSET (TD_NODE_REBRAID_STATE_OFFSET + 4) - -//===================================================================================================================== - -#define TD_REBRAID_STATE_NO_OPEN 0 -#define TD_REBRAID_STATE_NEED_OPEN 1 -#define TD_REBRAID_STATE_OOM 2 - -#define TD_PHASE_INIT_STATE 0 -#define TD_PHASE_INIT_REFS_TO_LEAVES 1 -#define TD_PHASE_CHECK_NEED_ALLOC 2 -#define TD_PHASE_ALLOC_ROOT_NODE 3 -#define TD_PHASE_REBRAID_COUNT_OPENINGS 4 -#define TD_PHASE_REBRAID_CHECK_TERMINATION 5 -#define TD_PHASE_REBRAID_OPEN 6 -#define TD_PHASE_REBRAID_UPDATE_NODES 7 -#define TD_PHASE_BIN_REFS 8 -#define TD_PHASE_FIND_BEST_SPLIT 9 -#define TD_PHASE_SECOND_PASS 10 -#define TD_PHASE_UPDATE_NEW_NODES 11 -#define TD_PHASE_DONE 12 - -struct StateTDBuild -{ - uint numNodes; - uint numProcessedNodes; - uint numNodesAllocated; - uint numRefs; - uint numRefsAllocated; - uint numInactiveInstance; - UintBoundingBox rootCentroidBBox; - uint numLeaves; - uint binsCounter; - -#if USE_BVH_REBRAID - uint rebraidState; - uint leafAllocOffset; -#endif -}; - -#define STATE_TD_NUM_NODES_OFFSET 0 -#define STATE_TD_NUM_PROCESSED_NODES_OFFSET 4 -#define STATE_TD_NUM_NODES_ALLOCATED_OFFSET 8 -#define STATE_TD_NUM_REFS_OFFSET 12 -#define STATE_TD_NUM_REFS_ALLOCATED_OFFSET 16 -#define STATE_TD_NUM_INACTIVE_INSTANCE_OFFSET 20 -#define STATE_TD_CENTROID_BBOX_OFFSET 24 -#define STATE_TD_NUM_LEAVES_OFFSET (STATE_TD_CENTROID_BBOX_OFFSET + sizeof(UintBoundingBox)) -#define STATE_TD_BINS_COUNTER_OFFSET (STATE_TD_NUM_LEAVES_OFFSET + 4) -#define STATE_TD_REBRAID_STATE_OFFSET (STATE_TD_BINS_COUNTER_OFFSET + 4) -#define STATE_TD_LEAF_ALLOC_OFFSET_OFFSET (STATE_TD_REBRAID_STATE_OFFSET + 4) - -//===================================================================================================================== -struct Flags -{ - uint dataValid; - uint prefixSum; -}; - -#define FLAGS_DATA_VALID_OFFSET 0 -#define FLAGS_PREFIX_SUM_OFFSET 4 - -#define DLB_KEYS_PER_THREAD 4 -#define DLB_KEYS_PER_GROUP (BUILD_THREADGROUP_SIZE * DLB_KEYS_PER_THREAD) - -#define DLB_VALID_SUM 0 -#define DLB_VALID_PREFIX_SUM 1 -#define NUM_DLB_VALID_TYPES 2 - -//===================================================================================================================== - -#define PLOC_PHASE_INIT 0 -#define PLOC_PHASE_FIND_NEAREST_NEIGHBOUR 1 -#define PLOC_PHASE_UPDATE_CLUSTER_COUNT 2 -#define PLOC_PHASE_DONE 3 - -struct StatePLOC -{ - uint numClusters; - uint internalNodesIndex; - uint clusterListIndex; - uint numClustersAlloc; -}; - -#define STATE_PLOC_NUM_CLUSTERS_OFFSET 0 -#define STATE_PLOC_INTERNAL_NODES_INDEX_OFFSET 4 -#define STATE_PLOC_CLUSTER_LIST_INDEX_OFFSET 8 -#define STATE_PLOC_NUM_CLUSTERS_ALLOC_OFFSET 12 - -//===================================================================================================================== -#define REBRAID_PHASE_CALC_SUM 0 -#define REBRAID_PHASE_OPEN 1 -#define REBRAID_PHASE_DONE 2 - -struct RebraidState -{ - float sumValue[2]; - uint mutex; - uint numLeafIndices; - uint iterationCount; -}; - -#define STATE_REBRAID_SUM_VALUE_OFFSET 0 -#define STATE_REBRAID_MUTEX_OFFSET (STATE_REBRAID_SUM_VALUE_OFFSET + 8) -#define STATE_REBRAID_NUM_LEAF_INDICES_OFFSET (STATE_REBRAID_MUTEX_OFFSET + 4) -#define STATE_REBRAID_ITERATION_COUNT_OFFSET (STATE_REBRAID_NUM_LEAF_INDICES_OFFSET + 4) - -#define REBRAID_KEYS_PER_THREAD 4 -#define REBRAID_KEYS_PER_GROUP (BUILD_THREADGROUP_SIZE * REBRAID_KEYS_PER_THREAD) - -//===================================================================================================================== -#define TS_PHASE_INIT 0 -#define TS_PHASE_CALC_SUM 1 -#define TS_PHASE_ALLOC_REFS 2 -#define TS_PHASE_SPLIT 3 -#define TS_PHASE_DONE 4 - -struct ScratchTSRef -{ - uint leafIndex; - uint numSplits; - - uint splitLeafBaseIndex; - - BoundingBox bbox; -}; - -struct ScratchTSState -{ - uint refListIndex; - uint numRefs; - uint numRefsAlloc; - float sum; - uint mutex; -}; - -#define STATE_TS_REF_LIST_INDEX_OFFSET 0 -#define STATE_TS_NUM_REFS_OFFSET STATE_TS_REF_LIST_INDEX_OFFSET + 4 -#define STATE_TS_NUM_REFS_ALLOC_OFFSET STATE_TS_NUM_REFS_OFFSET + 4 -#define STATE_TS_SUM_OFFSET STATE_TS_NUM_REFS_ALLOC_OFFSET + 4 -#define STATE_TS_MUTEX_OFFSET STATE_TS_SUM_OFFSET + 4 - -//===================================================================================================================== -struct IndexBufferInfo -{ - uint gpuVaLo; - uint gpuVaHi; - uint byteOffset; - uint format; -}; - -#define INDEX_BUFFER_INFO_GPU_VA_LO_OFFSET 0 -#define INDEX_BUFFER_INFO_GPU_VA_HI_OFFSET 4 -#define INDEX_BUFFER_INFO_BYTE_OFFSET_OFFSET 8 -#define INDEX_BUFFER_INFO_FORMAT_OFFSET 12 - -//===================================================================================================================== -enum RebraidType : uint -{ - Off = 0, // No Rebraid - V1 = 1, // First version of Rebraid - V2 = 2, // Second version of Rebraid -}; - -#define BUILD_MODE_LINEAR 0 -// BUILD_MODE_AC was 1, but it has been removed. -#define BUILD_MODE_PLOC 2 - -#define SAH_COST_TRIANGLE_INTERSECTION 1.5 -#define SAH_COST_AABBB_INTERSECTION 1 - -#define ENCODE_FLAG_ARRAY_OF_POINTERS 0x00000001 -#define ENCODE_FLAG_UPDATE_IN_PLACE 0x00000002 -#define ENCODE_FLAG_REBRAID_ENABLED 0x00000004 -#define ENCODE_FLAG_ENABLE_FUSED_INSTANCE_NODE 0x00000008 - -//===================================================================================================================== -struct IntersectionResult -{ -#if defined(__cplusplus) - IntersectionResult(int val) - { - memset(this, val, sizeof(IntersectionResult)); - } -#endif - float t; // Relative to tMin - uint nodeIndex; - float2 barycentrics; - uint geometryIndex; - uint primitiveIndex; - uint instNodePtr; - uint hitkind; - uint instanceContribution; - -#if DEVELOPER - uint numIterations; - uint maxStackDepth; - uint numRayBoxTest; - uint numCandidateHits; - uint numRayTriangleTest; - uint numAnyHitInvocation; - uint instanceIntersections; -#endif -}; - -//===================================================================================================================== -// Commit status -typedef uint COMMITTED_STATUS; - -#define COMMITTED_NOTHING 0 -#define COMMITTED_TRIANGLE_HIT 1 -#define COMMITTED_PROCEDURAL_PRIMITIVE_HIT 2 - -//===================================================================================================================== -// Candidate type -typedef uint CANDIDATE_STATUS; - -#define CANDIDATE_NON_OPAQUE_TRIANGLE 0 -#define CANDIDATE_PROCEDURAL_PRIMITIVE 1 -#define CANDIDATE_NON_OPAQUE_PROCEDURAL_PRIMITIVE 2 -#define CANDIDATE_EARLY_RAY_TERMINATE 4 - -#define INIT_LDS_STATE 0xFFFFFFFF - -//===================================================================================================================== -// Data required for system value intrinsics -struct RaySystemData -{ - uint currNodePtr; - float rayTCurrent; - uint instNodePtr; - uint instanceContribution; - uint geometryIndex; - uint primitiveIndex; - float2 barycentrics; - uint frontFace; - float3 origin; - float3 direction; -}; +struct LutData {}; //===================================================================================================================== -#if DEFINE_RAYDESC || __cplusplus -// Ray description matching the D3D12 HLSL header -struct RayDesc -{ - float3 Origin; - float TMin; - float3 Direction; - float TMax; -#if __cplusplus - RayDesc() - : - Origin(float3(0, 0, 0)), - TMin(0.f), - Direction(float3(0, 0, 0)), - TMax(0.f) - {} +// different ways to encode the scene bounds used to generate morton codes - RayDesc(uint val) - { - memset(this, val, sizeof(RayDesc)); - } -#endif -}; +#ifdef __cplusplus +enum SceneBoundsCalculation : uint +#else +enum class SceneBoundsCalculation : uint32 #endif - -//===================================================================================================================== -// Internal RayQuery structure initialised at TraceRaysInline() -struct RayQueryInternal { -#if __cplusplus - RayQueryInternal(int val) { - memset(this, val, sizeof(RayQueryInternal)); - } -#endif - - // Internal query data holding address of current BVH and stack information. - // Additional data that may be required will be stored here. - uint bvhLo; - uint bvhHi; - uint topLevelBvhLo; - uint topLevelBvhHi; - uint stackPtr; - uint stackPtrTop; - uint stackNumEntries; - uint instNodePtr; - uint currNodePtr; - uint instanceHitContributionAndFlags; - uint prevNodePtr; - uint isGoingDown; - uint lastInstanceNode; - - RayDesc rayDesc; - float rayTMin; - uint rayFlags; - uint instanceInclusionMask; - - // Candidate system data - CANDIDATE_STATUS candidateType; - RaySystemData candidate; - - // Committed system data - COMMITTED_STATUS committedStatus; - RaySystemData committed; - - uint reserved; - - // Counter data - // @note We don't wrap these in DEVELOPER because it would result in mismatch of RayQuery struct size - // on the driver side when we're not using counters. - uint numRayBoxTest; - uint numRayTriangleTest; - uint numIterations; - uint maxStackDepthAndDynamicId; - uint clocks; - uint numCandidateHits; - uint instanceIntersections; - uint rayQueryObjId; + BasedOnGeometry = 0x0, + BasedOnGeometryWithSize = 0x1 }; -//===================================================================================================================== -struct HitGroupInfo -{ - uint2 closestHitId; - uint2 anyHitId; - uint2 intersectionId; - uint tableIndex; -}; - -//===================================================================================================================== -struct TriangleData -{ -#if __cplusplus - TriangleData(int val) - { - memset(this, val, sizeof(TriangleData)); - } - - TriangleData() : TriangleData(0) - {} -#endif - - float3 v0; ///< Vertex 0 - float3 v1; ///< Vertex 1 - float3 v2; ///< Vertex 2 -}; - -//===================================================================================================================== -struct LutData {}; - #endif diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py index 67cd973..a967fdf 100644 --- a/tools/CompileRTShaders.py +++ b/tools/CompileRTShaders.py @@ -39,7 +39,6 @@ import shutil import glob import pathlib -from typing import List DWORDS_PER_LINE = 8 @@ -92,15 +91,18 @@ def getName(self): def isBVH(self): return not self.isLibrary() +# Explicitly pass the legacy RtIp level as a separate define so HLSL code can determine whether its GPURT_RTIP_LEVEL is the legacy one. +commonTraceDefines = f"GPURT_RTIP_LEGACY_LEVEL={maxLegacyRtIpLevel}" + traceShaderConfigs = [ - ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrarySw", defines="GPURT_RTIP_LEVEL=0"), - ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrarySwDev", defines="DEVELOPER=1,GPURT_RTIP_LEVEL=0"), + ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrarySw", defines=f"GPURT_RTIP_LEVEL=0,{commonTraceDefines}"), + ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrarySwDev", defines=f"DEVELOPER=1,GPURT_RTIP_LEVEL=0,{commonTraceDefines}"), # Below 2 lines will be removed after GPURT_MINIMUM_INTERFACE_MAJOR_VERSION is bumped to 48 - ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrary", defines="USE_HW_INTRINSIC=1,GPURT_RTIP_LEVEL=0"), - ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryDev", defines="USE_HW_INTRINSIC=1,DEVELOPER=1,GPURT_RTIP_LEVEL=0"), + ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrary", defines=f"USE_HW_INTRINSIC=1,GPURT_RTIP_LEVEL=0,{commonTraceDefines}"), + ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryDev", defines=f"USE_HW_INTRINSIC=1,DEVELOPER=1,GPURT_RTIP_LEVEL=0,{commonTraceDefines}"), - ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryLegacy", defines=f"USE_HW_INTRINSIC=1,GPURT_RTIP_LEVEL={maxLegacyRtIpLevel}"), - ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryDevLegacy", defines=f"USE_HW_INTRINSIC=1,DEVELOPER=1,GPURT_RTIP_LEVEL={maxLegacyRtIpLevel}"), + ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryLegacy", defines=f"USE_HW_INTRINSIC=1,GPURT_RTIP_LEVEL={maxLegacyRtIpLevel},{commonTraceDefines}"), + ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryDevLegacy", defines=f"USE_HW_INTRINSIC=1,DEVELOPER=1,GPURT_RTIP_LEVEL={maxLegacyRtIpLevel},{commonTraceDefines}"), ] bvhShaderConfigs = [ @@ -117,7 +119,7 @@ def isBVH(self): ShaderConfig(path="GenerateMortonCodes.hlsl", entryPoint="GenerateMortonCodes"), ShaderConfig(path="Rebraid.hlsl", entryPoint="Rebraid"), ShaderConfig(path="BuildBVH.hlsl", entryPoint="BuildBVH", defines="USE_BUILD_LBVH=1"), - ShaderConfig(path="BuildBVHPLOC.hlsl", entryPoint="BuildBVHPLOC"), + ShaderConfig(path="BuildPLOC.hlsl", entryPoint="BuildPLOC"), ShaderConfig(path="UpdateQBVH.hlsl", entryPoint="UpdateQBVH"), ShaderConfig(path="RefitBounds.hlsl", entryPoint="RefitBounds"), ShaderConfig(path="ClearBuffer.hlsl", entryPoint="ClearBuffer"), @@ -142,6 +144,9 @@ def isBVH(self): ShaderConfig(path="InitExecuteIndirect.hlsl", entryPoint="InitExecuteIndirect", outputName="InitExecuteIndirect"), ShaderConfig(path="PairCompression.hlsl", entryPoint="PairCompression"), ShaderConfig(path="MergeSort.hlsl", entryPoint="MergeSort"), + ShaderConfig(path="MergeSort.hlsl", entryPoint="MergeSortLocal"), + ShaderConfig(path="MergeSort.hlsl", entryPoint="MergeSortGlobalIteration"), + ShaderConfig(path="MergeSort.hlsl", entryPoint="MergeSortCopyLastLevel"), ShaderConfig(path="InitAccelerationStructure.hlsl", entryPoint="InitAccelerationStructure"), ShaderConfig(path="InitAccelerationStructure.hlsl", entryPoint="InitAccelerationStructure", defines="IS_UPDATE=1", outputName="InitUpdateAccelerationStructure"), ShaderConfig(path="BuildFastAgglomerativeLbvh.hlsl", entryPoint="BuildFastAgglomerativeLbvh"), @@ -179,8 +184,6 @@ def getBaseDxcCommandArgs(isBvh:bool, isLibrary:bool, isSpirv:bool): return dxcOptions -validationSpecialCaseDefines = {x.path:x.defines for x in list(filter(lambda a : a.defines is not None, bvhShaderConfigs))} - """ Combines args into an array of strings that can be used as compilation command by InvokeSubprocess. Output command lacks: filename, -M flag for listing includes and entrypoint-specific defines like USE_HW_INTRINSIC @@ -192,8 +195,7 @@ def getValidationCmdArgs(args) -> [str]: compilerPath = FixExePath(compilerPath) validateCommand = [compilerPath] - - validateCommand += getBaseDxcCommandArgs(True, True, False) + validateCommand += getBaseDxcCommandArgs(True, True, args.spirv) validateCommand += ["-Wno-misplaced-attributes"] # -Wmisplaced-attributes is triggered by [RootSignature()] # used by entrypoint code and compiled as library validateCommand += ['-Fo', 'temp.bin'] @@ -201,7 +203,7 @@ def getValidationCmdArgs(args) -> [str]: validateCommand += ['-DLIBRARY_COMPILATION'] #use defines from cmake - for d in args.defines.split(';'): + for d in args.defines.split(' '): d = d.strip() if d != '': validateCommand += ['-D' + d] @@ -212,33 +214,37 @@ def getValidationCmdArgs(args) -> [str]: validateCommand += ['-DUSE_HW_INTRINSIC=1'] #use include pathes from cmake - for p in args.includePaths.split(';'): + for p in args.includePaths.split(' '): p = p.strip() if p != '': validateCommand += ['-I', p] return validateCommand +def removeSuffix(path: pathlib.Path, suffix: str) -> pathlib.Path: + return pathlib.Path(path.as_posix()[:-len(suffix)]) + """ -Finds all hlsl-hlsli pairs of files under basePath (recursively). -Outputs dict of filenames (without extension) to pair of bools meaning (has_hlsl_implementation, has_hlsli_header) +Finds all implementation-interface pairs of files under basePath (recursively). +Outputs dict of filenames (without extension) to pair of bools meaning (has_implementation, has_interface) """ -def getHlslHlsliPairs(basePath: str) -> {str: (bool, bool)}: - # pairs -> {hlsl_hlsli_pair_path_without_extension: (has_hlsl, has_hlsli)} +def getImplInterfacePairs(directory: pathlib.Path, implementationSuffix: str, interfaceSuffix: str) -> {pathlib.Path, (bool, bool)}: + # pairs -> {pair_path_without_extension: (has_implementation, has_interface)} pairs = {} - # insert hlsl part of pairs - for hlslfile in glob.glob(basePath+"/**/*.hlsl", recursive=True): - withoutExtension = pathlib.Path(hlslfile).with_suffix("") - pairs[withoutExtension] = (True, False) + # insert implementation part of pairs + for implPath in directory.rglob("*" + implementationSuffix): + pairs[removeSuffix(implPath.resolve(), implementationSuffix)] = (True, False) - #insert hlsli part of pairs - for hlslifile in glob.glob(basePath+"/**/*.hlsli", recursive=True): - withoutExtension = pathlib.Path(hlslifile).with_suffix("") - hasHlslFile = pairs.get(withoutExtension, (False, False))[0] - pairs[withoutExtension] = (hasHlslFile, True) + # insert interface part of pairs + for interfacePath in directory.rglob("*" + interfaceSuffix): + withoutSuffix = removeSuffix(interfacePath.resolve(), interfaceSuffix) + hasImplFile = pairs.get(withoutSuffix, (False, False))[0] + pairs[withoutSuffix] = (hasImplFile, True) return pairs +validationSpecialCaseDefines = {x.path:x.defines for x in list(filter(lambda a : a.defines is not None, bvhShaderConfigs))} + """ Some files/functions can be included conditionally behind ifdefs. This function combines defines, so that we can test compilation with different combinations of defines. @@ -261,38 +267,34 @@ def getDefineCombos(path: pathlib.Path) -> [[str]]: """ shaderClean's hlsl-hlsli pair is considered clean when: -1. it does not include anything else than .hlsli files; -2. it does not include anything from outside of shaderClean directory. +1. let [(dir, suffix)] = allowedDirSuffix, it includes only -suffix files from dir/ directory, and +2. it does not include any other files except its own .hlsl file. """ -def validateIncludes(cmd: List[str], path: pathlib.Path, shadersCleanStr: str) -> bool: +def validateIncludes(cmd: [str], path: pathlib.Path, implSuffix: str, interfaceSuffix: str, + allowedDirSuffix: [(pathlib.Path, str)]) -> bool: + allowedDirSuffix = [(dirPath.as_posix(), suffix) for (dirPath, suffix) in allowedDirSuffix] listIncludesCmd = cmd + ["-M"] threadOutput = [] retVal = InvokeSubprocess(listIncludesCmd, None, threadOutput, linuxLibraryPath=listIncludesCmd[0], expectNoOutput=False) - assert retVal == 0, "Could not list includes of {0} with cmd {1} because:\n {2}".format(path, listIncludesCmd, threadOutput) + assert retVal == 0, "Could not list includes of {0} with cmd {1} because:\n {2}".format(path, listIncludesCmd, "\n".join(threadOutput)) - includedPaths = set() + includedFilesStr = set() for line in threadOutput[0].split("\n")[1:]: - includedPaths |= {pathlib.Path(line.strip(" \n\r\t\\/"))} - includedPaths -= {path.with_suffix(".hlsl")} - includedPaths -= {path.with_suffix(".hlsli")} - - # On windows, make sure that shadersCleanPath is also interpreted in the same way as hlsiStr via as_posix() otherwise - # use of a drive mapping may cause errors. - shadersCleanPath = pathlib.Path(shadersCleanStr) - shadersCleanStrPosix = str(shadersCleanPath.resolve().as_posix()) - - for hlsli in includedPaths: - hlsliStr = str(hlsli.resolve().as_posix()) - if hlsli.suffix != ".hlsli": - print("GPURT clean shader validation failed:") - print("\tIncluding non-hlsli files is not allowed.") - print("\t{0} includes {1}".format(path, hlsliStr)) - return False - - if shadersCleanStrPosix not in hlsliStr: + # use resolve() + as_posix() to avoid path mismatches when using drive mapping + includedFilesStr |= {pathlib.Path(line.strip(" \n\r\t\\/")).resolve().as_posix()} + includedFilesStr -= {path.as_posix() + implSuffix} + includedFilesStr -= {path.as_posix() + interfaceSuffix} + + for includedFileStr in includedFilesStr: + isAllowed = False + for (dirStr, suffix) in allowedDirSuffix: + if (includedFileStr.endswith(suffix)) and (dirStr in includedFileStr): + isAllowed = True + break + + if not isAllowed: print("GPURT clean shader validation failed:") - print("\tIncluding non-clean files is not allowed.") - print("\t{0} includes {1}".format(path, hlsliStr)) + print("\t{0} includes {1} which is not allowed.".format(path, includedFileStr)) return False return True @@ -301,12 +303,12 @@ def validateIncludes(cmd: List[str], path: pathlib.Path, shadersCleanStr: str) - hlsl-hlsli pairs must compile on its own. It tests whether pairs contain or include everything needed. If they do it allows including them anywhere in any order, except for some macros. """ -def validateCompilation(cmd: List[str], path: pathlib.Path, shadersCleanStr: str) -> bool: +def validateCompilation(cmd: [str], path: pathlib.Path) -> bool: threadOutput = [] retVal = InvokeSubprocess(cmd, None, threadOutput, linuxLibraryPath=cmd[0], expectNoOutput=False) if retVal != 0: print("GPURT clean shader validation failed:") - print("\tCould not compile {0} as library with cmd {1} because:\n {2}".format(path, cmd, threadOutput)) + print("\tCould not compile {0} as library with cmd {1} because:\n {2}".format(path, cmd, threadOutput[-1])) return False return True @@ -318,17 +320,20 @@ def validateCompilation(cmd: List[str], path: pathlib.Path, shadersCleanStr: str """ def validateShadersClean(args) -> bool: cmdBase = getValidationCmdArgs(args) - shadersCleanPath = pathlib.Path(FixInputPath(args.basepath)).parent.as_posix() + "/shadersClean" - shadersCleanStr = str(shadersCleanPath) + # use resolve() + as_posix() to avoid path mismatches when using drive mapping + srcPath = pathlib.Path(FixInputPath(args.basepath)).parent.resolve() + shadersCleanPath = srcPath / "shadersClean" - for path, (hasImpl, hasHeader) in getHlslHlsliPairs(shadersCleanPath).items(): + implExt = ".hlsl" + headerExt = ".hlsli" + for path, (hasImpl, hasHeader) in getImplInterfacePairs(shadersCleanPath, implExt, headerExt).items(): assert (hasImpl or hasHeader), "There should not be files without impl nor header." - fullPath = path.with_suffix(".hlsl" if hasImpl else ".hlsli") + fullPath = path.with_suffix(path.suffix + (implExt if hasImpl else headerExt)) for defines in getDefineCombos(fullPath): - compileCmd = cmdBase + defines + [str(fullPath.as_posix())] - if not validateIncludes(compileCmd, fullPath, shadersCleanStr): + compileCmd = cmdBase + defines + [fullPath.as_posix()] + if not validateIncludes(compileCmd, path, implExt, headerExt, [(shadersCleanPath, headerExt)]): return False - if not validateCompilation(compileCmd, fullPath, shadersCleanStr): + if not validateCompilation(compileCmd, fullPath): return False return True @@ -760,12 +765,20 @@ def main() -> int: args = parser.parse_args() if args.validateShadersClean: - print("Validating shadersClean directory") + print("Validating shadersClean directory.") tBegin = time.perf_counter() + validIncludes = validateShadersClean(args) + # For vulkan, we validate SPIR-V shaders in the same run instead of running the script again. + if args.vulkan and not args.spirv: + print("Now doing SPIR-V validation...") + args.spirv = True + validIncludes &= validateShadersClean(args) + tDuration = time.perf_counter() - tBegin if validIncludes: - print("Validated shadersClean directory in ", round(tDuration, 4)) + tDuration = round(time.perf_counter() - tBegin, 4) + print(f"Validated shadersClean directory in {tDuration}s.") else: print("Some files are not clean. See errors above.") return 0 if validIncludes else -1