diff --git a/CMakeLists.txt b/CMakeLists.txt index ea92db7..7332192 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,11 @@ option(GPURT_BUILD_CONTINUATION "GpuRt uses continuation traversal" ON) if (GPURT_BUILD_CONTINUATION) gpurt_add_compile_definitions(GPURT_BUILD_CONTINUATION=1) endif() + +cmake_dependent_option(GPURT_DEBUG_CONTINUATION_TRAVERSAL "Debug continuation traversal on legacy indirect path" OFF "GPURT_BUILD_CONTINUATION" OFF) +if (GPURT_DEBUG_CONTINUATION_TRAVERSAL) + gpurt_add_compile_definitions(GPURT_DEBUG_CONTINUATION_TRAVERSAL=1) +endif() #endif # Disable run time type information diff --git a/backends/pal/gpurtPalBackend.cpp b/backends/pal/gpurtPalBackend.cpp index d06394c..e4868dd 100644 --- a/backends/pal/gpurtPalBackend.cpp +++ b/backends/pal/gpurtPalBackend.cpp @@ -28,25 +28,6 @@ namespace GpuRt { -// ===================================================================================================================== -// GPURT to PAL enum conversions without undefined behavior. -static Pal::HwPipePoint GpuRtToPalHwPipePoint( - HwPipePoint gpurtHwPipePoint) -{ -#define HWPIPEPOINTCASE(x) case static_cast(Pal::HwPipePoint::x): return Pal::HwPipePoint::x - switch (static_cast(gpurtHwPipePoint)) - { - HWPIPEPOINTCASE(HwPipeTop); - HWPIPEPOINTCASE(HwPipePreCs); - HWPIPEPOINTCASE(HwPipeBottom); - default: - PAL_ASSERT_ALWAYS_MSG("Unhandled HwPipePoint value in conversion: %u\n", - static_cast(gpurtHwPipePoint)); - return Pal::HwPipePoint::HwPipeTop; - } -#undef HWPIPEPOINTCASE -} - // ===================================================================================================================== static Pal::ImmediateDataWidth GpuRtToPalImmediateDataWidth( ImmediateDataWidth gpurtImmediateDataWidth) @@ -132,7 +113,11 @@ void PalBackend::Dispatch( uint32 z ) const { +#if PAL_INTERFACE_MAJOR_VERSION >= 909 + GetCmdBuffer(cmdBuffer)->CmdDispatch({ x, y, z }, {}); +#else GetCmdBuffer(cmdBuffer)->CmdDispatch({ x, y, z }); +#endif } // ===================================================================================================================== @@ -238,6 +223,7 @@ void PalBackend::InsertBarrier( { const bool syncDispatch = flags & BarrierFlagSyncDispatch; const bool syncIndirectArgs = flags & BarrierFlagSyncIndirectArg; + const bool syncPreCpWrite = flags & BarrierFlagSyncPreCpWrite; const bool syncPostCpWrite = flags & BarrierFlagSyncPostCpWrite; Pal::ICmdBuffer* pCmdBuffer = GetCmdBuffer(cmdBuffer); @@ -247,8 +233,16 @@ void PalBackend::InsertBarrier( if (syncDispatch || syncIndirectArgs) { - memoryBarrier.srcStageMask = Pal::PipelineStageCs; - memoryBarrier.srcAccessMask = Pal::CoherShader; + memoryBarrier.srcStageMask |= Pal::PipelineStageCs; + memoryBarrier.srcAccessMask |= Pal::CoherShader; + } + + if (syncPreCpWrite) + { + memoryBarrier.srcStageMask |= Pal::PipelineStagePostPrefetch; + memoryBarrier.srcAccessMask |= Pal::CoherShader; + memoryBarrier.dstStageMask |= Pal::PipelineStagePostPrefetch; + memoryBarrier.dstAccessMask |= Pal::CoherCp; } if (syncPostCpWrite) @@ -359,12 +353,11 @@ void PalBackend::UpdateMemory( // ===================================================================================================================== void PalBackend::WriteTimestamp( ClientCmdBufferHandle cmdBuffer, - HwPipePoint hwPipePoint, const Pal::IGpuMemory& timeStampVidMem, uint64 offset ) const { - GetCmdBuffer(cmdBuffer)->CmdWriteTimestamp(GpuRtToPalHwPipePoint(hwPipePoint), timeStampVidMem, offset); + GetCmdBuffer(cmdBuffer)->CmdWriteTimestamp(Pal::PipelineStageBottomOfPipe, timeStampVidMem, offset); } // ===================================================================================================================== diff --git a/backends/pal/gpurtPalBackend.h b/backends/pal/gpurtPalBackend.h index 7da603e..a5fc976 100644 --- a/backends/pal/gpurtPalBackend.h +++ b/backends/pal/gpurtPalBackend.h @@ -121,7 +121,6 @@ class PalBackend : public IBackend virtual void WriteTimestamp( ClientCmdBufferHandle cmdBuffer, - HwPipePoint hwPipePoint, const Pal::IGpuMemory& timeStampVidMem, uint64 offset ) const override; diff --git a/gpurt/gpurt.h b/gpurt/gpurt.h index 5d9d8e2..ca70f8f 100644 --- a/gpurt/gpurt.h +++ b/gpurt/gpurt.h @@ -737,7 +737,7 @@ struct DeviceSettings uint32 numRebraidIterations; uint32 rebraidQualityHeuristic; - uint32 plocRadius; // PLOC Radius + uint32 plocRadius; // PLOC nearest neighbor search adius uint32 maxTopDownBuildInstances; // Max instances allowed for top down build uint32 parallelBuildWavesPerSimd; // Waves per SIMD to launch for parallel build diff --git a/gpurt/gpurtBackend.h b/gpurt/gpurtBackend.h index 00152dc..7463254 100644 --- a/gpurt/gpurtBackend.h +++ b/gpurt/gpurtBackend.h @@ -75,21 +75,13 @@ struct BufferViewInfo BufferViewSwizzle swizzle; }; -// ===================================================================================================================== -// Copy of Pal::HwPipePoint with values we use. -enum class HwPipePoint : uint32 -{ - HwPipeTop = 0x0, - HwPipePreCs = 0x1, - HwPipeBottom = 0x7, -}; - // ===================================================================================================================== enum BarrierFlags : uint32 { BarrierFlagSyncDispatch = 0x1, // Stall the following dispatch until all previous dispatch done BarrierFlagSyncIndirectArg = 0x2, // Prepare previous shader output for indirect argument use - BarrierFlagSyncPostCpWrite = 0x4, // Prepare data set by CP for shader use + BarrierFlagSyncPreCpWrite = 0x4, // Prepare for CP write + BarrierFlagSyncPostCpWrite = 0x8, // Prepare data set by CP for shader use }; // ===================================================================================================================== @@ -185,7 +177,6 @@ class IBackend // Will eventually replaced with a callback or other abstraction to avoid referencing video memory. virtual void WriteTimestamp( ClientCmdBufferHandle cmdBuffer, - HwPipePoint hwPipePoint, const Pal::IGpuMemory& timeStampVidMem, uint64 offset) const = 0; diff --git a/gpurt/gpurtBuildSettings.h b/gpurt/gpurtBuildSettings.h index 5c73247..413f719 100644 --- a/gpurt/gpurtBuildSettings.h +++ b/gpurt/gpurtBuildSettings.h @@ -62,7 +62,7 @@ struct CompileTimeBuildSettings uint32 radixSortScanLevel; uint32 emitCompactSize; uint32 enableBVHBuildDebugCounters; - uint32 plocRadius; + uint32 nnSearchRadius; uint32 enablePairCostCheck; uint32 enableVariableBitsMortonCode; uint32 rebraidType; @@ -112,7 +112,7 @@ struct CompileTimeBuildSettings #define BUILD_SETTINGS_DATA_RADIX_SORT_SCAN_LEVEL_ID 7 #define BUILD_SETTINGS_DATA_EMIT_COMPACT_SIZE_ID 8 #define BUILD_SETTINGS_DATA_ENABLE_BVH_BUILD_DEBUG_COUNTERS_ID 9 -#define BUILD_SETTINGS_DATA_PLOC_RADIUS_ID 10 +#define BUILD_SETTINGS_DATA_NN_SEARCH_RADIUS_ID 10 #define BUILD_SETTINGS_DATA_ENABLE_PAIR_COST_CHECK_ID 11 #define BUILD_SETTINGS_DATA_ENABLE_VARIABLE_BITS_MC_ID 12 #define BUILD_SETTINGS_DATA_REBRAID_TYPE_ID 13 diff --git a/gpurt/gpurtInlineFuncs.h b/gpurt/gpurtInlineFuncs.h index ff377df..72a02f7 100644 --- a/gpurt/gpurtInlineFuncs.h +++ b/gpurt/gpurtInlineFuncs.h @@ -156,23 +156,6 @@ inline BufferViewFormat GetSingleComponentFormatForFormat(BufferViewFormat forma } } -//===================================================================================================================== -// Converts the value of a Pal::HwPipePoint into a GpuRt::HwPipePoint without undefined behavior. -inline HwPipePoint PalToGpuRtHwPipePoint(uint32 palHwPipePoint) -{ -#define HWPIPEPOINTCASE(x) case static_cast(HwPipePoint::x): return HwPipePoint::x - switch (palHwPipePoint) - { - HWPIPEPOINTCASE(HwPipeTop); - HWPIPEPOINTCASE(HwPipePreCs); - HWPIPEPOINTCASE(HwPipeBottom); - default: - PAL_ASSERT_ALWAYS_MSG("Unhandled HwPipePoint value in conversion: %u\n", palHwPipePoint); - return HwPipePoint::HwPipeTop; - } -#undef HWPIPEPOINTCASE -} - //===================================================================================================================== // Return the number of components for a buffer view format when it's used as a vertex format. inline uint8 GetNumComponentsForVertexFormat(VertexFormat format) diff --git a/src/gpurtBvhBatcher.cpp b/src/gpurtBvhBatcher.cpp index ba70d10..ae15781 100644 --- a/src/gpurtBvhBatcher.cpp +++ b/src/gpurtBvhBatcher.cpp @@ -104,7 +104,7 @@ void BvhBatcher::BuildAccelerationStructureBatch( // but otherwise do not participate in the rest of the build. if (isUpdate) { - builder.EmitPostBuildInfo(); + builder.EmitPostBuildInfoDispatch(); } else { @@ -146,7 +146,11 @@ void BvhBatcher::BuildAccelerationStructureBatch( { RGP_PUSH_MARKER("Process Empty BVH builds"); DispatchInitAccelerationStructure(emptyBuilders); - BuildPhase(emptyBuilders, &BvhBuilder::EmitPostBuildInfo); + if (PhaseEnabled(BuildPhaseFlags::SeparateEmitPostBuildInfoPass)) + { + Barrier(); + BuildPhase(emptyBuilders, &BvhBuilder::EmitPostBuildInfoDispatch); + } RGP_POP_MARKER(); } @@ -264,17 +268,10 @@ void BvhBatcher::BuildRaytracingAccelerationStructureBatch( { RGP_PUSH_MARKER("EmitPostBuildInfo"); Barrier(); - BuildPhase("Updates", updaters, &BvhBuilder::EmitPostBuildInfo); - BuildPhase("Builds", builders, &BvhBuilder::EmitPostBuildInfo); - + BuildPhase(BuildPhaseFlags::SeparateEmitPostBuildInfoPass, updaters, &BvhBuilder::EmitPostBuildInfoDispatch); + BuildPhase(BuildPhaseFlags::SeparateEmitPostBuildInfoPass, builders, &BvhBuilder::EmitPostBuildInfoDispatch); RGP_POP_MARKER(); } - else - { - // Execute EmitPostBuildInfo without any RGP markers - BuildPhase(updaters, &BvhBuilder::EmitPostBuildInfo); - BuildPhase(builders, &BvhBuilder::EmitPostBuildInfo); - } if (PhaseEnabled(BuildPhaseFlags::BuildDumpEvents)) { diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp index 59287b6..898b172 100644 --- a/src/gpurtBvhBuilder.cpp +++ b/src/gpurtBvhBuilder.cpp @@ -1462,6 +1462,31 @@ void BvhBuilder::InitBuildConfig( #endif ; + // The builder supports one compacted size emit during the build itself. Additional postbuild info requires + // extra dispatches or CP writes. + uint32 emitCompactCount = 0; + for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; ++i) + { + AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i); + if (args.desc.infoType == AccelStructPostBuildInfoType::CompactedSize) + { + // Cache emit destination GPU VA for inlined emit from build shaders + m_emitCompactDstGpuVa = args.desc.postBuildBufferAddr.gpu; + emitCompactCount++; + } + else + { + m_buildConfig.nonInlinePostBuildEmits = true; + } + } + + // If maxNumPrimitives == 0, we never execute a BVH build, so we always need a separate emit pass. + if ((emitCompactCount > 1) || (m_buildConfig.maxNumPrimitives == 0)) + { + m_emitCompactDstGpuVa = 0; + m_buildConfig.nonInlinePostBuildEmits = true; + m_buildConfig.enableEmitCompactSizeDispatch = true; + } } // ===================================================================================================================== @@ -2194,7 +2219,10 @@ void BvhBuilder::InitBuildSettings() static_cast(m_buildConfig.fp16BoxNodesInBlasMode); m_buildSettings.fp16BoxModeMixedSaThreshold = m_deviceSettings.fp16BoxModeMixedSaThresh; m_buildSettings.enableBVHBuildDebugCounters = m_deviceSettings.enableBVHBuildDebugCounters; - m_buildSettings.plocRadius = m_deviceSettings.plocRadius; + if (buildMode == BvhBuildMode::PLOC) + { + m_buildSettings.nnSearchRadius = m_deviceSettings.plocRadius; + } m_buildSettings.enablePairCostCheck = m_deviceSettings.enablePairCompressionCostCheck; m_buildSettings.enableVariableBitsMortonCode = m_deviceSettings.enableVariableBitsMortonCodes; @@ -2222,24 +2250,7 @@ void BvhBuilder::InitBuildSettings() m_buildSettings.rtIpLevel = static_cast(m_pDevice->GetRtIpLevel()); - uint32 emitBufferCount = 0; - for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; ++i) - { - AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i); - if (args.desc.infoType == AccelStructPostBuildInfoType::CompactedSize) - { - // Cache emit destination GPU VA for inlined emit from build shaders - m_emitCompactDstGpuVa = args.desc.postBuildBufferAddr.gpu; - emitBufferCount++; - } - } - - if (emitBufferCount == 1) - { - // We only support one compacted emit size from the build shaders. If we have more than one emit - // destination buffers, we use the compute shader path - m_buildSettings.emitCompactSize = 1; - } + m_buildSettings.emitCompactSize = (m_emitCompactDstGpuVa != 0); m_buildSettings.doEncode = (m_buildConfig.needEncodeDispatch == false); @@ -2313,8 +2324,10 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo( // the build when performing the update causing page faults. scratchDataSize = Util::Max(scratchDataSize, updateDataSize); - // Some applications crash when the driver reports 0 scratch size. Use 1 DWORD instead. - scratchDataSize = Util::Max(static_cast(sizeof(uint32)), scratchDataSize); + // Some applications crash when the driver reports 0 scratch size. + // Additionally, the d3d12 debug layer does not like a scratch buffer + // that's only 4 bytes, so we pass back 8 bytes instead. + scratchDataSize = Util::Max(static_cast(sizeof(uint64)), scratchDataSize); prebuildInfo.scratchDataSizeInBytes = scratchDataSize; prebuildInfo.updateScratchDataSizeInBytes = updateDataSize; @@ -2432,7 +2445,7 @@ void BvhBuilder::BuildRaytracingAccelerationStructure() if (m_buildArgs.postBuildInfoDescCount > 0) { - if (NeedsPostBuildEmitPass()) + if (m_buildConfig.enableEmitCompactSizeDispatch) { // Make sure build is complete before emitting Barrier(); @@ -2513,7 +2526,6 @@ void BvhBuilder::PreBuildDumpEvents() if (result == Pal::Result::Success) { m_backend.WriteTimestamp(m_cmdBuffer, - HwPipePoint::HwPipeBottom, *m_dumpInfo.pTimeStampVidMem, m_dumpInfo.timeStampVidMemoffset); } @@ -2530,7 +2542,6 @@ void BvhBuilder::PostBuildDumpEvents() if (m_dumpInfo.pTimeStampVidMem != nullptr) { m_backend.WriteTimestamp(m_cmdBuffer, - HwPipePoint::HwPipeBottom, *m_dumpInfo.pTimeStampVidMem, m_dumpInfo.timeStampVidMemoffset + sizeof(uint64)); } @@ -2739,23 +2750,17 @@ void BvhBuilder::EncodePrimitives() // Handles writing any requested postbuild information. void BvhBuilder::EmitPostBuildInfo() { - if (m_buildArgs.postBuildInfoDescCount == 0) - { - return; - } - const uint32 resultDataSize = m_resultBufferInfo.dataSize; const bool isBottomLevel = (m_buildArgs.inputs.type == AccelStructType::BottomLevel); - const bool useSeparateEmitPass = NeedsPostBuildEmitPass(); + for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; i++) { const AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i); switch (args.desc.infoType) { case AccelStructPostBuildInfoType::CompactedSize: - // If maxNumPrimitives == 0, we never execute a BVH build, so we always need a separateEmitPass - if (useSeparateEmitPass || (m_buildConfig.maxNumPrimitives == 0)) + if (m_buildConfig.enableEmitCompactSizeDispatch) { EmitAccelerationStructurePostBuildInfo(args); } @@ -2808,6 +2813,22 @@ void BvhBuilder::EmitPostBuildInfo() } } +// ===================================================================================================================== +// Handles writing any requested postbuild information via dispatch (not CP writes). +void BvhBuilder::EmitPostBuildInfoDispatch() +{ + for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; i++) + { + const AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i); + + if ((args.desc.infoType != AccelStructPostBuildInfoType::CompactedSize) || + m_buildConfig.enableEmitCompactSizeDispatch) + { + EmitAccelerationStructurePostBuildInfo(args); + } + } +} + // ===================================================================================================================== // Emits post-build properties for a set of acceleration structures. // This enables applications to know the output resource requirements for performing acceleration structure @@ -3137,6 +3158,7 @@ void BvhBuilder::CopyASDeserializeMode( }; // Reset the task counter in destination buffer. + Barrier(BarrierFlagSyncPreCpWrite); ResetTaskCounter(copyArgs.dstAccelStructAddr.gpu); Barrier(BarrierFlagSyncPostCpWrite); @@ -3195,7 +3217,7 @@ BuildPhaseFlags BvhBuilder::EnabledPhases() const { BuildPhaseFlags flags{}; - if (NeedsPostBuildEmitPass()) + if (m_buildConfig.nonInlinePostBuildEmits) { flags |= BuildPhaseFlags::SeparateEmitPostBuildInfoPass; } @@ -3451,15 +3473,6 @@ bool BvhBuilder::AllowLatePairCompression() const return enableLatePairCompression; } -// ===================================================================================================================== -// Returns true when the builder will require a separate dispatch for emitting build info -bool BvhBuilder::NeedsPostBuildEmitPass() const -{ - const bool usesSeparateEmitPass = (m_buildArgs.postBuildInfoDescCount == 0) && - (m_emitCompactDstGpuVa != 0) && (m_buildSettings.emitCompactSize == 0); - return usesSeparateEmitPass; -} - // ===================================================================================================================== // Returns true when the builder has dumping events bool BvhBuilder::HasBuildDumpEvents() const diff --git a/src/gpurtBvhBuilder.h b/src/gpurtBvhBuilder.h index 806c518..c025041 100644 --- a/src/gpurtBvhBuilder.h +++ b/src/gpurtBvhBuilder.h @@ -224,6 +224,8 @@ class BvhBuilder bool enableMergeSort; bool enableInstanceRebraid; bool rebuildAccelStruct; + bool enableEmitCompactSizeDispatch; + bool nonInlinePostBuildEmits; }; BvhBuilder( @@ -321,6 +323,7 @@ class BvhBuilder void UpdateAccelerationStructure(); void EmitPostBuildInfo(); + void EmitPostBuildInfoDispatch(); void EncodeUpdate(); @@ -413,7 +416,6 @@ class BvhBuilder // Optional phase checks bool AllowRebraid() const; bool AllowLatePairCompression() const; - bool NeedsPostBuildEmitPass() const; bool HasBuildDumpEvents() const; // Helper functions diff --git a/src/gpurtTraceSource.h b/src/gpurtTraceSource.h index 3262892..1c609f9 100644 --- a/src/gpurtTraceSource.h +++ b/src/gpurtTraceSource.h @@ -71,7 +71,11 @@ class AccelStructTraceSource : public GpuUtil::ITraceSource } // Using this notification to do any preparation work that might be required before the trace begins. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + virtual void OnTraceAccepted(uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override +#else virtual void OnTraceAccepted() override +#endif { } @@ -134,7 +138,11 @@ class RayHistoryTraceSource : public GpuUtil::ITraceSource } // Using this notification to do any preparation work that might be required before the trace begins. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + virtual void OnTraceAccepted(uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override +#else virtual void OnTraceAccepted() override +#endif { } diff --git a/src/shaders/BuildCommonScratch.hlsl b/src/shaders/BuildCommonScratch.hlsl index b0d3197..648a11e 100644 --- a/src/shaders/BuildCommonScratch.hlsl +++ b/src/shaders/BuildCommonScratch.hlsl @@ -682,6 +682,32 @@ bool IsLeafOrIsCollapsed( return result; } +//===================================================================================================================== +uint GetMinimumNumOfTriangles() +{ + uint minNumOfTris = 2; + { + { + minNumOfTris = 0; + } + } + + return minNumOfTris; +} + +//===================================================================================================================== +float GetTriangleIntersectionCost(uint numTris) +{ + float Ct; + { + { + Ct = SAH_COST_TRIANGLE_INTERSECTION * numTris; + } + } + + return Ct; +} + //===================================================================================================================== void MergeScratchNodes( uint scratchNodesOffset, @@ -724,18 +750,17 @@ void MergeScratchNodes( const uint numRight = FetchScratchNodeNumPrimitives(rightNode, IsLeafNode(rightNodeIndex, numActivePrims)); const uint numTris = numLeft + numRight; - const float Ct = - SAH_COST_TRIANGLE_INTERSECTION; - const float Ci = SAH_COST_AABBB_INTERSECTION; const float leftCost = IsLeafNode(leftNodeIndex, numActivePrims) ? - (Ct * ComputeBoxSurfaceArea(leftBounds)) : FetchScratchNodeCost(scratchNodesOffset, leftNodeIndex); + (GetTriangleIntersectionCost(numLeft) * ComputeBoxSurfaceArea(leftBounds)) : + FetchScratchNodeCost(scratchNodesOffset, leftNodeIndex); const float rightCost = IsLeafNode(rightNodeIndex, numActivePrims) ? - (Ct * ComputeBoxSurfaceArea(rightBounds)) : FetchScratchNodeCost(scratchNodesOffset, rightNodeIndex); + (GetTriangleIntersectionCost(numRight) * ComputeBoxSurfaceArea(rightBounds)) : + FetchScratchNodeCost(scratchNodesOffset, rightNodeIndex); const bool leftCollapse = (leftNode.numPrimitivesAndDoCollapse & 0x1) || IsLeafNode(leftNodeIndex, numActivePrims); @@ -745,7 +770,7 @@ void MergeScratchNodes( float bestCost = leftCost + rightCost + Ci * mergedBoxSurfaceArea; - const float collapseCost = Ct * numTris; + const float collapseCost = GetTriangleIntersectionCost(numTris); const float splitCost = Ci + leftCost / mergedBoxSurfaceArea + rightCost / mergedBoxSurfaceArea; diff --git a/src/shaders/BuildPLOC.hlsl b/src/shaders/BuildPLOC.hlsl index 079f1bc..4c2f7b1 100644 --- a/src/shaders/BuildPLOC.hlsl +++ b/src/shaders/BuildPLOC.hlsl @@ -882,7 +882,7 @@ void BuildPLOC( plocArgs.baseBatchIndicesScratchOffset = ShaderConstants.offsets.batchIndices; plocArgs.fp16BoxNodesInBlasMode = Settings.fp16BoxNodesMode; plocArgs.fp16BoxModeMixedSaThresh = Settings.fp16BoxModeMixedSaThreshold; - plocArgs.plocRadius = Settings.plocRadius; + plocArgs.plocRadius = Settings.nnSearchRadius; plocArgs.splitBoxesByteOffset = ShaderConstants.offsets.triangleSplitBoxes; plocArgs.primIndicesSortedScratchOffset = ShaderConstants.offsets.primIndicesSorted; plocArgs.unsortedBvhLeafNodesOffset = ShaderConstants.offsets.bvhLeafNodeData; diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl index 8a3df86..15e197d 100644 --- a/src/shaders/BuildParallel.hlsl +++ b/src/shaders/BuildParallel.hlsl @@ -265,7 +265,7 @@ void BuildPloc( plocArgs.baseBatchIndicesScratchOffset = ShaderConstants.offsets.batchIndices; plocArgs.fp16BoxNodesInBlasMode = Settings.fp16BoxNodesMode; plocArgs.fp16BoxModeMixedSaThresh = Settings.fp16BoxModeMixedSaThreshold; - plocArgs.plocRadius = Settings.plocRadius; + plocArgs.plocRadius = Settings.nnSearchRadius; plocArgs.splitBoxesByteOffset = ShaderConstants.offsets.triangleSplitBoxes; plocArgs.primIndicesSortedScratchOffset = ShaderConstants.offsets.primIndicesSorted; plocArgs.unsortedBvhLeafNodesOffset = ShaderConstants.offsets.bvhLeafNodeData; diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl index 60e527f..1fc9b0f 100644 --- a/src/shaders/BuildQBVH.hlsl +++ b/src/shaders/BuildQBVH.hlsl @@ -279,25 +279,19 @@ uint WritePrimitiveNode( const uint geometryIndexAndFlags = PackGeometryIndexAndFlags(geometryIndex, geometryFlags); const uint geometryPrimNodePtrsOffset = offsets.primNodePtrs + geometryInfo.primNodePtrsOffset; - const uint flattenedPrimIndex = - (geometryInfo.primNodePtrsOffset / sizeof(uint)) + scratchNode.left_or_primIndex_or_instIndex; - uint numLeafsDone; ScratchGlobal.InterlockedAdd(ShaderConstants.offsets.qbvhGlobalStackPtrs + STACK_PTRS_NUM_LEAFS_DONE_OFFSET, 1, numLeafsDone); { - uint destIndex; - if (IsTrianglePrimitiveBuild() && - ((Settings.triangleCompressionMode != NO_TRIANGLE_COMPRESSION) || Settings.doTriangleSplitting)) - { - destIndex = numLeafsDone; - } - else - { - destIndex = flattenedPrimIndex; - } + // Use 'numLeafsDone' as the destination index. This will pack all leaf nodes together + // without any holes (invalid nodes) in between. + // Note: Packing the triangle nodes this way causes the primNodePtrs to access the + // Triangle nodes in random order which results in perf drops of some Rayperf scenes + // when built/updated using 'asb'. Since 'asb' is a synthetic app, ignoring this perf drop + // for now, but need to revisit this change if any actual game/benchmark shows the perf. drop. + uint destIndex = numLeafsDone; const uint primitiveNodeSize = (nodeType == NODE_TYPE_USER_NODE_PROCEDURAL) ? USER_NODE_PROCEDURAL_SIZE : diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli index ac6e315..2e5ff10 100644 --- a/src/shaders/BuildSettings.hlsli +++ b/src/shaders/BuildSettings.hlsli @@ -38,7 +38,7 @@ [[vk::constant_id(BUILD_SETTINGS_DATA_RADIX_SORT_SCAN_LEVEL_ID)]] uint radixSortScanLevel = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_EMIT_COMPACT_SIZE_ID)]] uint emitCompactSize = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_BVH_BUILD_DEBUG_COUNTERS_ID)]] uint enableBVHBuildDebugCounters = 0; -[[vk::constant_id(BUILD_SETTINGS_DATA_PLOC_RADIUS_ID)]] uint plocRadius = 0; +[[vk::constant_id(BUILD_SETTINGS_DATA_NN_SEARCH_RADIUS_ID)]] uint nnSearchRadius = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_PAIR_COST_CHECK_ID)]] uint enablePairCostCheck = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_VARIABLE_BITS_MC_ID)]] uint enableVariableBitsMortonCode = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_REBRAID_TYPE_ID)]] uint rebraidType = 0; @@ -73,7 +73,7 @@ static const CompileTimeBuildSettings Settings = { radixSortScanLevel, emitCompactSize, enableBVHBuildDebugCounters, - plocRadius, + nnSearchRadius, enablePairCostCheck, enableVariableBitsMortonCode, rebraidType, diff --git a/src/shaders/CompactAS1_1.hlsl b/src/shaders/CompactAS1_1.hlsl index d9a165e..31af7df 100644 --- a/src/shaders/CompactAS1_1.hlsl +++ b/src/shaders/CompactAS1_1.hlsl @@ -391,40 +391,51 @@ void CompactASImpl1_1( // Copy leaf nodes if (type == TOP_LEVEL) { - for (uint nodeIndex = globalId; nodeIndex < srcHeader.numLeafNodes; nodeIndex += ShaderConstants.numThreads) + // Need to loop over all the prims, not just numLeafNodes. + for (uint nodeIndex = globalId; nodeIndex < srcHeader.numPrimitives; nodeIndex += ShaderConstants.numThreads) { - const uint nodeOffset - = nodeIndex * GetBvhNodeSizeLeaf(PrimitiveType::Instance, Settings.enableFusedInstanceNode); - const uint srcNodeDataOffset = srcOffsetDataLeafNodes + nodeOffset; - const uint dstNodeDataOffset = dstOffsetDataLeafNodes + nodeOffset; + // Since there could be invalid instance nodes, we need to skip over them. Invalid instance nodes + // will have corresponding prim node pointers as -1. So check for this and skip the node if invalid. + // Note: We don't need to skip invalid nodes for BLASs because their leaf nodes will be packed one + // after another, ie: no holes -> no invalid nodes. + const uint primNodePtrOffset = srcOffsetDataPrimNodePtrs + (nodeIndex * NODE_PTR_SIZE); - // Copy instance node - // Note, fused instance nodes are twice the size of normal instance nodes. We need to copy it correspondingly. - if (Settings.enableFusedInstanceNode) - { - const FusedInstanceNode node = SrcBuffer.Load(srcNodeDataOffset); - DstMetadata.Store(dstNodeDataOffset, node); - } - else + if (SrcBuffer.Load(primNodePtrOffset) != INVALID_IDX) { - const InstanceNode node = SrcBuffer.Load(srcNodeDataOffset); - DstMetadata.Store(dstNodeDataOffset, node); - } + const uint nodeOffset + = nodeIndex * GetBvhNodeSizeLeaf(PrimitiveType::Instance, Settings.enableFusedInstanceNode); + const uint srcNodeDataOffset = srcOffsetDataLeafNodes + nodeOffset; + const uint dstNodeDataOffset = dstOffsetDataLeafNodes + nodeOffset; + + // Copy instance node + // Note, fused instance nodes are twice the size of normal instance nodes. We need to copy it correspondingly. + if (Settings.enableFusedInstanceNode) + { + const FusedInstanceNode node = SrcBuffer.Load(srcNodeDataOffset); + DstMetadata.Store(dstNodeDataOffset, node); + } + else + { + const InstanceNode node = SrcBuffer.Load(srcNodeDataOffset); + DstMetadata.Store(dstNodeDataOffset, node); + } - // Top level acceleration structures do not have geometry info. + // Top level acceleration structures do not have geometry info. - const uint srcNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, srcOffsets.leafNodes + nodeOffset); - const uint dstNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, dstOffsets.leafNodes + nodeOffset); + const uint srcNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, srcOffsets.leafNodes + nodeOffset); + const uint dstNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, dstOffsets.leafNodes + nodeOffset); - // Update the parent pointer and fix up the child pointer in the parent node - UpdateParentPointerAndChildPointer(srcMetadataSizeInBytes, - srcNodePointer, - dstMetadataSizeInBytes, - dstNodePointer); + // Update the parent pointer and fix up the child pointer in the parent node + UpdateParentPointerAndChildPointer(srcMetadataSizeInBytes, + srcNodePointer, + dstMetadataSizeInBytes, + dstNodePointer); + } } } else if (srcHeader.geometryType == GEOMETRY_TYPE_TRIANGLES) { + // Unlike TOP_LEVEL, this assumes that all leaf nodes are packed contiguously without any holes in between. for (uint nodeIndex = globalId; nodeIndex < srcHeader.numLeafNodes; nodeIndex += ShaderConstants.numThreads) { const uint nodeOffset = (nodeIndex * sizeof(TriangleNode)); diff --git a/src/shaders/Continuations1_1.hlsl b/src/shaders/Continuations1_1.hlsl index 1d17e9d..09fb6b5 100644 --- a/src/shaders/Continuations1_1.hlsl +++ b/src/shaders/Continuations1_1.hlsl @@ -158,7 +158,7 @@ static _AmdTraversalState InitTraversalState1_1( uint schedulerState = TRAVERSAL_STATE_COMMITTED_NOTHING; traversal.committed.PackState(schedulerState); - traversal.committed.currNodePtr = INVALID_NODE; + traversal.committed.SetCurrNodePtr(INVALID_NODE); // Start traversing from root node traversal.reservedNodePtr = INVALID_NODE; @@ -173,7 +173,7 @@ static _AmdTraversalState InitTraversalState1_1( traversal.stackPtr = stack.Pack(); traversal.PackStackPtrTop(INVALID_NODE); -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL traversal.committed.PackAnyHitCallType(0); #endif @@ -354,7 +354,7 @@ static void TraversalInternal1_1( candidate.PackInstanceContribution(instanceContributionToHitGroupIndex, hitKind); candidate.PackGeometryIndex(primitiveData.geometryIndex); candidate.PackIsOpaque(isOpaque); - candidate.currNodePtr = nodePtr; + candidate.SetCurrNodePtr(nodePtr); bool hasAnyHit = false; if ((rayForceOpaque == false) && (isOpaque == false)) @@ -416,9 +416,9 @@ static void TraversalInternal1_1( candidate.PackGeometryIndex(primitiveData.geometryIndex); candidate.PackIsOpaque(isOpaque); candidate.PackInstanceContribution(instanceContributionToHitGroupIndex); - candidate.currNodePtr = nodePtr; + candidate.SetCurrNodePtr(nodePtr); -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL uint anyHitCallType = rayForceOpaque ? ANYHIT_CALLTYPE_SKIP : ANYHIT_CALLTYPE_DUPLICATE; const bool noDuplicateAnyHit = (geometryFlags & D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION); anyHitCallType = noDuplicateAnyHit ? ANYHIT_CALLTYPE_NO_DUPLICATE : anyHitCallType; diff --git a/src/shaders/Continuations2_0.hlsl b/src/shaders/Continuations2_0.hlsl index 73293dc..283fe20 100644 --- a/src/shaders/Continuations2_0.hlsl +++ b/src/shaders/Continuations2_0.hlsl @@ -43,7 +43,7 @@ static _AmdTraversalState InitTraversalState2_0( uint schedulerState = TRAVERSAL_STATE_COMMITTED_NOTHING; traversal.committed.PackState(schedulerState); - traversal.committed.currNodePtr = INVALID_NODE; + traversal.committed.SetCurrNodePtr(INVALID_NODE); // Start traversing from root node traversal.reservedNodePtr = INVALID_NODE; @@ -58,7 +58,7 @@ static _AmdTraversalState InitTraversalState2_0( traversal.PackStackPtrTop(0); -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL traversal.committed.PackAnyHitCallType(0); #endif @@ -72,14 +72,8 @@ static void TraversalInternal2_0( inout_param(_AmdPrimitiveSystemState) candidate, inout_param(float2) candidateBarycentrics) { - uint rayFlags = data.ray.Flags(); - - uint boxHeuristicMode = AmdTraceRayGetBoxSortHeuristicMode(); - if ((boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosest) || - (boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosestMidPoint)) - { - boxHeuristicMode = GetBoxSortingHeuristicFromRayFlags(rayFlags, boxHeuristicMode); - } + const uint rayFlags = data.ray.Flags(); + const uint boxHeuristicMode = GetBoxHeuristicMode(); // Root bvh address for reuse const GpuVirtualAddress topBvhAddress = data.ray.AccelStruct(); @@ -322,7 +316,8 @@ static void TraversalInternal2_0( committed.PackInstanceContribution(instanceContributionToHitGroupIndex, hitKind); committed.PackGeometryIndex(primitiveData.geometryIndex, TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT, false); - committed.currNodePtr = nodePtr; + committed.SetCurrNodePtr(nodePtr); + state = TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT; // Exit traversal early if ray flags indicate end search after first hit @@ -357,7 +352,8 @@ static void TraversalInternal2_0( candidate.PackGeometryIndex(primitiveData.geometryIndex, // This #ifdef is required until the legacy GPURT_RTIP_LEVEL == 0 lib has been removed: TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT, isOpaque); - candidate.currNodePtr = nodePtr; + candidate.SetCurrNodePtr(nodePtr); + if (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::DeferFirst) { haveCandidate = true; @@ -412,10 +408,10 @@ static void TraversalInternal2_0( candidate.PackGeometryIndex(primitiveData.geometryIndex); candidate.PackIsOpaque(isOpaque); candidate.PackInstanceContribution(instanceContributionToHitGroupIndex); - candidate.currNodePtr = nodePtr; + candidate.SetCurrNodePtr(nodePtr); candidate.instNodePtr = data.traversal.instNodePtr; -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL // Determine anyHit shader call type uint anyHitCallType = rayForceOpaque ? ANYHIT_CALLTYPE_SKIP : ANYHIT_CALLTYPE_DUPLICATE; diff --git a/src/shaders/DecodeAS.hlsl b/src/shaders/DecodeAS.hlsl index fb8ca1a..64cb0fd 100644 --- a/src/shaders/DecodeAS.hlsl +++ b/src/shaders/DecodeAS.hlsl @@ -170,7 +170,7 @@ void DecodeAS(in uint3 globalThreadId : SV_DispatchThreadID) } else // GEOMETRY_TYPE_AABBS { - DstBuffer.Store(dstGeometryDescOffset + GEOMETRY_DESC_AABB_COUNT_OFFSET, geometryNumPrimitives); + DstBuffer.Store(dstGeometryDescOffset + GEOMETRY_DESC_AABB_COUNT_OFFSET, geometryNumPrimitives); DstBuffer.Store4(dstGeometryDescOffset + GEOMETRY_DESC_AABBS_OFFSET, uint4(addressLo, addressHi, DECODE_PRIMITIVE_STRIDE_AABB, 0)); } diff --git a/src/shaders/GpuRtLibrary.hlsl b/src/shaders/GpuRtLibrary.hlsl index 879590c..de4900c 100644 --- a/src/shaders/GpuRtLibrary.hlsl +++ b/src/shaders/GpuRtLibrary.hlsl @@ -33,6 +33,39 @@ #include "TraceRayCommon.hlsl" #include "AccelStructTracker.hlsl" +#ifdef __cplusplus +extern uint g_rtIpLevel; // defined in cputraversal +void _AmdSetRtip(uint rtIpLevel); // defined in cputraversal +#endif + +// Only the default path (Continuation) provides _AmdGetRtip(). +static RayTracingIpLevel GetRtIpLevel() +{ +#ifdef __cplusplus + switch (g_rtIpLevel) + { + case GPURT_RTIP1_1: + return RayTracingIpLevel::RtIp1_1; + case GPURT_RTIP2_0: + return RayTracingIpLevel::RtIp2_0; + default: + // Should never be called + GPU_ASSERT(false); + return RayTracingIpLevel::_None; + } +#else // __cplusplus +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL + if (GPURT_RTIP_LEVEL == (uint)RayTracingIpLevel::_None) + { + return RayTracingIpLevel::_None; + } + return RayTracingIpLevel::RtIp2_0; //default to ip 2.0 +#else // GPURT_DEBUG_CONTINUATION_TRAVERSAL + return _AmdGetRtip(); // Continuation path +#endif +#endif +} + #if GPURT_BUILD_CONTINUATION && LLPC_CLIENT_INTERFACE_MAJOR_VERSION // Include the continuations library #include "GpuRtLibraryCont.hlsl" @@ -294,8 +327,13 @@ export void TraceRayInline2_0( export uint GetInstanceID( in uint64_t instanceNodePtr) // 64-bit instance node address { - const uint instanceIdAndMask = LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_ID_AND_MASK_OFFSET); - return (instanceIdAndMask & 0x00ffffff); + uint instanceId = 0; + if (instanceNodePtr != 0) + { + const uint instanceIdAndMask = LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_ID_AND_MASK_OFFSET); + instanceId = (instanceIdAndMask & 0x00ffffff); + } + return instanceId; } //===================================================================================================================== @@ -303,7 +341,13 @@ export uint GetInstanceID( export uint GetInstanceIndex( in uint64_t instanceNodePtr) // 64-bit instance node address { - return LoadDwordAtAddr(instanceNodePtr + sizeof(InstanceDesc) + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET); + uint instanceIndex = 0; + if (instanceNodePtr != 0) + { + instanceIndex = LoadDwordAtAddr(instanceNodePtr + sizeof(InstanceDesc) + + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET); + } + return instanceIndex; } //===================================================================================================================== @@ -313,11 +357,16 @@ export float GetObjectToWorldTransform( in uint32_t row, // row index in uint32_t col) // column index { - const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float)); - return asfloat(LoadDwordAtAddr(instanceNodePtr + - sizeof(InstanceDesc) + - RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET + - elementOffset)); + float transform = 0; + if (instanceNodePtr != 0) + { + const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float)); + transform = asfloat(LoadDwordAtAddr(instanceNodePtr + + sizeof(InstanceDesc) + + RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET + + elementOffset)); + } + return transform; } //===================================================================================================================== @@ -327,8 +376,14 @@ export float GetWorldToObjectTransform( in uint32_t row, // row index in uint32_t col) // column index { - const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float)); - return asfloat(LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET + elementOffset)); + float transform = 0; + if (instanceNodePtr != 0) + { + const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float)); + transform = asfloat(LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET + + elementOffset)); + } + return transform; } //===================================================================================================================== @@ -336,17 +391,21 @@ export float GetWorldToObjectTransform( static float3x4 GetObjectToWorld3x4( in uint64_t instanceNodePtr) { - float3x4 transform; - switch (_AmdGetRtip()) - { - default: + float3x4 transform = (float3x4)0; + + if (instanceNodePtr != 0) { - const uint offset = RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET; - transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 0)); - transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 16)); - transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 32)); - break; - } + switch (GetRtIpLevel()) + { + default: + { + const uint offset = RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET; + transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 0)); + transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 16)); + transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 32)); + break; + } + } } return transform; @@ -357,20 +416,23 @@ static float3x4 GetObjectToWorld3x4( static float3x4 GetWorldToObject3x4( in uint64_t instanceNodePtr) { - float3x4 transform; + float3x4 transform = (float3x4)0; - switch (_AmdGetRtip()) + if (instanceNodePtr != 0) { - default: - { - const uint offset = INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET; - - transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 0)); - transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 16)); - transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 32)); - - break; - } + switch (GetRtIpLevel()) + { + default: + { + const uint offset = INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET; + + transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 0)); + transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 16)); + transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 32)); + + break; + } + } } return transform; @@ -398,7 +460,12 @@ export uint64_t GetRayQuery64BitInstanceNodePtr( in uint64_t tlasBaseAddr, // 64-bit TLAS base address in uint32_t instanceNodePtr) // Instance node pointer { - return CalculateNodeAddr64(tlasBaseAddr, instanceNodePtr); + uint64_t nodeAddr = 0; + if (instanceNodePtr != 0) + { + nodeAddr = CalculateNodeAddr64(tlasBaseAddr, instanceNodePtr); + } + return nodeAddr; } //===================================================================================================================== @@ -429,7 +496,7 @@ static uint GetGeneralInstanceID( in uint64_t instNodeAddr) // 64-bit instance node address { uint id = 0; - switch (_AmdGetRtip()) + switch (GetRtIpLevel()) { default: { @@ -447,7 +514,7 @@ static uint GetGeneralInstanceIndex( in uint64_t instNodeAddr) // 64-bit instance node address { uint index = 0; - RayTracingIpLevel rtip = _AmdGetRtip(); + RayTracingIpLevel rtip = GetRtIpLevel(); switch (rtip) { default: @@ -467,7 +534,7 @@ static uint64_t GetRayQueryInstanceNodePtr( in uint32_t instanceNodePtr) // Instance node pointer { uint64_t instNodePtr = 0; - RayTracingIpLevel rtip = _AmdGetRtip(); + RayTracingIpLevel rtip = GetRtIpLevel(); switch (rtip) { default: @@ -490,7 +557,7 @@ export RayQueryInternal _RayQuery_Allocate() export void _RayQuery_Abort( inout_param(RayQueryInternal) rayQuery) { - uint rtIp = (uint)_AmdGetRtip(); + uint rtIp = (uint)GetRtIpLevel(); if (rtIp >= (uint)RayTracingIpLevel::RtIp2_0) { rayQuery.currNodePtr = TERMINAL_NODE; @@ -1011,7 +1078,7 @@ export TriangleData _RayQuery_FetchTrianglePosition( in bool committed) // Node pointer { TriangleData tdata; - RayTracingIpLevel rtip = _AmdGetRtip(); + RayTracingIpLevel rtip = GetRtIpLevel(); switch (rtip) { default: @@ -1030,7 +1097,7 @@ export bool _RayQuery_Proceed( in uint constRayFlags, in uint3 dispatchThreadId) { - uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip()); + uint rtIpLevel = ConvertRtIpLevel(GetRtIpLevel()); return RayQueryProceedCommon( rayQuery, constRayFlags, @@ -1051,7 +1118,7 @@ export void _RayQuery_TraceRayInline( in RayDesc rayDesc, in uint3 dispatchThreadId) { - uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip()); + uint rtIpLevel = ConvertRtIpLevel(GetRtIpLevel()); TraceRayInlineCommon(rayQuery, accelStructLo, accelStructHi, diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl index 293fae0..f706b3d 100644 --- a/src/shaders/GpuRtLibraryCont.hlsl +++ b/src/shaders/GpuRtLibraryCont.hlsl @@ -25,8 +25,6 @@ // Include intrinsics and defines from the compiler #include "llpc/GpurtIntrinsics.h" -#ifndef __cplusplus -#endif #if DEVELOPER #include "../../gpurt/gpurtCounter.h" #endif @@ -34,20 +32,21 @@ #include "../shadersClean/common/Math.hlsli" #include "../shadersClean/common/InstanceDesc.hlsli" -// By default, Gpurt exports both non-continuation and continuation traversal functions. Dxcp picks one based on panel -// setting. -// GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP = GPURT_RTIP1_1/GPURT_RTIP2_0 -// is only used for a debug purpose. -// It supports DxcpRt (non-continuation) to use Continuation traversal. In this config, the pure continuation model does -// not work. -#ifndef GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP -#define GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP 0 -#endif +// Do not use ~0 as an invalid stack pointer, to leave it free to use as a sentinel value +#define CPS_STACK_PTR_STACKLESS_DEAD_LANE (~uint32_t(1)) +// CPS Stack pointers are dword-aligned, so we can use up to 2 bits. Use the second bit +// to flag a dead lane, so in particular CPS_STACK_PTR_STACKLESS_DEAD_LANE identifies a dead lane +#define CPS_STACK_PTR_DEAD_LANE_FLAG (2) +#define CPS_STACK_PTR_INVALID (CPS_STACK_PTR_STACKLESS_DEAD_LANE & ~CPS_STACK_PTR_DEAD_LANE_FLAG) -#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0) && (!defined(__cplusplus))) -#define CONTINUATION_ON_GPU 1 -#else -#define CONTINUATION_ON_GPU 0 +#define DEAD_SHADER_ADDR (~uint32_t(0)) + +static bool RtIpIsAtLeast(RayTracingIpLevel level) +{ + return ((uint32_t)GetRtIpLevel()) >= ((uint32_t)level); +} + +#ifndef __cplusplus #endif #define REMAT_INSTANCE_RAY 1 @@ -96,37 +95,6 @@ #define SCHEDULING_PRIORITY_CALLABLE 6 // Maximum supported value (3 bits): 7 -#if CONTINUATION_ON_GPU == 0 -#ifdef __cplusplus -extern uint g_rtIpLevel; // defined in cputraversal -void _AmdSetRtip(uint rtIpLevel); // defined in cputraversal -#endif -static RayTracingIpLevel _AmdGetRtip() -{ - RayTracingIpLevel rtIpLevel = RayTracingIpLevel::_None; -#ifdef __cplusplus - switch (g_rtIpLevel) -#else - switch (GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP) -#endif - { - case GPURT_RTIP1_1: - rtIpLevel = RayTracingIpLevel::RtIp1_1; - break; - case GPURT_RTIP2_0: - rtIpLevel = RayTracingIpLevel::RtIp2_0; - break; - } - - return rtIpLevel; -} -#endif - -static bool RtIpIsAtLeast(RayTracingIpLevel level) -{ - return ((uint32_t)_AmdGetRtip()) >= ((uint32_t)level); -} - //===================================================================================================================== static uint GetPriorityForShaderType( DXILShaderKind shaderKind) @@ -146,6 +114,63 @@ static uint GetPriorityForShaderType( // Forward declaration for _AmdDispatchSystemData.PackDispatchId() and _AmdDispatchSystemData.DispatchId() static uint3 GetDispatchRaysDimensions(); +//===================================================================================================================== +// Apply the known set/unset bits +static uint ApplyKnownFlags( + uint incomingFlags) +{ + uint flags = incomingFlags; + +#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 41 + // Apply known bits common to all TraceRay calls + flags = ((flags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags()); +#endif + + // Apply options overrides + flags &= ~Options::getRayFlagsOverrideForceDisableMask(); + flags |= Options::getRayFlagsOverrideForceEnableMask(); + + return flags; +} + +//===================================================================================================================== +// Apply compile time pipeline config flags only, it does not apply known common flags from TraceRay call sites +static uint ApplyCompileTimePipelineConfigFlags( + uint incomingFlags) +{ + uint flags = incomingFlags; + + flags |= (AmdTraceRayGetStaticFlags() & (PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES | PIPELINE_FLAG_SKIP_TRIANGLES)); +#if DEVELOPER + flags |= DispatchRaysConstBuf.profileRayFlags; +#endif + + return flags; +} + +//===================================================================================================================== +// Apply all static known flags, include both compile time pipeline config flags and known set/unset bits +static uint ApplyAllStaticallyKnownFlags( + uint incomingFlags) // The flags from TraceRay call sites, + // 0 means get Pipeline flags for all shaders in this pipeline +{ + return ApplyCompileTimePipelineConfigFlags(ApplyKnownFlags(incomingFlags)); +} + +//===================================================================================================================== +// Get the box sort heuristic mode according to the pipeline flags +static uint GetBoxHeuristicMode() +{ + uint boxHeuristicMode = AmdTraceRayGetBoxSortHeuristicMode(); + if ((boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosest) || + (boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosestMidPoint)) + { + boxHeuristicMode = GetBoxSortingHeuristicFromRayFlags(ApplyAllStaticallyKnownFlags(0), boxHeuristicMode); + } + + return boxHeuristicMode; +} + //===================================================================================================================== struct Vpc64 { @@ -181,6 +206,7 @@ struct Vpc64 { const uint firstMetadataBit = 32; const uint firstPriorityBitInMetadata = 16; GPU_ASSERT((vpc & 0xFFFF000000000000) == 0); + vpc &= 0x0000FFFFFFFFFFFF; vpc |= (prio64 << (firstMetadataBit + firstPriorityBitInMetadata)); return Vpc64(vpc); } @@ -216,24 +242,37 @@ struct Vpc32 { bool IsValid() { - return GetFunctionAddr() != 0; + return vpc != 0; } - void SetPriority(uint priority) + Vpc32 SetPriority(uint priority) { + if (_AmdIsLlpc()) + { + return Vpc32(vpc); + } + + vpc &= ~0x7; vpc |= priority; + + return Vpc32(vpc); } uint GetPriority() { return (uint)(vpc & 0x7); } + + static Vpc32 MakeWithPriority(Vpc32 vpc32, uint priority) + { + return vpc32.SetPriority(priority); + } }; //===================================================================================================================== // 32-bit function pointer packing/unpacking // -static Vpc64 Vpc32ToVpc64(Vpc32 vpc32, bool unpackPriority) +static Vpc64 Vpc32ToVpc64(Vpc32 vpc32) { if (_AmdIsLlpc()) { @@ -242,10 +281,7 @@ static Vpc64 Vpc32ToVpc64(Vpc32 vpc32, bool unpackPriority) Vpc64 vpc64 = Vpc64((uint64_t)(vpc32.GetFunctionAddr())); - if (unpackPriority) - { - vpc64.SetPriority(vpc32.GetPriority()); - } + vpc64.SetPriority(vpc32.GetPriority()); return vpc64; } @@ -315,8 +351,10 @@ struct _AmdDispatchSystemData return dispatchId; } - static _AmdDispatchSystemData MakeDeadLaneWithStack(); - static _AmdDispatchSystemData MakeDeadLaneWithoutStack(); + void SetDead(bool withStack) + { + nextNodePtr = withStack ? DEAD_LANE_WITH_STACK : DEAD_LANE_WITHOUT_STACK; + } uint dispatchLinearId; // Packed dispatch linear id. Combine x/y/z into 1 DWORD. @@ -358,27 +396,12 @@ struct _AmdRaySystemState // Incoming flags are the flags passed by TraceRay call uint IncomingFlags() { - uint incomingFlags = uint(bitFieldExtract64(packedAccelStruct, 48, 12)); -#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 41 - // Apply known bits common to all TraceRay calls - incomingFlags = ((incomingFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags()); -#endif - // Apply options overrides - incomingFlags &= ~Options::getRayFlagsOverrideForceDisableMask(); - incomingFlags |= Options::getRayFlagsOverrideForceEnableMask(); - - return incomingFlags; + return uint(bitFieldExtract64(packedAccelStruct, 48, 12)); } uint Flags() { - uint rayFlags = IncomingFlags(); - // Apply compile time pipeline config flags into the ray flags - rayFlags |= (AmdTraceRayGetStaticFlags() & (PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES | PIPELINE_FLAG_SKIP_TRIANGLES)); -#if DEVELOPER - rayFlags |= DispatchRaysConstBuf.profileRayFlags; -#endif - return rayFlags; + return ApplyAllStaticallyKnownFlags(IncomingFlags()); } void SetAnyHitDidAccept(bool value) @@ -421,7 +444,7 @@ struct _AmdPrimitiveSystemState packedGeometryIndex(0), packedInstanceContribution(0) , currNodePtr(INVALID_IDX) -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL , packedType(0) #endif { @@ -441,6 +464,10 @@ struct _AmdPrimitiveSystemState // hitKind [31 : 24] uint currNodePtr; + void SetCurrNodePtr(uint p) + { + currNodePtr = p; + } uint GeometryIndex() { @@ -519,7 +546,7 @@ struct _AmdPrimitiveSystemState packedInstanceContribution = bitFieldInsert(packedInstanceContribution, 24, 8, hitKind); } -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL // The following member data are only used in DEBUG uint packedType; // IsProcedural: [31] - 1 bit // AnyhitCallType: [1 : 0] - 2 bits @@ -598,9 +625,7 @@ struct _AmdTraversalState // field becomes re-used for something else in non-rebraid mode. uint reservedNodePtr; // RTIPv2.0 (lastNodePtr) -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0 uint32_t packedReturnAddr; // The address of the function to return to, packed into 32 bits. -#endif uint InstanceContribution() { @@ -629,16 +654,16 @@ struct _AmdTraversalState void PackStackPtrTop(uint ptr) { - GPU_ASSERT((_AmdGetRtip() == RayTracingIpLevel::RtIp1_1) || - (_AmdGetRtip() == RayTracingIpLevel::RtIp2_0)); + GPU_ASSERT((GetRtIpLevel() == RayTracingIpLevel::RtIp1_1) || + (GetRtIpLevel() == RayTracingIpLevel::RtIp2_0)); packedStackTopOrParentPointer = ptr; } uint StackPtrTop() { - GPU_ASSERT((_AmdGetRtip() == RayTracingIpLevel::RtIp1_1) || - (_AmdGetRtip() == RayTracingIpLevel::RtIp2_0)); + GPU_ASSERT((GetRtIpLevel() == RayTracingIpLevel::RtIp1_1) || + (GetRtIpLevel() == RayTracingIpLevel::RtIp2_0)); return packedStackTopOrParentPointer; } @@ -659,14 +684,14 @@ struct _AmdTraversalState return committed.State(); } - void PackReturnAddress(Vpc64 returnAddr) + void SetReturnAddress(Vpc32 returnAddr) { - packedReturnAddr = Vpc64ToVpc32(returnAddr).GetU32(); + packedReturnAddr = returnAddr.GetU32(); } - Vpc64 ReturnAddress() + Vpc32 ReturnAddress() { - return Vpc32ToVpc64(Vpc32(packedReturnAddr), true); + return Vpc32(packedReturnAddr); } }; @@ -712,6 +737,26 @@ struct _AmdRayHistoryCounter }; #endif +namespace Traits +{ + +static bool HasStacklessDeadLanes() +{ + return false; +} + +static bool HasStackfulDeadLanes() +{ + return Options::getPersistentLaunchEnabled(); +} + +static bool HasDeadLanes() +{ + return HasStackfulDeadLanes() || HasStacklessDeadLanes(); +} + +} // namespace Traits + //===================================================================================================================== struct _AmdSystemData { @@ -723,52 +768,56 @@ struct _AmdSystemData bool IsDeadLaneWithoutStack() { - // This type of dead lane is only possible when the continuations stack is in global memory. - // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime. - return (dispatch.nextNodePtr == DEAD_LANE_WITHOUT_STACK) && _AmdContinuationStackIsGlobal(); + return Traits::HasStacklessDeadLanes() && dispatch.nextNodePtr == DEAD_LANE_WITHOUT_STACK; } bool IsDeadLaneWithStack() { - // This type of dead lane is only possible when persistent launch is enabled. - // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime. - return (dispatch.nextNodePtr == DEAD_LANE_WITH_STACK) && Options::getPersistentLaunchEnabled(); + return Traits::HasStackfulDeadLanes() && dispatch.nextNodePtr == DEAD_LANE_WITH_STACK; + } + + bool IsDeadLane() + { + return IsDeadLaneWithoutStack() || IsDeadLaneWithStack(); } bool IsTraversal() { + GPU_ASSERT(!IsDeadLane()); return IsValidNode(dispatch.nextNodePtr); } bool IsChsOrMiss(in uint state) { + GPU_ASSERT(!IsDeadLane()); return (state >= TRAVERSAL_STATE_COMMITTED_NOTHING); } bool IsMiss(in uint state) { + GPU_ASSERT(!IsDeadLane()); return IsChsOrMiss(state) && !IsValidNode(traversal.committed.instNodePtr); } bool IsAhs(in uint state) { + GPU_ASSERT(!IsDeadLane()); return (state == TRAVERSAL_STATE_CANDIDATE_NON_OPAQUE_TRIANGLE); } bool IsIs(in uint state) { + GPU_ASSERT(!IsDeadLane()); return ((state == TRAVERSAL_STATE_CANDIDATE_PROCEDURAL_PRIMITIVE) || (state == TRAVERSAL_STATE_CANDIDATE_NON_OPAQUE_PROCEDURAL_PRIMITIVE)); } bool IsChs(in uint state) { + GPU_ASSERT(!IsDeadLane()); return IsChsOrMiss(state) && IsValidNode(traversal.committed.instNodePtr); } - static _AmdSystemData MakeDeadLaneWithStack(); - static _AmdSystemData MakeDeadLaneWithoutStack(); - // Note: _AmdDispatchSystemData must be the first member of _AmdSystemData. This allows us to save some VGPRs if // we need to call a function that takes _AmdSystemData but doesn't actually need ray or traversal data. // For example, the launch kernel can make a dead lane and enqueue traversal with just dispatch.nextNodePtr. @@ -816,24 +865,30 @@ struct _AmdTraversalResultData // 2) otherwise the first hitted non-opaque primitive. }; -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0 +#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus))) // Define specialized intrinsics. // We use macros because HLSL does not have varargs or generics. // The macros and intrinsics are defined by llpc. -DECLARE_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data) +DECLARE_ENQUEUE(, uint32_t returnAddr, _AmdSystemData data) -DECLARE_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data) -DECLARE_ENQUEUE(TraversalDead, uint64_t dummyReturnAddr, _AmdDispatchSystemData data) -DECLARE_ENQUEUE(RayGen, uint64_t dummyReturnAddr, _AmdDispatchSystemData data) +DECLARE_ENQUEUE(Traversal, uint32_t dummyReturnAddr, _AmdSystemData data) +DECLARE_ENQUEUE(TraversalDead, uint32_t dummyReturnAddr, _AmdDispatchSystemData data) +DECLARE_ENQUEUE(RayGen, uint32_t dummyReturnAddr, _AmdDispatchSystemData data) -DECLARE_ENQUEUE(AnyHit, uint64_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics) -DECLARE_ENQUEUE(Intersection, uint64_t returnAddr, _AmdAnyHitSystemData data) +DECLARE_ENQUEUE(AnyHit, uint32_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics) +DECLARE_ENQUEUE(Intersection, uint32_t returnAddr, _AmdAnyHitSystemData data) -DECLARE_AWAIT(AnyHit, _AmdAnyHitSystemData, uint64_t returnAddr, _AmdAnyHitSystemData data) -DECLARE_AWAIT(CallShader, _AmdDispatchSystemData, uint64_t returnAddr, _AmdDispatchSystemData data) +DECLARE_AWAIT(AnyHit, _AmdAnyHitSystemData, uint32_t returnAddr, _AmdAnyHitSystemData data) +DECLARE_AWAIT(CallShader, _AmdDispatchSystemData, uint32_t returnAddr, _AmdDispatchSystemData data) +#ifndef PASS_DUMMY_RET_ADDR // No returnAddr argument. The return address is instead included in the passed system data. DECLARE_AWAIT(Traversal, _AmdDispatchSystemData, _AmdSystemData data) +#else // PASS_DUMMY_RET_ADDR +// Pass a dummy return address for consistency reasons. +// The actual return address is included in the passed system data. +DECLARE_AWAIT(Traversal, _AmdDispatchSystemData, VpcIntTy dummyReturnAddr, _AmdSystemData data) +#endif DECLARE_RESTORE_SYSTEM_DATA(, _AmdDispatchSystemData data) DECLARE_RESTORE_SYSTEM_DATA(AnyHit, _AmdAnyHitSystemData data) @@ -853,64 +908,37 @@ DECLARE_CONT_STACK_LOAD_LAST_USE(U32, uint32_t) DECLARE_CONT_STACK_STORE(U32, uint32_t value) DECLARE_CONT_STACK_LOAD_LAST_USE(U64, uint64_t) DECLARE_CONT_STACK_STORE(U64, uint64_t value) -#endif - -inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithStack() -{ - _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData(); - data.nextNodePtr = DEAD_LANE_WITH_STACK; - return data; -} - -inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithoutStack() -{ - _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData(); - data.nextNodePtr = DEAD_LANE_WITHOUT_STACK; - return data; -} - -inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithStack() -{ - _AmdSystemData data = _AmdGetUninitializedSystemData(); - data.dispatch.nextNodePtr = DEAD_LANE_WITH_STACK; - return data; -} - -inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithoutStack() +#else // ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus))) +//===================================================================================================================== +inline _AmdDispatchSystemData _AmdGetUninitializedDispatchSystemData() { - _AmdSystemData data = _AmdGetUninitializedSystemData(); - data.dispatch.nextNodePtr = DEAD_LANE_WITHOUT_STACK; - return data; + return (_AmdDispatchSystemData)0; } //===================================================================================================================== -// Return the argument. -static Vpc64 GetVpc64FromShaderId(Vpc32 shaderId, uint priority) +inline _AmdSystemData _AmdGetUninitializedSystemData() { - Vpc64 vpc64 = Vpc32ToVpc64(shaderId, /* unpackPriority = */ false); - vpc64.SetPriority(priority); - return vpc64; + return (_AmdSystemData)0; } +#endif //===================================================================================================================== -static Vpc64 GetVpc64FromShaderIdAddr(GpuVirtualAddress addr, uint priority) +static Vpc32 GetVpcFromShaderIdAddr(GpuVirtualAddress addr) { #ifdef __cplusplus return 1; #else - Vpc32 shaderId = Vpc32(ConstantLoadDwordAtAddr(addr)); - return GetVpc64FromShaderId(shaderId, priority); + return Vpc32(ConstantLoadDwordAtAddr(addr)); #endif } //===================================================================================================================== -static Vpc64 GetVpc64FromShaderIdTable( +static Vpc32 GetVpcFromShaderIdTable( GpuVirtualAddress tableAddress, uint index, - uint stride, - uint priority) + uint stride) { - return GetVpc64FromShaderIdAddr(tableAddress + stride * index, priority); + return GetVpcFromShaderIdAddr(tableAddress + stride * index); } //===================================================================================================================== @@ -929,15 +957,6 @@ static Vpc32 GetAnyHit32BitShaderId( return Vpc32(ConstantLoadDwordAtAddr(tableVa + offset + 8)); } -//===================================================================================================================== -// Returns the 64-bit VPC for the given AHS by loading its shader address, and setting the AHS priority. -static Vpc64 GetAnyHitAddr( - uint hitGroupRecordIndex) -{ - Vpc32 shaderId = GetAnyHit32BitShaderId(hitGroupRecordIndex); - return GetVpc64FromShaderId(shaderId, SCHEDULING_PRIORITY_AHS); -} - //===================================================================================================================== // Returns whether the corresponding AHS is non-null. static bool AnyHitIsNonNull( @@ -1002,13 +1021,6 @@ static float4x3 WorldToObject4x3(in uint64_t tlasBaseAddr, in uint instNodePtr) return transpose(WorldToObject3x4(tlasBaseAddr, instNodePtr)); } -//===================================================================================================================== -// Implementation of DispatchRaysIndex. -export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data) -{ - return data.DispatchId(); -} - //===================================================================================================================== // Load dispatch dimensions from constant buffer. static uint3 GetDispatchRaysDimensions() @@ -1035,78 +1047,6 @@ static uint GetPersistentDispatchSize() return min(DispatchRaysConstBuf.rayDispatchMaxGroups, groupsNeeded); } -//===================================================================================================================== -// Implementation of DispatchRaysDimensions(). -export uint3 _cont_DispatchRaysDimensions3(in _AmdDispatchSystemData data) -{ - return GetDispatchRaysDimensions(); -} - -#if CONTINUATION_ON_GPU -//===================================================================================================================== -// Return the hit state for AnyHit and Intersection -export _AmdPrimitiveSystemState _cont_GetCandidateState(in _AmdAnyHitSystemData data) -{ - return data.candidate; -} - -//===================================================================================================================== -// Return the hit state for ClosestHit -export _AmdPrimitiveSystemState _cont_GetCommittedState(in _AmdSystemData data) -{ - return data.traversal.committed; -} - -//===================================================================================================================== -export float3 _cont_WorldRayOrigin3(in _AmdSystemData state) -{ - return state.ray.origin; -} - -//===================================================================================================================== -export float3 _cont_WorldRayDirection3(in _AmdSystemData state) -{ - return state.ray.direction; -} - -//===================================================================================================================== -export float _cont_RayTMin(in _AmdSystemData state) -{ - return state.ray.tMin; -} - -//===================================================================================================================== -export uint _cont_RayFlags(in _AmdSystemData state) -{ - return state.ray.IncomingFlags(); -} - -//===================================================================================================================== -export uint _cont_InstanceInclusionMask(in _AmdSystemData data) -{ - return ExtractInstanceInclusionMask(data.ray.traceParameters); -} - -//===================================================================================================================== -export float _cont_RayTCurrent(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) -{ - if (_AmdGetShaderKind() == DXILShaderKind::Intersection) - { - // The intersection shader is an exception. While the system data is usually about the candidate hit, the - // current t must be from the committed hit. - primitive = _cont_GetCommittedState(data); - } - - float tCurrentHw = 0.f; - { - tCurrentHw = primitive.rayTCurrent; - } - - // AMD Gpu shifts the origin, so rayTCurrent is between 0 and (tMaxApp - tMinApp). Add tMinApp back for App's use. - return tCurrentHw + data.ray.tMin; -} -#endif - //===================================================================================================================== // Map a thread to a ray, some threads could end up with non-existent (invalid) rays. // Note D3D12_DISPATCH_RAYS_DESC::(w x h x d) are organized to DispatchDims = (?, d, 1). @@ -1190,77 +1130,156 @@ static uint3 GetDispatchId(uint width, uint height, uint dispatchId) return uint3(xTile * TileWidth + x, yTile * TileHeight + y, z); } +#ifdef __cplusplus //===================================================================================================================== -export uint _cont_InstanceIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +// Helper function for cpp only +static float3 mul(in float3 v, in float4x3 m) { - - return ConstantLoadDwordAtAddr( - GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) + - INSTANCE_NODE_EXTRA_OFFSET + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET); + float3 r; + r.x = dot(m[0], v); + r.y = dot(m[1], v); + r.z = dot(m[2], v); + return r; } +#endif +#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus))) //===================================================================================================================== -export uint _cont_InstanceID(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +// Implementation of DispatchRaysIndex. +export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data) { - - return ConstantLoadDwordAtAddr( - GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) + INSTANCE_DESC_ID_AND_MASK_OFFSET) & 0x00ffffff; + return data.DispatchId(); } //===================================================================================================================== -export uint _cont_GeometryIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +// Implementation of DispatchRaysDimensions(). +export uint3 _cont_DispatchRaysDimensions3(in _AmdDispatchSystemData data) { - return primitive.GeometryIndex(); + return GetDispatchRaysDimensions(); } //===================================================================================================================== -export uint _cont_PrimitiveIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +// Return the hit state for AnyHit and Intersection +export _AmdPrimitiveSystemState _cont_GetCandidateState(in _AmdAnyHitSystemData data) { - return primitive.primitiveIndex; + return data.candidate; } //===================================================================================================================== -export float4x3 _cont_ObjectToWorld4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +// Return the hit state for ClosestHit +export _AmdPrimitiveSystemState _cont_GetCommittedState(in _AmdSystemData data) { - return ObjectToWorld4x3(data.ray.AccelStruct(), primitive.instNodePtr); + return data.traversal.committed; } //===================================================================================================================== -export float4x3 _cont_WorldToObject4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +export float3 _cont_WorldRayOrigin3(in _AmdSystemData state) { - return WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr); + return state.ray.origin; } //===================================================================================================================== -export TriangleData _cont_TriangleVertexPositions(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +export float3 _cont_WorldRayDirection3(in _AmdSystemData state) { - const GpuVirtualAddress instanceAddr = GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr); - { - return FetchTriangleFromNode(GetInstanceAddr(FetchInstanceDescAddr(instanceAddr)), primitive.currNodePtr); - } + return state.ray.direction; } -#ifdef __cplusplus //===================================================================================================================== -// Helper function for cpp only -static float3 mul(in float3 v, in float4x3 m) +export float _cont_RayTMin(in _AmdSystemData state) { - float3 r; - r.x = dot(m[0], v); - r.y = dot(m[1], v); - r.z = dot(m[2], v); - return r; + return state.ray.tMin; } -#endif //===================================================================================================================== -export float3 _cont_ObjectRayOrigin3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +export uint _cont_RayFlags(in _AmdSystemData state) { - return mul(float4(data.ray.origin, 1.0), WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr)); + // Get the flags passed by TraceRay call and apply the known set/unset bits. + return ApplyKnownFlags(state.ray.IncomingFlags()); } //===================================================================================================================== -export float3 _cont_ObjectRayDirection3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +export uint _cont_InstanceInclusionMask(in _AmdSystemData data) +{ + return ExtractInstanceInclusionMask(data.ray.traceParameters); +} + +//===================================================================================================================== +export float _cont_RayTCurrent(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +{ + if (_AmdGetShaderKind() == DXILShaderKind::Intersection) + { + // The intersection shader is an exception. While the system data is usually about the candidate hit, the + // current t must be from the committed hit. + primitive = _cont_GetCommittedState(data); + } + + float tCurrentHw = 0.f; + { + tCurrentHw = primitive.rayTCurrent; + } + + // AMD Gpu shifts the origin, so rayTCurrent is between 0 and (tMaxApp - tMinApp). Add tMinApp back for App's use. + return tCurrentHw + data.ray.tMin; +} + +//===================================================================================================================== +export uint _cont_InstanceIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +{ + + return ConstantLoadDwordAtAddr( + GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) + + INSTANCE_NODE_EXTRA_OFFSET + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET); +} + +//===================================================================================================================== +export uint _cont_InstanceID(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +{ + + return ConstantLoadDwordAtAddr( + GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) + INSTANCE_DESC_ID_AND_MASK_OFFSET) & 0x00ffffff; +} + +//===================================================================================================================== +export uint _cont_GeometryIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +{ + return primitive.GeometryIndex(); +} + +//===================================================================================================================== +export uint _cont_PrimitiveIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +{ + return primitive.primitiveIndex; +} + +//===================================================================================================================== +export float4x3 _cont_ObjectToWorld4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +{ + return ObjectToWorld4x3(data.ray.AccelStruct(), primitive.instNodePtr); +} + +//===================================================================================================================== +export float4x3 _cont_WorldToObject4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +{ + return WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr); +} + +//===================================================================================================================== +export TriangleData _cont_TriangleVertexPositions(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +{ + const GpuVirtualAddress instanceAddr = GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr); + { + return FetchTriangleFromNode(GetInstanceAddr(FetchInstanceDescAddr(instanceAddr)), primitive.currNodePtr); + } +} + +//===================================================================================================================== +export float3 _cont_ObjectRayOrigin3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) +{ + return mul(float4(data.ray.origin, 1.0), WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr)); +} + +//===================================================================================================================== +export float3 _cont_ObjectRayDirection3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive) { return mul(float4(data.ray.direction, 0.0), WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr)); } @@ -1353,7 +1372,6 @@ export uint _cont_GetContinuationStackAddr() { uint offset = 0; -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0 if (_AmdContinuationStackIsGlobal()) { const uint3 threadIdInGroup = AmdExtThreadIdInGroupCompute(); @@ -1366,7 +1384,6 @@ export uint _cont_GetContinuationStackAddr() offset = id * DispatchRaysConstBuf.cpsFrontendStackSize; } else -#endif { offset = #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 36 @@ -1387,27 +1404,26 @@ export uint64_t _cont_GetContinuationStackGlobalMemBase() } //===================================================================================================================== -static Vpc64 GetTraversalVpc64() +static Vpc32 GetTraversalVpc32() { // NOTE: DXCP uses a table for TraceRay, thus a load to traceRayGpuVa retrieves the actual traversal function // address. But Vulkan does not use the table so far, traceRayGpuVa is already the traversal function address. - return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo, - DispatchRaysConstBuf.traceRayGpuVaHi)); + return Vpc64ToVpc32(Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo, + DispatchRaysConstBuf.traceRayGpuVaHi))); } //===================================================================================================================== -static Vpc64 GetTraversalVpc64PwgDead() +static Vpc32 GetTraversalVpc32PwgDead() { - return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo, - DispatchRaysConstBuf.traceRayGpuVaHi)); + return Vpc64ToVpc32(Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo, + DispatchRaysConstBuf.traceRayGpuVaHi))); } //===================================================================================================================== -static Vpc64 GetRayGenVpc64() +static Vpc32 GetRayGenVpc32() { - return GetVpc64FromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo, - DispatchRaysConstBuf.rayGenerationTableAddressHi), - SCHEDULING_PRIORITY_RGS); + return GetVpcFromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo, + DispatchRaysConstBuf.rayGenerationTableAddressHi)); } //===================================================================================================================== @@ -1460,6 +1476,162 @@ export uint _cont_GetSbtStride() } } +//===================================================================================================================== +// ReportHit implementation that is called from the intersection shader. +// May call the AnyHit shader. +export bool _cont_ReportHit(inout_param(_AmdAnyHitSystemData) data, float THit, uint HitKind) +{ + // TODO Reuse shader record index computed in Traversal + // TODO Check for closest hit and duplicate anyHit calling + + THit -= data.base.ray.tMin; + float tCurrentCommitted = 0.f; + { + tCurrentCommitted = data.base.traversal.committed.rayTCurrent; + } + + if ((THit < 0.f) || (THit > tCurrentCommitted)) + { + // Discard the hit candidate and hint the compiler to not keep the + // values alive, which will remove redundant moves. + data.candidate.rayTCurrent = _AmdGetUninitializedF32(); + // Don't discard the hit kind as it is bit packed and cannot be discarded partially. + return false; + } + + data.candidate.rayTCurrent = THit; + data.candidate.PackHitKind(HitKind); + + uint isOpaque = true; + { + PrimitiveData primitiveData; + InstanceDesc desc; + + { + // Get primitive nodes to process based on candidate or committed hit + const uint tlasNodePtr = data.candidate.instNodePtr; + + const GpuVirtualAddress tlasAddr = data.base.ray.AccelStruct() + ExtractNodePointerOffset(tlasNodePtr); + desc = FetchInstanceDescAddr(tlasAddr); + isOpaque = data.candidate.IsOpaque(); + } + } + + if (!isOpaque) + { + uint hitGroupRecordIndex = 0; + { + hitGroupRecordIndex = data.base.dispatch.shaderRecIdx; + } + // Compute hit group address and fetch shader identifiers + const Vpc32 anyHitAddr = GetAnyHit32BitShaderId(hitGroupRecordIndex); + + if (anyHitAddr.IsValid()) + { + // Call AnyHit + // Hit attributes are added as an additional argument by the compiler + Vpc32 resumeAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetResumePointAddr()), SCHEDULING_PRIORITY_IS); + data = _AmdAwaitAnyHit(anyHitAddr.GetU32(), resumeAddr.GetU32(), data); + _AmdRestoreSystemDataAnyHit(data); + return data.base.ray.AnyHitDidAccept(); + } + else + { + _cont_AcceptHit(data); + _AmdAcceptHitAttributes(data); // changes data.base.traversal.committedBarycentrics plus up-to-6 DW data in payload + return true; + } + } + else + { + _cont_AcceptHit(data); + _AmdAcceptHitAttributes(data); + return true; + } +} + +//===================================================================================================================== +// CallShader implementation +export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint index) +{ + const uint64_t callableTableBaseAddress = + PackUint64(DispatchRaysConstBuf.callableTableBaseAddressLo, DispatchRaysConstBuf.callableTableBaseAddressHi); + + if (callableTableBaseAddress == 0) + { + // TODO: It might be better to AwaitSelf here, adding an artificial suspend point. + // For the common case of non-null callable shaders, this would reduce + // the size of compiled shaders, as the post-CallShader() part is unreachable, + // also simplifying manual testing with suspend points. + // For null callable shaders, it has the advantage of allowing + // to reconverge on the resume function if implemented in a way that yields only + // a single resume function. + return; + } + + const Vpc32 addr = GetVpcFromShaderIdTable(callableTableBaseAddress, + index, + DispatchRaysConstBuf.callableTableStrideInBytes); + + if (!addr.IsValid()) + { + // See TODO above on how to handle this case better. + return; + } + + const uint callerShaderRecIdx = data.shaderRecIdx; + data.shaderRecIdx = index; // the record index used by the callable shader + + const DXILShaderKind enclosingShaderType = _AmdGetShaderKind(); + const uint resumePrio = GetPriorityForShaderType(enclosingShaderType); + const Vpc32 resumeAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetResumePointAddr()), resumePrio); + + data = _AmdAwaitCallShader(addr.GetU32(), resumeAddr.GetU32(), data); + + // for the resume part. + data.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx + _AmdRestoreSystemData(data); // llvm inserts amd.dx.setLocalRootIndex(data.shaderRecIdx) +} + +//===================================================================================================================== +// Returns the low part of the miss shader address and sets up the dispatch data to have the correct shader record +// index. +static Vpc32 SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx) +{ + const uint64_t missTableBaseAddress = + PackUint64(DispatchRaysConstBuf.missTableBaseAddressLo, DispatchRaysConstBuf.missTableBaseAddressHi); + if (missTableBaseAddress == 0) + { + shaderRecIdx = 0; + return Vpc32(0); + } + + shaderRecIdx = ExtractMissShaderIndex(data.ray.traceParameters); + + // Calculate miss shader record address + return GetVpcFromShaderIdTable(missTableBaseAddress, + shaderRecIdx, + DispatchRaysConstBuf.missTableStrideInBytes); +} + +//===================================================================================================================== +static HitGroupInfo GetHitGroupInfo( + in _AmdSystemData data, + in uint state, + in _AmdPrimitiveSystemState candidate) +{ + uint geometryIndex = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ? + candidate.GeometryIndex() : data.traversal.committed.GeometryIndex(); + uint instanceContribution = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ? + candidate.InstanceContribution() : data.traversal.committed.InstanceContribution(); + + return GetHitGroupInfo(ExtractRayContributionToHitIndex(data.ray.traceParameters), + ExtractMultiplierForGeometryContributionToHitIndex(data.ray.traceParameters), + geometryIndex, + instanceContribution); +} +#endif + //===================================================================================================================== // Ray History helper functions //===================================================================================================================== @@ -1523,7 +1695,7 @@ static void RayHistoryWriteTopLevel(inout_param(_AmdSystemData) data) #if DEVELOPER if (EnableTraversalCounter() && data.counter.WriteTokenTopLevel()) { - WriteRayHistoryTokenTopLevel(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), data.ray.AccelStruct()); + WriteRayHistoryTokenTopLevel(GetRayId(data.dispatch.DispatchId()), data.ray.AccelStruct()); data.counter.SetWriteTokenTopLevel(false); } #endif @@ -1588,7 +1760,7 @@ static void RayHistoryWriteBegin(inout_param(_AmdSystemData) data) #if DEVELOPER if (EnableTraversalCounter()) { - const uint rayId = GetRayId(_cont_DispatchRaysIndex3(data.dispatch)); + const uint rayId = GetRayId(data.dispatch.DispatchId()); RayDesc rayDesc = (RayDesc)0; rayDesc.Origin = data.ray.origin; rayDesc.Direction = data.ray.direction; @@ -1600,7 +1772,7 @@ static void RayHistoryWriteBegin(inout_param(_AmdSystemData) data) data.counter.SetCallerShaderType(_AmdGetShaderKind()); WriteRayHistoryTokenBegin(rayId, - _cont_DispatchRaysIndex3(data.dispatch), + data.dispatch.DispatchId(), data.ray.AccelStruct(), data.ray.Flags(), data.ray.traceParameters, @@ -1619,7 +1791,7 @@ static void RayHistoryWriteEnd(inout_param(_AmdSystemData) data, uint state) #if DEVELOPER WriteDispatchCounters(data.counter.numIterations); - const uint rayId = GetRayId(_cont_DispatchRaysIndex3(data.dispatch)); + const uint rayId = GetRayId(data.dispatch.DispatchId()); const uint64_t timerEnd = AmdTraceRaySampleGpuTimer(); WriteRayHistoryTokenTimeStamp(rayId, timerEnd); @@ -1635,7 +1807,7 @@ static void RayHistoryWriteEnd(inout_param(_AmdSystemData) data, uint state) if (data.IsChs(state)) { // For CHS, get candidate and barycentrics from traversal. - const uint instNodeIndex = FetchInstanceIdx(ConvertRtIpLevel(_AmdGetRtip()), + const uint instNodeIndex = FetchInstanceIdx(ConvertRtIpLevel(GetRtIpLevel()), data.ray.AccelStruct(), data.traversal.committed.instNodePtr); WriteRayHistoryTokenEnd(rayId, @@ -1661,16 +1833,10 @@ static void RayHistoryWriteEnd(inout_param(_AmdSystemData) data, uint state) } //===================================================================================================================== -static uint2 RayHistoryGetIdentifierFromVPC(uint64_t vpc) +static uint2 RayHistoryGetIdentifierFromVPC(Vpc32 vpc) { // Zero out the metadata bits - return uint2(SplitUint64(vpc).x & 0xFFFFFFC0, 0); -} - -//===================================================================================================================== -static uint2 RayHistoryGetIdentifierFromShaderId(uint2 shaderId) -{ - return uint2(shaderId.x & 0xFFFFFFC0, 0); + return uint2(vpc.GetFunctionAddr(), 0); } //===================================================================================================================== @@ -1679,7 +1845,7 @@ static void RayHistoryWriteTriangleHitResult(_AmdSystemData data, bool accept) #if DEVELOPER if (EnableTraversalCounter()) { - WriteRayHistoryTokenTriangleHitResult(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), + WriteRayHistoryTokenTriangleHitResult(GetRayId(data.dispatch.DispatchId()), uint(accept), data.counter.candidateTCurrent); } @@ -1695,7 +1861,7 @@ static void RayHistoryWriteFunctionCall(inout_param(_AmdSystemData) data, #if DEVELOPER if (EnableTraversalCounter()) { - const uint rayId = GetRayId(_cont_DispatchRaysIndex3(data.dispatch)); + const uint rayId = GetRayId(data.dispatch.DispatchId()); switch(shaderKind) { @@ -1749,7 +1915,7 @@ static void RayHistoryWriteAnyHitOrProceduralStatus(inout_param(_AmdSystemData) #if DEVELOPER if (EnableTraversalCounter()) { - const uint rayId = GetRayId(_cont_DispatchRaysIndex3(data.dispatch)); + const uint rayId = GetRayId(data.dispatch.DispatchId()); const uint status = (data.dispatch.nextNodePtr == END_SEARCH) ? HIT_STATUS_ACCEPT_AND_END_SEARCH : (data.ray.AnyHitDidAccept() ? HIT_STATUS_ACCEPT : HIT_STATUS_IGNORE); @@ -1779,225 +1945,64 @@ static void RayHistoryWriteAnyHitOrProceduralStatus(inout_param(_AmdSystemData) { data.counter.numCandidateHits++; } - break; - - default: - break; - } - data.counter.SetCallerShaderType(DXILShaderKind::Invalid); - } -#endif -} - -//===================================================================================================================== -static void RayHistoryHandleIteration(inout_param(_AmdSystemData) data, uint nextNodePtr) -{ -#if DEVELOPER - if (EnableTraversalCounter()) - { - WriteRayHistoryTokenNodePtr(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), nextNodePtr); - UpdateWaveTraversalStatistics(ConvertRtIpLevel(_AmdGetRtip()), nextNodePtr); - - data.counter.numIterations++; - } -#endif -} - -//===================================================================================================================== -static void RayHistoryWriteBottomLevel(_AmdSystemData data, GpuVirtualAddress bvhAddress) -{ -#if DEVELOPER - if (EnableTraversalCounter()) - { - WriteRayHistoryTokenBottomLevel(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), bvhAddress); - } -#endif -} - -//===================================================================================================================== -static void TraversalCounterWriteCounter(_AmdSystemData data) -{ -#if DEVELOPER - if (EnableTraversalCounter()) - { - TraversalCounter counter = (TraversalCounter)0; - counter.data[TCID_NUM_RAY_BOX_TEST] = data.counter.numRayBoxTest; - counter.data[TCID_NUM_RAY_TRIANGLE_TEST] = data.counter.numRayTriangleTest; - counter.data[TCID_NUM_ITERATION] = data.counter.numIterations; - counter.data[TCID_MAX_TRAVERSAL_DEPTH] = data.counter.maxStackDepth; - counter.data[TCID_NUM_ANYHIT_INVOCATION] = data.counter.numAnyHitInvocation; - counter.data[TCID_SHADER_ID] = data.counter.shaderIdLow; - counter.data[TCID_SHADER_RECORD_INDEX] = data.counter.shaderRecIdx; - counter.data[TCID_TIMING_DATA] = data.counter.timer; - counter.data[TCID_WAVE_ID] = AmdTraceRayGetHwWaveId(); - counter.data[TCID_NUM_CANDIDATE_HITS] = data.counter.numCandidateHits; - counter.data[TCID_INSTANCE_INTERSECTIONS] = data.counter.instanceIntersections; - - WriteTraversalCounter(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), counter); - } -#endif -} - -#if CONTINUATION_ON_GPU -//===================================================================================================================== -// ReportHit implementation that is called from the intersection shader. -// May call the AnyHit shader. -export bool _cont_ReportHit(inout_param(_AmdAnyHitSystemData) data, float THit, uint HitKind) -{ - // TODO Reuse shader record index computed in Traversal - // TODO Check for closest hit and duplicate anyHit calling - - THit -= data.base.ray.tMin; - float tCurrentCommitted = 0.f; - { - tCurrentCommitted = data.base.traversal.committed.rayTCurrent; - } - - if ((THit < 0.f) || (THit > tCurrentCommitted)) - { - // Discard the hit candidate and hint the compiler to not keep the - // values alive, which will remove redundant moves. - data.candidate.rayTCurrent = _AmdGetUninitializedF32(); - // Don't discard the hit kind as it is bit packed and cannot be discarded partially. - return false; - } - - data.candidate.rayTCurrent = THit; - data.candidate.PackHitKind(HitKind); - - uint isOpaque = true; - { - PrimitiveData primitiveData; - InstanceDesc desc; - - { - // Get primitive nodes to process based on candidate or committed hit - const uint tlasNodePtr = data.candidate.instNodePtr; - - const GpuVirtualAddress tlasAddr = data.base.ray.AccelStruct() + ExtractNodePointerOffset(tlasNodePtr); - desc = FetchInstanceDescAddr(tlasAddr); - isOpaque = data.candidate.IsOpaque(); - } - } - - if (!isOpaque) - { - uint hitGroupRecordIndex = 0; - { - hitGroupRecordIndex = data.base.dispatch.shaderRecIdx; - } - // Compute hit group address and fetch shader identifiers - const Vpc64 anyHitAddr = GetAnyHitAddr(hitGroupRecordIndex); - - if (anyHitAddr.IsValid()) - { - // Call AnyHit - // Hit attributes are added as an additional argument by the compiler - Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), SCHEDULING_PRIORITY_IS); - data = _AmdAwaitAnyHit(anyHitAddr.GetU64(), resumeAddr.GetU64(), data); - _AmdRestoreSystemDataAnyHit(data); - return data.base.ray.AnyHitDidAccept(); - } - else - { - _cont_AcceptHit(data); - _AmdAcceptHitAttributes(data); // changes data.base.traversal.committedBarycentrics plus up-to-6 DW data in payload - return true; - } - } - else - { - _cont_AcceptHit(data); - _AmdAcceptHitAttributes(data); - return true; + break; + + default: + break; + } + data.counter.SetCallerShaderType(DXILShaderKind::Invalid); } +#endif } //===================================================================================================================== -// CallShader implementation -export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint index) +static void RayHistoryHandleIteration(inout_param(_AmdSystemData) data, uint nextNodePtr) { - const uint64_t callableTableBaseAddress = - PackUint64(DispatchRaysConstBuf.callableTableBaseAddressLo, DispatchRaysConstBuf.callableTableBaseAddressHi); - - if (callableTableBaseAddress == 0) +#if DEVELOPER + if (EnableTraversalCounter()) { - // TODO: It might be better to AwaitSelf here, adding an artificial suspend point. - // For the common case of non-null callable shaders, this would reduce - // the size of compiled shaders, as the post-CallShader() part is unreachable, - // also simplifying manual testing with suspend points. - // For null callable shaders, it has the advantage of allowing - // to reconverge on the resume function if implemented in a way that yields only - // a single resume function. - return; - } - - const Vpc64 addr = GetVpc64FromShaderIdTable(callableTableBaseAddress, - index, - DispatchRaysConstBuf.callableTableStrideInBytes, - SCHEDULING_PRIORITY_CALLABLE); + WriteRayHistoryTokenNodePtr(GetRayId(data.dispatch.DispatchId()), nextNodePtr); + UpdateWaveTraversalStatistics(ConvertRtIpLevel(GetRtIpLevel()), nextNodePtr); - if (!addr.IsValid()) - { - // See TODO above on how to handle this case better. - return; + data.counter.numIterations++; } - - const uint callerShaderRecIdx = data.shaderRecIdx; - data.shaderRecIdx = index; // the record index used by the callable shader - - const DXILShaderKind enclosingShaderType = _AmdGetShaderKind(); - const uint resumePrio = GetPriorityForShaderType(enclosingShaderType); - const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio); - - data = _AmdAwaitCallShader(addr.GetU64(), resumeAddr.GetU64(), data); - - // for the resume part. - data.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx - _AmdRestoreSystemData(data); // llvm inserts amd.dx.setLocalRootIndex(data.shaderRecIdx) +#endif } //===================================================================================================================== -// Returns the low part of the miss shader address and sets up the dispatch data to have the correct shader record -// index. -static Vpc64 SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx) +static void RayHistoryWriteBottomLevel(_AmdSystemData data, GpuVirtualAddress bvhAddress) { - const uint64_t missTableBaseAddress = - PackUint64(DispatchRaysConstBuf.missTableBaseAddressLo, DispatchRaysConstBuf.missTableBaseAddressHi); - if (missTableBaseAddress == 0) +#if DEVELOPER + if (EnableTraversalCounter()) { - shaderRecIdx = 0; - return Vpc64(0); + WriteRayHistoryTokenBottomLevel(GetRayId(data.dispatch.DispatchId()), bvhAddress); } - - shaderRecIdx = ExtractMissShaderIndex(data.ray.traceParameters); - - // Calculate miss shader record address - const Vpc64 shaderAddr = GetVpc64FromShaderIdTable(missTableBaseAddress, - shaderRecIdx, - DispatchRaysConstBuf.missTableStrideInBytes, - SCHEDULING_PRIORITY_MISS); - - return shaderAddr; +#endif } //===================================================================================================================== -static HitGroupInfo GetHitGroupInfo( - in _AmdSystemData data, - in uint state, - in _AmdPrimitiveSystemState candidate) +static void TraversalCounterWriteCounter(_AmdSystemData data) { - uint geometryIndex = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ? - candidate.GeometryIndex() : data.traversal.committed.GeometryIndex(); - uint instanceContribution = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ? - candidate.InstanceContribution() : data.traversal.committed.InstanceContribution(); +#if DEVELOPER + if (EnableTraversalCounter()) + { + TraversalCounter counter = (TraversalCounter)0; + counter.data[TCID_NUM_RAY_BOX_TEST] = data.counter.numRayBoxTest; + counter.data[TCID_NUM_RAY_TRIANGLE_TEST] = data.counter.numRayTriangleTest; + counter.data[TCID_NUM_ITERATION] = data.counter.numIterations; + counter.data[TCID_MAX_TRAVERSAL_DEPTH] = data.counter.maxStackDepth; + counter.data[TCID_NUM_ANYHIT_INVOCATION] = data.counter.numAnyHitInvocation; + counter.data[TCID_SHADER_ID] = data.counter.shaderIdLow; + counter.data[TCID_SHADER_RECORD_INDEX] = data.counter.shaderRecIdx; + counter.data[TCID_TIMING_DATA] = data.counter.timer; + counter.data[TCID_WAVE_ID] = AmdTraceRayGetHwWaveId(); + counter.data[TCID_NUM_CANDIDATE_HITS] = data.counter.numCandidateHits; + counter.data[TCID_INSTANCE_INTERSECTIONS] = data.counter.instanceIntersections; - return GetHitGroupInfo(ExtractRayContributionToHitIndex(data.ray.traceParameters), - ExtractMultiplierForGeometryContributionToHitIndex(data.ray.traceParameters), - geometryIndex, - instanceContribution); -} + WriteTraversalCounter(GetRayId(data.dispatch.DispatchId()), counter); + } #endif +} //===================================================================================================================== // Order matters, the following HLSL reference the functions and structs defined above. TODO: refactor these into a @@ -2005,7 +2010,30 @@ static HitGroupInfo GetHitGroupInfo( #include "Continuations1_1.hlsl" #include "Continuations2_0.hlsl" -#if CONTINUATION_ON_GPU +//===================================================================================================================== +// Calls traversal for the current rtip. +static void TraversalInternal( + inout_param(_AmdSystemData) data, + inout_param(uint) state, + inout_param(_AmdPrimitiveSystemState) candidate, + inout_param(float2) candidateBarycentrics) +{ + switch (GetRtIpLevel()) + { +#if (GPURT_RTIP_LEVEL == GPURT_RTIP_LEGACY_LEVEL) || (GPURT_RTIP_LEVEL == 0) + case RayTracingIpLevel::RtIp1_1: + TraversalInternal1_1(data, state, candidate, candidateBarycentrics); + break; + case RayTracingIpLevel::RtIp2_0: + TraversalInternal2_0(data, state, candidate, candidateBarycentrics); + break; +#endif + default: + break; + } +} + +#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus))) static uint64_t GetDispatchIdAddr() { return PackUint64(DispatchRaysConstBuf.cpsDispatchIdAddressLo, DispatchRaysConstBuf.cpsDispatchIdAddressHi); @@ -2066,12 +2094,13 @@ static void LaunchRayGen(bool setupStack) #if DEVELOPER systemData.parentId = -1; #endif - _AmdEnqueueRayGen(GetRayGenVpc64().GetU64(), _AmdGetUninitializedI64(), systemData); + _AmdEnqueueRayGen(GetRayGenVpc32().GetU32(), _AmdGetUninitializedI32(), systemData); } else if (Options::getPersistentLaunchEnabled()) { - _AmdDispatchSystemData systemData = _AmdDispatchSystemData::MakeDeadLaneWithStack(); - _AmdEnqueueTraversalDead(GetTraversalVpc64PwgDead().GetU64(), _AmdGetUninitializedI64(), systemData); + _AmdDispatchSystemData systemData = _AmdGetUninitializedDispatchSystemData(); + systemData.SetDead(true); + _AmdEnqueueTraversalDead(GetTraversalVpc32PwgDead().GetU32(), _AmdGetUninitializedI32(), systemData); } } @@ -2152,7 +2181,7 @@ export void _cont_TraceRay( } // Initialise traversal system state _AmdTraversalState traversal = (_AmdTraversalState)0; - switch (_AmdGetRtip()) + switch (GetRtIpLevel()) { case RayTracingIpLevel::RtIp1_1: traversal = InitTraversalState1_1(instanceInclusionMask, rayDesc, isValid); @@ -2176,16 +2205,19 @@ export void _cont_TraceRay( const uint callerShaderRecIdx = dispatch.shaderRecIdx; // 0 if from RayGen. const uint parentId = RayHistoryGetParentId(dispatch); - const Vpc64 traversalAddr = GetTraversalVpc64(); // The type of the shader containing this TraceRay call, i.e. the shader we are inlined into. const DXILShaderKind enclosingShaderType = _AmdGetShaderKind(); const uint resumePrio = GetPriorityForShaderType(enclosingShaderType); // NO control flow is allowed between _AmdGetResumePointAddr() and _AmdAwaitTraversal(). - const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio); - data.traversal.PackReturnAddress(resumeAddr); - dispatch = _AmdAwaitTraversal(traversalAddr.GetU64(), data); + const Vpc32 resumeAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetResumePointAddr()), resumePrio); + data.traversal.SetReturnAddress(resumeAddr); +#ifndef PASS_DUMMY_RET_ADDR + dispatch = _AmdAwaitTraversal(GetTraversalVpc32().GetU32(), data); +#else // PASS_DUMMY_RET_ADDR + dispatch = _AmdAwaitTraversal(GetTraversalVpc32().GetU32(), _AmdGetUninitializedI32(), data); +#endif // for the resume part. dispatch.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx @@ -2196,26 +2228,23 @@ export void _cont_TraceRay( } //===================================================================================================================== -// Get the address of the function that should be called next, either a closest hit or a miss shader. If no hit or miss -// shader should be called, this method returns false (and in that case it should return to -// data.traversal.ReturnAddress()), otherwise it returns true. -static bool GetNextHitMissPc( +// Get the address of the function that should be called next, either a closest hit or a miss shader. +// If no hit or miss shader should be called, this method returns DEAD_SHADER_ADDR. +static Vpc32 GetNextHitMissPc( inout_param(_AmdSystemData) data, uint state, - _AmdPrimitiveSystemState candidate, - out_param(Vpc64) nextShaderAddr) + _AmdPrimitiveSystemState candidate) { // MS if (data.IsMiss(state)) { uint shaderRecIdx; - const Vpc64 missShaderAddr = SetupMissShader(data, shaderRecIdx); + const Vpc32 missShaderAddr = SetupMissShader(data, shaderRecIdx); if (missShaderAddr.IsValid()) { // Valid MS data.dispatch.shaderRecIdx = shaderRecIdx; - nextShaderAddr = missShaderAddr; - return true; + return missShaderAddr; } } @@ -2230,98 +2259,79 @@ static bool GetNextHitMissPc( if ((data.ray.Flags() & RAY_FLAG_SKIP_CLOSEST_HIT_SHADER) == 0) { - if (hitInfo.closestHitId.x != 0) + Vpc32 closestHitId = Vpc32(hitInfo.closestHitId.x); + if (closestHitId.IsValid()) { - // Valid CHS - nextShaderAddr = GetVpc64FromShaderId(Vpc32(hitInfo.closestHitId.x), SCHEDULING_PRIORITY_CHS); - return true; + return closestHitId; } } } - return false; + return Vpc32(DEAD_SHADER_ADDR); } //===================================================================================================================== -// Calls traversal for the current rtip. -static void TraversalInternal( - inout_param(_AmdSystemData) data, - inout_param(uint) state, - inout_param(_AmdPrimitiveSystemState) candidate, - inout_param(float2) candidateBarycentrics) +// Helper to handle enqueueing CHS, MS. +static void EnqueueHitMiss(_AmdSystemData data, Vpc32 nextShaderAddr) { - switch (_AmdGetRtip()) - { -#if (GPURT_RTIP_LEVEL == GPURT_RTIP_LEGACY_LEVEL) || (GPURT_RTIP_LEVEL == 0) - case RayTracingIpLevel::RtIp1_1: - TraversalInternal1_1(data, state, candidate, candidateBarycentrics); - break; - case RayTracingIpLevel::RtIp2_0: - TraversalInternal2_0(data, state, candidate, candidateBarycentrics); - break; -#endif - default: - break; - } -} + GPU_ASSERT(nextShaderAddr.GetU32() != DEAD_SHADER_ADDR && !data.IsDeadLane()); + const uint state = data.traversal.committed.State(); + RayHistoryWriteEnd(data, state); -static void EnqueueNextShader(bool hasWorkToDo, Vpc64 nextShaderAddr, Vpc64 returnAddr, _AmdSystemData data) -{ - if (!hasWorkToDo) + const Vpc32 returnAddr = data.traversal.ReturnAddress(); + + if (nextShaderAddr.GetU32() == DEAD_SHADER_ADDR) { - if (_AmdContinuationStackIsGlobal()) - { - // No work to do = dead lane, jump to traversal as a synchronization point with an empty system data - _AmdSystemData sysData = _AmdSystemData::MakeDeadLaneWithoutStack(); - _AmdEnqueueTraversal(GetTraversalVpc64().GetU64(), _AmdGetUninitializedI64(), sysData); - } - else - { - GPU_ASSERT(false); - } + // We do not have an address to jump to, retrieve the return address and return to RGS + _AmdEnqueueRayGen(returnAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch); } - const uint newState = data.traversal.committed.State(); - RayHistoryWriteEnd(data, newState); + // Enqueue the selected shader + const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(state) + ? (int)DXILShaderKind::Miss // convert to int to fix linux build error + : (int)DXILShaderKind::ClosestHit + ); - if (nextShaderAddr.GetU64() != returnAddr.GetU64()) - { - const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(newState) ? - (int)DXILShaderKind::Miss : // convert to int to fix linux build error - (int)DXILShaderKind::ClosestHit); - RayHistoryWriteFunctionCall(data, - RayHistoryGetIdentifierFromVPC(nextShaderAddr.GetU64()), - data.dispatch.shaderRecIdx, - shaderKind); - - _AmdEnqueue(nextShaderAddr.GetU64(), returnAddr.GetU64(), data); - } + RayHistoryWriteFunctionCall(data, + RayHistoryGetIdentifierFromVPC(nextShaderAddr), + data.dispatch.shaderRecIdx, + shaderKind); - // Return to RayGen. No need to set a priority, as it is already set in the stored return address. - _AmdEnqueueRayGen(returnAddr.GetU64(), _AmdGetUninitializedI64(), data.dispatch); + _AmdEnqueue(nextShaderAddr.GetU32(), returnAddr.GetU32(), data); } //===================================================================================================================== -// Convenience helper calling Traversal on the debug/emulation path that returns _AmdTraversalResultData. -static _AmdTraversalResultData TraversalInternalDebugWrapper( - inout_param(_AmdSystemData) data) + +export void _cont_ExitRayGen(in _AmdDispatchSystemData data) { - uint state = TRAVERSAL_STATE_COMMITTED_NOTHING; - _AmdPrimitiveSystemState candidate = (_AmdPrimitiveSystemState)0; - float2 candidateBarycentrics = float2(0.0f, 0.0f); + if (Options::getPersistentLaunchEnabled() + ) { + // Lanes that exit raygen own a stack. Return them to traversal for scheduling + _AmdDispatchSystemData sysData = _AmdGetUninitializedDispatchSystemData(); + sysData.SetDead(true); + _AmdEnqueueTraversalDead(GetTraversalVpc32PwgDead().GetU32(), _AmdGetUninitializedI32(), sysData); + } + // In all other cases, exit the wave + _AmdComplete(); +} - TraversalInternal(data, state, candidate, candidateBarycentrics); +//===================================================================================================================== - _AmdTraversalResultData result = (_AmdTraversalResultData)0; - result.state = state; - result.candidate = candidate; - result.candidateBarycentrics = candidateBarycentrics; +//===================================================================================================================== - return result; -} +namespace ThreadTrace +{ -//===================================================================================================================== -// Wrapper to ensure the following shader section is marked as "Scheduler" in TTV (if thread traces are enabled). -static void EnterSchedulerSection() +enum struct Section +{ + Scheduler = 8, + Traversal = 6 +}; + +//================================================================================================================= +// Wrapper to ensure the subsequent shader section is correctly identified in TTV. +// If thread traces are disabled, this does nothing. Otherwise, it issues a return token and a new shader data token +// of the type specified by `section`. +static void EnterSection(uint section) { if (Options::getThreadTraceEnabled()) { @@ -2331,8 +2341,44 @@ static void EnterSchedulerSection() // Emit a function call token to start the scheduler function. AmdExtD3DShaderIntrinsics_ShaderMarker(0x11 | - (/* scheduler */ 8 << 8) | - (/* exec */ WaveActiveCountBits(true) << 13)); + (/* section */ section << 8) | + (/* exec */ WaveActiveCountBits(true) << 13)); + } +} + +} // namespace ThreadTrace + +//===================================================================================================================== +// Scheduler for dead lanes. +// Some lanes may return from this function. All lanes that return are guaranteed to be dead and are supposed to enqueue +// traversal for subsequent processing. If the full wave is dead and persistent launch is on, new work will be started. +// If persistent work is off, and all lanes are dead (potentially less than a full wave), and no work could be obtained, +// then the lanes are terminated. +static void ScheduleDeadWave(_AmdSystemData data, Vpc32 traversalAddr) +{ + GPU_ASSERT(WaveActiveAllTrue(data.IsDeadLane())); + + if (Options::getPersistentLaunchEnabled()) + { + if (data.IsDeadLaneWithStack()) + { + if (WaveActiveCountBits(true) == AmdExtLaneCount()) + { + // If the whole wave is dead, get ready to start a new dispatch + LaunchRayGen(false); + } + // Passthrough these stackful dead lanes + _AmdEnqueueTraversalDead(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch); + } + } + + if (Options::getPersistentLaunchEnabled()) + { + _AmdEnqueueTraversalDead(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch); + } + else + { + _AmdComplete(); } } @@ -2341,6 +2387,13 @@ static void EnterSchedulerSection() export void _cont_Traversal( inout_param(_AmdSystemData) data) { + bool IsDead = data.IsDeadLane(); + const bool IsTraversal = !IsDead && data.IsTraversal(); + + // TRAVERSAL: BVH ------------------------------------------------------------------------------------------------- + uint state = TRAVERSAL_STATE_COMMITTED_NOTHING; + _AmdPrimitiveSystemState candidate = (_AmdPrimitiveSystemState)0; + float2 candidateBarycentrics = float2(0.0f, 0.0f); // Discard data that doesn't need to be kept alive during Traversal data.dispatch.shaderRecIdx = _AmdGetUninitializedI32(); if (!IsBvhRebraid()) @@ -2349,23 +2402,18 @@ export void _cont_Traversal( data.traversal.lastInstanceRootNodePtr = _AmdGetUninitializedI32(); } - // Write AHS/IS returned status - bool IsDeadLane = (data.IsDeadLaneWithoutStack() || data.IsDeadLaneWithStack()); - if (!IsDeadLane) + if (!IsDead) { + // Write AHS/IS returned status RayHistoryWriteAnyHitOrProceduralStatus(data); } // Execute traversal for active lanes. - uint state = TRAVERSAL_STATE_COMMITTED_NOTHING; - _AmdPrimitiveSystemState candidate = (_AmdPrimitiveSystemState)0; - float2 candidateBarycentrics = float2(0.0f, 0.0f); - - if (data.IsTraversal()) + if (IsTraversal) { TraversalInternal(data, state, candidate, candidateBarycentrics); } - else + else if (!IsDead) { // This branch is hit when the traversal for a lane is done: // a) AHS/IS enqueued _cont_Traversal(), for the very last time. @@ -2380,41 +2428,32 @@ export void _cont_Traversal( // For CHS, get candidate and barycentrics from traversal. if (data.IsChs(state)) { - candidate = data.traversal.committed; - candidateBarycentrics = data.traversal.committedBarycentrics; + candidate = data.traversal.committed; + candidateBarycentrics = data.traversal.committedBarycentrics; } } - // Result used on the CPU path. This is an unused dummy return value on the GPU path. - _AmdTraversalResultData result = (_AmdTraversalResultData)0; + // ALIASES AND CACHED VARIABLES ----------------------------------------------------------------------------------- - bool IsChsOrMiss = data.IsChsOrMiss(state); - // Re-enqueue Traversal until all lanes are done with BVH Traversal. - // Only then enqueue CHS/Miss to ensure other lanes that are not yet done with Traversal - // converge on these CHS/Miss invocations. - // This is necessary because Traversal has lower scheduling priority. - if (WaveActiveAllTrue(IsChsOrMiss)) - { - EnterSchedulerSection(); + // Cache Traversal's own address + const Vpc32 traversalAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL); - Vpc64 nextShaderAddr = Vpc64(0); - GetNextHitMissPc(data, state, candidate, nextShaderAddr); + // Some aliases for variable state. Help the compiler figure out these are mutually exclusive in all modes. + bool IsChsOrMiss = false; + bool IsAhsOrIs = false; + if (!IsDead) + { + IsChsOrMiss = data.IsChsOrMiss(state); + IsAhsOrIs = (data.IsAhs(state) || data.IsIs(state)); + } + bool AllDead = Traits::HasDeadLanes() && WaveActiveAllTrue(IsDead); + bool AnyIsAhsOrIs = WaveActiveAnyTrue(IsAhsOrIs); - bool hasWorkToDo = true; - if (_AmdContinuationStackIsGlobal() && nextShaderAddr.IsValid()) - { - } + // TRAVERSAL: AHS AND IS ------------------------------------------------------------------------------------------ - const Vpc64 returnAddr = data.traversal.ReturnAddress(); - if (!nextShaderAddr.IsValid()) - { - nextShaderAddr = returnAddr; - } - EnqueueNextShader(hasWorkToDo, nextShaderAddr, returnAddr, data); - } - else + if (AnyIsAhsOrIs) { - if (data.IsAhs(state) || data.IsIs(state)) + if (IsAhsOrIs) { HitGroupInfo hitInfo = (HitGroupInfo)0; { @@ -2429,45 +2468,78 @@ export void _cont_Traversal( // AHS and IS re-enqueue SchedulerInternal when finished. if (data.IsAhs(state)) { + const Vpc32 anyHitAddr = Vpc32(hitInfo.anyHitId.x); RayHistoryWriteFunctionCall(anyHitData.base, - RayHistoryGetIdentifierFromShaderId(hitInfo.anyHitId), + RayHistoryGetIdentifierFromVPC(anyHitAddr), hitInfo.tableIndex, DXILShaderKind::AnyHit); - const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.anyHitId.x), SCHEDULING_PRIORITY_AHS); - const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL); - _AmdEnqueueAnyHit(addr.GetU64(), returnAddr.GetU64(), anyHitData, candidateBarycentrics); + const Vpc32 returnAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL); + _AmdEnqueueAnyHit(anyHitAddr.GetU32(), returnAddr.GetU32(), anyHitData, candidateBarycentrics); } else { // Intersection shader GPU_ASSERT(data.IsIs(state)); + const Vpc32 isAddr = Vpc32(hitInfo.intersectionId.x); RayHistoryWriteFunctionCall(anyHitData.base, - RayHistoryGetIdentifierFromShaderId(hitInfo.intersectionId), + RayHistoryGetIdentifierFromVPC(isAddr), hitInfo.tableIndex, DXILShaderKind::Intersection); - const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.intersectionId.x), SCHEDULING_PRIORITY_IS); - const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL); - _AmdEnqueueIntersection(addr.GetU64(), returnAddr.GetU64(), anyHitData); + const Vpc32 returnAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL); + _AmdEnqueueIntersection(isAddr.GetU32(), returnAddr.GetU32(), anyHitData); } } - else + _AmdEnqueueTraversal(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data); + } + + // FULL WAVE OF DEAD LANES ---------------------------------------------------------------------------------------- + else if (AllDead) + { + ScheduleDeadWave(data, traversalAddr); + // this is unreachable, ScheduleDeadWave guarantees to end with an enqueue + } + + // CHS, MISS AND POSSIBLY DEAD LANES ------------------------------------------------------------------------------ + else + { + GPU_ASSERT(IsChsOrMiss || IsDead); + ThreadTrace::EnterSection(ThreadTrace::Section::Scheduler); + + Vpc32 nextShaderAddr = Vpc32(IsDead ? DEAD_SHADER_ADDR : GetNextHitMissPc(data, state, candidate).GetU32()); + + if (!IsDead) { - // - // Everything else needs to go back through scheduling/traversal, regardless of state - // Note we don't need "Wait" here because priorities run AHS and IS first - const Vpc64 traversalAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL); - _AmdEnqueueTraversal(traversalAddr.GetU64(), _AmdGetUninitializedI64(), data); + EnqueueHitMiss(data, nextShaderAddr); } + _AmdEnqueueTraversalDead(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch); } - // This is unreachable } -#endif + +#elif GPURT_DEBUG_CONTINUATION_TRAVERSAL // ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus))) + +//===================================================================================================================== +// For debug. Convenience helper calling Traversal on the debug/emulation path that returns _AmdTraversalResultData. +static _AmdTraversalResultData TraversalInternalDebugWrapper( + inout_param(_AmdSystemData) data) +{ + uint state = TRAVERSAL_STATE_COMMITTED_NOTHING; + _AmdPrimitiveSystemState candidate = (_AmdPrimitiveSystemState)0; + float2 candidateBarycentrics = float2(0.0f, 0.0f); + + TraversalInternal(data, state, candidate, candidateBarycentrics); + + _AmdTraversalResultData result = (_AmdTraversalResultData)0; + result.state = state; + result.candidate = candidate; + result.candidateBarycentrics = candidateBarycentrics; + + return result; +} //===================================================================================================================== -#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP // For debug. Support DxcpRt (non-continuation) to use Continuation traversal. static IntersectionResult TraceRayInternalCPSDebug( in GpuVirtualAddress topLevelBvh, // Top-level acceleration structure to use @@ -2476,26 +2548,24 @@ static IntersectionResult TraceRayInternalCPSDebug( in RayDesc rayDesc, // Ray to be traced in uint rayId, // Ray ID for profiling in uint rtIpLevel // HW version to determine TraceRay implementation +#if DEVELOPER + , in uint dynamicId // dynamic ID +#endif ) { -#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 41 - rayFlags = (rayFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags(); -#endif - // Initialise ray system state from TraceRay parameters _AmdRaySystemState ray = (_AmdRaySystemState)0; - ray.accelStruct = topLevelBvh; + ray.PackAccelStructAndRayflags(topLevelBvh, rayFlags); ray.direction = rayDesc.Direction; ray.origin = rayDesc.Origin; ray.tMin = rayDesc.TMin; ray.tMax = rayDesc.TMax; - ray.flags = rayFlags; ray.traceParameters = traceRayParameters; const bool isValid = true; // already verified in the caller _AmdDispatchSystemData dispatch = (_AmdDispatchSystemData)0; - dispatch.PackDispatchId(GetDispatchId()); + dispatch.PackDispatchId(AmdTraceRayDispatchRaysIndex()); #if DEVELOPER dispatch.parentId = -1; #endif @@ -2525,6 +2595,10 @@ static IntersectionResult TraceRayInternalCPSDebug( sysData.ray = ray; sysData.traversal = traversal; +#if DEVELOPER + sysData.counter.dynamicId = dynamicId; +#endif + // Begin outer while loop while (sysData.dispatch.nextNodePtr < TERMINAL_NODE) { @@ -2564,10 +2638,20 @@ static IntersectionResult TraceRayInternalCPSDebug( const uint64_t instNodePtr64 = CalculateInstanceNodePtr64(rtIpLevel, topLevelBvh, tlasNodePtr); if (state == TRAVERSAL_STATE_CANDIDATE_NON_OPAQUE_TRIANGLE) { + uint status = HIT_STATUS_ACCEPT; + // This test reduces sp3 instructions, when rayFlags is a const containing RAY_FLAG_FORCE_OPAQUE. Note // in this case, this branch is not executed w/wo this test, but simpler sp3 boosts performance. - if ((rayFlags & RAY_FLAG_FORCE_OPAQUE) == 0) + if ((ray.Flags() & RAY_FLAG_FORCE_OPAQUE) == 0) { + if (PackUint64(hitInfo.anyHitId) != 0) + { + RayHistoryWriteFunctionCall(sysData, + hitInfo.anyHitId, + hitInfo.tableIndex, + DXILShaderKind::AnyHit); + } + uint hitKind = ret.candidate.HitKind(); // Set intersection attributes AmdTraceRaySetHitAttributes(ret.candidate.rayTCurrent, @@ -2582,9 +2666,13 @@ static IntersectionResult TraceRayInternalCPSDebug( BuiltInTriangleIntersectionAttributes attr = { ret.candidateBarycentrics }; AmdTraceRayCallTriangleAnyHitShader(hitInfo.anyHitId, hitInfo.tableIndex, attr); - uint status = HIT_STATUS_ACCEPT; AmdTraceRayGetHitAttributes(ret.candidate.rayTCurrent, hitKind, status); + if (PackUint64(hitInfo.anyHitId) != 0) + { + RayHistoryWriteAnyHitOrProceduralStatus(sysData); + } + if (status != HIT_STATUS_IGNORE) { sysData.traversal.committed = ret.candidate; @@ -2596,12 +2684,19 @@ static IntersectionResult TraceRayInternalCPSDebug( } } } + + RayHistoryWriteTriangleHitResult(sysData, status > HIT_STATUS_IGNORE); } else { // Intersection requires the currently committed hit as RayTCurrent() ret.candidate.rayTCurrent = sysData.traversal.committed.rayTCurrent; + RayHistoryWriteFunctionCall(sysData, + hitInfo.intersectionId, + hitInfo.tableIndex, + DXILShaderKind::Intersection); + // Set intersection attributes AmdTraceRaySetHitAttributes(sysData.traversal.committed.rayTCurrent, 0, @@ -2621,6 +2716,7 @@ static IntersectionResult TraceRayInternalCPSDebug( AmdTraceRayGetHitAttributes(ret.candidate.rayTCurrent, hitKind, status); + RayHistoryWriteAnyHitOrProceduralStatus(sysData); if (status != HIT_STATUS_IGNORE) { @@ -2675,6 +2771,7 @@ static IntersectionResult TraceRayInternalCPSDebug( { handleTriangleNode = CheckHandleTriangleNode(sysData.traversal.committed.currNodePtr); } + if (handleTriangleNode) { AmdTraceRaySetTriangleIntersectionAttributes(result.barycentrics); @@ -2686,6 +2783,16 @@ static IntersectionResult TraceRayInternalCPSDebug( AmdTraceRaySetHitTokenData(INVALID_NODE, INVALID_NODE); } +#if DEVELOPER + result.numRayBoxTest = sysData.counter.numRayBoxTest; + result.numRayTriangleTest = sysData.counter.numRayTriangleTest; + result.numIterations = sysData.counter.numIterations; + result.maxStackDepth = sysData.counter.maxStackDepth; + result.numAnyHitInvocation = sysData.counter.numAnyHitInvocation; + result.numCandidateHits = sysData.counter.numCandidateHits; + result.instanceIntersections = sysData.counter.instanceIntersections; +#endif + return result; } #endif diff --git a/src/shaders/RayQuery.hlsl b/src/shaders/RayQuery.hlsl index ea80508..7107295 100644 --- a/src/shaders/RayQuery.hlsl +++ b/src/shaders/RayQuery.hlsl @@ -98,11 +98,7 @@ static bool RayQueryProceedCommon( { if (continueTraversal == false) { -#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42 - const uint rayId = AmdExtDispatchThreadIdFlat(); -#else const uint rayId = GetRayId(dispatchThreadId); -#endif WriteDispatchCounters(rayQuery.numIterations); WriteTraversalCounter(rayQuery, rayId); diff --git a/src/shaders/RayQuery1_1.hlsl b/src/shaders/RayQuery1_1.hlsl index c82f9c2..d372f85 100644 --- a/src/shaders/RayQuery1_1.hlsl +++ b/src/shaders/RayQuery1_1.hlsl @@ -204,11 +204,7 @@ static void TraceRayInlineImpl1_1( #if DEVELOPER if (EnableTraversalCounter()) { -#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42 - const uint rayId = AmdExtDispatchThreadIdFlat(); -#else const uint rayId = GetRayId(dispatchThreadId); -#endif SetRayQueryDynamicId(rayQuery, AllocateRayHistoryDynamicId()); WriteRayHistoryTokenBegin(rayId, dispatchThreadId, @@ -235,11 +231,7 @@ static bool RayQueryProceedImpl1_1( uint rayId = 0; if (EnableTraversalCounter()) { -#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42 - rayId = AmdExtDispatchThreadIdFlat(); -#else rayId = GetRayId(dispatchThreadId); -#endif } #endif diff --git a/src/shaders/RayQuery2_0.hlsl b/src/shaders/RayQuery2_0.hlsl index 0a9bed9..9e163d3 100644 --- a/src/shaders/RayQuery2_0.hlsl +++ b/src/shaders/RayQuery2_0.hlsl @@ -118,11 +118,7 @@ static void TraceRayInlineImpl2_0( #if DEVELOPER if (EnableTraversalCounter()) { -#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42 - const uint rayId = AmdExtDispatchThreadIdFlat(); -#else const uint rayId = GetRayId(dispatchThreadId); -#endif SetRayQueryDynamicId(rayQuery, AllocateRayHistoryDynamicId()); WriteRayHistoryTokenBegin(rayId, dispatchThreadId, @@ -149,11 +145,7 @@ static bool RayQueryProceedImpl2_0( uint rayId = 0; if (EnableTraversalCounter()) { -#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42 - rayId = AmdExtDispatchThreadIdFlat(); -#else rayId = GetRayId(dispatchThreadId); -#endif } #endif diff --git a/src/shaders/TraceRay.hlsl b/src/shaders/TraceRay.hlsl index 5b9f06c..b4d012b 100644 --- a/src/shaders/TraceRay.hlsl +++ b/src/shaders/TraceRay.hlsl @@ -34,8 +34,11 @@ static IntersectionResult TraceRayInternal( in RayDesc rayDesc, // Ray to be traced in uint rayId, // Ray ID for profiling in uint rtIpLevel // HW version to determine TraceRay implementation +#if DEVELOPER + , in uint dynamicId // dynamic ID +#endif ) -#if GPURT_BUILD_CONTINUATION && GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL { return TraceRayInternalCPSDebug(topLevelBvh, rayFlags, @@ -44,9 +47,12 @@ static IntersectionResult TraceRayInternal( rayId, rtIpLevel +#if DEVELOPER + , dynamicId +#endif ); } -#else // GPURT_BUILD_CONTINUATION && GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP +#else // GPURT_DEBUG_CONTINUATION_TRAVERSAL // Default path { #ifdef __cplusplus @@ -116,6 +122,9 @@ static bool TraceRayCommon( uint rtIpLevel ) { +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL + uint oriRayFlags = rayFlags; +#endif #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 41 rayFlags = (rayFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags(); #endif @@ -188,11 +197,18 @@ static bool TraceRayCommon( { result = TraceRayInternal( accelStruct, +#if GPURT_DEBUG_CONTINUATION_TRAVERSAL + oriRayFlags, +#else rayFlags, +#endif packedTraceParams, ray, rayId, rtIpLevel +#if DEVELOPER + , dynamicId +#endif ); } else @@ -250,13 +266,13 @@ static bool TraceRayCommon( } AmdTraceRaySetParentId(dynamicId); - counter.data[TCID_NUM_RAY_BOX_TEST] = result.numRayBoxTest; - counter.data[TCID_NUM_RAY_TRIANGLE_TEST] = result.numRayTriangleTest; - counter.data[TCID_NUM_ITERATION] = result.numIterations; - counter.data[TCID_MAX_TRAVERSAL_DEPTH] = result.maxStackDepth; - counter.data[TCID_NUM_ANYHIT_INVOCATION] = result.numAnyHitInvocation; - counter.data[TCID_WAVE_ID] = AmdTraceRayGetHwWaveId(); - counter.data[TCID_NUM_CANDIDATE_HITS] = result.numCandidateHits; + counter.data[TCID_NUM_RAY_BOX_TEST] = result.numRayBoxTest; + counter.data[TCID_NUM_RAY_TRIANGLE_TEST] = result.numRayTriangleTest; + counter.data[TCID_NUM_ITERATION] = result.numIterations; + counter.data[TCID_MAX_TRAVERSAL_DEPTH] = result.maxStackDepth; + counter.data[TCID_NUM_ANYHIT_INVOCATION] = result.numAnyHitInvocation; + counter.data[TCID_WAVE_ID] = AmdTraceRayGetHwWaveId(); + counter.data[TCID_NUM_CANDIDATE_HITS] = result.numCandidateHits; counter.data[TCID_INSTANCE_INTERSECTIONS] = result.instanceIntersections; } #endif diff --git a/src/shaders/TrianglePrimitive.hlsl b/src/shaders/TrianglePrimitive.hlsl index e2975dc..b32357e 100644 --- a/src/shaders/TrianglePrimitive.hlsl +++ b/src/shaders/TrianglePrimitive.hlsl @@ -224,7 +224,7 @@ TriangleData FetchTransformedTriangleData( //====================================================================================================================== bool IsActive(TriangleData tri) { - return ((isnan(tri.v0.x) == false) && (isnan(tri.v1.x) == false) && (isnan(tri.v2.x) == false)); + return (any(isnan(tri.v0)) == false) && (any(isnan(tri.v1)) == false) && (any(isnan(tri.v2)) == false); } //===================================================================================================================== diff --git a/src/shadersClean/traversal/TraversalDefs.hlsli b/src/shadersClean/traversal/TraversalDefs.hlsli index 28f9999..ea8ea10 100644 --- a/src/shadersClean/traversal/TraversalDefs.hlsli +++ b/src/shadersClean/traversal/TraversalDefs.hlsli @@ -160,6 +160,15 @@ struct RayQueryInternal //===================================================================================================================== struct HitGroupInfo { +#ifdef __cplusplus + HitGroupInfo(uint val) + { + memset(this, val, sizeof(HitGroupInfo)); + } + + HitGroupInfo() : HitGroupInfo(0) + {} +#endif uint2 closestHitId; uint2 anyHitId; uint2 intersectionId; diff --git a/src/shared/rayTracingDefs.h b/src/shared/rayTracingDefs.h index 5778dfd..636df92 100644 --- a/src/shared/rayTracingDefs.h +++ b/src/shared/rayTracingDefs.h @@ -115,8 +115,14 @@ struct EncodeTaskCountersCommon }; //===================================================================================================================== -struct EncodeTaskCountersBuild : EncodeTaskCountersCommon +// There is DXC bug that doesn't properly compile HLSL->SPRIV using structure inheritance. +// Once it is fixed, EncodeTaskCountersBuild, EncodeTaskCountersUpdate can inherit from EncodeTaskCountersCommon +// https://github.com/microsoft/DirectXShaderCompiler/issues/6986 +struct EncodeTaskCountersBuild { + uint numPrimitives; + uint primRefs; + // The following indirect arguments are only used in mult-dispatch path. Note, currently only HPLOC dispatch uses // these, but it will be extended to other passes when early pair compression is enabled. uint groupCountX; @@ -135,8 +141,10 @@ static_assert(ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET == offsetof(EncodeTaskCounter //===================================================================================================================== // Update scratch memory fields -struct EncodeTaskCountersUpdate : EncodeTaskCountersCommon +struct EncodeTaskCountersUpdate { + uint numPrimitives; + uint primRefs; uint refitTaskCounter; uint taskCount; uint tasksDone; diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py index 95a90d3..5c691cd 100644 --- a/tools/CompileRTShaders.py +++ b/tools/CompileRTShaders.py @@ -43,7 +43,7 @@ DWORDS_PER_LINE = 8 FILE_STANDARD_HEADER = """ -/* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. */ +/* Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. */ /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // diff --git a/tools/DebugPreprocessShaders.py b/tools/DebugPreprocessShaders.py index 4793b96..cc4f16a 100644 --- a/tools/DebugPreprocessShaders.py +++ b/tools/DebugPreprocessShaders.py @@ -29,7 +29,7 @@ import argparse cpp_file_header = """ -/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */ +/* Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. */ namespace GpuRt {