From cf316364f35099647196a81fa9f882d564358a20 Mon Sep 17 00:00:00 2001 From: qiaojbao Date: Wed, 28 Aug 2024 18:04:13 +0800 Subject: [PATCH] Update gpurt from commit 5fa9a2c2 Add RayQuery entry functions Fix bug with calculating update scratch size Set default value for header.compactedSizeInBytes Fix update scratch size allocation for force rebuilds Fix remapping scratchBuffer issues Use _AmdIsLlpc() helper Add PAL interface number to compiler definition Disable shader include validation Enhance triangle pairing heuristic --- CMakeLists.txt | 4 + cmake/GpuRtGenerateShaders.cmake | 13 +- cmake/GpurtOptionsCodegen.cmake | 1 + gpurt/gpurtAccelStruct.h | 7 +- gpurt/gpurtBuildSettings.h | 2 + src/gpurtBvhBatcher.cpp | 2 + src/gpurtBvhBuilder.cpp | 267 +++++++++++++++------- src/gpurtBvhBuilder.h | 40 +++- src/gpurtDevice.cpp | 11 +- src/gpurtInternalShaderBindings.h | 10 + src/shaders/BuildBVHPLOC.hlsl | 2 - src/shaders/BuildBVHTDTR.hlsl | 14 +- src/shaders/BuildCommonScratch.hlsl | 22 ++ src/shaders/BuildParallel.hlsl | 70 +++--- src/shaders/BuildQBVH.hlsl | 8 +- src/shaders/BuildSettings.hlsli | 2 + src/shaders/Common.hlsl | 10 +- src/shaders/CompactCommon.hlsl | 17 +- src/shaders/EncodeCommon.hlsl | 7 +- src/shaders/EncodeHwBvhCommon.hlsl | 6 + src/shaders/EncodePairedTriangleImpl.hlsl | 63 +++-- src/shaders/EncodeTopLevel.hlsl | 1 + src/shaders/EncodeTopLevelUpdate.hlsl | 5 +- src/shaders/Extensions.hlsl | 19 ++ src/shaders/GenerateMortonCodes.hlsl | 2 +- src/shaders/GpuRtLibrary.hlsl | 60 ++++- src/shaders/GpuRtLibraryCont.hlsl | 90 ++++---- src/shaders/LaneGroup.hlsl | 16 ++ src/shaders/TaskCounter.hlsl | 24 +- src/shaders/TaskQueueCounter.hlsl | 10 - src/shaders/TraceRayCommon.hlsl | 22 ++ src/shaders/TrianglePrimitive.hlsl | 16 +- src/shaders/Update.hlsl | 21 +- src/shadersClean/common/Math.hlsli | 7 + tools/CompileRTShaders.py | 2 +- 35 files changed, 592 insertions(+), 281 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5be2de9..ea92db7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,10 @@ if (DEFINED GPURT_CLIENT_INTERFACE_MAJOR_VERSION) gpurt_add_compile_definitions(GPURT_CLIENT_INTERFACE_MAJOR_VERSION=${GPURT_CLIENT_INTERFACE_MAJOR_VERSION}) endif() +if (DEFINED PAL_CLIENT_INTERFACE_MAJOR_VERSION) + gpurt_add_compile_definitions(PAL_CLIENT_INTERFACE_MAJOR_VERSION=${PAL_CLIENT_INTERFACE_MAJOR_VERSION}) +endif() + ### Add Source Directories target_include_directories(gpurt PUBLIC .) target_include_directories(gpurt_internal PUBLIC .) diff --git a/cmake/GpuRtGenerateShaders.cmake b/cmake/GpuRtGenerateShaders.cmake index faf053c..4654fa0 100644 --- a/cmake/GpuRtGenerateShaders.cmake +++ b/cmake/GpuRtGenerateShaders.cmake @@ -99,6 +99,8 @@ list(APPEND gpurtSharedDependencies ${gpurtCompileScript} ) +set(RT_SHADER_VALIDATION_COMMAND "") + # Create custom command that outputs the generated BVH shaders # The generated shaders depend on all the above mentioned files if(GPURT_CLIENT_API STREQUAL "VULKAN") @@ -130,16 +132,7 @@ if(GPURT_CLIENT_API STREQUAL "VULKAN") ${gpurtStripWhitelist} ${gpurtDxcCompiler} ${gpurtSpirvRemap} - - COMMAND Python3::Interpreter "${gpurtCompileScript}" - --outputDir "${gpurtOutputDir}" - --validateShadersClean - ${COMPILER_ARGUMENT} - --defines "\"${gpurtDefines}\"" - --includePaths "\"${gpurtIncludeDirectories}\"" - "${gpurtDxilBvhShader}" - "${gpurtShadersSourceDir}" - "${gpurtSscStrict}" + COMMAND ${RT_SHADER_VALIDATION_COMMAND} COMMAND Python3::Interpreter "${gpurtCompileScript}" --vulkan diff --git a/cmake/GpurtOptionsCodegen.cmake b/cmake/GpurtOptionsCodegen.cmake index 1910f65..9d6d87e 100644 --- a/cmake/GpurtOptionsCodegen.cmake +++ b/cmake/GpurtOptionsCodegen.cmake @@ -53,4 +53,5 @@ add_custom_target(generate_gpurtOptions_h ) target_include_directories(gpurt PUBLIC ${OUTDIR}) +target_sources(gpurt INTERFACE ${GPURTOPTIONS_OUTPUT}) diff --git a/gpurt/gpurtAccelStruct.h b/gpurt/gpurtAccelStruct.h index 6468547..3b35b30 100644 --- a/gpurt/gpurtAccelStruct.h +++ b/gpurt/gpurtAccelStruct.h @@ -94,6 +94,8 @@ struct AccelStructMetadataHeader // numTasksDone can be reset in one 64 bit CP write. uint32 numTasksDone; // Number of tasks done uint32 reserved0[16]; // Reserved + uint32 reserved1[3]; // Reserved + uint32 reserved2[3]; // Reserved }; #define ACCEL_STRUCT_METADATA_VA_LO_OFFSET 0 @@ -102,7 +104,9 @@ struct AccelStructMetadataHeader #define ACCEL_STRUCT_METADATA_TASK_COUNTER_OFFSET 12 #define ACCEL_STRUCT_METADATA_NUM_TASKS_DONE_OFFSET 16 #define ACCEL_STRUCT_METADATA_RESERVED_0 20 -#define ACCEL_STRUCT_METADATA_HEADER_SIZE 84 +#define ACCEL_STRUCT_METADATA_RESERVED_1 84 +#define ACCEL_STRUCT_METADATA_RESERVED_2 96 +#define ACCEL_STRUCT_METADATA_HEADER_SIZE 108 GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_HEADER_SIZE == sizeof(AccelStructMetadataHeader), "Acceleration structure header mismatch"); #ifdef __cplusplus @@ -110,6 +114,7 @@ GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_VA_LO_OFFSET == offsetof(AccelStructMe GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_VA_HI_OFFSET == offsetof(AccelStructMetadataHeader, addressHi), ""); GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_SIZE_OFFSET == offsetof(AccelStructMetadataHeader, sizeInBytes), ""); GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_TASK_COUNTER_OFFSET == offsetof(AccelStructMetadataHeader, taskCounter), ""); +GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_NUM_TASKS_DONE_OFFSET == offsetof(AccelStructMetadataHeader, numTasksDone), ""); #endif #ifdef __cplusplus diff --git a/gpurt/gpurtBuildSettings.h b/gpurt/gpurtBuildSettings.h index 8482537..cce62a4 100644 --- a/gpurt/gpurtBuildSettings.h +++ b/gpurt/gpurtBuildSettings.h @@ -99,6 +99,7 @@ struct CompileTimeBuildSettings uint32 unused11; uint32 unused12; uint32 unused13; + uint32 rebuildAccelStruct; }; #define BUILD_SETTINGS_DATA_TOP_LEVEL_BUILD_ID 0 @@ -134,6 +135,7 @@ struct CompileTimeBuildSettings #define BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID 41 #define BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID 42 #define BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID 43 +#define BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID 47 #ifdef __cplusplus } // namespace GpuRt diff --git a/src/gpurtBvhBatcher.cpp b/src/gpurtBvhBatcher.cpp index 674eb65..08ee242 100644 --- a/src/gpurtBvhBatcher.cpp +++ b/src/gpurtBvhBatcher.cpp @@ -24,12 +24,14 @@ **********************************************************************************************************************/ #include "palCmdBuffer.h" +#include "palHashMapImpl.h" #include "palMetroHash.h" #include "palVectorImpl.h" #include "gpurt/gpurt.h" #include "gpurt/gpurtLib.h" #include "gpurt/gpurtAccelStruct.h" +#include "gpurt/gpurtInlineFuncs.h" #include "gpurtInternal.h" #include "gpurtInternalShaderBindings.h" #include "gpurtBvhBatcher.h" diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp index 060ee74..feb9e0b 100644 --- a/src/gpurtBvhBuilder.cpp +++ b/src/gpurtBvhBuilder.cpp @@ -100,18 +100,9 @@ static VertexFormatInfo VertexFormatInfoTable[] = { Pal::ChNumFormat::X8Y8_Unorm, Pal::ChNumFormat::X8_Unorm, 2 }, }; -// ===================================================================================================================== -// Helper structure for encapsulating triangle index buffer information -struct IndexBufferInfo -{ - uint32 format; - uint64 byteOffset; - uint64 gpuVa; -}; - // ===================================================================================================================== // Helper function to convert triangle geometry information into index buffer info -IndexBufferInfo GetIndexBufferInfo( +IndexBufferInfo BvhBuilder::GetIndexBufferInfo( const GeometryTriangles& geometry) { IndexBufferInfo indexBuffer = {}; @@ -149,7 +140,7 @@ static uint32 DispatchSize( // ===================================================================================================================== // Helper function that calculates the block count for input number of triangles -static uint32 TrianglePairBlockCount( +uint32 BvhBuilder::TrianglePairBlockCount( uint32 numTriangles) { constexpr uint32 TrianglePairBlockSize = 64; @@ -454,19 +445,27 @@ BvhBuilder::BvhBuilder( const AccelStructBuildInfo& buildInfo) // Build args : m_pDevice(pDevice), - m_deviceSettings(deviceSettings), m_clientCb(clientCb), + m_deviceSettings(deviceSettings), + m_buildConfig({}), + m_resultOffsets({}), m_buildArgs(buildInfo), m_deviceProps(deviceProps), + m_metadataSizeInBytes(0), m_cmdBuffer(cmdBuffer), + m_scratchOffsets({}), m_backend(backend), m_buildSettings({}), + m_shaderConstantsGpuVa(0ull), + m_geomConstSrdTable(0ull), + m_geomBufferSrdTable(0ull), m_radixSortConfig(GetRadixSortConfig(deviceSettings)), m_emitCompactDstGpuVa(0ull), - m_buildSettingsHash(0) + m_buildSettingsHash(0u), + m_resultBufferInfo({}), + m_scratchBufferInfo({}), + m_dumpInfo({}) { - // Determine if the flags have to be overriden based on the build inputs. - m_buildArgs.inputs = m_pDevice->OverrideBuildInputs(m_buildArgs.inputs); InitializeBuildConfigs(); { @@ -491,16 +490,26 @@ BvhBuilder::BvhBuilder( const DeviceSettings& deviceSettings) // Device settings : m_pDevice(pDevice), - m_deviceSettings(deviceSettings), m_clientCb(clientCb), + m_deviceSettings(deviceSettings), + m_buildConfig({}), + m_resultOffsets({}), m_buildArgs(AccelStructBuildInfo{}), m_deviceProps(deviceProps), + m_metadataSizeInBytes(0), m_cmdBuffer(cmdBuffer), + m_scratchOffsets({}), m_backend(backend), m_buildSettings({}), + m_shaderConstantsGpuVa(0ull), + m_geomConstSrdTable(0ull), + m_geomBufferSrdTable(0ull), m_radixSortConfig(GetRadixSortConfig(deviceSettings)), m_emitCompactDstGpuVa(0ull), - m_buildSettingsHash(0) + m_buildSettingsHash(0u), + m_resultBufferInfo({}), + m_scratchBufferInfo({}), + m_dumpInfo({}) { InitCopySettings(); } @@ -549,10 +558,16 @@ BvhBuildMode BvhBuilder::OverrideBuildMode( // Remapped scratch buffer base address bool BvhBuilder::AllowRemappingScratchBuffer() const { + bool encodeQuadPrimitives = m_buildConfig.enableEarlyPairCompression; + + bool usePrimIndicesArray = false; + return (m_deviceSettings.enableRemapScratchBuffer == true) && (IsUpdate() == false) && - (m_deviceSettings.enableBuildAccelStructScratchDumping == false); + (m_deviceSettings.enableBuildAccelStructScratchDumping == false) && + (encodeQuadPrimitives == false) && + (usePrimIndicesArray == false); } // ===================================================================================================================== @@ -587,11 +602,41 @@ uint32 BvhBuilder::CalculateScratchBufferSize( return size; } +// ===================================================================================================================== +// Calculates the result buffer's metadata size +uint32 BvhBuilder::CalculateMetadataSize( + const uint32 internalNodeSize, + const uint32 leafNodeSize, + uint32* const pRunningOffset) +{ + uint metadataSizeInBytes; + { + metadataSizeInBytes = CalcMetadataSizeInBytes(internalNodeSize, leafNodeSize); + // Align metadata size to cache line + metadataSizeInBytes = Util::Pow2Align(metadataSizeInBytes, 128); + + *pRunningOffset += metadataSizeInBytes; + } + + return metadataSizeInBytes; +} + // ===================================================================================================================== // Calculates the result buffer offsets and returns the total result memory size BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfo( AccelStructDataOffsets* pOffsets, - uint32* pMetadataSizeInBytes) + uint32* pMetadataSizeInBytes, + uint32 remapScratchBufferSize) +{ + return CalculateResultBufferInfoDefault(pOffsets, pMetadataSizeInBytes, remapScratchBufferSize); +} + +// ===================================================================================================================== +// Calculates the result buffer offsets and returns the total result memory size +BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfoDefault( + AccelStructDataOffsets* pOffsets, + uint32* pMetadataSizeInBytes, + uint32 remapScratchBufferSize) { ResultBufferInfo info = {}; @@ -623,37 +668,40 @@ BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfo( uint32 internalNodeSize = 0; uint32 leafNodeSize = 0; + uint32 nodeSize = 0; if (m_buildConfig.maxNumPrimitives > 0) { internalNodeSize = CalculateInternalNodesSize(); leafNodeSize = CalculateLeafNodesSize(); + nodeSize = internalNodeSize + leafNodeSize; offsets.internalNodes = ReserveBytes(internalNodeSize, &runningOffset); offsets.leafNodes = ReserveBytes(leafNodeSize, &runningOffset); + if (AllowRemappingScratchBuffer() && (remapScratchBufferSize > nodeSize)) + { + ReserveBytes(remapScratchBufferSize - nodeSize, &runningOffset); + + nodeSize = remapScratchBufferSize; + } + if (m_buildConfig.topLevelBuild == false) { const uint32 geometryInfoSize = CalculateGeometryInfoSize(m_buildArgs.inputs.inputElemCount); offsets.geometryInfo = ReserveBytes(geometryInfoSize, &runningOffset); } - offsets.primNodePtrs = ReserveBytes(m_buildConfig.maxNumPrimitives * sizeof(uint32), &runningOffset); + { + offsets.primNodePtrs = ReserveBytes(m_buildConfig.maxNumPrimitives * sizeof(uint32), &runningOffset); + } } uint32 totalSizeInBytes = runningOffset; // Metadata section is at the beginning of the acceleration structure buffer - uint32 metadataSizeInBytes; - { - metadataSizeInBytes = CalcMetadataSizeInBytes(internalNodeSize, leafNodeSize); - // Align metadata size to cache line - metadataSizeInBytes = Util::Pow2Align(metadataSizeInBytes, 128); - - totalSizeInBytes += metadataSizeInBytes; - } - + const uint32 metadataSizeInBytes = CalculateMetadataSize(internalNodeSize, leafNodeSize, &totalSizeInBytes); if (pOffsets != nullptr) { memcpy(pOffsets, &offsets, sizeof(offsets)); @@ -664,8 +712,8 @@ BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfo( *pMetadataSizeInBytes = metadataSizeInBytes; } - info.baseOffset = metadataSizeInBytes + sizeof(AccelStructHeader); - info.nodeSize = internalNodeSize + leafNodeSize; + info.baseOffset = sizeof(AccelStructHeader); + info.nodeSize = nodeSize; info.dataSize = totalSizeInBytes; return info; } @@ -1082,7 +1130,6 @@ BvhBuilder::ScratchBufferInfo BvhBuilder::CalculateScratchBufferInfoDefault( neighbourIndices = ReserveBytes(aabbCount * sizeof(uint32), &runningOffset); // TODO: calculate number of blocks based on KEYS_PER_THREAD atomicFlagsPloc = ReserveBytes(aabbCount * RayTracingPLOCFlags, &runningOffset); - clusterOffsets = ReserveBytes(aabbCount * sizeof(uint32), &runningOffset); } } bvh2PhaseMaxSize = Util::Max(bvh2PhaseMaxSize, runningOffset); @@ -1204,6 +1251,24 @@ GeometryType BvhBuilder::GetGeometryType( return type; } +// ===================================================================================================================== +bool BvhBuilder::ForceRebuild( + const Internal::Device* pDevice, + const AccelStructBuildInputs inputs) +{ + const DeviceSettings settings = pDevice->Settings(); + const bool rebuildTopLevel = + Util::TestAnyFlagSet(settings.forceRebuildForUpdates, ForceRebuildForUpdatesMode::TopLevel) && + (inputs.type == GpuRt::AccelStructType::TopLevel); + const bool rebuildBottomLevel = + Util::TestAnyFlagSet(settings.forceRebuildForUpdates, ForceRebuildForUpdatesMode::BottomLevel) && + (inputs.type == GpuRt::AccelStructType::BottomLevel); + + bool rebuildAS = rebuildBottomLevel || rebuildTopLevel; + + return rebuildAS; +} + // ===================================================================================================================== // Initialize buildConfig void BvhBuilder::InitBuildConfig( @@ -1211,6 +1276,13 @@ void BvhBuilder::InitBuildConfig( { m_buildConfig = {}; + if (ForceRebuild(m_pDevice, m_buildArgs.inputs)) + { + // Determine if the flags have to be overriden based on the build inputs. + m_buildArgs.inputs = m_pDevice->OverrideBuildInputs(m_buildArgs.inputs); + m_buildConfig.rebuildAccelStruct = true; + } + // For top-level acceleration structure, inputElementCount represents the number of instances uint32 primitiveCount = (buildArgs.inputs.type == AccelStructType::BottomLevel) ? 0 : buildArgs.inputs.inputElemCount; @@ -1272,8 +1344,7 @@ void BvhBuilder::InitBuildConfig( m_buildConfig.topDownBuild = m_buildConfig.allowTopDownBuild && (buildArgs.inputs.inputElemCount <= m_deviceSettings.maxTopDownBuildInstances); - if ((Util::TestAnyFlagSet(m_buildArgs.inputs.flags, AccelStructBuildFlagAllowUpdate) == false) && - m_buildConfig.topLevelBuild) + if ((UpdateAllowed() == false) && m_buildConfig.topLevelBuild) { if (m_buildConfig.rebraidType == RebraidType::V1) { @@ -1300,8 +1371,7 @@ void BvhBuilder::InitBuildConfig( m_buildConfig.triangleSplitting = (m_deviceSettings.enableParallelBuild) && m_deviceSettings.enableTriangleSplitting && (buildArgs.inputs.type == AccelStructType::BottomLevel) && - (Util::TestAnyFlagSet(buildArgs.inputs.flags, AccelStructBuildFlagAllowUpdate) == false) && - Util::TestAnyFlagSet(buildArgs.inputs.flags, AccelStructBuildFlagPreferFastTrace); + (UpdateAllowed() == false) && Util::TestAnyFlagSet(buildArgs.inputs.flags, AccelStructBuildFlagPreferFastTrace); m_buildConfig.buildMode = OverrideBuildMode(buildArgs); @@ -1384,7 +1454,8 @@ void BvhBuilder::InitBuildConfig( (IsUpdate() && (m_deviceSettings.enableMergedEncodeUpdate == 0)) || ((IsUpdate() == false) && (m_deviceSettings.enableMergedEncodeBuild == 0)) || - (buildArgs.inputs.type == AccelStructType::TopLevel) || + ((buildArgs.inputs.type == AccelStructType::TopLevel) + ) || ((IsUpdate() == false) && (m_buildConfig.geometryType == GeometryType::Aabbs)) #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 46 || (buildArgs.inputs.inputElemCount > maxDescriptorTableSize) @@ -1552,6 +1623,7 @@ AccelStructHeader BvhBuilder::InitAccelStructHeader() const header.accelStructVersion = GPURT_ACCEL_STRUCT_VERSION; header.metadataSizeInBytes = m_metadataSizeInBytes; header.sizeInBytes = accelStructSize; + header.compactedSizeInBytes = accelStructSize; header.numPrimitives = m_buildConfig.maxNumPrimitives; // Is this correct? header.numDescs = m_buildArgs.inputs.inputElemCount; header.geometryType = static_cast(m_buildConfig.geometryType); @@ -1823,9 +1895,9 @@ void BvhBuilder::InitAccelerationStructure() } } - // Merged encode/build writes the header using the shader. However, we don't launch the build shader in the case - // of an empty BVH. - if (m_buildConfig.needEncodeDispatch || (m_buildConfig.maxNumPrimitives == 0)) + // Merged encode/build and BuildParallel write the header using the shader. + // However, we don't launch the build shader in the case of an empty BVH. + if ((m_buildConfig.maxNumPrimitives == 0) || (m_deviceSettings.enableParallelBuild == false)) { WriteImmediateData(HeaderBufferBaseVa(), InitAccelStructMetadataHeader()); WriteImmediateData(ResultBufferBaseVa(), InitAccelStructHeader()); @@ -2176,6 +2248,7 @@ void BvhBuilder::InitBuildSettings() m_buildSettings.updateFlags = m_buildArgs.inputs.flags & (AccelStructBuildFlagPerformUpdate | AccelStructBuildFlagAllowUpdate); + m_buildSettings.rebuildAccelStruct = m_buildConfig.rebuildAccelStruct; m_buildSettings.isUpdateInPlace = IsUpdateInPlace(); m_buildSettings.encodeArrayOfPointers = @@ -2215,19 +2288,26 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo( AccelStructPrebuildInfo prebuildInfo = {}; - // Calculate the amount of space needed to store the result. - const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr); - const uint32 resultDataSize = resultBufferInfo.dataSize; - // Calculate the amount of scratch space needed during the construction process. const ScratchBufferInfo scratchBufferInfo = CalculateScratchBufferInfo(nullptr); + + // Calculate the amount of space needed to store the result. + const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr, scratchBufferInfo.bvh2PhaseSize); + uint32 scratchDataSize = CalculateScratchBufferSize(resultBufferInfo, scratchBufferInfo); + const uint32 resultDataSize = resultBufferInfo.dataSize; uint32 updateDataSize = 0; - if (UpdateAllowed()) + if (m_buildConfig.rebuildAccelStruct) + { + // When we force a rebuild the acceleration structure should use scratch data size for updates + updateDataSize = Util::Max(1u, scratchDataSize); + } + else if (UpdateAllowed()) { updateDataSize = Util::Max(1u, CalculateUpdateScratchBufferInfo(nullptr)); } + // Scratch size for builds may be smaller than updates, some apps will still try to use the scratch size from // the build when performing the update causing page faults. scratchDataSize = Util::Max(scratchDataSize, updateDataSize); @@ -2380,8 +2460,26 @@ void BvhBuilder::PreBuildDumpEvents() { PAL_ASSERT(HasBuildDumpEvents()); - const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr); - const uint32 resultDataSize = resultBufferInfo.dataSize; + uint32 scratchDataSize; + uint32 resultDataSize; + + if (IsUpdate() == false) + { + const ScratchBufferInfo scratchBufferInfo = CalculateScratchBufferInfo(nullptr); + + const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr, scratchBufferInfo.bvh2PhaseSize); + + scratchDataSize = CalculateScratchBufferSize(resultBufferInfo, scratchBufferInfo); + + resultDataSize = resultBufferInfo.dataSize; + } + else + { + scratchDataSize = CalculateUpdateScratchBufferInfo(nullptr); + + const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr, 0); + resultDataSize = resultBufferInfo.dataSize; + } // Initialise acceleration structure information for dump purposes m_dumpInfo = {}; @@ -2397,19 +2495,10 @@ void BvhBuilder::PreBuildDumpEvents() m_dumpInfo.gpuVa = HeaderBufferBaseVa(); m_dumpInfo.sizeInBytes = resultDataSize; m_dumpInfo.scratchGpuVa = ScratchBufferBaseVa(); + m_dumpInfo.scratchSizeInBytes = scratchDataSize; m_dumpInfo.pTimeStampVidMem = nullptr; m_dumpInfo.timeStampVidMemoffset = 0; - if (IsUpdate() == false) - { - const ScratchBufferInfo scratchBufferInfo = CalculateScratchBufferInfo(nullptr); - m_dumpInfo.scratchSizeInBytes = CalculateScratchBufferSize(resultBufferInfo, scratchBufferInfo); - } - else - { - m_dumpInfo.scratchSizeInBytes = CalculateUpdateScratchBufferInfo(nullptr); - } - if (m_deviceSettings.enableBuildAccelStructStats) { #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 39 @@ -2509,18 +2598,20 @@ void BvhBuilder::InitializeBuildConfigs() InitBuildConfig(m_buildArgs); InitBuildSettings(); - m_resultBufferInfo = CalculateResultBufferInfo(&m_resultOffsets, &m_metadataSizeInBytes); - - m_scratchBufferInfo = {}; if (IsUpdate() == false) { // Compute the offsets into the scratch buffer for all of our scratch resources. m_scratchBufferInfo = CalculateScratchBufferInfo(&m_scratchOffsets); + + m_resultBufferInfo = CalculateResultBufferInfo(&m_resultOffsets, &m_metadataSizeInBytes, m_scratchBufferInfo.bvh2PhaseSize); } else { // Compute the offsets into the scratch buffer for all of our scratch resources. + m_scratchBufferInfo = {}; CalculateUpdateScratchBufferInfo(&m_scratchOffsets); + + m_resultBufferInfo = CalculateResultBufferInfo(&m_resultOffsets, &m_metadataSizeInBytes, 0); } // Add tlas to m_tlas @@ -3487,46 +3578,46 @@ void BvhBuilder::UpdateParallel() // Perform geometry encoding and update in one dispatch void BvhBuilder::EncodeUpdate() { - BindPipeline((m_buildConfig.geometryType == GeometryType::Triangles) ? - InternalRayTracingCsType::UpdateTriangles : InternalRayTracingCsType::UpdateAabbs); + uint32 numThreadGroups = 0; - const uint32 threadGroupSize = DefaultThreadGroupSize; - const uint32 wavesPerSimd = 8; - uint32 numThreadGroups = 0; { - const uint32 numWorkItems = Util::Max(1u, m_buildConfig.numPrimitives); - numThreadGroups = GetNumPersistentThreadGroups(numWorkItems, threadGroupSize, wavesPerSimd); - } - const uint32 numThreads = numThreadGroups * threadGroupSize; + BindPipeline((m_buildConfig.geometryType == GeometryType::Triangles) ? + InternalRayTracingCsType::UpdateTriangles : InternalRayTracingCsType::UpdateAabbs); - uint32 entryOffset = 0; + const uint32 threadGroupSize = DefaultThreadGroupSize; + const uint32 wavesPerSimd = 8; + const uint32 numWorkItems = Util::Max(1u, m_buildConfig.numPrimitives); + const uint32 numThreadGroups = GetNumPersistentThreadGroups(numWorkItems, threadGroupSize, wavesPerSimd); + const uint32 numThreads = numThreadGroups * threadGroupSize; - const Update::Constants shaderConstants = - { - .numThreads = numThreads, - }; + uint32 entryOffset = 0; - // Set shader constants - entryOffset = WriteUserDataEntries(&shaderConstants, Update::NumEntries, entryOffset); + const Update::Constants shaderConstants = + { + .numThreads = numThreads, + }; - entryOffset = WriteBuildShaderConstantBuffer(entryOffset); + // Set shader constants + entryOffset = WriteUserDataEntries(&shaderConstants, Update::NumEntries, entryOffset); - // Set result/scratch/source buffers - entryOffset = WriteUpdateBuffers(entryOffset); + entryOffset = WriteBuildShaderConstantBuffer(entryOffset); - const uint32 cbvSrdTableGpuVaLo = Util::LowPart(m_geomConstSrdTable); - entryOffset = WriteUserDataEntries(&cbvSrdTableGpuVaLo, 1, entryOffset); + // Set result/scratch/source buffers + entryOffset = WriteUpdateBuffers(entryOffset); - const uint32 vbvSrdTableGpuVaLo = Util::LowPart(m_geomBufferSrdTable); - entryOffset = WriteUserDataEntries(&vbvSrdTableGpuVaLo, 1, entryOffset); + const uint32 cbvSrdTableGpuVaLo = Util::LowPart(m_geomConstSrdTable); + entryOffset = WriteUserDataEntries(&cbvSrdTableGpuVaLo, 1, entryOffset); - // NullBuffer binding - entryOffset = WriteBufferVa(0, entryOffset); + const uint32 vbvSrdTableGpuVaLo = Util::LowPart(m_geomBufferSrdTable); + entryOffset = WriteUserDataEntries(&vbvSrdTableGpuVaLo, 1, entryOffset); - RGP_PUSH_MARKER("Update (NumPrimitives=%u)", m_buildConfig.maxNumPrimitives); - Dispatch(numThreadGroups); + // NullBuffer binding + entryOffset = WriteBufferVa(0, entryOffset); - RGP_POP_MARKER(); + RGP_PUSH_MARKER("Update (NumPrimitives=%u)", m_buildConfig.maxNumPrimitives); + Dispatch(numThreadGroups); + RGP_POP_MARKER(); + } } // ===================================================================================================================== diff --git a/src/gpurtBvhBuilder.h b/src/gpurtBvhBuilder.h index 876a7ec..4af762f 100644 --- a/src/gpurtBvhBuilder.h +++ b/src/gpurtBvhBuilder.h @@ -36,6 +36,15 @@ namespace EncodeNodes struct Constants; } +// ===================================================================================================================== +// Helper structure for encapsulating triangle index buffer information +struct IndexBufferInfo +{ + uint32 format; + uint64 byteOffset; + uint64 gpuVa; +}; + // ===================================================================================================================== // Helper class used by GPURT to perform various BVH operations like building, copying, etc. class BvhBuilder @@ -66,6 +75,11 @@ class BvhBuilder static uint32 CalculateGeometryInfoSize( uint32 numGeometryDescs); + // Helper function for when to perform a rebuild + static bool ForceRebuild( + const Internal::Device* pDevice, + const AccelStructBuildInputs inputs); + // Builds or updates an acceleration structure and stores it in a result buffer void BuildRaytracingAccelerationStructure(); @@ -106,14 +120,12 @@ class BvhBuilder ResultBufferInfo CalculateResultBufferInfo( AccelStructDataOffsets* pOffsets, - uint32* pMetadataSizeInBytes); + uint32* pMetadataSizeInBytes, + uint remapScratchBufferSize); ScratchBufferInfo CalculateScratchBufferInfo( RayTracingScratchDataOffsets* pOffsets); - ScratchBufferInfo CalculateScratchBufferInfoDefault( - RayTracingScratchDataOffsets* pOffsets); - uint32 CalculateUpdateScratchBufferInfo( RayTracingScratchDataOffsets* pOffsets); @@ -169,6 +181,12 @@ class BvhBuilder static uint32 GetGeometryPrimCount( const Geometry& geometry); + static IndexBufferInfo GetIndexBufferInfo( + const GeometryTriangles& geometry); + + static uint32 TrianglePairBlockCount( + uint32 numTriangles); + private: // Configs that change within build calls, private to the bvh builder. @@ -205,6 +223,7 @@ class BvhBuilder bool enableFastLBVH; bool enableMergeSort; bool enableInstanceRebraid; + bool rebuildAccelStruct; }; BvhBuilder( @@ -213,6 +232,19 @@ class BvhBuilder ClientCallbacks clientCb, const DeviceSettings& deviceSettings); + uint32 CalculateMetadataSize( + const uint32 internalNodeSize, + const uint32 leafNodeSize, + uint32* const pRunningOffset); + + ResultBufferInfo CalculateResultBufferInfoDefault( + AccelStructDataOffsets* pOffsets, + uint32* pMetadataSizeInBytes, + uint remapScratchBufferSize); + + ScratchBufferInfo CalculateScratchBufferInfoDefault( + RayTracingScratchDataOffsets* pOffsets); + uint32 CalculateInternalNodesSize()const; uint32 CalculateLeafNodesSize() const; uint32 CalculateNodesSize() const; diff --git a/src/gpurtDevice.cpp b/src/gpurtDevice.cpp index ce6f940..b1368f0 100644 --- a/src/gpurtDevice.cpp +++ b/src/gpurtDevice.cpp @@ -2128,16 +2128,7 @@ const AccelStructBuildInputs Device::OverrideBuildInputs( ) const { AccelStructBuildInputs buildInputs = inputs; - - const bool rebuildTopLevel = - ( - Util::TestAnyFlagSet(Settings().forceRebuildForUpdates, ForceRebuildForUpdatesMode::TopLevel)) && - (buildInputs.type == GpuRt::AccelStructType::TopLevel); - const bool rebuildBottomLevel = - Util::TestAnyFlagSet(Settings().forceRebuildForUpdates, ForceRebuildForUpdatesMode::BottomLevel) && - (buildInputs.type == GpuRt::AccelStructType::BottomLevel); - - bool rebuildAS = rebuildBottomLevel || rebuildTopLevel; + const bool rebuildAS = BvhBuilder::ForceRebuild(this, inputs); if (rebuildAS) { diff --git a/src/gpurtInternalShaderBindings.h b/src/gpurtInternalShaderBindings.h index 4b26c92..1785233 100644 --- a/src/gpurtInternalShaderBindings.h +++ b/src/gpurtInternalShaderBindings.h @@ -82,6 +82,16 @@ namespace CopyAS constexpr uint32 NumEntries = (sizeof(Constants) / sizeof(uint32)); } +namespace BuildTrivialBvh +{ + struct Constants + { + uint32 maxGeometryCount; + }; + + constexpr uint32 NumEntries = (sizeof(Constants) / sizeof(uint32)); +} + namespace CompactAS { struct Constants diff --git a/src/shaders/BuildBVHPLOC.hlsl b/src/shaders/BuildBVHPLOC.hlsl index abb971e..effeb80 100644 --- a/src/shaders/BuildBVHPLOC.hlsl +++ b/src/shaders/BuildBVHPLOC.hlsl @@ -56,7 +56,6 @@ struct BuildPlocArgs uint currentStateScratchOffset; uint taskQueueCounterScratchOffset; uint atomicFlagsScratchOffset; - uint offsetsScratchOffset; uint dynamicBlockIndexScratchOffset; uint numBatchesScratchOffset; uint baseBatchIndicesScratchOffset; @@ -878,7 +877,6 @@ void BuildBVHPLOC( plocArgs.currentStateScratchOffset = ShaderConstants.offsets.currentState; plocArgs.taskQueueCounterScratchOffset = ShaderConstants.offsets.plocTaskQueueCounter; plocArgs.atomicFlagsScratchOffset = ShaderConstants.offsets.atomicFlagsPloc; - plocArgs.offsetsScratchOffset = ShaderConstants.offsets.clusterOffsets; plocArgs.dynamicBlockIndexScratchOffset = ShaderConstants.offsets.dynamicBlockIndex; plocArgs.numBatchesScratchOffset = ShaderConstants.offsets.numBatches; plocArgs.baseBatchIndicesScratchOffset = ShaderConstants.offsets.batchIndices; diff --git a/src/shaders/BuildBVHTDTR.hlsl b/src/shaders/BuildBVHTDTR.hlsl index e9d568d..ed21d28 100644 --- a/src/shaders/BuildBVHTDTR.hlsl +++ b/src/shaders/BuildBVHTDTR.hlsl @@ -984,23 +984,13 @@ void BuildBVHTDImpl( uint numRefsAllocated = ScratchBuffer.Load(numRefsAllocatedOffset); if (globalId == 0) { - UintBoundingBox sceneBounds; - - uint4 data; - data = ScratchBuffer.Load4(args.SceneBoundsOffset); // todo: recalc based on ACTIVE nodes - sceneBounds.min = data.xyz; - data.xy = ScratchBuffer.Load2(args.SceneBoundsOffset + 0x10); - sceneBounds.max = data.wxy; - - BoundingBox bbox; - bbox.min = Uint3ToFloat3(sceneBounds.min); - bbox.max = Uint3ToFloat3(sceneBounds.max); + BoundingBox bbox = FetchSceneBounds(args.SceneBoundsOffset); // todo: recalc based on ACTIVE nodes BoundingBox bboxCentroid; UintBoundingBox boxCentroidUint; - data = ScratchBuffer.Load4(args.CurrentStateScratchOffset + STATE_TD_CENTROID_BBOX_OFFSET); + uint4 data = ScratchBuffer.Load4(args.CurrentStateScratchOffset + STATE_TD_CENTROID_BBOX_OFFSET); boxCentroidUint.min = data.xyz; data.xy = ScratchBuffer.Load2(args.CurrentStateScratchOffset + STATE_TD_CENTROID_BBOX_OFFSET + 0x10); boxCentroidUint.max = data.wxy; diff --git a/src/shaders/BuildCommonScratch.hlsl b/src/shaders/BuildCommonScratch.hlsl index 579920d..cd54497 100644 --- a/src/shaders/BuildCommonScratch.hlsl +++ b/src/shaders/BuildCommonScratch.hlsl @@ -591,6 +591,28 @@ float2 FetchSceneSize(uint sceneBoundsOffset) return minMax; } +//===================================================================================================================== +void InitSceneBounds(uint sceneBoundsOffset) +{ + // Initialize scene bounds + const uint maxVal = FloatToUint(FLT_MAX); + const uint minVal = FloatToUint(-FLT_MAX); + + ScratchBuffer.Store3(sceneBoundsOffset, maxVal.xxx); + sceneBoundsOffset += sizeof(uint3); + ScratchBuffer.Store3(sceneBoundsOffset, minVal.xxx); + sceneBoundsOffset += sizeof(uint3); + ScratchBuffer.Store2(sceneBoundsOffset, uint2(maxVal, minVal)); + sceneBoundsOffset += sizeof(uint2); + + if (Settings.rebraidType == RebraidType::V2) + { + ScratchBuffer.Store3(sceneBoundsOffset, maxVal.xxx); + sceneBoundsOffset += sizeof(uint3); + ScratchBuffer.Store3(sceneBoundsOffset, minVal.xxx); + } +} + //====================================================================================================================== uint GetBvhNodesOffset( uint numActivePrims, diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl index 73084ba..e557003 100644 --- a/src/shaders/BuildParallel.hlsl +++ b/src/shaders/BuildParallel.hlsl @@ -260,7 +260,6 @@ void BuildBvhPloc( plocArgs.currentStateScratchOffset = ShaderConstants.offsets.currentState; plocArgs.taskQueueCounterScratchOffset = ShaderConstants.offsets.plocTaskQueueCounter; plocArgs.atomicFlagsScratchOffset = ShaderConstants.offsets.atomicFlagsPloc; - plocArgs.offsetsScratchOffset = ShaderConstants.offsets.clusterOffsets; plocArgs.dynamicBlockIndexScratchOffset = ShaderConstants.offsets.dynamicBlockIndex; plocArgs.numBatchesScratchOffset = ShaderConstants.offsets.numBatches; plocArgs.baseBatchIndicesScratchOffset = ShaderConstants.offsets.batchIndices; @@ -401,44 +400,31 @@ void InitAccelerationStructure() DstBuffer.Store(0, ShaderConstants.header); - // Initialise encode counters - WriteTaskCounterData( - ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_NUM_PRIMITIVES_OFFSET, 0); - - // Early triangle pairing and triangle splitting dynamically increment primitive reference counter. Initialise - // counters to 0 when these features are enabled - - const bool dynamicallyIncrementsPrimRefCount = - Settings.enableEarlyPairCompression || Settings.doTriangleSplitting || Settings.isIndirectBuild; - const uint primRefInitCount = - (dynamicallyIncrementsPrimRefCount) ? 0 : ShaderConstants.numPrimitives; + if (Settings.doEncode) + { + // Initialise encode counters + WriteTaskCounterData( + ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_NUM_PRIMITIVES_OFFSET, 0); - WriteTaskCounterData( - ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET, primRefInitCount); + // Early triangle pairing and triangle splitting dynamically increment primitive reference counter. Initialise + // counters to 0 when these features are enabled - // Initialize valid scratch buffer counters to 0 - InitScratchCounter(ShaderConstants.offsets.plocTaskQueueCounter); - InitScratchCounter(ShaderConstants.offsets.tdTaskQueueCounter); - InitScratchCounter(CurrentSplitTaskQueueCounter()); - ClearNumBatches(ShaderConstants.offsets.numBatches); + const bool dynamicallyIncrementsPrimRefCount = + Settings.enableEarlyPairCompression || Settings.doTriangleSplitting || Settings.isIndirectBuild; + const uint primRefInitCount = + (dynamicallyIncrementsPrimRefCount) ? 0 : ShaderConstants.numPrimitives; - // Initialize scene bounds - const uint maxVal = FloatToUint(FLT_MAX); - const uint minVal = FloatToUint(-FLT_MAX); + WriteTaskCounterData( + ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET, primRefInitCount); - uint offset = ShaderConstants.offsets.sceneBounds; - ScratchBuffer.Store3(offset, maxVal.xxx); - offset += sizeof(uint3); - ScratchBuffer.Store3(offset, minVal.xxx); - offset += sizeof(uint3); - ScratchBuffer.Store2(offset, uint2(maxVal, minVal)); - offset += sizeof(uint2); + // Initialize valid scratch buffer counters to 0 + InitScratchCounter(ShaderConstants.offsets.plocTaskQueueCounter); + InitScratchCounter(ShaderConstants.offsets.tdTaskQueueCounter); + InitScratchCounter(CurrentSplitTaskQueueCounter()); + ClearNumBatches(ShaderConstants.offsets.numBatches); - if (Settings.rebraidType == RebraidType::V2) - { - ScratchBuffer.Store3(offset, maxVal.xxx); - offset += sizeof(uint3); - ScratchBuffer.Store3(offset, minVal.xxx); + // Initialize scene bounds + InitSceneBounds(ShaderConstants.offsets.sceneBounds); } } @@ -503,17 +489,17 @@ void BuildBvh( INIT_TASK; - if (Settings.doEncode) - { - BEGIN_TASK(1); + BEGIN_TASK(1); - if (globalId == 0) - { - InitAccelerationStructure(); - } + if (globalId == 0) + { + InitAccelerationStructure(); + } - END_TASK(1); + END_TASK(1); + if (Settings.doEncode) + { BEGIN_TASK(ShaderRootConstants.NumThreadGroups()); EncodePrimitives(globalId, localId); diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl index 2f03134..9dd9e30 100644 --- a/src/shaders/BuildQBVH.hlsl +++ b/src/shaders/BuildQBVH.hlsl @@ -604,11 +604,9 @@ static void PullUpChildren( } else { - { - // Note, box node flags are combined together by using an AND operation. Thus, we need to initialise - // invalid child flags as 0xff - boxNodeFlags = SetBoxNodeFlagsField(boxNodeFlags, 0xff, i); - } + // Note, box node flags are combined together by using an AND operation. Thus, we need to initialise + // invalid child flags as 0xff + boxNodeFlags = SetBoxNodeFlagsField(boxNodeFlags, 0xff, i); } } diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli index 4d72cb5..6bb5026 100644 --- a/src/shaders/BuildSettings.hlsli +++ b/src/shaders/BuildSettings.hlsli @@ -59,6 +59,7 @@ [[vk::constant_id(BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID)]] uint encodeArrayOfPointers = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID)]] uint sceneBoundsCalculationType = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID)]] uint rebraidQualityHeuristic = 0; +[[vk::constant_id(BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID)]] uint rebuildAccelStruct = 0; static const CompileTimeBuildSettings Settings = { topLevelBuild, @@ -108,6 +109,7 @@ static const CompileTimeBuildSettings Settings = { 0, 0, 0, + rebuildAccelStruct, }; #endif diff --git a/src/shaders/Common.hlsl b/src/shaders/Common.hlsl index b8c594b..d4562fb 100644 --- a/src/shaders/Common.hlsl +++ b/src/shaders/Common.hlsl @@ -270,12 +270,10 @@ static uint64_t PackInstanceBasePointer(GpuVirtualAddress instanceVa, uint insta instanceBasePointer |= (instanceFlags & D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE) ? (1ull << NODE_POINTER_FORCE_NON_OPAQUE_SHIFT) : 0; - { - // Set 'Skip Procedural' for triangles and 'Skip Triangles' for procedural geometry - instanceBasePointer |= (geometryType == GEOMETRY_TYPE_TRIANGLES) - ? (1ull << NODE_POINTER_SKIP_PROCEDURAL_SHIFT) - : (1ull << NODE_POINTER_SKIP_TRIANGLES_SHIFT); - } + // Set 'Skip Procedural' for triangles and 'Skip Triangles' for procedural geometry + instanceBasePointer |= (geometryType == GEOMETRY_TYPE_TRIANGLES) + ? (1ull << NODE_POINTER_SKIP_PROCEDURAL_SHIFT) + : (1ull << NODE_POINTER_SKIP_TRIANGLES_SHIFT); instanceBasePointer |= (geometryType == GEOMETRY_TYPE_AABBS) ? (1ull << NODE_POINTER_DISABLE_TRIANGLE_CULL_SHIFT) : 0; diff --git a/src/shaders/CompactCommon.hlsl b/src/shaders/CompactCommon.hlsl index 8d41575..b504f2a 100644 --- a/src/shaders/CompactCommon.hlsl +++ b/src/shaders/CompactCommon.hlsl @@ -35,7 +35,7 @@ uint CalcCompactedSize( // Acceleration structure data starts with the header (not including the metadata) uint runningOffset = sizeof(AccelStructHeader); - AccelStructOffsets offsets; + AccelStructOffsets offsets = (AccelStructOffsets)0; offsets.internalNodes = runningOffset; uint internalNodeSize = 0; @@ -63,8 +63,12 @@ uint CalcCompactedSize( offsets.geometryInfo = runningOffset; runningOffset += srcHeader.numDescs * sizeof(GeometryInfo); - offsets.primNodePtrs = runningOffset; - runningOffset += srcHeader.numPrimitives * sizeof(uint); + { + offsets.primNodePtrs = runningOffset; + runningOffset += srcHeader.numPrimitives * sizeof(uint); + + } + } else { @@ -82,8 +86,11 @@ uint CalcCompactedSize( offsets.geometryInfo = 0; } - offsets.primNodePtrs = runningOffset; - runningOffset += srcHeader.numPrimitives * sizeof(uint); + { + offsets.primNodePtrs = runningOffset; + runningOffset += srcHeader.numPrimitives * sizeof(uint); + + } } { diff --git a/src/shaders/EncodeCommon.hlsl b/src/shaders/EncodeCommon.hlsl index 462fc0f..3eca227 100644 --- a/src/shaders/EncodeCommon.hlsl +++ b/src/shaders/EncodeCommon.hlsl @@ -343,7 +343,12 @@ void EncodeTriangleNode( //===================================================================================================================== // Fetch API bounding box from source buffer which is a typed R32G32 buffer. -BoundingBox FetchBoundingBoxData(RWBuffer buffer, uint index, uint offsetInElements, uint boxStrideInElements) +template +BoundingBox FetchBoundingBoxData( + Float3Buffer buffer, + uint index, + uint offsetInElements, + uint boxStrideInElements) { const uint baseElementIndex = index * boxStrideInElements + offsetInElements; diff --git a/src/shaders/EncodeHwBvhCommon.hlsl b/src/shaders/EncodeHwBvhCommon.hlsl index 3552a3c..22e6a0b 100644 --- a/src/shaders/EncodeHwBvhCommon.hlsl +++ b/src/shaders/EncodeHwBvhCommon.hlsl @@ -141,12 +141,18 @@ void PostHwBvhBuild( offsets, metadataSizeInBytes); + // Rebuilding an updateable acceleration structure need to use the original size and not compacted one. + if (Settings.rebuildAccelStruct) + { + compactedSize = ShaderConstants.header.compactedSizeInBytes; + } WriteAccelStructHeaderField(ACCEL_STRUCT_HEADER_COMPACTED_BYTE_SIZE_OFFSET, compactedSize); if (Settings.emitCompactSize != 0) { EmitBuffer.Store2(0, uint2(compactedSize, 0)); } + } } diff --git a/src/shaders/EncodePairedTriangleImpl.hlsl b/src/shaders/EncodePairedTriangleImpl.hlsl index 7cb366d..090b544 100644 --- a/src/shaders/EncodePairedTriangleImpl.hlsl +++ b/src/shaders/EncodePairedTriangleImpl.hlsl @@ -46,11 +46,10 @@ void WriteScratchTriangleNode( WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_V2_OFFSET, data); const BoundingBox box = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2); - - const uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags); - // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out. const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff; + const uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags); + const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId); data = uint4(0, 0, 0, packedFlags); @@ -118,7 +117,6 @@ void WriteScratchQuadNode( // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out. const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff; - const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId); WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_FLAGS_OFFSET, packedFlags); } @@ -196,11 +194,40 @@ float ComputePairAreaRatio( return ratio; } +//====================================================================================================================== +float ComputeEdgeBoxSurfaceArea( + float3x3 vertices, + uint rotation) +{ + // triangle v1, v2, v0 + float3 e0 = (vertices[1]); + float3 e1 = (vertices[0]); + + if (rotation == 0) + { + // triangle v0, v1, v2 + e0 = (vertices[0]); + e1 = (vertices[2]); + } + else if (rotation == 1) + { + // triangle v2, v0, v1 + e0 = (vertices[2]); + e1 = (vertices[1]); + } + + BoundingBox edgeBox = (BoundingBox)0; + edgeBox.min = min(e0, e1); + edgeBox.max = max(e0, e1); + + return ComputeBoxSurfaceArea(edgeBox); +} + //====================================================================================================================== template int PairTrianglesOptimal( T tri, - BoundingBox bbox, + float3x3 vertices, bool isActive) { bool valid = isActive; @@ -208,6 +235,8 @@ int PairTrianglesOptimal( // Initialise to unpaired triangle int pairInfo = -1; + const BoundingBox bbox = GenerateTriangleBoundingBox(vertices[0], vertices[1], vertices[2]); + while (valid) { const bool isBroadcastLane = WaveIsFirstLane(); @@ -230,7 +259,13 @@ int PairTrianglesOptimal( WaveReadLaneFirst(bbox.max), }; - const float ratio = (packedOffset == -1) ? FLT_MAX : ComputePairAreaRatio(broadcastTriBounds, bbox); + const uint tri1Rotation = (packedOffset >> 4) & 0xF; + const float edgeBoxSa = ComputeEdgeBoxSurfaceArea(vertices, tri1Rotation); + + // Skip unpaired triangles and pairs with perpendicular shared edges (i.e. edge box area = 0) + const float ratio = + ((packedOffset == -1) || (edgeBoxSa == 0.0f)) ? + FLT_MAX : ComputePairAreaRatio(broadcastTriBounds, bbox); const float waveMinRatio = WaveActiveMin(ratio); @@ -325,13 +360,17 @@ int PairTriangles( const bool isActiveTriangle = IsActive(tri); + float3x3 faceVertices; + faceVertices[0] = tri.v0; + faceVertices[1] = tri.v1; + faceVertices[2] = tri.v2; + // Indexed triangles can always be paired as their connectivity cannot change on updates. if (isIndexed) { if (Settings.enablePairCostCheck) { - const BoundingBox bbox = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2); - pairInfo = PairTrianglesOptimal(faceIndices, bbox, isActiveTriangle); + pairInfo = PairTrianglesOptimal(faceIndices, faceVertices, isActiveTriangle); } else { @@ -341,15 +380,9 @@ int PairTriangles( // Only pair non-indexed triangles for non-updateable as the triangle positions can change on updates else if (IsUpdateAllowed() == false) { - float3x3 faceVertices; - faceVertices[0] = tri.v0; - faceVertices[1] = tri.v1; - faceVertices[2] = tri.v2; - if (Settings.enablePairCostCheck) { - const BoundingBox bbox = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2); - pairInfo = PairTrianglesOptimal(faceVertices, bbox, isActiveTriangle); + pairInfo = PairTrianglesOptimal(faceVertices, faceVertices, isActiveTriangle); } else { diff --git a/src/shaders/EncodeTopLevel.hlsl b/src/shaders/EncodeTopLevel.hlsl index bd03de0..689a4ff 100644 --- a/src/shaders/EncodeTopLevel.hlsl +++ b/src/shaders/EncodeTopLevel.hlsl @@ -136,6 +136,7 @@ void EncodeInstances( EncodeInstancesUpdate(index, desc, tlasMetadataSize, + offsets, primNodePointerOffset, baseAddrAccelStructHeader, numActivePrims, diff --git a/src/shaders/EncodeTopLevelUpdate.hlsl b/src/shaders/EncodeTopLevelUpdate.hlsl index 48277fe..7a93c31 100644 --- a/src/shaders/EncodeTopLevelUpdate.hlsl +++ b/src/shaders/EncodeTopLevelUpdate.hlsl @@ -28,9 +28,9 @@ void WriteInstanceDescriptor( in InstanceDesc instanceDesc, in uint geometryType, - in uint boxNodeFlags, in uint instanceIndex, in uint instNodePtr, + in AccelStructOffsets offsets, in uint blasRootNodePointer, in uint blasMetadataSize, in uint tlasMetadataSize) @@ -51,6 +51,7 @@ void EncodeInstancesUpdate( uint index, InstanceDesc desc, uint tlasMetadataSize, + AccelStructOffsets offsets, uint primNodePointerOffset, uint64_t baseAddrAccelStructHeader, uint numActivePrims, @@ -159,9 +160,9 @@ void EncodeInstancesUpdate( WriteInstanceDescriptor(desc, geometryType, - boxNodeFlags, index, nodePointer, + offsets, CreateRootNodePointer(), blasMetadataSize, tlasMetadataSize); diff --git a/src/shaders/Extensions.hlsl b/src/shaders/Extensions.hlsl index baf4630..bfc0812 100644 --- a/src/shaders/Extensions.hlsl +++ b/src/shaders/Extensions.hlsl @@ -117,6 +117,7 @@ __decl float3 AmdExtD3DShaderIntrinsics_FloatOpWithRoundMode( //===================================================================================================================== // Sub-group wave reductions +// Ref: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_instructions [[vk::ext_capability(/* GroupNonUniform */ 61)]] [[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]] @@ -149,6 +150,24 @@ float AmdExtD3DShaderIntrinsics_WaveClusterMax(float x, uint dxClusterSize) return spirv_OpGroupNonUniformFMax_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); } +[[vk::ext_instruction(359)]] +uint spirv_OpGroupNonUniformBitwiseAnd_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize); + +uint AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(uint x, uint dxClusterSize) +{ + const uint clusterSize = (1u << (dxClusterSize - 1)); + return spirv_OpGroupNonUniformBitwiseAnd_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); +} + +[[vk::ext_instruction(360)]] +uint spirv_OpGroupNonUniformBitwiseOr_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize); + +uint AmdExtD3DShaderIntrinsics_WaveClusterBitOr(uint x, uint dxClusterSize) +{ + const uint clusterSize = (1u << (dxClusterSize - 1)); + return spirv_OpGroupNonUniformBitwiseOr_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); +} + #endif //===================================================================================================================== diff --git a/src/shaders/GenerateMortonCodes.hlsl b/src/shaders/GenerateMortonCodes.hlsl index bc43899..6cd8bbd 100644 --- a/src/shaders/GenerateMortonCodes.hlsl +++ b/src/shaders/GenerateMortonCodes.hlsl @@ -107,7 +107,7 @@ void GenerateMortonCodesImpl( // Clear refit propagation flags for each leaf node in BVH2. const uint initValue = (Settings.enableFastLBVH ? 0xffffffffu : 0); const uint flagOffset = ShaderConstants.offsets.propagationFlags + (primitiveIndex * sizeof(uint)); - ScratchGlobal.Store(flagOffset, initValue); + ScratchBuffer.Store(flagOffset, initValue); } #if NO_SHADER_ENTRYPOINT == 0 diff --git a/src/shaders/GpuRtLibrary.hlsl b/src/shaders/GpuRtLibrary.hlsl index 3d829e4..81bf9fa 100644 --- a/src/shaders/GpuRtLibrary.hlsl +++ b/src/shaders/GpuRtLibrary.hlsl @@ -29,7 +29,6 @@ // Following order matters as AccelStructTracker relies on defines from TraceRayCommon.hlsl #include "TraceRayCommon.hlsl" #include "AccelStructTracker.hlsl" -#include "llpc/GpurtIntrinsics.h" #if GPURT_BUILD_CONTINUATION && LLPC_CLIENT_INTERFACE_MAJOR_VERSION // Include the continuations library @@ -1002,6 +1001,65 @@ export uint _RayQuery_InstanceIndex(in RayQueryInternal rayQuery, bool committed } } +//===================================================================================================================== +// Fetch triangle position +export TriangleData _RayQuery_FetchTrianglePosition( + inout_param(RayQueryInternal) rayQuery, // BVH address + in bool committed) // Node pointer +{ + TriangleData tdata; + RayTracingIpLevel rtip = _AmdGetRtip(); + switch (rtip) + { + default: + { + tdata = FetchTrianglePositionFromRayQuery(rayQuery, committed); + break; + } + } + return tdata; +} + +//===================================================================================================================== +// RayQuery::Proceed() entry point +export bool _RayQuery_Proceed( + inout_param(RayQueryInternal) rayQuery, + in uint constRayFlags, + in uint3 dispatchThreadId) +{ + uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip()); + return RayQueryProceedCommon( + rayQuery, + constRayFlags, + dispatchThreadId, + rtIpLevel + ); +} + +//===================================================================================================================== +// TraceRayInline() entry point +export void _RayQuery_TraceRayInline( + inout_param(RayQueryInternal) rayQuery, + in uint accelStructLo, + in uint accelStructHi, + in uint constRayFlags, + in uint rayFlags, + in uint instanceMask, + in RayDesc rayDesc, + in uint3 dispatchThreadId) +{ + uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip()); + TraceRayInlineCommon(rayQuery, + accelStructLo, + accelStructHi, + constRayFlags, + rayFlags, + instanceMask, + rayDesc, + dispatchThreadId, + rtIpLevel); +} + export void _RayQuery_SetObjId(in RayQueryInternal rayQuery, int objId) { rayQuery.rayQueryObjId = objId; diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl index 5973394..23ed420 100644 --- a/src/shaders/GpuRtLibraryCont.hlsl +++ b/src/shaders/GpuRtLibraryCont.hlsl @@ -131,26 +131,6 @@ static bool RtIpIsAtLeast(RayTracingIpLevel level) return ((uint32_t)_AmdGetRtip()) >= ((uint32_t)level); } -//===================================================================================================================== -static uint ConvertRtIpLevel(RayTracingIpLevel rtIpLevel) -{ - uint level = 0; - - switch (rtIpLevel) - { - case RayTracingIpLevel::RtIp1_1: - level = GPURT_RTIP1_1; - break; - case RayTracingIpLevel::RtIp2_0: - level = GPURT_RTIP2_0; - break; - default: - break; - } - - return level; -} - //===================================================================================================================== static uint GetPriorityForShaderType( DXILShaderKind shaderKind) @@ -170,19 +150,62 @@ static uint GetPriorityForShaderType( // Forward declaration for _AmdDispatchSystemData.PackDispatchId() and _AmdDispatchSystemData.DispatchId() static uint3 GetDispatchRaysDimensions(); +//===================================================================================================================== + static uint64_t GetVpcWithPriority(uint64_t vpc, uint priority) { - return vpc; + if (_AmdIsLlpc()) + { + return vpc; + } + + const uint64_t prio64 = priority; + const uint firstMetadataBit = 32; + const uint firstPriorityBitInMetadata = 16; + GPU_ASSERT((vpc & 0xFFFF000000000000) == 0); + return vpc | (prio64 << (firstMetadataBit + firstPriorityBitInMetadata)); } -static uint64_t Unpack32BitVpcTo64BitVpc(uint32_t vpc32, bool /*unpackPriority*/) +//===================================================================================================================== +// 32-bit function pointer packing/unpacking +// +static uint64_t Unpack32BitVpcTo64BitVpc(uint32_t vpc32, bool unpackPriority) { - return vpc32; + if (_AmdIsLlpc()) + { + return vpc32; + } + + uint64_t vpc = (vpc32 & 0xFFFFFFC0); + + if (unpackPriority) + { + // The priority is stored in bits 0..2. + uint32_t priority = (vpc32 & 0x7); + vpc = GetVpcWithPriority(vpc, priority); + } + + return vpc; } static uint32_t Pack64BitVpcTo32Bits(uint64_t vpc) { - return (vpc & 0xFFFFFFFF); + if (_AmdIsLlpc()) + { + return (vpc & 0xFFFFFFFF); + } + + // Incoming metadata is in the high dword + uint32_t inMetadata = (uint32_t)(vpc >> 32); + uint32_t prio = (inMetadata >> 16); + // We only have three bits for the priority: + GPU_ASSERT(prio <= 7); + + // Outgoing metadata is in the low 6 bits + uint32_t outMetadata = prio; + + GPU_ASSERT((vpc & 0x2F) == 0); + return SplitUint64(vpc).x | outMetadata; } //===================================================================================================================== @@ -2030,7 +2053,6 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_ } else { - // This case should only occur in sorting mode. GPU_ASSERT(false); } } @@ -2038,7 +2060,6 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_ const uint newState = data.traversal.committed.State(); RayHistoryWriteEnd(data, newState); - // Finished sorting, previously dead lanes may now have CHS|MS to execute and vice-versa if (nextShaderAddr != returnAddr) { const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(newState) ? @@ -2116,7 +2137,6 @@ export void _cont_Traversal( RayHistoryWriteAnyHitOrProceduralStatus(data); } - // Handle reordering of rays/threads before processing since dead lanes may become alive after sorting. // Execute traversal for active lanes. uint state = TRAVERSAL_STATE_COMMITTED_NOTHING; _AmdPrimitiveSystemState candidate = (_AmdPrimitiveSystemState)0; @@ -2150,14 +2170,6 @@ export void _cont_Traversal( _AmdTraversalResultData result = (_AmdTraversalResultData)0; bool IsChsOrMiss = data.IsChsOrMiss(state); - // For sorting-enabled global mem mode, we only enqueue CHS/Miss once all - // lanes have arrived in this state. - // In non-sorting mode, we immediately enqueue CHS/Miss. This is mostly - // to replicate the old ProcessContinuation() behavior for now. - // We might want to consider also waiting for all lanes here in the non-global - // mem mode for consistency, and potentially also to have a common place - // in between Traversal and CHS/Miss where extra work can be done just once - // for all lanes, e.g. preparing system data for CHS/Miss. if ((_AmdContinuationStackIsGlobal() && WaveActiveAllTrue(IsChsOrMiss)) || (!_AmdContinuationStackIsGlobal() && IsChsOrMiss)) { @@ -2167,11 +2179,6 @@ export void _cont_Traversal( GetNextHitMissPc(data, state, candidate, nextShaderAddr); bool hasWorkToDo = true; - // Avoid sorting on return addresses to RayGen (the case nextShaderValid == false), as it may create - // unexpected behavior and might increase execution divergence. For example, we might have multiple resume - // points due to divergent control flow in the TraceRay call, but those resume points are all copies of the same - // code. If we sort and re-read only from one bin, we might prevent future TraceRay calls from reconverging - // on traversal. if (_AmdContinuationStackIsGlobal() && (nextShaderAddr != 0)) { } @@ -2231,9 +2238,6 @@ export void _cont_Traversal( } else { - // The last remaining case is that we need to re-enqueue Traversal, because we are waiting for - // other lanes to finish BVH traversal before sorting, or to resume suspended lanes that wait for - // other lanes to run IS/AHS in early-is-ahs mode. // // Everything else needs to go back through scheduling/traversal, regardless of state // Note we don't need "Wait" here because priorities run AHS and IS first diff --git a/src/shaders/LaneGroup.hlsl b/src/shaders/LaneGroup.hlsl index 5e69227..3274c59 100644 --- a/src/shaders/LaneGroup.hlsl +++ b/src/shaders/LaneGroup.hlsl @@ -124,6 +124,22 @@ struct LaneGroup return AmdExtD3DShaderIntrinsics_WaveClusterMin(val, clusterSize); } + template + T BitOr(T val) + { + const uint clusterSize = log2(groupSize) + 1; + + return AmdExtD3DShaderIntrinsics_WaveClusterBitOr(val, clusterSize); + } + + template + T BitAnd(T val) + { + const uint clusterSize = log2(groupSize) + 1; + + return AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(val, clusterSize); + } + template T Broadcast(T val, uint targetLane) { diff --git a/src/shaders/TaskCounter.hlsl b/src/shaders/TaskCounter.hlsl index 034a897..30d5531 100644 --- a/src/shaders/TaskCounter.hlsl +++ b/src/shaders/TaskCounter.hlsl @@ -26,16 +26,32 @@ #include "BuildSettings.hlsli" #endif +//====================================================================================================================== +// Set a scratch buffer counter to 0 if it has a valid offset +void InitScratchCounter(uint offset) +{ + if (offset != INVALID_IDX) + { + ScratchGlobal.Store(offset, 0); + } +} + +//====================================================================================================================== +// Increase a scratch buffer counter and return its original value +uint IncrementScratchCounter(uint offset, uint value) +{ + uint originalVal = 0; + ScratchGlobal.InterlockedAdd(offset, value, originalVal); + return originalVal; +} + //===================================================================================================================== // Increment task counter to mark a task / primitive as done uint IncrementTaskCounter(uint offset, uint value) { DeviceMemoryBarrier(); - uint originalVal = 0; - ScratchGlobal.InterlockedAdd(offset, value, originalVal); - - return originalVal; + return IncrementScratchCounter(offset, value); } //===================================================================================================================== diff --git a/src/shaders/TaskQueueCounter.hlsl b/src/shaders/TaskQueueCounter.hlsl index fd3303d..84aa2e5 100644 --- a/src/shaders/TaskQueueCounter.hlsl +++ b/src/shaders/TaskQueueCounter.hlsl @@ -86,13 +86,3 @@ bool EndTask(const uint localId, uint taskQueueOffset) return returnValue; } - -//====================================================================================================================== -// Set a scratch buffer counter to 0 if it has a valid index -void InitScratchCounter(uint offset) -{ - if (offset != INVALID_IDX) - { - ScratchGlobal.Store(offset, 0); - } -} diff --git a/src/shaders/TraceRayCommon.hlsl b/src/shaders/TraceRayCommon.hlsl index 3736e40..c22f9eb 100644 --- a/src/shaders/TraceRayCommon.hlsl +++ b/src/shaders/TraceRayCommon.hlsl @@ -30,6 +30,8 @@ #endif #include "../../gpurt/gpurtDispatch.h" +#include "llpc/GpurtIntrinsics.h" + // Driver reserved space ID and resource bindings #define SPACEID space93 @@ -90,6 +92,26 @@ static uint CalculateHitGroupRecordAddress( ); } +//===================================================================================================================== +static uint ConvertRtIpLevel(RayTracingIpLevel rtIpLevel) +{ + uint level = 0; + + switch (rtIpLevel) + { + case RayTracingIpLevel::RtIp1_1: + level = GPURT_RTIP1_1; + break; + case RayTracingIpLevel::RtIp2_0: + level = GPURT_RTIP2_0; + break; + default: + break; + } + + return level; +} + //===================================================================================================================== static HitGroupInfo FetchHitGroupInfo( uint hitGroupRecordIndex) diff --git a/src/shaders/TrianglePrimitive.hlsl b/src/shaders/TrianglePrimitive.hlsl index 25e02a1..e2975dc 100644 --- a/src/shaders/TrianglePrimitive.hlsl +++ b/src/shaders/TrianglePrimitive.hlsl @@ -113,10 +113,11 @@ uint3 FetchFaceIndices( // Vertex buffers only require an address and stride alignment of the format component size not the entire element size. // If the input data is not naturally aligned, we cannot use a single typed fetch for the 2-3 components. In this case, // we need to fetch each component separately. +template float3 FetchVertexPerComponent( - RWBuffer buffer, - uint firstComponentIndex, - uint numComponents) + Float3Buffer buffer, + uint firstComponentIndex, + uint numComponents) { float3 vertex; vertex.x = buffer[firstComponentIndex+0].x; @@ -134,8 +135,9 @@ float3 FetchVertexPerComponent( } //===================================================================================================================== +template TriangleData FetchTriangleData( - RWBuffer buffer, + Float3Buffer buffer, uint vertexOffsetInComponents, uint3 index, uint strideInComponents, @@ -181,8 +183,9 @@ uint CalcTriangleBoxNodeFlags( } //====================================================================================================================== +template TriangleData FetchTransformedTriangleData( - in RWBuffer geometryBuffer, + in Float3Buffer geometryBuffer, in uint3 faceIndices, in uint geometryStride, in uint vertexOffsetInComponents, @@ -226,10 +229,11 @@ bool IsActive(TriangleData tri) //===================================================================================================================== // Helper function to fetch triangle data. Returns false if the vertex indices are out of bounds. +template bool FetchTrianglePrimitive( in BuildShaderGeometryConstants geomConstants, in NumPrimAndInputOffset inputOffsets, - in RWBuffer geometryBuffer, + in Float3Buffer geometryBuffer, in uint geomId, in uint primId, inout_param(TriangleData) tri, diff --git a/src/shaders/Update.hlsl b/src/shaders/Update.hlsl index 0e06e40..035ad4b 100644 --- a/src/shaders/Update.hlsl +++ b/src/shaders/Update.hlsl @@ -133,18 +133,15 @@ void Update( const uint numGroups = ShaderRootConstants.numThreads / BUILD_THREADGROUP_SIZE; - { - ClearUpdateFlags(globalId); - BEGIN_TASK(numGroups); - EncodePrimitives(globalId, GEOMETRY_TYPE_TRIANGLES); - END_TASK(numGroups); - - const uint numWorkItems = ScratchBuffer.Load(UPDATE_SCRATCH_STACK_NUM_ENTRIES_OFFSET); - UpdateQBVHImpl(globalId, - numWorkItems, - ShaderRootConstants.numThreads); - } - + ClearUpdateFlags(globalId); + BEGIN_TASK(numGroups); + EncodePrimitives(globalId, GEOMETRY_TYPE_TRIANGLES); + END_TASK(numGroups); + + const uint numWorkItems = ScratchBuffer.Load(UPDATE_SCRATCH_STACK_NUM_ENTRIES_OFFSET); + UpdateQBVHImpl(globalId, + numWorkItems, + ShaderRootConstants.numThreads); } //====================================================================================================================== diff --git a/src/shadersClean/common/Math.hlsli b/src/shadersClean/common/Math.hlsli index 5c8356b..981b9b5 100644 --- a/src/shadersClean/common/Math.hlsli +++ b/src/shadersClean/common/Math.hlsli @@ -48,6 +48,13 @@ inline uint32_t bit(uint32_t index) return 1u << index; } +//===================================================================================================================== +// Helper function for producing a 16 bit mask of one bit +inline uint16_t bit16(uint16_t index) +{ + return uint16_t(1u << index); +} + //===================================================================================================================== // Helper function for producing a 64 bit mask of one bit inline uint64_t bit64(uint32_t index) diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py index 9ce4d59..67cd973 100644 --- a/tools/CompileRTShaders.py +++ b/tools/CompileRTShaders.py @@ -193,7 +193,7 @@ def getValidationCmdArgs(args) -> [str]: validateCommand = [compilerPath] - validateCommand += getBaseDxcCommandArgs(True, True, True) + validateCommand += getBaseDxcCommandArgs(True, True, False) validateCommand += ["-Wno-misplaced-attributes"] # -Wmisplaced-attributes is triggered by [RootSignature()] # used by entrypoint code and compiled as library validateCommand += ['-Fo', 'temp.bin']