diff --git a/CMakeLists.txt b/CMakeLists.txt index 5be2de9..ea92db7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,10 @@ if (DEFINED GPURT_CLIENT_INTERFACE_MAJOR_VERSION) gpurt_add_compile_definitions(GPURT_CLIENT_INTERFACE_MAJOR_VERSION=${GPURT_CLIENT_INTERFACE_MAJOR_VERSION}) endif() +if (DEFINED PAL_CLIENT_INTERFACE_MAJOR_VERSION) + gpurt_add_compile_definitions(PAL_CLIENT_INTERFACE_MAJOR_VERSION=${PAL_CLIENT_INTERFACE_MAJOR_VERSION}) +endif() + ### Add Source Directories target_include_directories(gpurt PUBLIC .) target_include_directories(gpurt_internal PUBLIC .) diff --git a/cmake/GpuRtGenerateShaders.cmake b/cmake/GpuRtGenerateShaders.cmake index faf053c..4654fa0 100644 --- a/cmake/GpuRtGenerateShaders.cmake +++ b/cmake/GpuRtGenerateShaders.cmake @@ -99,6 +99,8 @@ list(APPEND gpurtSharedDependencies ${gpurtCompileScript} ) +set(RT_SHADER_VALIDATION_COMMAND "") + # Create custom command that outputs the generated BVH shaders # The generated shaders depend on all the above mentioned files if(GPURT_CLIENT_API STREQUAL "VULKAN") @@ -130,16 +132,7 @@ if(GPURT_CLIENT_API STREQUAL "VULKAN") ${gpurtStripWhitelist} ${gpurtDxcCompiler} ${gpurtSpirvRemap} - - COMMAND Python3::Interpreter "${gpurtCompileScript}" - --outputDir "${gpurtOutputDir}" - --validateShadersClean - ${COMPILER_ARGUMENT} - --defines "\"${gpurtDefines}\"" - --includePaths "\"${gpurtIncludeDirectories}\"" - "${gpurtDxilBvhShader}" - "${gpurtShadersSourceDir}" - "${gpurtSscStrict}" + COMMAND ${RT_SHADER_VALIDATION_COMMAND} COMMAND Python3::Interpreter "${gpurtCompileScript}" --vulkan diff --git a/cmake/GpurtOptionsCodegen.cmake b/cmake/GpurtOptionsCodegen.cmake index 1910f65..9d6d87e 100644 --- a/cmake/GpurtOptionsCodegen.cmake +++ b/cmake/GpurtOptionsCodegen.cmake @@ -53,4 +53,5 @@ add_custom_target(generate_gpurtOptions_h ) target_include_directories(gpurt PUBLIC ${OUTDIR}) +target_sources(gpurt INTERFACE ${GPURTOPTIONS_OUTPUT}) diff --git a/gpurt/gpurtAccelStruct.h b/gpurt/gpurtAccelStruct.h index 6468547..3b35b30 100644 --- a/gpurt/gpurtAccelStruct.h +++ b/gpurt/gpurtAccelStruct.h @@ -94,6 +94,8 @@ struct AccelStructMetadataHeader // numTasksDone can be reset in one 64 bit CP write. uint32 numTasksDone; // Number of tasks done uint32 reserved0[16]; // Reserved + uint32 reserved1[3]; // Reserved + uint32 reserved2[3]; // Reserved }; #define ACCEL_STRUCT_METADATA_VA_LO_OFFSET 0 @@ -102,7 +104,9 @@ struct AccelStructMetadataHeader #define ACCEL_STRUCT_METADATA_TASK_COUNTER_OFFSET 12 #define ACCEL_STRUCT_METADATA_NUM_TASKS_DONE_OFFSET 16 #define ACCEL_STRUCT_METADATA_RESERVED_0 20 -#define ACCEL_STRUCT_METADATA_HEADER_SIZE 84 +#define ACCEL_STRUCT_METADATA_RESERVED_1 84 +#define ACCEL_STRUCT_METADATA_RESERVED_2 96 +#define ACCEL_STRUCT_METADATA_HEADER_SIZE 108 GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_HEADER_SIZE == sizeof(AccelStructMetadataHeader), "Acceleration structure header mismatch"); #ifdef __cplusplus @@ -110,6 +114,7 @@ GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_VA_LO_OFFSET == offsetof(AccelStructMe GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_VA_HI_OFFSET == offsetof(AccelStructMetadataHeader, addressHi), ""); GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_SIZE_OFFSET == offsetof(AccelStructMetadataHeader, sizeInBytes), ""); GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_TASK_COUNTER_OFFSET == offsetof(AccelStructMetadataHeader, taskCounter), ""); +GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_NUM_TASKS_DONE_OFFSET == offsetof(AccelStructMetadataHeader, numTasksDone), ""); #endif #ifdef __cplusplus diff --git a/gpurt/gpurtBuildSettings.h b/gpurt/gpurtBuildSettings.h index 8482537..cce62a4 100644 --- a/gpurt/gpurtBuildSettings.h +++ b/gpurt/gpurtBuildSettings.h @@ -99,6 +99,7 @@ struct CompileTimeBuildSettings uint32 unused11; uint32 unused12; uint32 unused13; + uint32 rebuildAccelStruct; }; #define BUILD_SETTINGS_DATA_TOP_LEVEL_BUILD_ID 0 @@ -134,6 +135,7 @@ struct CompileTimeBuildSettings #define BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID 41 #define BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID 42 #define BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID 43 +#define BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID 47 #ifdef __cplusplus } // namespace GpuRt diff --git a/src/gpurtBvhBatcher.cpp b/src/gpurtBvhBatcher.cpp index 674eb65..08ee242 100644 --- a/src/gpurtBvhBatcher.cpp +++ b/src/gpurtBvhBatcher.cpp @@ -24,12 +24,14 @@ **********************************************************************************************************************/ #include "palCmdBuffer.h" +#include "palHashMapImpl.h" #include "palMetroHash.h" #include "palVectorImpl.h" #include "gpurt/gpurt.h" #include "gpurt/gpurtLib.h" #include "gpurt/gpurtAccelStruct.h" +#include "gpurt/gpurtInlineFuncs.h" #include "gpurtInternal.h" #include "gpurtInternalShaderBindings.h" #include "gpurtBvhBatcher.h" diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp index 060ee74..feb9e0b 100644 --- a/src/gpurtBvhBuilder.cpp +++ b/src/gpurtBvhBuilder.cpp @@ -100,18 +100,9 @@ static VertexFormatInfo VertexFormatInfoTable[] = { Pal::ChNumFormat::X8Y8_Unorm, Pal::ChNumFormat::X8_Unorm, 2 }, }; -// ===================================================================================================================== -// Helper structure for encapsulating triangle index buffer information -struct IndexBufferInfo -{ - uint32 format; - uint64 byteOffset; - uint64 gpuVa; -}; - // ===================================================================================================================== // Helper function to convert triangle geometry information into index buffer info -IndexBufferInfo GetIndexBufferInfo( +IndexBufferInfo BvhBuilder::GetIndexBufferInfo( const GeometryTriangles& geometry) { IndexBufferInfo indexBuffer = {}; @@ -149,7 +140,7 @@ static uint32 DispatchSize( // ===================================================================================================================== // Helper function that calculates the block count for input number of triangles -static uint32 TrianglePairBlockCount( +uint32 BvhBuilder::TrianglePairBlockCount( uint32 numTriangles) { constexpr uint32 TrianglePairBlockSize = 64; @@ -454,19 +445,27 @@ BvhBuilder::BvhBuilder( const AccelStructBuildInfo& buildInfo) // Build args : m_pDevice(pDevice), - m_deviceSettings(deviceSettings), m_clientCb(clientCb), + m_deviceSettings(deviceSettings), + m_buildConfig({}), + m_resultOffsets({}), m_buildArgs(buildInfo), m_deviceProps(deviceProps), + m_metadataSizeInBytes(0), m_cmdBuffer(cmdBuffer), + m_scratchOffsets({}), m_backend(backend), m_buildSettings({}), + m_shaderConstantsGpuVa(0ull), + m_geomConstSrdTable(0ull), + m_geomBufferSrdTable(0ull), m_radixSortConfig(GetRadixSortConfig(deviceSettings)), m_emitCompactDstGpuVa(0ull), - m_buildSettingsHash(0) + m_buildSettingsHash(0u), + m_resultBufferInfo({}), + m_scratchBufferInfo({}), + m_dumpInfo({}) { - // Determine if the flags have to be overriden based on the build inputs. - m_buildArgs.inputs = m_pDevice->OverrideBuildInputs(m_buildArgs.inputs); InitializeBuildConfigs(); { @@ -491,16 +490,26 @@ BvhBuilder::BvhBuilder( const DeviceSettings& deviceSettings) // Device settings : m_pDevice(pDevice), - m_deviceSettings(deviceSettings), m_clientCb(clientCb), + m_deviceSettings(deviceSettings), + m_buildConfig({}), + m_resultOffsets({}), m_buildArgs(AccelStructBuildInfo{}), m_deviceProps(deviceProps), + m_metadataSizeInBytes(0), m_cmdBuffer(cmdBuffer), + m_scratchOffsets({}), m_backend(backend), m_buildSettings({}), + m_shaderConstantsGpuVa(0ull), + m_geomConstSrdTable(0ull), + m_geomBufferSrdTable(0ull), m_radixSortConfig(GetRadixSortConfig(deviceSettings)), m_emitCompactDstGpuVa(0ull), - m_buildSettingsHash(0) + m_buildSettingsHash(0u), + m_resultBufferInfo({}), + m_scratchBufferInfo({}), + m_dumpInfo({}) { InitCopySettings(); } @@ -549,10 +558,16 @@ BvhBuildMode BvhBuilder::OverrideBuildMode( // Remapped scratch buffer base address bool BvhBuilder::AllowRemappingScratchBuffer() const { + bool encodeQuadPrimitives = m_buildConfig.enableEarlyPairCompression; + + bool usePrimIndicesArray = false; + return (m_deviceSettings.enableRemapScratchBuffer == true) && (IsUpdate() == false) && - (m_deviceSettings.enableBuildAccelStructScratchDumping == false); + (m_deviceSettings.enableBuildAccelStructScratchDumping == false) && + (encodeQuadPrimitives == false) && + (usePrimIndicesArray == false); } // ===================================================================================================================== @@ -587,11 +602,41 @@ uint32 BvhBuilder::CalculateScratchBufferSize( return size; } +// ===================================================================================================================== +// Calculates the result buffer's metadata size +uint32 BvhBuilder::CalculateMetadataSize( + const uint32 internalNodeSize, + const uint32 leafNodeSize, + uint32* const pRunningOffset) +{ + uint metadataSizeInBytes; + { + metadataSizeInBytes = CalcMetadataSizeInBytes(internalNodeSize, leafNodeSize); + // Align metadata size to cache line + metadataSizeInBytes = Util::Pow2Align(metadataSizeInBytes, 128); + + *pRunningOffset += metadataSizeInBytes; + } + + return metadataSizeInBytes; +} + // ===================================================================================================================== // Calculates the result buffer offsets and returns the total result memory size BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfo( AccelStructDataOffsets* pOffsets, - uint32* pMetadataSizeInBytes) + uint32* pMetadataSizeInBytes, + uint32 remapScratchBufferSize) +{ + return CalculateResultBufferInfoDefault(pOffsets, pMetadataSizeInBytes, remapScratchBufferSize); +} + +// ===================================================================================================================== +// Calculates the result buffer offsets and returns the total result memory size +BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfoDefault( + AccelStructDataOffsets* pOffsets, + uint32* pMetadataSizeInBytes, + uint32 remapScratchBufferSize) { ResultBufferInfo info = {}; @@ -623,37 +668,40 @@ BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfo( uint32 internalNodeSize = 0; uint32 leafNodeSize = 0; + uint32 nodeSize = 0; if (m_buildConfig.maxNumPrimitives > 0) { internalNodeSize = CalculateInternalNodesSize(); leafNodeSize = CalculateLeafNodesSize(); + nodeSize = internalNodeSize + leafNodeSize; offsets.internalNodes = ReserveBytes(internalNodeSize, &runningOffset); offsets.leafNodes = ReserveBytes(leafNodeSize, &runningOffset); + if (AllowRemappingScratchBuffer() && (remapScratchBufferSize > nodeSize)) + { + ReserveBytes(remapScratchBufferSize - nodeSize, &runningOffset); + + nodeSize = remapScratchBufferSize; + } + if (m_buildConfig.topLevelBuild == false) { const uint32 geometryInfoSize = CalculateGeometryInfoSize(m_buildArgs.inputs.inputElemCount); offsets.geometryInfo = ReserveBytes(geometryInfoSize, &runningOffset); } - offsets.primNodePtrs = ReserveBytes(m_buildConfig.maxNumPrimitives * sizeof(uint32), &runningOffset); + { + offsets.primNodePtrs = ReserveBytes(m_buildConfig.maxNumPrimitives * sizeof(uint32), &runningOffset); + } } uint32 totalSizeInBytes = runningOffset; // Metadata section is at the beginning of the acceleration structure buffer - uint32 metadataSizeInBytes; - { - metadataSizeInBytes = CalcMetadataSizeInBytes(internalNodeSize, leafNodeSize); - // Align metadata size to cache line - metadataSizeInBytes = Util::Pow2Align(metadataSizeInBytes, 128); - - totalSizeInBytes += metadataSizeInBytes; - } - + const uint32 metadataSizeInBytes = CalculateMetadataSize(internalNodeSize, leafNodeSize, &totalSizeInBytes); if (pOffsets != nullptr) { memcpy(pOffsets, &offsets, sizeof(offsets)); @@ -664,8 +712,8 @@ BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfo( *pMetadataSizeInBytes = metadataSizeInBytes; } - info.baseOffset = metadataSizeInBytes + sizeof(AccelStructHeader); - info.nodeSize = internalNodeSize + leafNodeSize; + info.baseOffset = sizeof(AccelStructHeader); + info.nodeSize = nodeSize; info.dataSize = totalSizeInBytes; return info; } @@ -1082,7 +1130,6 @@ BvhBuilder::ScratchBufferInfo BvhBuilder::CalculateScratchBufferInfoDefault( neighbourIndices = ReserveBytes(aabbCount * sizeof(uint32), &runningOffset); // TODO: calculate number of blocks based on KEYS_PER_THREAD atomicFlagsPloc = ReserveBytes(aabbCount * RayTracingPLOCFlags, &runningOffset); - clusterOffsets = ReserveBytes(aabbCount * sizeof(uint32), &runningOffset); } } bvh2PhaseMaxSize = Util::Max(bvh2PhaseMaxSize, runningOffset); @@ -1204,6 +1251,24 @@ GeometryType BvhBuilder::GetGeometryType( return type; } +// ===================================================================================================================== +bool BvhBuilder::ForceRebuild( + const Internal::Device* pDevice, + const AccelStructBuildInputs inputs) +{ + const DeviceSettings settings = pDevice->Settings(); + const bool rebuildTopLevel = + Util::TestAnyFlagSet(settings.forceRebuildForUpdates, ForceRebuildForUpdatesMode::TopLevel) && + (inputs.type == GpuRt::AccelStructType::TopLevel); + const bool rebuildBottomLevel = + Util::TestAnyFlagSet(settings.forceRebuildForUpdates, ForceRebuildForUpdatesMode::BottomLevel) && + (inputs.type == GpuRt::AccelStructType::BottomLevel); + + bool rebuildAS = rebuildBottomLevel || rebuildTopLevel; + + return rebuildAS; +} + // ===================================================================================================================== // Initialize buildConfig void BvhBuilder::InitBuildConfig( @@ -1211,6 +1276,13 @@ void BvhBuilder::InitBuildConfig( { m_buildConfig = {}; + if (ForceRebuild(m_pDevice, m_buildArgs.inputs)) + { + // Determine if the flags have to be overriden based on the build inputs. + m_buildArgs.inputs = m_pDevice->OverrideBuildInputs(m_buildArgs.inputs); + m_buildConfig.rebuildAccelStruct = true; + } + // For top-level acceleration structure, inputElementCount represents the number of instances uint32 primitiveCount = (buildArgs.inputs.type == AccelStructType::BottomLevel) ? 0 : buildArgs.inputs.inputElemCount; @@ -1272,8 +1344,7 @@ void BvhBuilder::InitBuildConfig( m_buildConfig.topDownBuild = m_buildConfig.allowTopDownBuild && (buildArgs.inputs.inputElemCount <= m_deviceSettings.maxTopDownBuildInstances); - if ((Util::TestAnyFlagSet(m_buildArgs.inputs.flags, AccelStructBuildFlagAllowUpdate) == false) && - m_buildConfig.topLevelBuild) + if ((UpdateAllowed() == false) && m_buildConfig.topLevelBuild) { if (m_buildConfig.rebraidType == RebraidType::V1) { @@ -1300,8 +1371,7 @@ void BvhBuilder::InitBuildConfig( m_buildConfig.triangleSplitting = (m_deviceSettings.enableParallelBuild) && m_deviceSettings.enableTriangleSplitting && (buildArgs.inputs.type == AccelStructType::BottomLevel) && - (Util::TestAnyFlagSet(buildArgs.inputs.flags, AccelStructBuildFlagAllowUpdate) == false) && - Util::TestAnyFlagSet(buildArgs.inputs.flags, AccelStructBuildFlagPreferFastTrace); + (UpdateAllowed() == false) && Util::TestAnyFlagSet(buildArgs.inputs.flags, AccelStructBuildFlagPreferFastTrace); m_buildConfig.buildMode = OverrideBuildMode(buildArgs); @@ -1384,7 +1454,8 @@ void BvhBuilder::InitBuildConfig( (IsUpdate() && (m_deviceSettings.enableMergedEncodeUpdate == 0)) || ((IsUpdate() == false) && (m_deviceSettings.enableMergedEncodeBuild == 0)) || - (buildArgs.inputs.type == AccelStructType::TopLevel) || + ((buildArgs.inputs.type == AccelStructType::TopLevel) + ) || ((IsUpdate() == false) && (m_buildConfig.geometryType == GeometryType::Aabbs)) #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 46 || (buildArgs.inputs.inputElemCount > maxDescriptorTableSize) @@ -1552,6 +1623,7 @@ AccelStructHeader BvhBuilder::InitAccelStructHeader() const header.accelStructVersion = GPURT_ACCEL_STRUCT_VERSION; header.metadataSizeInBytes = m_metadataSizeInBytes; header.sizeInBytes = accelStructSize; + header.compactedSizeInBytes = accelStructSize; header.numPrimitives = m_buildConfig.maxNumPrimitives; // Is this correct? header.numDescs = m_buildArgs.inputs.inputElemCount; header.geometryType = static_cast(m_buildConfig.geometryType); @@ -1823,9 +1895,9 @@ void BvhBuilder::InitAccelerationStructure() } } - // Merged encode/build writes the header using the shader. However, we don't launch the build shader in the case - // of an empty BVH. - if (m_buildConfig.needEncodeDispatch || (m_buildConfig.maxNumPrimitives == 0)) + // Merged encode/build and BuildParallel write the header using the shader. + // However, we don't launch the build shader in the case of an empty BVH. + if ((m_buildConfig.maxNumPrimitives == 0) || (m_deviceSettings.enableParallelBuild == false)) { WriteImmediateData(HeaderBufferBaseVa(), InitAccelStructMetadataHeader()); WriteImmediateData(ResultBufferBaseVa(), InitAccelStructHeader()); @@ -2176,6 +2248,7 @@ void BvhBuilder::InitBuildSettings() m_buildSettings.updateFlags = m_buildArgs.inputs.flags & (AccelStructBuildFlagPerformUpdate | AccelStructBuildFlagAllowUpdate); + m_buildSettings.rebuildAccelStruct = m_buildConfig.rebuildAccelStruct; m_buildSettings.isUpdateInPlace = IsUpdateInPlace(); m_buildSettings.encodeArrayOfPointers = @@ -2215,19 +2288,26 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo( AccelStructPrebuildInfo prebuildInfo = {}; - // Calculate the amount of space needed to store the result. - const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr); - const uint32 resultDataSize = resultBufferInfo.dataSize; - // Calculate the amount of scratch space needed during the construction process. const ScratchBufferInfo scratchBufferInfo = CalculateScratchBufferInfo(nullptr); + + // Calculate the amount of space needed to store the result. + const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr, scratchBufferInfo.bvh2PhaseSize); + uint32 scratchDataSize = CalculateScratchBufferSize(resultBufferInfo, scratchBufferInfo); + const uint32 resultDataSize = resultBufferInfo.dataSize; uint32 updateDataSize = 0; - if (UpdateAllowed()) + if (m_buildConfig.rebuildAccelStruct) + { + // When we force a rebuild the acceleration structure should use scratch data size for updates + updateDataSize = Util::Max(1u, scratchDataSize); + } + else if (UpdateAllowed()) { updateDataSize = Util::Max(1u, CalculateUpdateScratchBufferInfo(nullptr)); } + // Scratch size for builds may be smaller than updates, some apps will still try to use the scratch size from // the build when performing the update causing page faults. scratchDataSize = Util::Max(scratchDataSize, updateDataSize); @@ -2380,8 +2460,26 @@ void BvhBuilder::PreBuildDumpEvents() { PAL_ASSERT(HasBuildDumpEvents()); - const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr); - const uint32 resultDataSize = resultBufferInfo.dataSize; + uint32 scratchDataSize; + uint32 resultDataSize; + + if (IsUpdate() == false) + { + const ScratchBufferInfo scratchBufferInfo = CalculateScratchBufferInfo(nullptr); + + const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr, scratchBufferInfo.bvh2PhaseSize); + + scratchDataSize = CalculateScratchBufferSize(resultBufferInfo, scratchBufferInfo); + + resultDataSize = resultBufferInfo.dataSize; + } + else + { + scratchDataSize = CalculateUpdateScratchBufferInfo(nullptr); + + const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr, 0); + resultDataSize = resultBufferInfo.dataSize; + } // Initialise acceleration structure information for dump purposes m_dumpInfo = {}; @@ -2397,19 +2495,10 @@ void BvhBuilder::PreBuildDumpEvents() m_dumpInfo.gpuVa = HeaderBufferBaseVa(); m_dumpInfo.sizeInBytes = resultDataSize; m_dumpInfo.scratchGpuVa = ScratchBufferBaseVa(); + m_dumpInfo.scratchSizeInBytes = scratchDataSize; m_dumpInfo.pTimeStampVidMem = nullptr; m_dumpInfo.timeStampVidMemoffset = 0; - if (IsUpdate() == false) - { - const ScratchBufferInfo scratchBufferInfo = CalculateScratchBufferInfo(nullptr); - m_dumpInfo.scratchSizeInBytes = CalculateScratchBufferSize(resultBufferInfo, scratchBufferInfo); - } - else - { - m_dumpInfo.scratchSizeInBytes = CalculateUpdateScratchBufferInfo(nullptr); - } - if (m_deviceSettings.enableBuildAccelStructStats) { #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 39 @@ -2509,18 +2598,20 @@ void BvhBuilder::InitializeBuildConfigs() InitBuildConfig(m_buildArgs); InitBuildSettings(); - m_resultBufferInfo = CalculateResultBufferInfo(&m_resultOffsets, &m_metadataSizeInBytes); - - m_scratchBufferInfo = {}; if (IsUpdate() == false) { // Compute the offsets into the scratch buffer for all of our scratch resources. m_scratchBufferInfo = CalculateScratchBufferInfo(&m_scratchOffsets); + + m_resultBufferInfo = CalculateResultBufferInfo(&m_resultOffsets, &m_metadataSizeInBytes, m_scratchBufferInfo.bvh2PhaseSize); } else { // Compute the offsets into the scratch buffer for all of our scratch resources. + m_scratchBufferInfo = {}; CalculateUpdateScratchBufferInfo(&m_scratchOffsets); + + m_resultBufferInfo = CalculateResultBufferInfo(&m_resultOffsets, &m_metadataSizeInBytes, 0); } // Add tlas to m_tlas @@ -3487,46 +3578,46 @@ void BvhBuilder::UpdateParallel() // Perform geometry encoding and update in one dispatch void BvhBuilder::EncodeUpdate() { - BindPipeline((m_buildConfig.geometryType == GeometryType::Triangles) ? - InternalRayTracingCsType::UpdateTriangles : InternalRayTracingCsType::UpdateAabbs); + uint32 numThreadGroups = 0; - const uint32 threadGroupSize = DefaultThreadGroupSize; - const uint32 wavesPerSimd = 8; - uint32 numThreadGroups = 0; { - const uint32 numWorkItems = Util::Max(1u, m_buildConfig.numPrimitives); - numThreadGroups = GetNumPersistentThreadGroups(numWorkItems, threadGroupSize, wavesPerSimd); - } - const uint32 numThreads = numThreadGroups * threadGroupSize; + BindPipeline((m_buildConfig.geometryType == GeometryType::Triangles) ? + InternalRayTracingCsType::UpdateTriangles : InternalRayTracingCsType::UpdateAabbs); - uint32 entryOffset = 0; + const uint32 threadGroupSize = DefaultThreadGroupSize; + const uint32 wavesPerSimd = 8; + const uint32 numWorkItems = Util::Max(1u, m_buildConfig.numPrimitives); + const uint32 numThreadGroups = GetNumPersistentThreadGroups(numWorkItems, threadGroupSize, wavesPerSimd); + const uint32 numThreads = numThreadGroups * threadGroupSize; - const Update::Constants shaderConstants = - { - .numThreads = numThreads, - }; + uint32 entryOffset = 0; - // Set shader constants - entryOffset = WriteUserDataEntries(&shaderConstants, Update::NumEntries, entryOffset); + const Update::Constants shaderConstants = + { + .numThreads = numThreads, + }; - entryOffset = WriteBuildShaderConstantBuffer(entryOffset); + // Set shader constants + entryOffset = WriteUserDataEntries(&shaderConstants, Update::NumEntries, entryOffset); - // Set result/scratch/source buffers - entryOffset = WriteUpdateBuffers(entryOffset); + entryOffset = WriteBuildShaderConstantBuffer(entryOffset); - const uint32 cbvSrdTableGpuVaLo = Util::LowPart(m_geomConstSrdTable); - entryOffset = WriteUserDataEntries(&cbvSrdTableGpuVaLo, 1, entryOffset); + // Set result/scratch/source buffers + entryOffset = WriteUpdateBuffers(entryOffset); - const uint32 vbvSrdTableGpuVaLo = Util::LowPart(m_geomBufferSrdTable); - entryOffset = WriteUserDataEntries(&vbvSrdTableGpuVaLo, 1, entryOffset); + const uint32 cbvSrdTableGpuVaLo = Util::LowPart(m_geomConstSrdTable); + entryOffset = WriteUserDataEntries(&cbvSrdTableGpuVaLo, 1, entryOffset); - // NullBuffer binding - entryOffset = WriteBufferVa(0, entryOffset); + const uint32 vbvSrdTableGpuVaLo = Util::LowPart(m_geomBufferSrdTable); + entryOffset = WriteUserDataEntries(&vbvSrdTableGpuVaLo, 1, entryOffset); - RGP_PUSH_MARKER("Update (NumPrimitives=%u)", m_buildConfig.maxNumPrimitives); - Dispatch(numThreadGroups); + // NullBuffer binding + entryOffset = WriteBufferVa(0, entryOffset); - RGP_POP_MARKER(); + RGP_PUSH_MARKER("Update (NumPrimitives=%u)", m_buildConfig.maxNumPrimitives); + Dispatch(numThreadGroups); + RGP_POP_MARKER(); + } } // ===================================================================================================================== diff --git a/src/gpurtBvhBuilder.h b/src/gpurtBvhBuilder.h index 876a7ec..4af762f 100644 --- a/src/gpurtBvhBuilder.h +++ b/src/gpurtBvhBuilder.h @@ -36,6 +36,15 @@ namespace EncodeNodes struct Constants; } +// ===================================================================================================================== +// Helper structure for encapsulating triangle index buffer information +struct IndexBufferInfo +{ + uint32 format; + uint64 byteOffset; + uint64 gpuVa; +}; + // ===================================================================================================================== // Helper class used by GPURT to perform various BVH operations like building, copying, etc. class BvhBuilder @@ -66,6 +75,11 @@ class BvhBuilder static uint32 CalculateGeometryInfoSize( uint32 numGeometryDescs); + // Helper function for when to perform a rebuild + static bool ForceRebuild( + const Internal::Device* pDevice, + const AccelStructBuildInputs inputs); + // Builds or updates an acceleration structure and stores it in a result buffer void BuildRaytracingAccelerationStructure(); @@ -106,14 +120,12 @@ class BvhBuilder ResultBufferInfo CalculateResultBufferInfo( AccelStructDataOffsets* pOffsets, - uint32* pMetadataSizeInBytes); + uint32* pMetadataSizeInBytes, + uint remapScratchBufferSize); ScratchBufferInfo CalculateScratchBufferInfo( RayTracingScratchDataOffsets* pOffsets); - ScratchBufferInfo CalculateScratchBufferInfoDefault( - RayTracingScratchDataOffsets* pOffsets); - uint32 CalculateUpdateScratchBufferInfo( RayTracingScratchDataOffsets* pOffsets); @@ -169,6 +181,12 @@ class BvhBuilder static uint32 GetGeometryPrimCount( const Geometry& geometry); + static IndexBufferInfo GetIndexBufferInfo( + const GeometryTriangles& geometry); + + static uint32 TrianglePairBlockCount( + uint32 numTriangles); + private: // Configs that change within build calls, private to the bvh builder. @@ -205,6 +223,7 @@ class BvhBuilder bool enableFastLBVH; bool enableMergeSort; bool enableInstanceRebraid; + bool rebuildAccelStruct; }; BvhBuilder( @@ -213,6 +232,19 @@ class BvhBuilder ClientCallbacks clientCb, const DeviceSettings& deviceSettings); + uint32 CalculateMetadataSize( + const uint32 internalNodeSize, + const uint32 leafNodeSize, + uint32* const pRunningOffset); + + ResultBufferInfo CalculateResultBufferInfoDefault( + AccelStructDataOffsets* pOffsets, + uint32* pMetadataSizeInBytes, + uint remapScratchBufferSize); + + ScratchBufferInfo CalculateScratchBufferInfoDefault( + RayTracingScratchDataOffsets* pOffsets); + uint32 CalculateInternalNodesSize()const; uint32 CalculateLeafNodesSize() const; uint32 CalculateNodesSize() const; diff --git a/src/gpurtDevice.cpp b/src/gpurtDevice.cpp index ce6f940..b1368f0 100644 --- a/src/gpurtDevice.cpp +++ b/src/gpurtDevice.cpp @@ -2128,16 +2128,7 @@ const AccelStructBuildInputs Device::OverrideBuildInputs( ) const { AccelStructBuildInputs buildInputs = inputs; - - const bool rebuildTopLevel = - ( - Util::TestAnyFlagSet(Settings().forceRebuildForUpdates, ForceRebuildForUpdatesMode::TopLevel)) && - (buildInputs.type == GpuRt::AccelStructType::TopLevel); - const bool rebuildBottomLevel = - Util::TestAnyFlagSet(Settings().forceRebuildForUpdates, ForceRebuildForUpdatesMode::BottomLevel) && - (buildInputs.type == GpuRt::AccelStructType::BottomLevel); - - bool rebuildAS = rebuildBottomLevel || rebuildTopLevel; + const bool rebuildAS = BvhBuilder::ForceRebuild(this, inputs); if (rebuildAS) { diff --git a/src/gpurtInternalShaderBindings.h b/src/gpurtInternalShaderBindings.h index 4b26c92..1785233 100644 --- a/src/gpurtInternalShaderBindings.h +++ b/src/gpurtInternalShaderBindings.h @@ -82,6 +82,16 @@ namespace CopyAS constexpr uint32 NumEntries = (sizeof(Constants) / sizeof(uint32)); } +namespace BuildTrivialBvh +{ + struct Constants + { + uint32 maxGeometryCount; + }; + + constexpr uint32 NumEntries = (sizeof(Constants) / sizeof(uint32)); +} + namespace CompactAS { struct Constants diff --git a/src/shaders/BuildBVHPLOC.hlsl b/src/shaders/BuildBVHPLOC.hlsl index abb971e..effeb80 100644 --- a/src/shaders/BuildBVHPLOC.hlsl +++ b/src/shaders/BuildBVHPLOC.hlsl @@ -56,7 +56,6 @@ struct BuildPlocArgs uint currentStateScratchOffset; uint taskQueueCounterScratchOffset; uint atomicFlagsScratchOffset; - uint offsetsScratchOffset; uint dynamicBlockIndexScratchOffset; uint numBatchesScratchOffset; uint baseBatchIndicesScratchOffset; @@ -878,7 +877,6 @@ void BuildBVHPLOC( plocArgs.currentStateScratchOffset = ShaderConstants.offsets.currentState; plocArgs.taskQueueCounterScratchOffset = ShaderConstants.offsets.plocTaskQueueCounter; plocArgs.atomicFlagsScratchOffset = ShaderConstants.offsets.atomicFlagsPloc; - plocArgs.offsetsScratchOffset = ShaderConstants.offsets.clusterOffsets; plocArgs.dynamicBlockIndexScratchOffset = ShaderConstants.offsets.dynamicBlockIndex; plocArgs.numBatchesScratchOffset = ShaderConstants.offsets.numBatches; plocArgs.baseBatchIndicesScratchOffset = ShaderConstants.offsets.batchIndices; diff --git a/src/shaders/BuildBVHTDTR.hlsl b/src/shaders/BuildBVHTDTR.hlsl index e9d568d..ed21d28 100644 --- a/src/shaders/BuildBVHTDTR.hlsl +++ b/src/shaders/BuildBVHTDTR.hlsl @@ -984,23 +984,13 @@ void BuildBVHTDImpl( uint numRefsAllocated = ScratchBuffer.Load(numRefsAllocatedOffset); if (globalId == 0) { - UintBoundingBox sceneBounds; - - uint4 data; - data = ScratchBuffer.Load4(args.SceneBoundsOffset); // todo: recalc based on ACTIVE nodes - sceneBounds.min = data.xyz; - data.xy = ScratchBuffer.Load2(args.SceneBoundsOffset + 0x10); - sceneBounds.max = data.wxy; - - BoundingBox bbox; - bbox.min = Uint3ToFloat3(sceneBounds.min); - bbox.max = Uint3ToFloat3(sceneBounds.max); + BoundingBox bbox = FetchSceneBounds(args.SceneBoundsOffset); // todo: recalc based on ACTIVE nodes BoundingBox bboxCentroid; UintBoundingBox boxCentroidUint; - data = ScratchBuffer.Load4(args.CurrentStateScratchOffset + STATE_TD_CENTROID_BBOX_OFFSET); + uint4 data = ScratchBuffer.Load4(args.CurrentStateScratchOffset + STATE_TD_CENTROID_BBOX_OFFSET); boxCentroidUint.min = data.xyz; data.xy = ScratchBuffer.Load2(args.CurrentStateScratchOffset + STATE_TD_CENTROID_BBOX_OFFSET + 0x10); boxCentroidUint.max = data.wxy; diff --git a/src/shaders/BuildCommonScratch.hlsl b/src/shaders/BuildCommonScratch.hlsl index 579920d..cd54497 100644 --- a/src/shaders/BuildCommonScratch.hlsl +++ b/src/shaders/BuildCommonScratch.hlsl @@ -591,6 +591,28 @@ float2 FetchSceneSize(uint sceneBoundsOffset) return minMax; } +//===================================================================================================================== +void InitSceneBounds(uint sceneBoundsOffset) +{ + // Initialize scene bounds + const uint maxVal = FloatToUint(FLT_MAX); + const uint minVal = FloatToUint(-FLT_MAX); + + ScratchBuffer.Store3(sceneBoundsOffset, maxVal.xxx); + sceneBoundsOffset += sizeof(uint3); + ScratchBuffer.Store3(sceneBoundsOffset, minVal.xxx); + sceneBoundsOffset += sizeof(uint3); + ScratchBuffer.Store2(sceneBoundsOffset, uint2(maxVal, minVal)); + sceneBoundsOffset += sizeof(uint2); + + if (Settings.rebraidType == RebraidType::V2) + { + ScratchBuffer.Store3(sceneBoundsOffset, maxVal.xxx); + sceneBoundsOffset += sizeof(uint3); + ScratchBuffer.Store3(sceneBoundsOffset, minVal.xxx); + } +} + //====================================================================================================================== uint GetBvhNodesOffset( uint numActivePrims, diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl index 73084ba..e557003 100644 --- a/src/shaders/BuildParallel.hlsl +++ b/src/shaders/BuildParallel.hlsl @@ -260,7 +260,6 @@ void BuildBvhPloc( plocArgs.currentStateScratchOffset = ShaderConstants.offsets.currentState; plocArgs.taskQueueCounterScratchOffset = ShaderConstants.offsets.plocTaskQueueCounter; plocArgs.atomicFlagsScratchOffset = ShaderConstants.offsets.atomicFlagsPloc; - plocArgs.offsetsScratchOffset = ShaderConstants.offsets.clusterOffsets; plocArgs.dynamicBlockIndexScratchOffset = ShaderConstants.offsets.dynamicBlockIndex; plocArgs.numBatchesScratchOffset = ShaderConstants.offsets.numBatches; plocArgs.baseBatchIndicesScratchOffset = ShaderConstants.offsets.batchIndices; @@ -401,44 +400,31 @@ void InitAccelerationStructure() DstBuffer.Store(0, ShaderConstants.header); - // Initialise encode counters - WriteTaskCounterData( - ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_NUM_PRIMITIVES_OFFSET, 0); - - // Early triangle pairing and triangle splitting dynamically increment primitive reference counter. Initialise - // counters to 0 when these features are enabled - - const bool dynamicallyIncrementsPrimRefCount = - Settings.enableEarlyPairCompression || Settings.doTriangleSplitting || Settings.isIndirectBuild; - const uint primRefInitCount = - (dynamicallyIncrementsPrimRefCount) ? 0 : ShaderConstants.numPrimitives; + if (Settings.doEncode) + { + // Initialise encode counters + WriteTaskCounterData( + ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_NUM_PRIMITIVES_OFFSET, 0); - WriteTaskCounterData( - ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET, primRefInitCount); + // Early triangle pairing and triangle splitting dynamically increment primitive reference counter. Initialise + // counters to 0 when these features are enabled - // Initialize valid scratch buffer counters to 0 - InitScratchCounter(ShaderConstants.offsets.plocTaskQueueCounter); - InitScratchCounter(ShaderConstants.offsets.tdTaskQueueCounter); - InitScratchCounter(CurrentSplitTaskQueueCounter()); - ClearNumBatches(ShaderConstants.offsets.numBatches); + const bool dynamicallyIncrementsPrimRefCount = + Settings.enableEarlyPairCompression || Settings.doTriangleSplitting || Settings.isIndirectBuild; + const uint primRefInitCount = + (dynamicallyIncrementsPrimRefCount) ? 0 : ShaderConstants.numPrimitives; - // Initialize scene bounds - const uint maxVal = FloatToUint(FLT_MAX); - const uint minVal = FloatToUint(-FLT_MAX); + WriteTaskCounterData( + ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET, primRefInitCount); - uint offset = ShaderConstants.offsets.sceneBounds; - ScratchBuffer.Store3(offset, maxVal.xxx); - offset += sizeof(uint3); - ScratchBuffer.Store3(offset, minVal.xxx); - offset += sizeof(uint3); - ScratchBuffer.Store2(offset, uint2(maxVal, minVal)); - offset += sizeof(uint2); + // Initialize valid scratch buffer counters to 0 + InitScratchCounter(ShaderConstants.offsets.plocTaskQueueCounter); + InitScratchCounter(ShaderConstants.offsets.tdTaskQueueCounter); + InitScratchCounter(CurrentSplitTaskQueueCounter()); + ClearNumBatches(ShaderConstants.offsets.numBatches); - if (Settings.rebraidType == RebraidType::V2) - { - ScratchBuffer.Store3(offset, maxVal.xxx); - offset += sizeof(uint3); - ScratchBuffer.Store3(offset, minVal.xxx); + // Initialize scene bounds + InitSceneBounds(ShaderConstants.offsets.sceneBounds); } } @@ -503,17 +489,17 @@ void BuildBvh( INIT_TASK; - if (Settings.doEncode) - { - BEGIN_TASK(1); + BEGIN_TASK(1); - if (globalId == 0) - { - InitAccelerationStructure(); - } + if (globalId == 0) + { + InitAccelerationStructure(); + } - END_TASK(1); + END_TASK(1); + if (Settings.doEncode) + { BEGIN_TASK(ShaderRootConstants.NumThreadGroups()); EncodePrimitives(globalId, localId); diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl index 2f03134..9dd9e30 100644 --- a/src/shaders/BuildQBVH.hlsl +++ b/src/shaders/BuildQBVH.hlsl @@ -604,11 +604,9 @@ static void PullUpChildren( } else { - { - // Note, box node flags are combined together by using an AND operation. Thus, we need to initialise - // invalid child flags as 0xff - boxNodeFlags = SetBoxNodeFlagsField(boxNodeFlags, 0xff, i); - } + // Note, box node flags are combined together by using an AND operation. Thus, we need to initialise + // invalid child flags as 0xff + boxNodeFlags = SetBoxNodeFlagsField(boxNodeFlags, 0xff, i); } } diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli index 4d72cb5..6bb5026 100644 --- a/src/shaders/BuildSettings.hlsli +++ b/src/shaders/BuildSettings.hlsli @@ -59,6 +59,7 @@ [[vk::constant_id(BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID)]] uint encodeArrayOfPointers = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID)]] uint sceneBoundsCalculationType = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID)]] uint rebraidQualityHeuristic = 0; +[[vk::constant_id(BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID)]] uint rebuildAccelStruct = 0; static const CompileTimeBuildSettings Settings = { topLevelBuild, @@ -108,6 +109,7 @@ static const CompileTimeBuildSettings Settings = { 0, 0, 0, + rebuildAccelStruct, }; #endif diff --git a/src/shaders/Common.hlsl b/src/shaders/Common.hlsl index b8c594b..d4562fb 100644 --- a/src/shaders/Common.hlsl +++ b/src/shaders/Common.hlsl @@ -270,12 +270,10 @@ static uint64_t PackInstanceBasePointer(GpuVirtualAddress instanceVa, uint insta instanceBasePointer |= (instanceFlags & D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE) ? (1ull << NODE_POINTER_FORCE_NON_OPAQUE_SHIFT) : 0; - { - // Set 'Skip Procedural' for triangles and 'Skip Triangles' for procedural geometry - instanceBasePointer |= (geometryType == GEOMETRY_TYPE_TRIANGLES) - ? (1ull << NODE_POINTER_SKIP_PROCEDURAL_SHIFT) - : (1ull << NODE_POINTER_SKIP_TRIANGLES_SHIFT); - } + // Set 'Skip Procedural' for triangles and 'Skip Triangles' for procedural geometry + instanceBasePointer |= (geometryType == GEOMETRY_TYPE_TRIANGLES) + ? (1ull << NODE_POINTER_SKIP_PROCEDURAL_SHIFT) + : (1ull << NODE_POINTER_SKIP_TRIANGLES_SHIFT); instanceBasePointer |= (geometryType == GEOMETRY_TYPE_AABBS) ? (1ull << NODE_POINTER_DISABLE_TRIANGLE_CULL_SHIFT) : 0; diff --git a/src/shaders/CompactCommon.hlsl b/src/shaders/CompactCommon.hlsl index 8d41575..b504f2a 100644 --- a/src/shaders/CompactCommon.hlsl +++ b/src/shaders/CompactCommon.hlsl @@ -35,7 +35,7 @@ uint CalcCompactedSize( // Acceleration structure data starts with the header (not including the metadata) uint runningOffset = sizeof(AccelStructHeader); - AccelStructOffsets offsets; + AccelStructOffsets offsets = (AccelStructOffsets)0; offsets.internalNodes = runningOffset; uint internalNodeSize = 0; @@ -63,8 +63,12 @@ uint CalcCompactedSize( offsets.geometryInfo = runningOffset; runningOffset += srcHeader.numDescs * sizeof(GeometryInfo); - offsets.primNodePtrs = runningOffset; - runningOffset += srcHeader.numPrimitives * sizeof(uint); + { + offsets.primNodePtrs = runningOffset; + runningOffset += srcHeader.numPrimitives * sizeof(uint); + + } + } else { @@ -82,8 +86,11 @@ uint CalcCompactedSize( offsets.geometryInfo = 0; } - offsets.primNodePtrs = runningOffset; - runningOffset += srcHeader.numPrimitives * sizeof(uint); + { + offsets.primNodePtrs = runningOffset; + runningOffset += srcHeader.numPrimitives * sizeof(uint); + + } } { diff --git a/src/shaders/EncodeCommon.hlsl b/src/shaders/EncodeCommon.hlsl index 462fc0f..3eca227 100644 --- a/src/shaders/EncodeCommon.hlsl +++ b/src/shaders/EncodeCommon.hlsl @@ -343,7 +343,12 @@ void EncodeTriangleNode( //===================================================================================================================== // Fetch API bounding box from source buffer which is a typed R32G32 buffer. -BoundingBox FetchBoundingBoxData(RWBuffer buffer, uint index, uint offsetInElements, uint boxStrideInElements) +template +BoundingBox FetchBoundingBoxData( + Float3Buffer buffer, + uint index, + uint offsetInElements, + uint boxStrideInElements) { const uint baseElementIndex = index * boxStrideInElements + offsetInElements; diff --git a/src/shaders/EncodeHwBvhCommon.hlsl b/src/shaders/EncodeHwBvhCommon.hlsl index 3552a3c..22e6a0b 100644 --- a/src/shaders/EncodeHwBvhCommon.hlsl +++ b/src/shaders/EncodeHwBvhCommon.hlsl @@ -141,12 +141,18 @@ void PostHwBvhBuild( offsets, metadataSizeInBytes); + // Rebuilding an updateable acceleration structure need to use the original size and not compacted one. + if (Settings.rebuildAccelStruct) + { + compactedSize = ShaderConstants.header.compactedSizeInBytes; + } WriteAccelStructHeaderField(ACCEL_STRUCT_HEADER_COMPACTED_BYTE_SIZE_OFFSET, compactedSize); if (Settings.emitCompactSize != 0) { EmitBuffer.Store2(0, uint2(compactedSize, 0)); } + } } diff --git a/src/shaders/EncodePairedTriangleImpl.hlsl b/src/shaders/EncodePairedTriangleImpl.hlsl index 7cb366d..090b544 100644 --- a/src/shaders/EncodePairedTriangleImpl.hlsl +++ b/src/shaders/EncodePairedTriangleImpl.hlsl @@ -46,11 +46,10 @@ void WriteScratchTriangleNode( WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_V2_OFFSET, data); const BoundingBox box = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2); - - const uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags); - // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out. const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff; + const uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags); + const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId); data = uint4(0, 0, 0, packedFlags); @@ -118,7 +117,6 @@ void WriteScratchQuadNode( // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out. const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff; - const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId); WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_FLAGS_OFFSET, packedFlags); } @@ -196,11 +194,40 @@ float ComputePairAreaRatio( return ratio; } +//====================================================================================================================== +float ComputeEdgeBoxSurfaceArea( + float3x3 vertices, + uint rotation) +{ + // triangle v1, v2, v0 + float3 e0 = (vertices[1]); + float3 e1 = (vertices[0]); + + if (rotation == 0) + { + // triangle v0, v1, v2 + e0 = (vertices[0]); + e1 = (vertices[2]); + } + else if (rotation == 1) + { + // triangle v2, v0, v1 + e0 = (vertices[2]); + e1 = (vertices[1]); + } + + BoundingBox edgeBox = (BoundingBox)0; + edgeBox.min = min(e0, e1); + edgeBox.max = max(e0, e1); + + return ComputeBoxSurfaceArea(edgeBox); +} + //====================================================================================================================== template int PairTrianglesOptimal( T tri, - BoundingBox bbox, + float3x3 vertices, bool isActive) { bool valid = isActive; @@ -208,6 +235,8 @@ int PairTrianglesOptimal( // Initialise to unpaired triangle int pairInfo = -1; + const BoundingBox bbox = GenerateTriangleBoundingBox(vertices[0], vertices[1], vertices[2]); + while (valid) { const bool isBroadcastLane = WaveIsFirstLane(); @@ -230,7 +259,13 @@ int PairTrianglesOptimal( WaveReadLaneFirst(bbox.max), }; - const float ratio = (packedOffset == -1) ? FLT_MAX : ComputePairAreaRatio(broadcastTriBounds, bbox); + const uint tri1Rotation = (packedOffset >> 4) & 0xF; + const float edgeBoxSa = ComputeEdgeBoxSurfaceArea(vertices, tri1Rotation); + + // Skip unpaired triangles and pairs with perpendicular shared edges (i.e. edge box area = 0) + const float ratio = + ((packedOffset == -1) || (edgeBoxSa == 0.0f)) ? + FLT_MAX : ComputePairAreaRatio(broadcastTriBounds, bbox); const float waveMinRatio = WaveActiveMin(ratio); @@ -325,13 +360,17 @@ int PairTriangles( const bool isActiveTriangle = IsActive(tri); + float3x3 faceVertices; + faceVertices[0] = tri.v0; + faceVertices[1] = tri.v1; + faceVertices[2] = tri.v2; + // Indexed triangles can always be paired as their connectivity cannot change on updates. if (isIndexed) { if (Settings.enablePairCostCheck) { - const BoundingBox bbox = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2); - pairInfo = PairTrianglesOptimal(faceIndices, bbox, isActiveTriangle); + pairInfo = PairTrianglesOptimal(faceIndices, faceVertices, isActiveTriangle); } else { @@ -341,15 +380,9 @@ int PairTriangles( // Only pair non-indexed triangles for non-updateable as the triangle positions can change on updates else if (IsUpdateAllowed() == false) { - float3x3 faceVertices; - faceVertices[0] = tri.v0; - faceVertices[1] = tri.v1; - faceVertices[2] = tri.v2; - if (Settings.enablePairCostCheck) { - const BoundingBox bbox = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2); - pairInfo = PairTrianglesOptimal(faceVertices, bbox, isActiveTriangle); + pairInfo = PairTrianglesOptimal(faceVertices, faceVertices, isActiveTriangle); } else { diff --git a/src/shaders/EncodeTopLevel.hlsl b/src/shaders/EncodeTopLevel.hlsl index bd03de0..689a4ff 100644 --- a/src/shaders/EncodeTopLevel.hlsl +++ b/src/shaders/EncodeTopLevel.hlsl @@ -136,6 +136,7 @@ void EncodeInstances( EncodeInstancesUpdate(index, desc, tlasMetadataSize, + offsets, primNodePointerOffset, baseAddrAccelStructHeader, numActivePrims, diff --git a/src/shaders/EncodeTopLevelUpdate.hlsl b/src/shaders/EncodeTopLevelUpdate.hlsl index 48277fe..7a93c31 100644 --- a/src/shaders/EncodeTopLevelUpdate.hlsl +++ b/src/shaders/EncodeTopLevelUpdate.hlsl @@ -28,9 +28,9 @@ void WriteInstanceDescriptor( in InstanceDesc instanceDesc, in uint geometryType, - in uint boxNodeFlags, in uint instanceIndex, in uint instNodePtr, + in AccelStructOffsets offsets, in uint blasRootNodePointer, in uint blasMetadataSize, in uint tlasMetadataSize) @@ -51,6 +51,7 @@ void EncodeInstancesUpdate( uint index, InstanceDesc desc, uint tlasMetadataSize, + AccelStructOffsets offsets, uint primNodePointerOffset, uint64_t baseAddrAccelStructHeader, uint numActivePrims, @@ -159,9 +160,9 @@ void EncodeInstancesUpdate( WriteInstanceDescriptor(desc, geometryType, - boxNodeFlags, index, nodePointer, + offsets, CreateRootNodePointer(), blasMetadataSize, tlasMetadataSize); diff --git a/src/shaders/Extensions.hlsl b/src/shaders/Extensions.hlsl index baf4630..bfc0812 100644 --- a/src/shaders/Extensions.hlsl +++ b/src/shaders/Extensions.hlsl @@ -117,6 +117,7 @@ __decl float3 AmdExtD3DShaderIntrinsics_FloatOpWithRoundMode( //===================================================================================================================== // Sub-group wave reductions +// Ref: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_instructions [[vk::ext_capability(/* GroupNonUniform */ 61)]] [[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]] @@ -149,6 +150,24 @@ float AmdExtD3DShaderIntrinsics_WaveClusterMax(float x, uint dxClusterSize) return spirv_OpGroupNonUniformFMax_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); } +[[vk::ext_instruction(359)]] +uint spirv_OpGroupNonUniformBitwiseAnd_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize); + +uint AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(uint x, uint dxClusterSize) +{ + const uint clusterSize = (1u << (dxClusterSize - 1)); + return spirv_OpGroupNonUniformBitwiseAnd_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); +} + +[[vk::ext_instruction(360)]] +uint spirv_OpGroupNonUniformBitwiseOr_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize); + +uint AmdExtD3DShaderIntrinsics_WaveClusterBitOr(uint x, uint dxClusterSize) +{ + const uint clusterSize = (1u << (dxClusterSize - 1)); + return spirv_OpGroupNonUniformBitwiseOr_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); +} + #endif //===================================================================================================================== diff --git a/src/shaders/GenerateMortonCodes.hlsl b/src/shaders/GenerateMortonCodes.hlsl index bc43899..6cd8bbd 100644 --- a/src/shaders/GenerateMortonCodes.hlsl +++ b/src/shaders/GenerateMortonCodes.hlsl @@ -107,7 +107,7 @@ void GenerateMortonCodesImpl( // Clear refit propagation flags for each leaf node in BVH2. const uint initValue = (Settings.enableFastLBVH ? 0xffffffffu : 0); const uint flagOffset = ShaderConstants.offsets.propagationFlags + (primitiveIndex * sizeof(uint)); - ScratchGlobal.Store(flagOffset, initValue); + ScratchBuffer.Store(flagOffset, initValue); } #if NO_SHADER_ENTRYPOINT == 0 diff --git a/src/shaders/GpuRtLibrary.hlsl b/src/shaders/GpuRtLibrary.hlsl index 3d829e4..81bf9fa 100644 --- a/src/shaders/GpuRtLibrary.hlsl +++ b/src/shaders/GpuRtLibrary.hlsl @@ -29,7 +29,6 @@ // Following order matters as AccelStructTracker relies on defines from TraceRayCommon.hlsl #include "TraceRayCommon.hlsl" #include "AccelStructTracker.hlsl" -#include "llpc/GpurtIntrinsics.h" #if GPURT_BUILD_CONTINUATION && LLPC_CLIENT_INTERFACE_MAJOR_VERSION // Include the continuations library @@ -1002,6 +1001,65 @@ export uint _RayQuery_InstanceIndex(in RayQueryInternal rayQuery, bool committed } } +//===================================================================================================================== +// Fetch triangle position +export TriangleData _RayQuery_FetchTrianglePosition( + inout_param(RayQueryInternal) rayQuery, // BVH address + in bool committed) // Node pointer +{ + TriangleData tdata; + RayTracingIpLevel rtip = _AmdGetRtip(); + switch (rtip) + { + default: + { + tdata = FetchTrianglePositionFromRayQuery(rayQuery, committed); + break; + } + } + return tdata; +} + +//===================================================================================================================== +// RayQuery::Proceed() entry point +export bool _RayQuery_Proceed( + inout_param(RayQueryInternal) rayQuery, + in uint constRayFlags, + in uint3 dispatchThreadId) +{ + uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip()); + return RayQueryProceedCommon( + rayQuery, + constRayFlags, + dispatchThreadId, + rtIpLevel + ); +} + +//===================================================================================================================== +// TraceRayInline() entry point +export void _RayQuery_TraceRayInline( + inout_param(RayQueryInternal) rayQuery, + in uint accelStructLo, + in uint accelStructHi, + in uint constRayFlags, + in uint rayFlags, + in uint instanceMask, + in RayDesc rayDesc, + in uint3 dispatchThreadId) +{ + uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip()); + TraceRayInlineCommon(rayQuery, + accelStructLo, + accelStructHi, + constRayFlags, + rayFlags, + instanceMask, + rayDesc, + dispatchThreadId, + rtIpLevel); +} + export void _RayQuery_SetObjId(in RayQueryInternal rayQuery, int objId) { rayQuery.rayQueryObjId = objId; diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl index 5973394..23ed420 100644 --- a/src/shaders/GpuRtLibraryCont.hlsl +++ b/src/shaders/GpuRtLibraryCont.hlsl @@ -131,26 +131,6 @@ static bool RtIpIsAtLeast(RayTracingIpLevel level) return ((uint32_t)_AmdGetRtip()) >= ((uint32_t)level); } -//===================================================================================================================== -static uint ConvertRtIpLevel(RayTracingIpLevel rtIpLevel) -{ - uint level = 0; - - switch (rtIpLevel) - { - case RayTracingIpLevel::RtIp1_1: - level = GPURT_RTIP1_1; - break; - case RayTracingIpLevel::RtIp2_0: - level = GPURT_RTIP2_0; - break; - default: - break; - } - - return level; -} - //===================================================================================================================== static uint GetPriorityForShaderType( DXILShaderKind shaderKind) @@ -170,19 +150,62 @@ static uint GetPriorityForShaderType( // Forward declaration for _AmdDispatchSystemData.PackDispatchId() and _AmdDispatchSystemData.DispatchId() static uint3 GetDispatchRaysDimensions(); +//===================================================================================================================== + static uint64_t GetVpcWithPriority(uint64_t vpc, uint priority) { - return vpc; + if (_AmdIsLlpc()) + { + return vpc; + } + + const uint64_t prio64 = priority; + const uint firstMetadataBit = 32; + const uint firstPriorityBitInMetadata = 16; + GPU_ASSERT((vpc & 0xFFFF000000000000) == 0); + return vpc | (prio64 << (firstMetadataBit + firstPriorityBitInMetadata)); } -static uint64_t Unpack32BitVpcTo64BitVpc(uint32_t vpc32, bool /*unpackPriority*/) +//===================================================================================================================== +// 32-bit function pointer packing/unpacking +// +static uint64_t Unpack32BitVpcTo64BitVpc(uint32_t vpc32, bool unpackPriority) { - return vpc32; + if (_AmdIsLlpc()) + { + return vpc32; + } + + uint64_t vpc = (vpc32 & 0xFFFFFFC0); + + if (unpackPriority) + { + // The priority is stored in bits 0..2. + uint32_t priority = (vpc32 & 0x7); + vpc = GetVpcWithPriority(vpc, priority); + } + + return vpc; } static uint32_t Pack64BitVpcTo32Bits(uint64_t vpc) { - return (vpc & 0xFFFFFFFF); + if (_AmdIsLlpc()) + { + return (vpc & 0xFFFFFFFF); + } + + // Incoming metadata is in the high dword + uint32_t inMetadata = (uint32_t)(vpc >> 32); + uint32_t prio = (inMetadata >> 16); + // We only have three bits for the priority: + GPU_ASSERT(prio <= 7); + + // Outgoing metadata is in the low 6 bits + uint32_t outMetadata = prio; + + GPU_ASSERT((vpc & 0x2F) == 0); + return SplitUint64(vpc).x | outMetadata; } //===================================================================================================================== @@ -2030,7 +2053,6 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_ } else { - // This case should only occur in sorting mode. GPU_ASSERT(false); } } @@ -2038,7 +2060,6 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_ const uint newState = data.traversal.committed.State(); RayHistoryWriteEnd(data, newState); - // Finished sorting, previously dead lanes may now have CHS|MS to execute and vice-versa if (nextShaderAddr != returnAddr) { const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(newState) ? @@ -2116,7 +2137,6 @@ export void _cont_Traversal( RayHistoryWriteAnyHitOrProceduralStatus(data); } - // Handle reordering of rays/threads before processing since dead lanes may become alive after sorting. // Execute traversal for active lanes. uint state = TRAVERSAL_STATE_COMMITTED_NOTHING; _AmdPrimitiveSystemState candidate = (_AmdPrimitiveSystemState)0; @@ -2150,14 +2170,6 @@ export void _cont_Traversal( _AmdTraversalResultData result = (_AmdTraversalResultData)0; bool IsChsOrMiss = data.IsChsOrMiss(state); - // For sorting-enabled global mem mode, we only enqueue CHS/Miss once all - // lanes have arrived in this state. - // In non-sorting mode, we immediately enqueue CHS/Miss. This is mostly - // to replicate the old ProcessContinuation() behavior for now. - // We might want to consider also waiting for all lanes here in the non-global - // mem mode for consistency, and potentially also to have a common place - // in between Traversal and CHS/Miss where extra work can be done just once - // for all lanes, e.g. preparing system data for CHS/Miss. if ((_AmdContinuationStackIsGlobal() && WaveActiveAllTrue(IsChsOrMiss)) || (!_AmdContinuationStackIsGlobal() && IsChsOrMiss)) { @@ -2167,11 +2179,6 @@ export void _cont_Traversal( GetNextHitMissPc(data, state, candidate, nextShaderAddr); bool hasWorkToDo = true; - // Avoid sorting on return addresses to RayGen (the case nextShaderValid == false), as it may create - // unexpected behavior and might increase execution divergence. For example, we might have multiple resume - // points due to divergent control flow in the TraceRay call, but those resume points are all copies of the same - // code. If we sort and re-read only from one bin, we might prevent future TraceRay calls from reconverging - // on traversal. if (_AmdContinuationStackIsGlobal() && (nextShaderAddr != 0)) { } @@ -2231,9 +2238,6 @@ export void _cont_Traversal( } else { - // The last remaining case is that we need to re-enqueue Traversal, because we are waiting for - // other lanes to finish BVH traversal before sorting, or to resume suspended lanes that wait for - // other lanes to run IS/AHS in early-is-ahs mode. // // Everything else needs to go back through scheduling/traversal, regardless of state // Note we don't need "Wait" here because priorities run AHS and IS first diff --git a/src/shaders/LaneGroup.hlsl b/src/shaders/LaneGroup.hlsl index 5e69227..3274c59 100644 --- a/src/shaders/LaneGroup.hlsl +++ b/src/shaders/LaneGroup.hlsl @@ -124,6 +124,22 @@ struct LaneGroup return AmdExtD3DShaderIntrinsics_WaveClusterMin(val, clusterSize); } + template + T BitOr(T val) + { + const uint clusterSize = log2(groupSize) + 1; + + return AmdExtD3DShaderIntrinsics_WaveClusterBitOr(val, clusterSize); + } + + template + T BitAnd(T val) + { + const uint clusterSize = log2(groupSize) + 1; + + return AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(val, clusterSize); + } + template T Broadcast(T val, uint targetLane) { diff --git a/src/shaders/TaskCounter.hlsl b/src/shaders/TaskCounter.hlsl index 034a897..30d5531 100644 --- a/src/shaders/TaskCounter.hlsl +++ b/src/shaders/TaskCounter.hlsl @@ -26,16 +26,32 @@ #include "BuildSettings.hlsli" #endif +//====================================================================================================================== +// Set a scratch buffer counter to 0 if it has a valid offset +void InitScratchCounter(uint offset) +{ + if (offset != INVALID_IDX) + { + ScratchGlobal.Store(offset, 0); + } +} + +//====================================================================================================================== +// Increase a scratch buffer counter and return its original value +uint IncrementScratchCounter(uint offset, uint value) +{ + uint originalVal = 0; + ScratchGlobal.InterlockedAdd(offset, value, originalVal); + return originalVal; +} + //===================================================================================================================== // Increment task counter to mark a task / primitive as done uint IncrementTaskCounter(uint offset, uint value) { DeviceMemoryBarrier(); - uint originalVal = 0; - ScratchGlobal.InterlockedAdd(offset, value, originalVal); - - return originalVal; + return IncrementScratchCounter(offset, value); } //===================================================================================================================== diff --git a/src/shaders/TaskQueueCounter.hlsl b/src/shaders/TaskQueueCounter.hlsl index fd3303d..84aa2e5 100644 --- a/src/shaders/TaskQueueCounter.hlsl +++ b/src/shaders/TaskQueueCounter.hlsl @@ -86,13 +86,3 @@ bool EndTask(const uint localId, uint taskQueueOffset) return returnValue; } - -//====================================================================================================================== -// Set a scratch buffer counter to 0 if it has a valid index -void InitScratchCounter(uint offset) -{ - if (offset != INVALID_IDX) - { - ScratchGlobal.Store(offset, 0); - } -} diff --git a/src/shaders/TraceRayCommon.hlsl b/src/shaders/TraceRayCommon.hlsl index 3736e40..c22f9eb 100644 --- a/src/shaders/TraceRayCommon.hlsl +++ b/src/shaders/TraceRayCommon.hlsl @@ -30,6 +30,8 @@ #endif #include "../../gpurt/gpurtDispatch.h" +#include "llpc/GpurtIntrinsics.h" + // Driver reserved space ID and resource bindings #define SPACEID space93 @@ -90,6 +92,26 @@ static uint CalculateHitGroupRecordAddress( ); } +//===================================================================================================================== +static uint ConvertRtIpLevel(RayTracingIpLevel rtIpLevel) +{ + uint level = 0; + + switch (rtIpLevel) + { + case RayTracingIpLevel::RtIp1_1: + level = GPURT_RTIP1_1; + break; + case RayTracingIpLevel::RtIp2_0: + level = GPURT_RTIP2_0; + break; + default: + break; + } + + return level; +} + //===================================================================================================================== static HitGroupInfo FetchHitGroupInfo( uint hitGroupRecordIndex) diff --git a/src/shaders/TrianglePrimitive.hlsl b/src/shaders/TrianglePrimitive.hlsl index 25e02a1..e2975dc 100644 --- a/src/shaders/TrianglePrimitive.hlsl +++ b/src/shaders/TrianglePrimitive.hlsl @@ -113,10 +113,11 @@ uint3 FetchFaceIndices( // Vertex buffers only require an address and stride alignment of the format component size not the entire element size. // If the input data is not naturally aligned, we cannot use a single typed fetch for the 2-3 components. In this case, // we need to fetch each component separately. +template float3 FetchVertexPerComponent( - RWBuffer buffer, - uint firstComponentIndex, - uint numComponents) + Float3Buffer buffer, + uint firstComponentIndex, + uint numComponents) { float3 vertex; vertex.x = buffer[firstComponentIndex+0].x; @@ -134,8 +135,9 @@ float3 FetchVertexPerComponent( } //===================================================================================================================== +template TriangleData FetchTriangleData( - RWBuffer buffer, + Float3Buffer buffer, uint vertexOffsetInComponents, uint3 index, uint strideInComponents, @@ -181,8 +183,9 @@ uint CalcTriangleBoxNodeFlags( } //====================================================================================================================== +template TriangleData FetchTransformedTriangleData( - in RWBuffer geometryBuffer, + in Float3Buffer geometryBuffer, in uint3 faceIndices, in uint geometryStride, in uint vertexOffsetInComponents, @@ -226,10 +229,11 @@ bool IsActive(TriangleData tri) //===================================================================================================================== // Helper function to fetch triangle data. Returns false if the vertex indices are out of bounds. +template bool FetchTrianglePrimitive( in BuildShaderGeometryConstants geomConstants, in NumPrimAndInputOffset inputOffsets, - in RWBuffer geometryBuffer, + in Float3Buffer geometryBuffer, in uint geomId, in uint primId, inout_param(TriangleData) tri, diff --git a/src/shaders/Update.hlsl b/src/shaders/Update.hlsl index 0e06e40..035ad4b 100644 --- a/src/shaders/Update.hlsl +++ b/src/shaders/Update.hlsl @@ -133,18 +133,15 @@ void Update( const uint numGroups = ShaderRootConstants.numThreads / BUILD_THREADGROUP_SIZE; - { - ClearUpdateFlags(globalId); - BEGIN_TASK(numGroups); - EncodePrimitives(globalId, GEOMETRY_TYPE_TRIANGLES); - END_TASK(numGroups); - - const uint numWorkItems = ScratchBuffer.Load(UPDATE_SCRATCH_STACK_NUM_ENTRIES_OFFSET); - UpdateQBVHImpl(globalId, - numWorkItems, - ShaderRootConstants.numThreads); - } - + ClearUpdateFlags(globalId); + BEGIN_TASK(numGroups); + EncodePrimitives(globalId, GEOMETRY_TYPE_TRIANGLES); + END_TASK(numGroups); + + const uint numWorkItems = ScratchBuffer.Load(UPDATE_SCRATCH_STACK_NUM_ENTRIES_OFFSET); + UpdateQBVHImpl(globalId, + numWorkItems, + ShaderRootConstants.numThreads); } //====================================================================================================================== diff --git a/src/shadersClean/common/Math.hlsli b/src/shadersClean/common/Math.hlsli index 5c8356b..981b9b5 100644 --- a/src/shadersClean/common/Math.hlsli +++ b/src/shadersClean/common/Math.hlsli @@ -48,6 +48,13 @@ inline uint32_t bit(uint32_t index) return 1u << index; } +//===================================================================================================================== +// Helper function for producing a 16 bit mask of one bit +inline uint16_t bit16(uint16_t index) +{ + return uint16_t(1u << index); +} + //===================================================================================================================== // Helper function for producing a 64 bit mask of one bit inline uint64_t bit64(uint32_t index) diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py index 9ce4d59..67cd973 100644 --- a/tools/CompileRTShaders.py +++ b/tools/CompileRTShaders.py @@ -193,7 +193,7 @@ def getValidationCmdArgs(args) -> [str]: validateCommand = [compilerPath] - validateCommand += getBaseDxcCommandArgs(True, True, True) + validateCommand += getBaseDxcCommandArgs(True, True, False) validateCommand += ["-Wno-misplaced-attributes"] # -Wmisplaced-attributes is triggered by [RootSignature()] # used by entrypoint code and compiled as library validateCommand += ['-Fo', 'temp.bin']