diff --git a/backends/pal/gpurtPalBackend.cpp b/backends/pal/gpurtPalBackend.cpp index 08f4571..d06394c 100644 --- a/backends/pal/gpurtPalBackend.cpp +++ b/backends/pal/gpurtPalBackend.cpp @@ -152,11 +152,11 @@ void PalBackend::WriteImmediateSingle( ImmediateDataWidth width ) const { - // We want to use HwPipePreCs (ME) so that the writes do not occur before UAV barriers are done waiting. + // We want to use StagePostPrefetch (ME) so that the writes do not occur before UAV barriers are done waiting. // Both internal barriers during the build and application barriers synchronizing access to acceleration - // structure memory wait at HwPipePreCs. + // structure memory wait at StagePostPrefetch. GetCmdBuffer(cmdBuffer)->CmdWriteImmediate( - Pal::HwPipePoint::HwPipePreCs, + Pal::PipelineStageFlag::PipelineStagePostPrefetch, value, GpuRtToPalImmediateDataWidth(width), destVa); diff --git a/cmake/GpuRtGenerateShaders.cmake b/cmake/GpuRtGenerateShaders.cmake index 4654fa0..42a779b 100644 --- a/cmake/GpuRtGenerateShaders.cmake +++ b/cmake/GpuRtGenerateShaders.cmake @@ -76,6 +76,7 @@ if (GPURT_ENABLE_GPU_DEBUG) set(debugShaderDirectory "${CMAKE_CURRENT_BINARY_DIR}/debugShaders/src/shaders/") set(gpurtShaderSource ${GPURT_SHADER_SOURCE_FILES}) set(gpurtShadersSourceDir ${debugShaderDirectory}) + set(gpurtShadersPreprocessInputFile "${CMAKE_CURRENT_BINARY_DIR}/debugShaders/DebugPreprocessShadersInput.txt") list(TRANSFORM gpurtShaderSource PREPEND "${debugShaderDirectory}") set(preprocessArgs "") foreach(originalSourceFile ${GPURT_SHADER_SOURCE_FILES}) @@ -84,10 +85,13 @@ if (GPURT_ENABLE_GPU_DEBUG) list(APPEND preprocessArgs "${originalSourcePath}" "${newSourceFilePath}") endforeach() set(gpurtDebugPreprocessorScript "${gpurtToolsDir}/DebugPreprocessShaders.py") + configure_file("${gpurtToolsDir}/DebugPreprocessShadersInput.txt.in" + ${gpurtShadersPreprocessInputFile} + ) add_custom_command( OUTPUT ${gpurtShaderSource} ${gpurtDebugInfoFile} - DEPENDS ${originalShaderSource} ${gpurtDebugPreprocessorScript} - COMMAND Python3::Interpreter ${gpurtDebugPreprocessorScript} ${preprocessArgs} ${gpurtDebugInfoFile} + DEPENDS ${originalShaderSource} ${gpurtDebugPreprocessorScript} ${gpurtShadersPreprocessInputFile} + COMMAND Python3::Interpreter ${gpurtDebugPreprocessorScript} -i ${gpurtShadersPreprocessInputFile} -o ${gpurtDebugInfoFile} ) else() set(gpurtShaderSource "${originalShaderSource}") diff --git a/gpurt/gpurt.h b/gpurt/gpurt.h index 68d5ef5..5d9d8e2 100644 --- a/gpurt/gpurt.h +++ b/gpurt/gpurt.h @@ -1471,13 +1471,21 @@ class IDevice // @param pDispatchRaysConstants (in/out) Non-null pointer to a DispatchRaysConstants // @param cpsMemoryGpuAddr (in) GPU address pointing to the beginning of cps memory // @param cpsMemoryBytes (in) Cps allocated memory size in bytes - // - // @return the required global memory allocation size in bytes virtual void PatchDispatchRaysConstants( DispatchRaysConstants* pDispatchRaysConstants, const gpusize cpsMemoryGpuAddr, const gpusize cpsMemoryBytes) = 0; + // Populates the GPU addresses in the InitExecuteIndirectConstants structure + // + // @param pInitExecuteIndirectConstants (in/out) Non-null pointer to a InitExecuteIndirectConstants + // @param cpsMemoryGpuAddr (in) GPU address pointing to the beginning of cps memory + // @param cpsMemoryBytes (in) Cps allocated memory size in bytes + virtual void PatchInitExecuteIndirectConstants( + GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants, + const gpusize cpsMemoryGpuAddr, + const gpusize cpsMemoryBytes) = 0; + // // @param cpsVideoMem [in] Cps video memory // @param cpsMemoryBytes [in] Cps allocated memory size in bytes @@ -1630,6 +1638,8 @@ class IDevice // Check if a build is a good candidate for ACE offload (typically barrier-free cases) virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const = 0; + virtual uint32 CalculateBvhPrimitiveCount(const AccelStructBuildInputs& inputs) const = 0; + protected: /// Client must create objects by explicitly calling CreateDevice method diff --git a/gpurt/gpurtDispatch.h b/gpurt/gpurtDispatch.h index 8f4ce03..fee7757 100644 --- a/gpurt/gpurtDispatch.h +++ b/gpurt/gpurtDispatch.h @@ -55,6 +55,8 @@ struct DispatchRaysTopLevelData uint32 accelStructTrackerSrd[MaxBufferSrdSize]; // Structured buffer SRD pointing to the accel struct tracker }; +#define DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID 48 + // Dispatch rays constant buffer data (GPU structure). Note, using unaligned uint64_t in HLSL constant buffers requires // -no-legacy-cbuf-layout for cpp style structure alignment to work. But currently that support is incomplete in DXC // and until that is resolved we need to use uint32's explicitly. @@ -74,7 +76,8 @@ struct DispatchRaysConstantData uint32 hitGroupTableBaseAddressLo; // Hit group table base address low 32-bits uint32 hitGroupTableBaseAddressHi; // Hit group table base address high 32-bits uint32 hitGroupTableStrideInBytes; // Hit group table record byte stride - uint32 reserved0; // Reserved padding + uint32 cpsDispatchId; // Continuations DispatchId, written in the persistent mode. + // This value should not be read via constant buffer. uint32 callableTableBaseAddressLo; // Callable shader table base address low 32-bits uint32 callableTableBaseAddressHi; // Callable shader table base address high 32-bits uint32 callableTableStrideInBytes; // Callable shader table byte stride @@ -96,6 +99,8 @@ struct DispatchRaysConstantData uint32 cpsGlobalMemoryAddressLo; // Separate CPS stack memory base address low 32-bits uint32 cpsGlobalMemoryAddressHi; // Separate CPS stack memory base address high 32-bits uint32 counterMask; // Mask for filtering ray history token + uint32 cpsDispatchIdAddressLo; // Continuations cpsDispatchId address low 32-bits + uint32 cpsDispatchIdAddressHi; // Continuations cpsDispatchId address high 32-bits }; #pragma pack(pop) @@ -109,6 +114,8 @@ struct DispatchRaysConstants #if __cplusplus static_assert((sizeof(DispatchRaysConstants) % sizeof(uint32)) == 0, "DispatchRaysConstants is not dword-aligned"); +static_assert(DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID == offsetof(DispatchRaysConstantData, cpsDispatchId), + "DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID mismatches to cpsDispatchId"); constexpr uint32 DispatchRaysConstantsDw = sizeof(DispatchRaysConstants) / sizeof(uint32); #endif @@ -132,6 +139,17 @@ struct InitExecuteIndirectUserData // Constants for InitExecuteIndirect shader struct InitExecuteIndirectConstants { +#if __cplusplus + // Internal counter buffer SRDs + uint32 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize]; + + // Internal acceleration structure tracker buffer SRD. + uint32 accelStructTrackerSrd[MaxBufferSrdSize]; +#else + uint4 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize / 4]; + uint4 accelStructTrackerSrd[MaxBufferSrdSize / 4]; +#endif + uint32 inputBytesPerDispatch; // Size of application indirect arguments uint32 outputBytesPerDispatch; // Size of resulting driver internal arguments uint32 bindingArgsSize; // Size of binding arguments in the app buffer preceeding the dispatch @@ -160,18 +178,10 @@ struct InitExecuteIndirectConstants uint32 counterRayIdRangeBegin; // Counter ray ID range begin uint32 counterRayIdRangeEnd; // Counter ray ID range end uint32 cpsBackendStackSize; // Scratch memory used by a compiler backend, start at offset 0 - uint32 padding0; // Padding for 16-byte alignment + uint32 cpsFrontendStackSize; // Scratch memory used by IR (Intermediate Representation), for a continuation passing shader -#if __cplusplus - // Internal counter buffer SRDs - uint32 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize]; - - // Internal acceleration structure tracker buffer SRD. - uint32 accelStructTrackerSrd[MaxBufferSrdSize]; -#else - uint4 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize / 4]; - uint4 accelStructTrackerSrd[MaxBufferSrdSize / 4]; -#endif + uint32 cpsGlobalMemoryAddressLo; // Separate CPS stack memory base address low 32-bits + uint32 cpsGlobalMemoryAddressHi; // Separate CPS stack memory base address high 32-bits }; constexpr uint32 InitExecuteIndirectConstantsDw = sizeof(InitExecuteIndirectConstants) / sizeof(uint32); @@ -184,7 +194,7 @@ static_assert((MaxBufferSrdSize == 4), "Buffer SRD size changed, affected shader #endif static_assert((sizeof(InitExecuteIndirectConstants) % sizeof(uint32)) == 0, "InitExecuteIndirectConstants is not dword-aligned"); -} +} // namespace GpuRt #endif #endif diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp index 47e4043..59287b6 100644 --- a/src/gpurtBvhBuilder.cpp +++ b/src/gpurtBvhBuilder.cpp @@ -1629,7 +1629,7 @@ AccelStructHeader BvhBuilder::InitAccelStructHeader() const header.geometryType = static_cast(m_buildConfig.geometryType); header.uuidLo = Util::LowPart(m_deviceSettings.accelerationStructureUUID); header.uuidHi = Util::HighPart(m_deviceSettings.accelerationStructureUUID); - header.rtIpLevel = uint32(m_pDevice->GetRtIpLevel()); + header.rtIpLevel = static_cast(PalToGpuRtIpLevel(m_pDevice->GetRtIpLevel())); if (m_buildConfig.topLevelBuild) { @@ -2313,8 +2313,8 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo( // the build when performing the update causing page faults. scratchDataSize = Util::Max(scratchDataSize, updateDataSize); - // Some applications crash when the driver reports 0 scratch size. Use 1 instead. - scratchDataSize = Util::Max(1u, scratchDataSize); + // Some applications crash when the driver reports 0 scratch size. Use 1 DWORD instead. + scratchDataSize = Util::Max(static_cast(sizeof(uint32)), scratchDataSize); prebuildInfo.scratchDataSizeInBytes = scratchDataSize; prebuildInfo.updateScratchDataSizeInBytes = updateDataSize; diff --git a/src/gpurtDevice.cpp b/src/gpurtDevice.cpp index 6058d33..f3b2f7a 100644 --- a/src/gpurtDevice.cpp +++ b/src/gpurtDevice.cpp @@ -467,6 +467,18 @@ Pal::Result Device::InitializeCpsMemory( return result; } +//===================================================================================================================== +// Populates the GPU addresses in the Constant structure +template +void Device::PatchConstants(ConstantsType* pConstant, + const gpusize cpsMemoryGpuAddr, + const gpusize cpsMemoryBytes) +{ + pConstant->cpsGlobalMemoryAddressLo = Util::LowPart(cpsMemoryGpuAddr); + pConstant->cpsGlobalMemoryAddressHi = Util::HighPart(cpsMemoryGpuAddr); + +} + //===================================================================================================================== // Populates the GPU addresses in the DispatchRaysConstants structure void Device::PatchDispatchRaysConstants( @@ -474,9 +486,17 @@ void Device::PatchDispatchRaysConstants( const gpusize cpsMemoryGpuAddr, const gpusize cpsMemoryBytes) { - pDispatchRaysConstants->constData.cpsGlobalMemoryAddressLo = Util::LowPart(cpsMemoryGpuAddr); - pDispatchRaysConstants->constData.cpsGlobalMemoryAddressHi = Util::HighPart(cpsMemoryGpuAddr); + PatchConstants(&pDispatchRaysConstants->constData, cpsMemoryGpuAddr, cpsMemoryBytes); +} +//===================================================================================================================== +// Populates the GPU addresses in the InitExecuteIndirectConstants structure +void Device::PatchInitExecuteIndirectConstants( + GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants, + const gpusize cpsMemoryGpuAddr, + const gpusize cpsMemoryBytes) +{ + PatchConstants(pInitExecuteIndirectConstants, cpsMemoryGpuAddr, cpsMemoryBytes); } //===================================================================================================================== @@ -2125,6 +2145,27 @@ bool Device::ShouldUseGangedAceForBuild( return shouldUseGangedAce; } +// ===================================================================================================================== +uint32 Device::CalculateBvhPrimitiveCount( + const AccelStructBuildInputs& inputs + ) const +{ + // For top-level acceleration structure, inputElementCount represents the number of instances + uint32 primitiveCount = (inputs.type == AccelStructType::TopLevel) ? inputs.inputElemCount : 0; + + if (inputs.type == AccelStructType::BottomLevel) + { + for (uint32 i = 0; i < inputs.inputElemCount; ++i) + { + const Geometry geometry = m_clientCb.pfnConvertAccelStructBuildGeometry(inputs, i); + const uint32 geometryPrimCount = BvhBuilder::GetGeometryPrimCount(geometry); + primitiveCount += geometryPrimCount; + } + } + + return primitiveCount; +} + // ===================================================================================================================== const AccelStructBuildInputs Device::OverrideBuildInputs( const AccelStructBuildInputs& inputs diff --git a/src/gpurtInternal.h b/src/gpurtInternal.h index 7cf7f2c..f59b164 100644 --- a/src/gpurtInternal.h +++ b/src/gpurtInternal.h @@ -106,6 +106,42 @@ enum EncodeFlags : uint32 EncodeFlagFusedInstanceNode = 0x00000008, }; +// Values should remain stable for RRA binary-compatibility (PAL equivalents do not guarantee stability) +enum RtIpLevel : uint32 +{ + RtIpNone = 0x0, ///< The device does not have an RayTracing Ip Level + RtIp1_0 = 0x1, ///< First Implementation of HW RT + RtIp1_1 = 0x2, ///< Added computation of triangle barycentrics into HW + RtIp2_0 = 0x3, ///< Added more Hardware RayTracing features, such as BoxSort, PointerFlag, etc + RtIpReserved = 0x5, ///< Special value, should not be used +}; + +// ===================================================================================================================== +// Convert PAL RtIpLevel values to their GpuRT equivalent +static RtIpLevel PalToGpuRtIpLevel(Pal::RayTracingIpLevel palRtIpLevel) +{ + RtIpLevel gpuRtIpLevel = RtIpLevel::RtIpNone; + + switch (palRtIpLevel) + { + case Pal::RayTracingIpLevel::RtIp1_0: + gpuRtIpLevel = RtIpLevel::RtIp1_0; + break; + case Pal::RayTracingIpLevel::RtIp1_1: + gpuRtIpLevel = RtIpLevel::RtIp1_1; + break; + case Pal::RayTracingIpLevel::RtIp2_0: + gpuRtIpLevel = RtIpLevel::RtIp2_0; + break; + case Pal::RayTracingIpLevel::None: + default: + gpuRtIpLevel = RtIpLevel::RtIpNone; + break; + } + + return gpuRtIpLevel; +} + struct RadixSortConfig { uint32 workGroupSize; @@ -336,13 +372,21 @@ class Device : public IDevice // @param pDispatchRaysConstants (in/out) Non-null pointer to a DispatchRaysConstants // @param cpsMemoryGpuAddr (in) GPU address pointing to the beginning of cps memory // @param cpsMemoryBytes (in) Cps allocated memory size in bytes - // - // @return the required global memory allocation size in bytes virtual void PatchDispatchRaysConstants( DispatchRaysConstants* pDispatchRaysConstants, const gpusize cpsMemoryGpuAddr, const gpusize cpsMemoryBytes) override; + // Populates the GPU addresses in the InitExecuteIndirectConstants structure + // + // @param pInitExecuteIndirectConstants (in/out) Non-null pointer to a InitExecuteIndirectConstants + // @param cpsMemoryGpuAddr (in) GPU address pointing to the beginning of cps memory + // @param cpsMemoryBytes (in) Cps allocated memory size in bytes + virtual void PatchInitExecuteIndirectConstants( + GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants, + const gpusize cpsMemoryGpuAddr, + const gpusize cpsMemoryBytes) override; + // // @param cpsVideoMem [in] Cps video memory // @param cpsMemoryBytes [in] Cps allocated memory size in bytes @@ -683,6 +727,8 @@ class Device : public IDevice virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const override; + virtual uint32 CalculateBvhPrimitiveCount(const AccelStructBuildInputs& inputs) const override; + // Returns size in DWORDs of a typed buffer view SRD uint32 GetTypedBufferSrdSizeDw() const { return m_typedBufferSrdSizeDw; }; @@ -722,6 +768,12 @@ class Device : public IDevice virtual ~Device() override; + template + void PatchConstants( + ConstantsType* pConstant, + const gpusize cpsMemoryGpuAddr, + const gpusize cpsMemoryBytes); + DeviceInitInfo m_info; Util::GenericAllocatorTracked m_allocator; diff --git a/src/gpurtTraceSource.cpp b/src/gpurtTraceSource.cpp index 36c5b3e..209a92c 100644 --- a/src/gpurtTraceSource.cpp +++ b/src/gpurtTraceSource.cpp @@ -51,7 +51,7 @@ void AccelStructTraceSource::OnTraceBegin( if (m_pDevice->AccelStructTrackerGpuAddr() != 0) { // Before starting the trace set tracking to enabled. - pCmdBuf->CmdWriteImmediate(Pal::HwPipeBottom, 1, Pal::ImmediateDataWidth::ImmediateData32Bit, + pCmdBuf->CmdWriteImmediate(Pal::PipelineStageBottomOfPipe, 1, Pal::ImmediateDataWidth::ImmediateData32Bit, m_pDevice->AccelStructTrackerGpuAddr() + offsetof(AccelStructTracker, enabled)); m_pDevice->RaytracingBarrier(pCmdBuf, BarrierFlagSyncPostCpWrite); } @@ -67,7 +67,7 @@ void AccelStructTraceSource::OnTraceEnd( if (m_pDevice->AccelStructTrackerGpuAddr() != 0) { // Disable tracking. - pCmdBuf->CmdWriteImmediate(Pal::HwPipeBottom, 0, Pal::ImmediateDataWidth::ImmediateData32Bit, + pCmdBuf->CmdWriteImmediate(Pal::PipelineStageBottomOfPipe, 0, Pal::ImmediateDataWidth::ImmediateData32Bit, m_pDevice->AccelStructTrackerGpuAddr() + offsetof(AccelStructTracker, enabled)); m_pDevice->RaytracingBarrier(pCmdBuf, BarrierFlagSyncPostCpWrite); } diff --git a/src/options.yaml b/src/options.yaml index 4ea170e..859c740 100644 --- a/src/options.yaml +++ b/src/options.yaml @@ -32,12 +32,12 @@ enum CpsCandidatePrimitiveMode: # Controls how candidate primitives are handled in the continuations (CPS) software Traversal loop. SuspendLane: # Suspend a lane upon candidate hits and wait for other lanes to end the Traversal loop. # This is the default. Other modes are experimental and might not be implemented on all RtIps. - SuspendWave: # On each Traversal iteration, check whether any lane has a candidate, and break if so. - # Only implemented for RtIp 2.0, all other cases use SuspendLane. DeferFirst: # When finding the first candidate, record it and ignore it for the time being. At the end of the # Traversal loop, process pending candidates. When finding the second candidate, immediately break # out of the loop to first process the first one. - # Only implemented for triangle primitives on RtIp 2.0, all other cases use SuspendLane. + # Implementation status: + # * RtIp 1.1: Not supported, SuspendLane is always used. + # * RtIp 2.0: DeferFirst is supported, but only for triangle primitives. # ------------------------------------------------------------------------------------------------------------------ # This is the definition of the single options struct. diff --git a/src/shaders/BuildBVH.hlsl b/src/shaders/BuildBVH.hlsl index 202f0f9..8ec5c77 100644 --- a/src/shaders/BuildBVH.hlsl +++ b/src/shaders/BuildBVH.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #include "BuildRootSignature.hlsl" diff --git a/src/shaders/BuildBVHTDTR.hlsl b/src/shaders/BuildBVHTDTR.hlsl index 7d1d71f..2614a34 100644 --- a/src/shaders/BuildBVHTDTR.hlsl +++ b/src/shaders/BuildBVHTDTR.hlsl @@ -188,7 +188,7 @@ struct StateTDBuild #define USE_LDS 1 //===================================================================================================================== -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define GC_DSTMETADATA #define GC_SCRATCHBUFFER diff --git a/src/shaders/BuildFastAgglomerativeLbvh.hlsl b/src/shaders/BuildFastAgglomerativeLbvh.hlsl index 526053c..56328b6 100644 --- a/src/shaders/BuildFastAgglomerativeLbvh.hlsl +++ b/src/shaders/BuildFastAgglomerativeLbvh.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define GC_SCRATCHBUFFER #include "BuildRootSignature.hlsl" diff --git a/src/shaders/BuildPLOC.hlsl b/src/shaders/BuildPLOC.hlsl index 2c39642..079f1bc 100644 --- a/src/shaders/BuildPLOC.hlsl +++ b/src/shaders/BuildPLOC.hlsl @@ -88,7 +88,7 @@ struct BuildPlocArgs #include "Common.hlsl" //===================================================================================================================== -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define GC_DSTMETADATA #define GC_SCRATCHBUFFER diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl index eaf9090..8a3df86 100644 --- a/src/shaders/BuildParallel.hlsl +++ b/src/shaders/BuildParallel.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #define BUILD_PARALLEL 1 -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define TASK_COUNTER_BUFFER ScratchGlobal #define TASK_COUNTER_OFFSET (ShaderConstants.offsets.taskLoopCounters + TASK_LOOP_BUILD_PARALLEL_COUNTER_OFFSET) diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl index 512496a..60e527f 100644 --- a/src/shaders/BuildQBVH.hlsl +++ b/src/shaders/BuildQBVH.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #if NO_SHADER_ENTRYPOINT == 0 -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define GC_DSTBUFFER #define GC_DSTMETADATA diff --git a/src/shaders/BuildRootSignature.hlsl b/src/shaders/BuildRootSignature.hlsl index 2df19b0..15c48e2 100644 --- a/src/shaders/BuildRootSignature.hlsl +++ b/src/shaders/BuildRootSignature.hlsl @@ -22,7 +22,7 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" // DebugBuffer #if GPURT_ENABLE_GPU_DEBUG diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli index 5929195..ac6e315 100644 --- a/src/shaders/BuildSettings.hlsli +++ b/src/shaders/BuildSettings.hlsli @@ -26,6 +26,8 @@ #ifndef _BUILDSETTINGS_HLSLI #define _BUILDSETTINGS_HLSLI +#include "../shadersClean/common/ShaderDefs.hlsli" + [[vk::constant_id(BUILD_SETTINGS_DATA_TOP_LEVEL_BUILD_ID)]] uint topLevelBuild = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_BUILD_MODE_ID)]] uint buildMode = 0; [[vk::constant_id(BUILD_SETTINGS_DATA_TRIANGLE_COMPRESSION_MODE_ID)]] uint triangleCompressionMode = 0; diff --git a/src/shaders/CMakeLists.txt b/src/shaders/CMakeLists.txt index 54ef25e..ac75c24 100644 --- a/src/shaders/CMakeLists.txt +++ b/src/shaders/CMakeLists.txt @@ -59,7 +59,7 @@ set(gpurtHlsl CompactAS.hlsl CompactAS1_1.hlsl CompactCommon.hlsl - CopyAS.hlsl + ../shadersClean/build/CopyAS.hlsl ../shadersClean/build/CopyBufferRaw.hlsl DecodeAS.hlsl DecodeCommon.hlsl @@ -129,7 +129,6 @@ set(otherDeps ../shadersClean/common/InstanceDesc.hlsli ../shadersClean/common/NodePointers.hlsli ../shadersClean/common/ScratchNode.hlsli - ../shadersClean/common/TempAssert.hlsli ../shadersClean/traversal/TraversalDefs.hlsli ../shadersClean/common/gfx10/BoxNode1_0.hlsli ../shadersClean/common/gfx10/InstanceNode1_0.hlsli diff --git a/src/shaders/Common.hlsl b/src/shaders/Common.hlsl index 1b55ccf..92f72d9 100644 --- a/src/shaders/Common.hlsl +++ b/src/shaders/Common.hlsl @@ -34,7 +34,7 @@ #ifndef _COMMON_HLSL #define _COMMON_HLSL -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #include "../shadersClean/common/ScratchNode.hlsli" typedef AccelStructDataOffsets AccelStructOffsets; diff --git a/src/shaders/Continuations2_0.hlsl b/src/shaders/Continuations2_0.hlsl index 221143c..73293dc 100644 --- a/src/shaders/Continuations2_0.hlsl +++ b/src/shaders/Continuations2_0.hlsl @@ -509,39 +509,10 @@ static void TraversalInternal2_0( } bool laneHasCandidate = (state < TRAVERSAL_STATE_COMMITTED_NOTHING); - if (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::SuspendWave) + if (laneHasCandidate) { - - // Stopping the Traversal loop for the whole wave on the first AHS/IS might be too aggressive. - // We implement this basic version here as basis for further experiments. - // Delaying it a bit could have potential benefits: - // * avoid overhead of wave-intrinsic in every iteration (depending on the implementation of delaying) - // * letting more lanes join the IS/AHS work - if (WaveActiveAnyTrue(laneHasCandidate)) - { - if (laneHasCandidate) - { - // Break out of traversal to run AHS/IS - } - else if (IsValidNode(nextNodePtr)) - { - // Break out of traversal so other lanes can run AHS/IS and re-join traversal - state = TRAVERSAL_STATE_SUSPEND_TRAVERSAL; - } - else - { - // The lane is done with Traversal, and wants to run CHS or Miss - } - break; - } - } - else - { - if (laneHasCandidate) - { - // Break out of traversal to run AHS/IS - break; - } + // Break out of traversal to run AHS/IS + break; } } diff --git a/src/shaders/EncodeCommon.hlsl b/src/shaders/EncodeCommon.hlsl index fb666c6..0aa00b0 100644 --- a/src/shaders/EncodeCommon.hlsl +++ b/src/shaders/EncodeCommon.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #include "BuildCommonScratch.hlsl" -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #include "TrianglePrimitive.hlsl" #include "UpdateCommon.hlsl" diff --git a/src/shaders/EncodeNodes.hlsl b/src/shaders/EncodeNodes.hlsl index 2075069..3ee98e6 100644 --- a/src/shaders/EncodeNodes.hlsl +++ b/src/shaders/EncodeNodes.hlsl @@ -22,7 +22,7 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define GC_DSTBUFFER #define GC_DSTMETADATA diff --git a/src/shaders/EncodeTopLevel.hlsl b/src/shaders/EncodeTopLevel.hlsl index 00419bc..90c1954 100644 --- a/src/shaders/EncodeTopLevel.hlsl +++ b/src/shaders/EncodeTopLevel.hlsl @@ -22,7 +22,7 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #include "BuildRootSignature.hlsl" diff --git a/src/shaders/EncodeTopLevelBuild.hlsl b/src/shaders/EncodeTopLevelBuild.hlsl index 2424f4a..097c3ac 100644 --- a/src/shaders/EncodeTopLevelBuild.hlsl +++ b/src/shaders/EncodeTopLevelBuild.hlsl @@ -22,7 +22,7 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #include "../shadersClean/common/ScratchNode.hlsli" //===================================================================================================================== diff --git a/src/shaders/Extensions.hlsl b/src/shaders/Extensions.hlsl index e78c92b..176dff2 100644 --- a/src/shaders/Extensions.hlsl +++ b/src/shaders/Extensions.hlsl @@ -29,6 +29,13 @@ #include "../shadersClean/common/Extensions.hlsli" #include "../shadersClean/common/Math.hlsli" +#define AmdExtD3DShaderIntrinsicsWaveOp_MinF 0x07 +#define AmdExtD3DShaderIntrinsicsWaveOp_MaxF 0x0a +#define AmdExtD3DShaderIntrinsicsWaveOp_Inclusive 0x01 + +#define AmdExtClusteredSubgroup 3 +#define AmdExtClusteredReduce 3 + // Dummy implementation for Vulkan build only __decl uint AmdExtD3DShaderIntrinsics_LoadDwordAtAddr( uint gpuVaLoBits, uint gpuVaHiBits, uint offset) DUMMY_UINT_FUNC @@ -57,10 +64,6 @@ __decl uint2 AmdExtD3DShaderIntrinsics_AtomicMinU64( __decl uint2 AmdExtD3DShaderIntrinsics_ShaderClock() DUMMY_UINT2_FUNC __decl uint2 AmdExtD3DShaderIntrinsics_ShaderRealtimeClock() DUMMY_UINT2_FUNC -#define AmdExtD3DShaderIntrinsicsWaveOp_MinF 0x07 -#define AmdExtD3DShaderIntrinsicsWaveOp_MaxF 0x0a -#define AmdExtD3DShaderIntrinsicsWaveOp_Inclusive 0x01 - __decl float3 AmdExtD3DShaderIntrinsics_WaveScan( uint waveOp, uint flags, float3 src) DUMMY_FLOAT3_FUNC @@ -112,56 +115,115 @@ __decl float3 AmdExtD3DShaderIntrinsics_FloatOpWithRoundMode( uint roundMode, uint operation, float3 src0, float3 src1) DUMMY_FLOAT3_FUNC //===================================================================================================================== -// Sub-group wave reductions +// Sub-group wave reductions spirv ops // Ref: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_instructions [[vk::ext_capability(/* GroupNonUniform */ 61)]] [[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]] [[vk::ext_capability(/* GroupNonUniformClustered */ 67)]] - [[vk::ext_instruction(350)]] float spirv_OpGroupNonUniformFAdd_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize); +[[vk::ext_capability(/* GroupNonUniform */ 61)]] +[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]] +[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]] +[[vk::ext_instruction(355)]] +float spirv_OpGroupNonUniformFMin_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize); + +[[vk::ext_capability(/* GroupNonUniform */ 61)]] +[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]] +[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]] +[[vk::ext_instruction(358)]] +float spirv_OpGroupNonUniformFMax_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize); + +[[vk::ext_capability(/* GroupNonUniform */ 61)]] +[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]] +[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]] +[[vk::ext_instruction(359)]] +uint spirv_OpGroupNonUniformBitwiseAnd_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize); + +[[vk::ext_capability(/* GroupNonUniform */ 61)]] +[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]] +[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]] +[[vk::ext_instruction(360)]] +uint spirv_OpGroupNonUniformBitwiseOr_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize); + +//===================================================================================================================== +// GpuRt WaveClusterSum Intrinsics float AmdExtD3DShaderIntrinsics_WaveClusterSum(float x, uint dxClusterSize) { const uint clusterSize = (1u << (dxClusterSize - 1)); - return spirv_OpGroupNonUniformFAdd_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); + return spirv_OpGroupNonUniformFAdd_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize); } -[[vk::ext_instruction(355)]] -float spirv_OpGroupNonUniformFMin_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize); - +//===================================================================================================================== +// GpuRt WaveClusterMin Intrinsics float AmdExtD3DShaderIntrinsics_WaveClusterMin(float x, uint dxClusterSize) { const uint clusterSize = (1u << (dxClusterSize - 1)); - return spirv_OpGroupNonUniformFMin_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); + return spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize); } -[[vk::ext_instruction(358)]] -float spirv_OpGroupNonUniformFMax_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize); +float2 AmdExtD3DShaderIntrinsics_WaveClusterMin(float2 val, uint dxClusterSize) +{ + float2 result; + const uint clusterSize = (1u << (dxClusterSize - 1)); + result.x = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize); + result.y = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize); + return result; +} -float AmdExtD3DShaderIntrinsics_WaveClusterMax(float x, uint dxClusterSize) +float3 AmdExtD3DShaderIntrinsics_WaveClusterMin(float3 val, uint dxClusterSize) { + float3 result; const uint clusterSize = (1u << (dxClusterSize - 1)); - return spirv_OpGroupNonUniformFMax_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); + result.x = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize); + result.y = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize); + result.z = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.z, clusterSize); + return result; } -[[vk::ext_instruction(359)]] -uint spirv_OpGroupNonUniformBitwiseAnd_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize); +//===================================================================================================================== +// GpuRt WaveClusterMax Intrinsics +float AmdExtD3DShaderIntrinsics_WaveClusterMax(float val, uint dxClusterSize) +{ + const uint clusterSize = (1u << (dxClusterSize - 1)); + return spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val, clusterSize); +} -uint AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(uint x, uint dxClusterSize) +float2 AmdExtD3DShaderIntrinsics_WaveClusterMax(float2 val, uint dxClusterSize) { + float2 result; const uint clusterSize = (1u << (dxClusterSize - 1)); - return spirv_OpGroupNonUniformBitwiseAnd_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); + result.x = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize); + result.y = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize); + return result; } -[[vk::ext_instruction(360)]] -uint spirv_OpGroupNonUniformBitwiseOr_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize); +float3 AmdExtD3DShaderIntrinsics_WaveClusterMax(float3 val, uint dxClusterSize) +{ + float3 result; + const uint clusterSize = (1u << (dxClusterSize - 1)); + result.x = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize); + result.y = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize); + result.z = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.z, clusterSize); + return result; +} +//===================================================================================================================== +// GpuRt WaveClusterBitAnd Intrinsics +uint AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(uint x, uint dxClusterSize) +{ + const uint clusterSize = (1u << (dxClusterSize - 1)); + return spirv_OpGroupNonUniformBitwiseAnd_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize); +} + +//===================================================================================================================== +// GpuRt WaveClusterBitOr Intrinsics uint AmdExtD3DShaderIntrinsics_WaveClusterBitOr(uint x, uint dxClusterSize) { const uint clusterSize = (1u << (dxClusterSize - 1)); - return spirv_OpGroupNonUniformBitwiseOr_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize); + return spirv_OpGroupNonUniformBitwiseOr_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize); } //===================================================================================================================== @@ -317,6 +379,7 @@ __decl uint64_t AmdExtConstantLoad64AtAddr(GpuVirtualAddress addr, uint offset) __decl uint AmdExtDispatchThreadIdFlat() DUMMY_UINT_FUNC; //===================================================================================================================== +__decl uint AmdExtAtomicAddAtAddr(uint64_t gpuVa, uint offset, uint value) DUMMY_UINT_FUNC; __decl uint64_t AmdExtAtomic64AddAtAddr(uint64_t gpuVa, uint offset, uint64_t value) DUMMY_UINT_FUNC __decl uint64_t AmdExtAtomic64CmpXchgAtAddr(uint64_t gpuVa, uint offset, uint64_t compare_value, uint64_t value) DUMMY_UINT_FUNC __decl uint64_t AmdExtLoad64AtAddrUncached(uint64_t gpuVa, uint offset) DUMMY_UINT_FUNC @@ -324,6 +387,12 @@ __decl uint AmdExtLoadDwordAtAddrUncached(uint64_t addr, uint offset) DUMMY_UIN __decl void AmdExtStoreDwordAtAddrUncached(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC __decl uint3 AmdExtGroupIdCompute() DUMMY_UINT3_FUNC __decl uint3 AmdExtGroupDimCompute() DUMMY_UINT3_FUNC +__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC +__decl uint AmdExtFlattenedThreadIdInGroupCompute() DUMMY_UINT_FUNC +__decl uint AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC +__decl void AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC +__decl void AmdExtDeviceMemoryAcquire() DUMMY_VOID_FUNC +__decl void AmdExtDeviceMemoryRelease() DUMMY_VOID_FUNC __decl uint AmdExtLaneCount() DUMMY_UINT_FUNC __decl void AmdExtSleep(uint value) DUMMY_VOID_FUNC diff --git a/src/shaders/GenerateMortonCodes.hlsl b/src/shaders/GenerateMortonCodes.hlsl index 6cd8bbd..79df409 100644 --- a/src/shaders/GenerateMortonCodes.hlsl +++ b/src/shaders/GenerateMortonCodes.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #include "BuildRootSignature.hlsl" #endif diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl index de0dc26..293fae0 100644 --- a/src/shaders/GpuRtLibraryCont.hlsl +++ b/src/shaders/GpuRtLibraryCont.hlsl @@ -81,18 +81,13 @@ #define TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT 5 #define TRAVERSAL_STATE_COMMITTED_PROCEDURAL_PRIMITIVE_HIT 6 -// This state implies Traversal was stopped to run AHS/IS for other lanes. This lane wants to resume Traversal. -#define TRAVERSAL_STATE_SUSPEND_TRAVERSAL 7 - // Shader priorities for continuation scheduling. Higher values mean higher scheduling precedence. -// Reserve priority 0 as invalid value. This way, 0-initialized priorities in metadata-annotated -// function pointers (e.g. from relocations) can be detected. // Note: For 32-bit packing of function pointers, we require the scheduling priority to fit into 3 bits. -#define SCHEDULING_PRIORITY_INVALID 0 -#define SCHEDULING_PRIORITY_RGS 1 -#define SCHEDULING_PRIORITY_CHS 2 -#define SCHEDULING_PRIORITY_MISS 2 -#define SCHEDULING_PRIORITY_TRAVERSAL 3 +#define SCHEDULING_PRIORITY_PWG_DEAD 0 +#define SCHEDULING_PRIORITY_TRAVERSAL 1 +#define SCHEDULING_PRIORITY_RGS 2 +#define SCHEDULING_PRIORITY_CHS 3 +#define SCHEDULING_PRIORITY_MISS 3 // Give IS higher prio than AHS so AHS called by ReportHit // have a chance to run together with AHS called by Traversal. #define SCHEDULING_PRIORITY_AHS 4 @@ -144,7 +139,7 @@ static uint GetPriorityForShaderType( case DXILShaderKind::AnyHit: return SCHEDULING_PRIORITY_AHS; case DXILShaderKind::Intersection: return SCHEDULING_PRIORITY_IS; case DXILShaderKind::RayGeneration: return SCHEDULING_PRIORITY_RGS; - default: return SCHEDULING_PRIORITY_INVALID; + default: GPU_ASSERT(false); return 0; } } @@ -153,60 +148,128 @@ static uint3 GetDispatchRaysDimensions(); //===================================================================================================================== -static uint64_t GetVpcWithPriority(uint64_t vpc, uint priority) -{ - if (_AmdIsLlpc()) +struct Vpc64 { + uint64_t vpc; + +#if defined(__cplusplus) + Vpc64(uint64_t value) : vpc(value) {} +#endif + + uint64_t GetU64() { return vpc; } - const uint64_t prio64 = priority; - const uint firstMetadataBit = 32; - const uint firstPriorityBitInMetadata = 16; - GPU_ASSERT((vpc & 0xFFFF000000000000) == 0); - return vpc | (prio64 << (firstMetadataBit + firstPriorityBitInMetadata)); -} + uint GetFunctionAddr() + { + return (vpc & 0xFFFFFFFF); + } + + bool IsValid() + { + return GetFunctionAddr() != 0; + } + + Vpc64 SetPriority(uint priority) + { + if (_AmdIsLlpc()) + { + return Vpc64(vpc); + } + + const uint64_t prio64 = (uint64_t)(priority); + const uint firstMetadataBit = 32; + const uint firstPriorityBitInMetadata = 16; + GPU_ASSERT((vpc & 0xFFFF000000000000) == 0); + vpc |= (prio64 << (firstMetadataBit + firstPriorityBitInMetadata)); + return Vpc64(vpc); + } + + uint GetPriority() + { + uint inMetadata = (uint)(vpc >> 32); + return (uint)(inMetadata >> 16); + } + + static Vpc64 MakeWithPriority(Vpc64 vpc64, uint priority) + { + return vpc64.SetPriority(priority); + } +}; + +struct Vpc32 { + uint32_t vpc; + +#if defined(__cplusplus) + Vpc32(uint32_t value) : vpc(value) {} +#endif + + uint32_t GetU32() + { + return vpc; + } + + uint32_t GetFunctionAddr() + { + return (uint32_t)(vpc & 0xFFFFFFC0); + } + + bool IsValid() + { + return GetFunctionAddr() != 0; + } + + void SetPriority(uint priority) + { + vpc |= priority; + } + + uint GetPriority() + { + return (uint)(vpc & 0x7); + } +}; //===================================================================================================================== // 32-bit function pointer packing/unpacking // -static uint64_t Unpack32BitVpcTo64BitVpc(uint32_t vpc32, bool unpackPriority) +static Vpc64 Vpc32ToVpc64(Vpc32 vpc32, bool unpackPriority) { if (_AmdIsLlpc()) { - return vpc32; + return Vpc64(vpc32.GetU32()); } - uint64_t vpc = (vpc32 & 0xFFFFFFC0); + Vpc64 vpc64 = Vpc64((uint64_t)(vpc32.GetFunctionAddr())); if (unpackPriority) { - // The priority is stored in bits 0..2. - uint32_t priority = (vpc32 & 0x7); - vpc = GetVpcWithPriority(vpc, priority); + vpc64.SetPriority(vpc32.GetPriority()); } - return vpc; + return vpc64; } -static uint32_t Pack64BitVpcTo32Bits(uint64_t vpc) +static Vpc32 Vpc64ToVpc32(Vpc64 vpc64) { + Vpc32 vpc32 = Vpc32((uint32_t)(vpc64.GetFunctionAddr())); + if (_AmdIsLlpc()) { - return (vpc & 0xFFFFFFFF); + return vpc32; } + GPU_ASSERT((vpc32.GetU32() & 0x2F) == 0); + // Incoming metadata is in the high dword - uint32_t inMetadata = (uint32_t)(vpc >> 32); - uint32_t prio = (inMetadata >> 16); + uint prio = vpc64.GetPriority(); + // We only have three bits for the priority: GPU_ASSERT(prio <= 7); - // Outgoing metadata is in the low 6 bits - uint32_t outMetadata = prio; + vpc32.SetPriority(prio); - GPU_ASSERT((vpc & 0x2F) == 0); - return SplitUint64(vpc).x | outMetadata; + return vpc32; } //===================================================================================================================== @@ -596,14 +659,14 @@ struct _AmdTraversalState return committed.State(); } - void PackReturnAddress(uint64_t returnAddr) + void PackReturnAddress(Vpc64 returnAddr) { - packedReturnAddr = Pack64BitVpcTo32Bits(returnAddr); + packedReturnAddr = Vpc64ToVpc32(returnAddr).GetU32(); } - uint64_t ReturnAddress() + Vpc64 ReturnAddress() { - return Unpack32BitVpcTo64BitVpc(packedReturnAddr, true); + return Vpc32ToVpc64(Vpc32(packedReturnAddr), true); } }; @@ -679,9 +742,7 @@ struct _AmdSystemData bool IsChsOrMiss(in uint state) { - return (state >= TRAVERSAL_STATE_COMMITTED_NOTHING) && - ((Options::getCpsCandidatePrimitiveMode() != CpsCandidatePrimitiveMode::SuspendWave) || - (state < TRAVERSAL_STATE_SUSPEND_TRAVERSAL)); + return (state >= TRAVERSAL_STATE_COMMITTED_NOTHING); } bool IsMiss(in uint state) @@ -762,19 +823,17 @@ struct _AmdTraversalResultData DECLARE_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data) DECLARE_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data) +DECLARE_ENQUEUE(TraversalDead, uint64_t dummyReturnAddr, _AmdDispatchSystemData data) DECLARE_ENQUEUE(RayGen, uint64_t dummyReturnAddr, _AmdDispatchSystemData data) -DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data) -DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdDispatchSystemData data) DECLARE_ENQUEUE(AnyHit, uint64_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics) DECLARE_ENQUEUE(Intersection, uint64_t returnAddr, _AmdAnyHitSystemData data) -DECLARE_WAIT_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data) DECLARE_AWAIT(AnyHit, _AmdAnyHitSystemData, uint64_t returnAddr, _AmdAnyHitSystemData data) DECLARE_AWAIT(CallShader, _AmdDispatchSystemData, uint64_t returnAddr, _AmdDispatchSystemData data) // No returnAddr argument. The return address is instead included in the passed system data. -DECLARE_WAIT_AWAIT(Traversal, _AmdDispatchSystemData, _AmdSystemData data) +DECLARE_AWAIT(Traversal, _AmdDispatchSystemData, _AmdSystemData data) DECLARE_RESTORE_SYSTEM_DATA(, _AmdDispatchSystemData data) DECLARE_RESTORE_SYSTEM_DATA(AnyHit, _AmdAnyHitSystemData data) @@ -826,36 +885,37 @@ inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithoutStack() //===================================================================================================================== // Return the argument. -static uint64_t GetVpcFromShaderId(uint32_t shaderId, uint priority) +static Vpc64 GetVpc64FromShaderId(Vpc32 shaderId, uint priority) { - uint64_t vpc = Unpack32BitVpcTo64BitVpc(shaderId, /* unpackPriority = */ false); - return GetVpcWithPriority(vpc, priority); + Vpc64 vpc64 = Vpc32ToVpc64(shaderId, /* unpackPriority = */ false); + vpc64.SetPriority(priority); + return vpc64; } //===================================================================================================================== -static uint64_t GetVpcFromShaderIdAddr(GpuVirtualAddress addr, uint priority) +static Vpc64 GetVpc64FromShaderIdAddr(GpuVirtualAddress addr, uint priority) { #ifdef __cplusplus return 1; #else - uint32_t shaderId = ConstantLoadDwordAtAddr(addr); - return GetVpcFromShaderId(shaderId, priority); + Vpc32 shaderId = Vpc32(ConstantLoadDwordAtAddr(addr)); + return GetVpc64FromShaderId(shaderId, priority); #endif } //===================================================================================================================== -static uint64_t GetVpcFromShaderIdTable( +static Vpc64 GetVpc64FromShaderIdTable( GpuVirtualAddress tableAddress, uint index, uint stride, uint priority) { - return GetVpcFromShaderIdAddr(tableAddress + stride * index, priority); + return GetVpc64FromShaderIdAddr(tableAddress + stride * index, priority); } //===================================================================================================================== // Returns the 32-bit part of the hit group shader id containing the AHS shader id. -static uint32_t GetAnyHit32BitShaderId( +static Vpc32 GetAnyHit32BitShaderId( uint hitGroupRecordIndex) { const uint offset = DispatchRaysConstBuf.hitGroupTableStrideInBytes * hitGroupRecordIndex; @@ -864,18 +924,18 @@ static uint32_t GetAnyHit32BitShaderId( PackUint64(DispatchRaysConstBuf.hitGroupTableBaseAddressLo, DispatchRaysConstBuf.hitGroupTableBaseAddressHi); if (tableVa == 0) { - return 0; + return Vpc32(0); } - return ConstantLoadDwordAtAddr(tableVa + offset + 8); + return Vpc32(ConstantLoadDwordAtAddr(tableVa + offset + 8)); } //===================================================================================================================== // Returns the 64-bit VPC for the given AHS by loading its shader address, and setting the AHS priority. -static uint64_t GetAnyHitAddr( +static Vpc64 GetAnyHitAddr( uint hitGroupRecordIndex) { - uint32_t shaderId = GetAnyHit32BitShaderId(hitGroupRecordIndex); - return GetVpcFromShaderId(shaderId, SCHEDULING_PRIORITY_AHS); + Vpc32 shaderId = GetAnyHit32BitShaderId(hitGroupRecordIndex); + return GetVpc64FromShaderId(shaderId, SCHEDULING_PRIORITY_AHS); } //===================================================================================================================== @@ -891,7 +951,7 @@ static bool AnyHitIsNonNull( geometryContributionToHitGroupIndex, instanceContributionToHitGroupIndex); - return GetAnyHit32BitShaderId(hitGroupRecordIndex) != 0; + return GetAnyHit32BitShaderId(hitGroupRecordIndex).IsValid(); } //===================================================================================================================== @@ -942,14 +1002,6 @@ static float4x3 WorldToObject4x3(in uint64_t tlasBaseAddr, in uint instNodePtr) return transpose(WorldToObject3x4(tlasBaseAddr, instNodePtr)); } -//===================================================================================================================== -__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC -__decl uint AmdExtFlattenedThreadIdInGroupCompute() DUMMY_UINT_FUNC -__decl uint AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC -__decl void AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC -__decl void AmdExtDeviceMemoryAcquire() DUMMY_VOID_FUNC -__decl void AmdExtDeviceMemoryRelease() DUMMY_VOID_FUNC - //===================================================================================================================== // Implementation of DispatchRaysIndex. export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data) @@ -970,7 +1022,7 @@ static uint3 GetDispatchRaysDimensions() //===================================================================================================================== // Persistent dispatch size (1D). -static uint3 GetPersistentDispatchSize() +static uint GetPersistentDispatchSize() { // Groups needed to cover the dispatch if each thread only processes 1 ray const uint3 rayDispatch = GetDispatchRaysDimensions(); @@ -1069,6 +1121,7 @@ static uint3 GetDispatchId() dispatchId.z = groupId.y; if ((dims.x > 1) && (dims.y > 1)) { + // Use 8 x (threadGroupSize / 8) tiles. /* Sample: D3D12_DISPATCH_RAYS_DESC::(w x h x d) = (18, 6, 1). Divided into 8x4 tiles(boxes). A number in a box is the group id. @@ -1334,20 +1387,27 @@ export uint64_t _cont_GetContinuationStackGlobalMemBase() } //===================================================================================================================== -static uint64_t GetTraversalVpc() +static Vpc64 GetTraversalVpc64() { // NOTE: DXCP uses a table for TraceRay, thus a load to traceRayGpuVa retrieves the actual traversal function // address. But Vulkan does not use the table so far, traceRayGpuVa is already the traversal function address. - return PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo, - DispatchRaysConstBuf.traceRayGpuVaHi); + return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo, + DispatchRaysConstBuf.traceRayGpuVaHi)); } //===================================================================================================================== -static uint64_t GetRayGenVpc() +static Vpc64 GetTraversalVpc64PwgDead() { - return GetVpcFromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo, - DispatchRaysConstBuf.rayGenerationTableAddressHi), - SCHEDULING_PRIORITY_RGS); + return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo, + DispatchRaysConstBuf.traceRayGpuVaHi)); +} + +//===================================================================================================================== +static Vpc64 GetRayGenVpc64() +{ + return GetVpc64FromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo, + DispatchRaysConstBuf.rayGenerationTableAddressHi), + SCHEDULING_PRIORITY_RGS); } //===================================================================================================================== @@ -1610,7 +1670,6 @@ static uint2 RayHistoryGetIdentifierFromVPC(uint64_t vpc) //===================================================================================================================== static uint2 RayHistoryGetIdentifierFromShaderId(uint2 shaderId) { - // Zero out the dVGPR bits and the higher dWord return uint2(shaderId.x & 0xFFFFFFC0, 0); } @@ -1828,15 +1887,14 @@ export bool _cont_ReportHit(inout_param(_AmdAnyHitSystemData) data, float THit, hitGroupRecordIndex = data.base.dispatch.shaderRecIdx; } // Compute hit group address and fetch shader identifiers - const uint64_t anyHitAddr = GetAnyHitAddr(hitGroupRecordIndex); + const Vpc64 anyHitAddr = GetAnyHitAddr(hitGroupRecordIndex); - if (SplitUint64(anyHitAddr).x != 0) + if (anyHitAddr.IsValid()) { // Call AnyHit // Hit attributes are added as an additional argument by the compiler - const uint64_t resumeAddr = _AmdGetResumePointAddr(); - const uint64_t resumeAddrWithPrio = GetVpcWithPriority(resumeAddr, SCHEDULING_PRIORITY_IS); - data = _AmdAwaitAnyHit(anyHitAddr, resumeAddrWithPrio, data); + Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), SCHEDULING_PRIORITY_IS); + data = _AmdAwaitAnyHit(anyHitAddr.GetU64(), resumeAddr.GetU64(), data); _AmdRestoreSystemDataAnyHit(data); return data.base.ray.AnyHitDidAccept(); } @@ -1874,12 +1932,12 @@ export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint inde return; } - const uint64_t addr = GetVpcFromShaderIdTable(callableTableBaseAddress, - index, - DispatchRaysConstBuf.callableTableStrideInBytes, - SCHEDULING_PRIORITY_CALLABLE); + const Vpc64 addr = GetVpc64FromShaderIdTable(callableTableBaseAddress, + index, + DispatchRaysConstBuf.callableTableStrideInBytes, + SCHEDULING_PRIORITY_CALLABLE); - if (SplitUint64(addr).x == 0) + if (!addr.IsValid()) { // See TODO above on how to handle this case better. return; @@ -1890,10 +1948,9 @@ export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint inde const DXILShaderKind enclosingShaderType = _AmdGetShaderKind(); const uint resumePrio = GetPriorityForShaderType(enclosingShaderType); - const uint64_t resumeAddr = _AmdGetResumePointAddr(); - const uint64_t resumeAddrWithPrio = GetVpcWithPriority(resumeAddr, resumePrio); + const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio); - data = _AmdAwaitCallShader(addr, resumeAddrWithPrio, data); + data = _AmdAwaitCallShader(addr.GetU64(), resumeAddr.GetU64(), data); // for the resume part. data.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx @@ -1903,23 +1960,23 @@ export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint inde //===================================================================================================================== // Returns the low part of the miss shader address and sets up the dispatch data to have the correct shader record // index. -static uint64_t SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx) +static Vpc64 SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx) { const uint64_t missTableBaseAddress = PackUint64(DispatchRaysConstBuf.missTableBaseAddressLo, DispatchRaysConstBuf.missTableBaseAddressHi); if (missTableBaseAddress == 0) { shaderRecIdx = 0; - return 0; + return Vpc64(0); } shaderRecIdx = ExtractMissShaderIndex(data.ray.traceParameters); // Calculate miss shader record address - const uint64_t shaderAddr = GetVpcFromShaderIdTable(missTableBaseAddress, - shaderRecIdx, - DispatchRaysConstBuf.missTableStrideInBytes, - SCHEDULING_PRIORITY_MISS); + const Vpc64 shaderAddr = GetVpc64FromShaderIdTable(missTableBaseAddress, + shaderRecIdx, + DispatchRaysConstBuf.missTableStrideInBytes, + SCHEDULING_PRIORITY_MISS); return shaderAddr; } @@ -1949,6 +2006,11 @@ static HitGroupInfo GetHitGroupInfo( #include "Continuations2_0.hlsl" #if CONTINUATION_ON_GPU +static uint64_t GetDispatchIdAddr() +{ + return PackUint64(DispatchRaysConstBuf.cpsDispatchIdAddressLo, DispatchRaysConstBuf.cpsDispatchIdAddressHi); +} + //===================================================================================================================== static void LaunchRayGen(bool setupStack) { @@ -1968,28 +2030,16 @@ static void LaunchRayGen(bool setupStack) // This is written in a way that is intended to be correct even if threads don't reconverge after calling into // the ray generation shader. - uint localWorkId; const uint popCount = WaveActiveCountBits(true); + uint flatDispatchId = 0; if (WaveIsFirstLane()) { - localWorkId = AmdTraceRayPersistentLdsAtomicAdd(0, popCount); + flatDispatchId = AmdExtAtomicAddAtAddr(GetDispatchIdAddr(), 0, popCount); } - localWorkId = WaveReadLaneFirst(localWorkId) + WavePrefixCountBits(true); + flatDispatchId = WaveReadLaneFirst(flatDispatchId) + WavePrefixCountBits(true); const uint3 rayDims = GetDispatchRaysDimensions(); - const uint tgCount = GetPersistentDispatchSize(); - - // Single dimension dispatch so the flattened group ID is the same as the x component of the group ID - const uint tgId = AmdExtGroupIdCompute().x; - - // Interleave waves' worth of work among CUs so that every CU does approximately the same amount of work even - // for dispatches that are smaller than the maximum occupancy of the GPU. This is probably also a bit better - // for memory and shader execution locality, since CUs should tend to stay roughly within the same region of - // the dispatch. Assume numthreads(32, 1, 1). - const uint lowPart = localWorkId & 31; - const uint highPart = localWorkId & ~31; - const uint flatDispatchId = highPart * tgCount + tgId * 32 + lowPart; dispatchId = GetDispatchId(rayDims.x, rayDims.y, flatDispatchId); valid = flatDispatchId < (rayDims.x * rayDims.y * rayDims.z); @@ -2016,12 +2066,12 @@ static void LaunchRayGen(bool setupStack) #if DEVELOPER systemData.parentId = -1; #endif - _AmdEnqueueRayGen(GetRayGenVpc(), _AmdGetUninitializedI64(), systemData); + _AmdEnqueueRayGen(GetRayGenVpc64().GetU64(), _AmdGetUninitializedI64(), systemData); } else if (Options::getPersistentLaunchEnabled()) { _AmdDispatchSystemData systemData = _AmdDispatchSystemData::MakeDeadLaneWithStack(); - _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), systemData); + _AmdEnqueueTraversalDead(GetTraversalVpc64PwgDead().GetU64(), _AmdGetUninitializedI64(), systemData); } } @@ -2029,16 +2079,6 @@ static void LaunchRayGen(bool setupStack) // KernelEntry is entry function of the RayTracing continuation mode export void _cont_KernelEntry() { - if (Options::getPersistentLaunchEnabled()) - { - if (AmdExtFlattenedThreadIdInGroupCompute() == 0) - { - AmdTraceRayPersistentLdsWrite(0, 0); - } - - GroupMemoryBarrierWithGroupSync(); - } - LaunchRayGen(true); } @@ -2136,17 +2176,16 @@ export void _cont_TraceRay( const uint callerShaderRecIdx = dispatch.shaderRecIdx; // 0 if from RayGen. const uint parentId = RayHistoryGetParentId(dispatch); - const uint64_t traversalAddrWithPrio = GetTraversalVpc(); + const Vpc64 traversalAddr = GetTraversalVpc64(); // The type of the shader containing this TraceRay call, i.e. the shader we are inlined into. const DXILShaderKind enclosingShaderType = _AmdGetShaderKind(); const uint resumePrio = GetPriorityForShaderType(enclosingShaderType); - // NO control flow is allowed between _AmdGetResumePointAddr() and _AmdWaitAwaitTraversal(). - const uint64_t resumeAddr = _AmdGetResumePointAddr(); - const uint64_t resumeAddrWithPrio = GetVpcWithPriority(resumeAddr, resumePrio); - data.traversal.PackReturnAddress(resumeAddrWithPrio); - dispatch = _AmdWaitAwaitTraversal(traversalAddrWithPrio, -1, data); + // NO control flow is allowed between _AmdGetResumePointAddr() and _AmdAwaitTraversal(). + const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio); + data.traversal.PackReturnAddress(resumeAddr); + dispatch = _AmdAwaitTraversal(traversalAddr.GetU64(), data); // for the resume part. dispatch.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx @@ -2164,14 +2203,14 @@ static bool GetNextHitMissPc( inout_param(_AmdSystemData) data, uint state, _AmdPrimitiveSystemState candidate, - out_param(uint64_t) nextShaderAddr) + out_param(Vpc64) nextShaderAddr) { // MS if (data.IsMiss(state)) { uint shaderRecIdx; - const uint64_t missShaderAddr = SetupMissShader(data, shaderRecIdx); - if (SplitUint64(missShaderAddr).x != 0) + const Vpc64 missShaderAddr = SetupMissShader(data, shaderRecIdx); + if (missShaderAddr.IsValid()) { // Valid MS data.dispatch.shaderRecIdx = shaderRecIdx; @@ -2194,7 +2233,7 @@ static bool GetNextHitMissPc( if (hitInfo.closestHitId.x != 0) { // Valid CHS - nextShaderAddr = GetVpcFromShaderId(hitInfo.closestHitId.x, SCHEDULING_PRIORITY_CHS); + nextShaderAddr = GetVpc64FromShaderId(Vpc32(hitInfo.closestHitId.x), SCHEDULING_PRIORITY_CHS); return true; } } @@ -2225,7 +2264,7 @@ static void TraversalInternal( } } -static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_t returnAddr, _AmdSystemData data) +static void EnqueueNextShader(bool hasWorkToDo, Vpc64 nextShaderAddr, Vpc64 returnAddr, _AmdSystemData data) { if (!hasWorkToDo) { @@ -2233,7 +2272,7 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_ { // No work to do = dead lane, jump to traversal as a synchronization point with an empty system data _AmdSystemData sysData = _AmdSystemData::MakeDeadLaneWithoutStack(); - _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), sysData); + _AmdEnqueueTraversal(GetTraversalVpc64().GetU64(), _AmdGetUninitializedI64(), sysData); } else { @@ -2244,21 +2283,21 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_ const uint newState = data.traversal.committed.State(); RayHistoryWriteEnd(data, newState); - if (nextShaderAddr != returnAddr) + if (nextShaderAddr.GetU64() != returnAddr.GetU64()) { const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(newState) ? (int)DXILShaderKind::Miss : // convert to int to fix linux build error (int)DXILShaderKind::ClosestHit); RayHistoryWriteFunctionCall(data, - RayHistoryGetIdentifierFromVPC(nextShaderAddr), + RayHistoryGetIdentifierFromVPC(nextShaderAddr.GetU64()), data.dispatch.shaderRecIdx, shaderKind); - _AmdEnqueue(nextShaderAddr, returnAddr, data); + _AmdEnqueue(nextShaderAddr.GetU64(), returnAddr.GetU64(), data); } // Return to RayGen. No need to set a priority, as it is already set in the stored return address. - _AmdEnqueueRayGen(returnAddr, _AmdGetUninitializedI64(), data.dispatch); + _AmdEnqueueRayGen(returnAddr.GetU64(), _AmdGetUninitializedI64(), data.dispatch); } //===================================================================================================================== @@ -2350,21 +2389,24 @@ export void _cont_Traversal( _AmdTraversalResultData result = (_AmdTraversalResultData)0; bool IsChsOrMiss = data.IsChsOrMiss(state); - if ((_AmdContinuationStackIsGlobal() && WaveActiveAllTrue(IsChsOrMiss)) || - (!_AmdContinuationStackIsGlobal() && IsChsOrMiss)) + // Re-enqueue Traversal until all lanes are done with BVH Traversal. + // Only then enqueue CHS/Miss to ensure other lanes that are not yet done with Traversal + // converge on these CHS/Miss invocations. + // This is necessary because Traversal has lower scheduling priority. + if (WaveActiveAllTrue(IsChsOrMiss)) { EnterSchedulerSection(); - uint64_t nextShaderAddr = 0; + Vpc64 nextShaderAddr = Vpc64(0); GetNextHitMissPc(data, state, candidate, nextShaderAddr); bool hasWorkToDo = true; - if (_AmdContinuationStackIsGlobal() && (nextShaderAddr != 0)) + if (_AmdContinuationStackIsGlobal() && nextShaderAddr.IsValid()) { } - const uint64_t returnAddr = data.traversal.ReturnAddress(); - if (nextShaderAddr == 0) + const Vpc64 returnAddr = data.traversal.ReturnAddress(); + if (!nextShaderAddr.IsValid()) { nextShaderAddr = returnAddr; } @@ -2372,10 +2414,7 @@ export void _cont_Traversal( } else { - bool mayEnqueueTraversal = (_AmdContinuationStackIsGlobal() || - (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::SuspendWave)); - // If we cannot re-enqueue Traversal, then we already know that we are in AHS or IS state. - if (!mayEnqueueTraversal || data.IsAhs(state) || data.IsIs(state)) + if (data.IsAhs(state) || data.IsIs(state)) { HitGroupInfo hitInfo = (HitGroupInfo)0; { @@ -2395,10 +2434,9 @@ export void _cont_Traversal( hitInfo.tableIndex, DXILShaderKind::AnyHit); - const uint64_t addr = GetVpcFromShaderId(hitInfo.anyHitId.x, SCHEDULING_PRIORITY_AHS); - const uint64_t returnAddr = _AmdGetCurrentFuncAddr(); - const uint64_t returnAddrWithPrio = GetVpcWithPriority(returnAddr, SCHEDULING_PRIORITY_TRAVERSAL); - _AmdEnqueueAnyHit(addr, returnAddrWithPrio, anyHitData, candidateBarycentrics); + const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.anyHitId.x), SCHEDULING_PRIORITY_AHS); + const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL); + _AmdEnqueueAnyHit(addr.GetU64(), returnAddr.GetU64(), anyHitData, candidateBarycentrics); } else { @@ -2410,10 +2448,9 @@ export void _cont_Traversal( hitInfo.tableIndex, DXILShaderKind::Intersection); - const uint64_t addr = GetVpcFromShaderId(hitInfo.intersectionId.x, SCHEDULING_PRIORITY_IS); - const uint64_t returnAddr = _AmdGetCurrentFuncAddr(); - const uint64_t returnAddrWithPrio = GetVpcWithPriority(returnAddr, SCHEDULING_PRIORITY_TRAVERSAL); - _AmdEnqueueIntersection(addr, returnAddrWithPrio, anyHitData); + const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.intersectionId.x), SCHEDULING_PRIORITY_IS); + const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL); + _AmdEnqueueIntersection(addr.GetU64(), returnAddr.GetU64(), anyHitData); } } else @@ -2421,9 +2458,8 @@ export void _cont_Traversal( // // Everything else needs to go back through scheduling/traversal, regardless of state // Note we don't need "Wait" here because priorities run AHS and IS first - const uint64_t traversalAddr = _AmdGetCurrentFuncAddr(); - const uint64_t traversalAddrWithPrio = GetVpcWithPriority(traversalAddr, SCHEDULING_PRIORITY_TRAVERSAL); - _AmdEnqueueTraversal(traversalAddrWithPrio, _AmdGetUninitializedI64(), data); + const Vpc64 traversalAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL); + _AmdEnqueueTraversal(traversalAddr.GetU64(), _AmdGetUninitializedI64(), data); } } // This is unreachable diff --git a/src/shaders/IndirectArgBufferUtils.hlsl b/src/shaders/IndirectArgBufferUtils.hlsl index 73a627d..952c6a5 100644 --- a/src/shaders/IndirectArgBufferUtils.hlsl +++ b/src/shaders/IndirectArgBufferUtils.hlsl @@ -31,7 +31,7 @@ #ifndef _INDIRECTARGBUFFER_HLSL #define _INDIRECTARGBUFFER_HLSL -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #include "BuildSettings.hlsli" //====================================================================================================================== diff --git a/src/shaders/MergeSort.hlsl b/src/shaders/MergeSort.hlsl index bd1921a..50d6882 100644 --- a/src/shaders/MergeSort.hlsl +++ b/src/shaders/MergeSort.hlsl @@ -26,7 +26,7 @@ #define BUILD_THREADGROUP_SIZE 512 //===================================================================================================================== -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define GC_DSTMETADATA #define GC_SCRATCHBUFFER diff --git a/src/shaders/PairCompression.hlsl b/src/shaders/PairCompression.hlsl index 91aac60..da86963 100644 --- a/src/shaders/PairCompression.hlsl +++ b/src/shaders/PairCompression.hlsl @@ -25,7 +25,7 @@ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define GC_DSTBUFFER #define GC_DSTMETADATA diff --git a/src/shaders/RadixSort/BitHistogram.hlsl b/src/shaders/RadixSort/BitHistogram.hlsl index f2b3fb2..5b5d4a3 100644 --- a/src/shaders/RadixSort/BitHistogram.hlsl +++ b/src/shaders/RadixSort/BitHistogram.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../../shared/rayTracingDefs.h" +#include "../../shadersClean/common/ShaderDefs.hlsli" #include "../BuildRootSignature.hlsl" diff --git a/src/shaders/RadixSort/DistributePartSumInt4.hlsl b/src/shaders/RadixSort/DistributePartSumInt4.hlsl index 1c86e7c..8d1aaf2 100644 --- a/src/shaders/RadixSort/DistributePartSumInt4.hlsl +++ b/src/shaders/RadixSort/DistributePartSumInt4.hlsl @@ -25,7 +25,7 @@ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../../shared/rayTracingDefs.h" +#include "../../shadersClean/common/ShaderDefs.hlsli" #include "../BuildRootSignature.hlsl" diff --git a/src/shaders/RadixSort/ScanCommon.hlsli b/src/shaders/RadixSort/ScanCommon.hlsli index d43217f..edd8f35 100644 --- a/src/shaders/RadixSort/ScanCommon.hlsli +++ b/src/shaders/RadixSort/ScanCommon.hlsli @@ -22,7 +22,7 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "../../shared/rayTracingDefs.h" +#include "../../shadersClean/common/ShaderDefs.hlsli" #define NUMBER_OF_BLOCKS_PER_GROUP 1 #define NUM_BINS 16 diff --git a/src/shaders/RadixSort/ScanExclusiveInt4.hlsl b/src/shaders/RadixSort/ScanExclusiveInt4.hlsl index 4fd23ed..6c9ff45 100644 --- a/src/shaders/RadixSort/ScanExclusiveInt4.hlsl +++ b/src/shaders/RadixSort/ScanExclusiveInt4.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../../shared/rayTracingDefs.h" +#include "../../shadersClean/common/ShaderDefs.hlsli" #include "../BuildRootSignature.hlsl" diff --git a/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl b/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl index 91ff455..fe50439 100644 --- a/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl +++ b/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../../shared/rayTracingDefs.h" +#include "../../shadersClean/common/ShaderDefs.hlsli" #define GC_SCRATCHBUFFER #include "../BuildRootSignature.hlsl" diff --git a/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl b/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl index 34e53bf..40f8620 100644 --- a/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl +++ b/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../../shared/rayTracingDefs.h" +#include "../../shadersClean/common/ShaderDefs.hlsli" #include "../BuildRootSignature.hlsl" diff --git a/src/shaders/RadixSort/ScatterKeysAndValues.hlsl b/src/shaders/RadixSort/ScatterKeysAndValues.hlsl index dede6b2..03bb570 100644 --- a/src/shaders/RadixSort/ScatterKeysAndValues.hlsl +++ b/src/shaders/RadixSort/ScatterKeysAndValues.hlsl @@ -24,7 +24,7 @@ **********************************************************************************************************************/ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../../shared/rayTracingDefs.h" +#include "../../shadersClean/common/ShaderDefs.hlsli" #include "../BuildRootSignature.hlsl" diff --git a/src/shaders/Rebraid.hlsl b/src/shaders/Rebraid.hlsl index 4aee6d9..48d6edc 100644 --- a/src/shaders/Rebraid.hlsl +++ b/src/shaders/Rebraid.hlsl @@ -30,7 +30,7 @@ #if NO_SHADER_ENTRYPOINT == 0 //===================================================================================================================== -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define GC_DSTBUFFER #define GC_DSTMETADATA diff --git a/src/shaders/RefitBounds.hlsl b/src/shaders/RefitBounds.hlsl index d86fc26..c500419 100644 --- a/src/shaders/RefitBounds.hlsl +++ b/src/shaders/RefitBounds.hlsl @@ -22,7 +22,7 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" #define GC_DSTBUFFER #define GC_SCRATCHBUFFER diff --git a/src/shaders/TraceRay.hlsl b/src/shaders/TraceRay.hlsl index 334e2dc..5b9f06c 100644 --- a/src/shaders/TraceRay.hlsl +++ b/src/shaders/TraceRay.hlsl @@ -265,11 +265,17 @@ static bool TraceRayCommon( { if ((rayFlags & RAY_FLAG_SKIP_CLOSEST_HIT_SHADER) == 0) { - const uint instanceContribution = (result.instanceContribution & 0x00ffffff); - const HitGroupInfo hitInfo = GetHitGroupInfo(rayContributionToHitGroupIndex, - multiplierForGeometryContributionToShaderIndex, - result.geometryIndex, - instanceContribution); + uint instanceContribution = 0; + HitGroupInfo hitInfo = (HitGroupInfo)0; + + { + instanceContribution = (result.instanceContribution & 0x00ffffff); + hitInfo = GetHitGroupInfo(rayContributionToHitGroupIndex, + multiplierForGeometryContributionToShaderIndex, + result.geometryIndex, + instanceContribution); + } + uint64_t instNodePtr64 = 0; { instNodePtr64 = CalculateInstanceNodePtr64(rtIpLevel, accelStruct, result.instNodePtr); diff --git a/src/shaders/Update.hlsl b/src/shaders/Update.hlsl index 3818053..3c091ad 100644 --- a/src/shaders/Update.hlsl +++ b/src/shaders/Update.hlsl @@ -47,7 +47,7 @@ struct RootConstants uint numThreads; }; -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" [[vk::push_constant]] ConstantBuffer ShaderRootConstants : register(b0); [[vk::binding(1, 1)]] ConstantBuffer ShaderConstants : register(b1); diff --git a/src/shaders/UpdateParallel.hlsl b/src/shaders/UpdateParallel.hlsl index 56c782a..7af9953 100644 --- a/src/shaders/UpdateParallel.hlsl +++ b/src/shaders/UpdateParallel.hlsl @@ -44,7 +44,7 @@ struct RootConstants uint numThreads; }; -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" [[vk::push_constant]] ConstantBuffer ShaderRootConstants : register(b0); [[vk::binding(1, 1)]] ConstantBuffer ShaderConstants : register(b1); diff --git a/src/shaders/UpdateQBVH.hlsl b/src/shaders/UpdateQBVH.hlsl index ae818c8..d0d253c 100644 --- a/src/shaders/UpdateQBVH.hlsl +++ b/src/shaders/UpdateQBVH.hlsl @@ -44,7 +44,7 @@ struct RootConstants uint numThreads; }; -#include "../shared/rayTracingDefs.h" +#include "../shadersClean/common/ShaderDefs.hlsli" [[vk::push_constant]] ConstantBuffer ShaderRootConstants : register(b0); [[vk::binding(1, 1)]] ConstantBuffer ShaderConstants : register(b1); diff --git a/src/shaders/CopyAS.hlsl b/src/shadersClean/build/CopyAS.hlsl similarity index 98% rename from src/shaders/CopyAS.hlsl rename to src/shadersClean/build/CopyAS.hlsl index 2ca420e..dd1354c 100644 --- a/src/shaders/CopyAS.hlsl +++ b/src/shadersClean/build/CopyAS.hlsl @@ -22,8 +22,8 @@ * SOFTWARE. * **********************************************************************************************************************/ -#include "../../gpurt/gpurtAccelStruct.h" -#include "../shared/rayTracingDefs.h" +#include "../../../gpurt/gpurtAccelStruct.h" +#include "../common/ShaderDefs.hlsli" // Note, CBV(b255) must be the last used binding in the root signature. #define RootSig "RootConstants(num32BitConstants=3, b0, visibility=SHADER_VISIBILITY_ALL), "\ diff --git a/src/shadersClean/common/InstanceDesc.hlsli b/src/shadersClean/common/InstanceDesc.hlsli index 09f910c..35cbe1c 100644 --- a/src/shadersClean/common/InstanceDesc.hlsli +++ b/src/shadersClean/common/InstanceDesc.hlsli @@ -25,7 +25,7 @@ #ifndef INSTANCE_DESC_HLSLI #define INSTANCE_DESC_HLSLI -#include "TempAssert.hlsli" +#include "../../shared/assert.h" //===================================================================================================================== // 64-byte aligned structure matching D3D12_RAYTRACING_INSTANCE_DESC diff --git a/src/shadersClean/common/NodePointers.hlsli b/src/shadersClean/common/NodePointers.hlsli index 46e6fa3..9e690e8 100644 --- a/src/shadersClean/common/NodePointers.hlsli +++ b/src/shadersClean/common/NodePointers.hlsli @@ -26,7 +26,7 @@ #ifndef NODE_POINTERS_HLSLI #define NODE_POINTERS_HLSLI -#include "../common/TempAssert.hlsli" +#include "../../shared/assert.h" //===================================================================================================================== // Node pointer size in bytes diff --git a/src/shadersClean/common/ShaderDefs.hlsli b/src/shadersClean/common/ShaderDefs.hlsli index 3ca709b..f552f78 100644 --- a/src/shadersClean/common/ShaderDefs.hlsli +++ b/src/shadersClean/common/ShaderDefs.hlsli @@ -37,7 +37,7 @@ #define DUMMY_FLOAT2_FUNC { return float2(0, 0); } #define DUMMY_FLOAT3_FUNC { return float3(0, 0, 0); } -#include "TempAssert.hlsli" +#include "../../shared/assert.h" // TODO: there are functions that use values from these files, but really // those functions should be in these files, and then the files that use the functions @@ -49,6 +49,8 @@ #include "gfx10/InstanceNode1_0.hlsli" #include "NodePointers.hlsli" +#include "../../shared/rayTracingDefs.h" + #define SAH_COST_TRIANGLE_INTERSECTION 1.5 #define SAH_COST_AABBB_INTERSECTION 1 @@ -473,14 +475,22 @@ enum RebraidType : uint //===================================================================================================================== struct TriangleData { +#ifdef __cplusplus + TriangleData(uint val) + { + memset(this, val, sizeof(TriangleData)); + } + + TriangleData() : TriangleData(0) + {} +#endif float3 v0; ///< Vertex 0 float3 v1; ///< Vertex 1 float3 v2; ///< Vertex 2 }; #ifndef LIBRARY_COMPILATION -// This does not include RayTracingDefs.h as the goal is -// to eventually have everything in this file alone + #endif #endif diff --git a/src/shadersClean/common/TempAssert.hlsli b/src/shadersClean/common/TempAssert.hlsli deleted file mode 100644 index 1407fe8..0000000 --- a/src/shadersClean/common/TempAssert.hlsli +++ /dev/null @@ -1,38 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -// TODO: this is a temporary assert file to allow files with asserts to be "clean" -// while the assert file itself cannot be. We need this as we have to move files out of "shared" -// which use assert.h, but cannot then include assert.h as "clean" inclusion of shared files isn't set up yet, -// *because* there are too many files in shared, and they can't be moved out because -// they use assert.h and... (cyclical issue) - -#ifndef ASSERT_HLSLI -#define ASSERT_HLSLI -#ifndef GPURT_STATIC_ASSERT -// _Static_assert is not supported with -spirv: https://github.com/microsoft/DirectXShaderCompiler/issues/5750 -#define GPURT_STATIC_ASSERT(condition, message) -#endif -#endif diff --git a/src/shadersClean/common/gfx10/BoxNode1_0.hlsli b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli index 6103e61..6623b47 100644 --- a/src/shadersClean/common/gfx10/BoxNode1_0.hlsli +++ b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli @@ -25,7 +25,7 @@ #ifndef BOX_NODE_1_1_HLSLI #define BOX_NODE_1_1_HLSLI -#include "../TempAssert.hlsli" +#include "../../../shared/assert.h" //===================================================================================================================== // Hardware 32-bit box node format and offsets diff --git a/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli index ae0280d..e615089 100644 --- a/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli +++ b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli @@ -27,7 +27,7 @@ #include "BoxNode1_0.hlsli" #include "../InstanceDesc.hlsli" -#include "../TempAssert.hlsli" +#include "../../../shared/assert.h" //===================================================================================================================== struct InstanceSidebandData1_1 diff --git a/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli index 4431ecd..b8e01ec 100644 --- a/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli +++ b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli @@ -25,7 +25,7 @@ #ifndef PROCEDURAL_NODE_1_1_HLSLI #define PROCEDURAL_NODE_1_1_HLSLI -#include "../TempAssert.hlsli" +#include "../../../shared/assert.h" //===================================================================================================================== #define USER_NODE_PROCEDURAL_MIN_OFFSET 0 diff --git a/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli index 0d9d1eb..7e618be 100644 --- a/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli +++ b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli @@ -25,7 +25,7 @@ #ifndef TRIANGLE_NODE_1_0_HLSLI #define TRIANGLE_NODE_1_0_HLSLI -#include "../TempAssert.hlsli" +#include "../../../shared/assert.h" //===================================================================================================================== // Hardware triangle node format and offsets diff --git a/src/shadersClean/traversal/TraversalDefs.hlsli b/src/shadersClean/traversal/TraversalDefs.hlsli index 8541f35..28f9999 100644 --- a/src/shadersClean/traversal/TraversalDefs.hlsli +++ b/src/shadersClean/traversal/TraversalDefs.hlsli @@ -25,7 +25,7 @@ #ifndef TRAVERSAL_DEFS_HLSLI #define TRAVERSAL_DEFS_HLSLI -#include "../common/TempAssert.hlsli" +#include "../../shared/assert.h" #define ENCODE_FLAG_ARRAY_OF_POINTERS 0x00000001 #define ENCODE_FLAG_UPDATE_IN_PLACE 0x00000002 @@ -93,6 +93,15 @@ struct RaySystemData // Ray description matching the D3D12 HLSL header struct RayDesc { +#ifdef __cplusplus + RayDesc(uint val) + { + memset(this, val, sizeof(RayDesc)); + } + + RayDesc() : RayDesc(0) + {} +#endif float3 Origin; float TMin; float3 Direction; diff --git a/src/shared/rayTracingDefs.h b/src/shared/rayTracingDefs.h index 6dfec65..5778dfd 100644 --- a/src/shared/rayTracingDefs.h +++ b/src/shared/rayTracingDefs.h @@ -27,10 +27,6 @@ #ifndef _RAYTRACING_DEF_H #define _RAYTRACING_DEF_H -#ifndef __cplusplus -#include "../shadersClean/common/ShaderDefs.hlsli" -#endif - #include "../../gpurt/gpurtAccelStruct.h" #include "../../gpurt/gpurtBuildSettings.h" #include "../../gpurt/gpurtDispatch.h" diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py index a967fdf..95a90d3 100644 --- a/tools/CompileRTShaders.py +++ b/tools/CompileRTShaders.py @@ -136,7 +136,7 @@ def isBVH(self): ShaderConfig(path="EmitAS.hlsl", entryPoint="EmitCompactSize"), ShaderConfig(path="EmitAS.hlsl", entryPoint="EmitSerializeDesc"), ShaderConfig(path="EmitAS.hlsl", entryPoint="EmitToolVisDesc"), - ShaderConfig(path="CopyAS.hlsl", entryPoint="CopyAS"), + ShaderConfig(path="../shadersClean/build/CopyAS.hlsl", entryPoint="CopyAS"), ShaderConfig(path="CompactAS.hlsl", entryPoint="CompactAS"), ShaderConfig(path="DecodeAS.hlsl", entryPoint="DecodeAS"), ShaderConfig(path="SerializeAS.hlsl", entryPoint="SerializeAS"), @@ -313,6 +313,37 @@ def validateCompilation(cmd: [str], path: pathlib.Path) -> bool: return True +""" +Validates the organization of files in the shared folder to enforce cpp/h a src/header sort of structure +This helps keep the shader library untangled and easier to maintain. +#define'ing LIBRARY_COMPILATION enables including files in any order and does not include implementation dependencies. +""" +def validateShared(args) -> bool: + cmdBase = getValidationCmdArgs(args) + # use resolve() + as_posix() to avoid path mismatches when using drive mapping + srcPath = pathlib.Path(FixInputPath(args.basepath)).parent.resolve() + + gpurtInterfacePath = (srcPath / "../gpurt").resolve() + sharedPath = srcPath / "shared" + generatedFilepath = pathlib.Path(args.g_FilePath) + implExt = "._unused_" + headerExt = ".h" + + # shared files need to be able to include the gpurt interface files due to the requirements of the interface + # we treat this as an exception for rules about what files can be included + + for path, (hasImpl, hasHeader) in getImplInterfacePairs(sharedPath, implExt, headerExt).items(): + assert (hasHeader and not hasImpl), "Shared files should be header only." + fullPath = path.with_suffix(path.suffix + (implExt if hasImpl else headerExt)) + for defines in getDefineCombos(fullPath): + compileCmd = cmdBase + defines + [fullPath.as_posix()] + if not validateIncludes(compileCmd, path, implExt, headerExt, [(sharedPath, headerExt), (gpurtInterfacePath, ".h"), (generatedFilepath, ".h")]): + return False + if not validateCompilation(compileCmd, fullPath): + return False + + return True + """ Validates the organization of shaders to enforce cpp/h a src/header sort of structure This helps keep the shader library untangled and easier to maintain. @@ -322,6 +353,11 @@ def validateShadersClean(args) -> bool: cmdBase = getValidationCmdArgs(args) # use resolve() + as_posix() to avoid path mismatches when using drive mapping srcPath = pathlib.Path(FixInputPath(args.basepath)).parent.resolve() + + gpurtInterfacePath = (srcPath / "../gpurt").resolve() + sharedPath = srcPath / "shared" + generatedFilepath = pathlib.Path(args.g_FilePath) +# Validation of the shadersClean folder shadersCleanPath = srcPath / "shadersClean" implExt = ".hlsl" @@ -331,11 +367,10 @@ def validateShadersClean(args) -> bool: fullPath = path.with_suffix(path.suffix + (implExt if hasImpl else headerExt)) for defines in getDefineCombos(fullPath): compileCmd = cmdBase + defines + [fullPath.as_posix()] - if not validateIncludes(compileCmd, path, implExt, headerExt, [(shadersCleanPath, headerExt)]): + if not validateIncludes(compileCmd, path, implExt, headerExt, [(shadersCleanPath, headerExt), (sharedPath, ".hlsli"), (gpurtInterfacePath, ".h"), (generatedFilepath, ".h")]): return False if not validateCompilation(compileCmd, fullPath): return False - return True def isSpirvShader(shaderConfig, args): @@ -751,6 +786,7 @@ def main() -> int: parser.add_argument('--verbose', action='store_true', help='Output verbose inforation', default=False) parser.add_argument('--defines', help='Defines for the shader compiler, separated by ; or ,.', default="") parser.add_argument('--includePaths', help='Include paths for the shader compiler, separated by ; or ,.', default="") + parser.add_argument('--g_FilePath', help='Path to the build destination where generated headers are written', default="") parser.add_argument('--compilerPath', help='Path to standalone compiler.', default='./dxc.exe') parser.add_argument('--dxcompilerLibPath', help='Path to dxcompiler.dll/libdxcompiler.so', default='./dxcompiler.dll') parser.add_argument('--spirvRemapPath', help='Path to spirv-remap executable', default='./spirv-remap.exe') @@ -769,11 +805,14 @@ def main() -> int: tBegin = time.perf_counter() validIncludes = validateShadersClean(args) + validIncludes &= validateShared(args) + # For vulkan, we validate SPIR-V shaders in the same run instead of running the script again. if args.vulkan and not args.spirv: print("Now doing SPIR-V validation...") args.spirv = True validIncludes &= validateShadersClean(args) + validIncludes &= validateShared(args) tDuration = time.perf_counter() - tBegin if validIncludes: diff --git a/tools/DebugPreprocessShaders.py b/tools/DebugPreprocessShaders.py index b51c1bd..4793b96 100644 --- a/tools/DebugPreprocessShaders.py +++ b/tools/DebugPreprocessShaders.py @@ -26,6 +26,7 @@ import sys import os import re +import argparse cpp_file_header = """ /* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */ @@ -62,7 +63,7 @@ def process_file(src_path, dst_path): for line in src_file: # Find something that looks like a GPU_ASSERT macro invocation (GPU_ASSERT + '(' or space) if line.find('#define') == -1: - m = re.match('.*GPU_ASSERT\s*(\()', line) + m = re.match('.*GPU_ASSERT\\s*(\\()', line) if m is not None: open_paren = m.start(1) assert_id = add_assert(src_name, line_num, line) @@ -71,7 +72,7 @@ def process_file(src_path, dst_path): line = line.replace('GPU_ASSERT', 'GPU_ASSERT_IMPL', 1) else: # Find something that looks like a GPU_DPF macro invocation (GPU_DPF + '(' or space) - m = re.match('.*GPU_DPF\s*(\().*"(.*)"', line) + m = re.match('.*GPU_DPF\\s*(\\().*"(.*)"', line) if m is not None: open_paren = m.start(1) msg_id = add_print_msg(src_name, line_num, m.group(2)) @@ -91,13 +92,26 @@ def generate_cpp_file(output_file_path): output_str += cpp_file_footer output_file.write(output_str) -def main(): +def main(cpp_file, input_pair_list): # Process each file in the argument list # The argments are pairs of input and ouput files then the path to the output file - for i in range(1, len(sys.argv) - 1, 2): - process_file(sys.argv[i], sys.argv[i+1]) - generate_cpp_file(sys.argv[-1]) + for i in range(0, len(input_pair_list), 2): + process_file(input_pair_list[i], input_pair_list[i+1]) + generate_cpp_file(cpp_file) return 0 if __name__ == '__main__': - sys.exit(main()) + parser = argparse.ArgumentParser( + prog='DebugPreprocessShaders', + description='Preprocesses shaders for GPU_ASSERT/GPU_DPF lines and generates a lookup table to match their text with their ID' + ) + parser.add_argument('-i', '--input', help='File containing a list of input shader/output processed shader path pairs, semicolon delimited', required=True) + parser.add_argument('-o', '--output', help='Path to output cpp header', required=True) + args = parser.parse_args() + + input_file = open(args.input, 'r') + # Strip any newlines or whitespace from the beginning/end, and split by ';' + input_pair_list = input_file.read().strip().split(';') + + sys.exit(main(args.output, input_pair_list)) + diff --git a/tools/DebugPreprocessShadersInput.txt.in b/tools/DebugPreprocessShadersInput.txt.in new file mode 100644 index 0000000..3f15488 --- /dev/null +++ b/tools/DebugPreprocessShadersInput.txt.in @@ -0,0 +1,2 @@ +${preprocessArgs} +