From f2d96f116cbbe34b2784c589b63d851c8b31b2e0 Mon Sep 17 00:00:00 2001
From: qiaojbao <Qiaojin.Bao@amd.com>
Date: Wed, 30 Oct 2024 16:30:28 +0800
Subject: [PATCH] Update gpurt from commit b0e4674a

Support cpsGlobal in DispatchRaysIndirect
First attempt at shared file validation
Move CopyAS to shadersClean
Use global atomic to get dispatch id
Fix line length limits for GPURT_ENABLE_GPU_DEBUG=ON
Updated stages for CmdWriteImmediate
gpurtDevice: Add public numprims calc function
[Continuations] Introduce helper structs for packed and unpacked Vpc values
Defines an RT IP enumeration inside GpuRT that is more stable (across driver versions) than the PAL equivalent
[Continuations] Remove SuspendWave mode
[Continuations] Revamp scheduling to not use wait masks
Add float3 versions of AmdExtD3DShaderIntrinsics_WaveClusterMax / AmdExtD3DShaderIntrinsics_WaveClusterMin
---
 backends/pal/gpurtPalBackend.cpp              |   6 +-
 cmake/GpuRtGenerateShaders.cmake              |   8 +-
 gpurt/gpurt.h                                 |  14 +-
 gpurt/gpurtDispatch.h                         |  36 +-
 src/gpurtBvhBuilder.cpp                       |   6 +-
 src/gpurtDevice.cpp                           |  45 ++-
 src/gpurtInternal.h                           |  56 ++-
 src/gpurtTraceSource.cpp                      |   4 +-
 src/options.yaml                              |   6 +-
 src/shaders/BuildBVH.hlsl                     |   2 +-
 src/shaders/BuildBVHTDTR.hlsl                 |   2 +-
 src/shaders/BuildFastAgglomerativeLbvh.hlsl   |   2 +-
 src/shaders/BuildPLOC.hlsl                    |   2 +-
 src/shaders/BuildParallel.hlsl                |   2 +-
 src/shaders/BuildQBVH.hlsl                    |   2 +-
 src/shaders/BuildRootSignature.hlsl           |   2 +-
 src/shaders/BuildSettings.hlsli               |   2 +
 src/shaders/CMakeLists.txt                    |   3 +-
 src/shaders/Common.hlsl                       |   2 +-
 src/shaders/Continuations2_0.hlsl             |  35 +-
 src/shaders/EncodeCommon.hlsl                 |   2 +-
 src/shaders/EncodeNodes.hlsl                  |   2 +-
 src/shaders/EncodeTopLevel.hlsl               |   2 +-
 src/shaders/EncodeTopLevelBuild.hlsl          |   2 +-
 src/shaders/Extensions.hlsl                   | 113 ++++--
 src/shaders/GenerateMortonCodes.hlsl          |   2 +-
 src/shaders/GpuRtLibraryCont.hlsl             | 358 ++++++++++--------
 src/shaders/IndirectArgBufferUtils.hlsl       |   2 +-
 src/shaders/MergeSort.hlsl                    |   2 +-
 src/shaders/PairCompression.hlsl              |   2 +-
 src/shaders/RadixSort/BitHistogram.hlsl       |   2 +-
 .../RadixSort/DistributePartSumInt4.hlsl      |   2 +-
 src/shaders/RadixSort/ScanCommon.hlsli        |   2 +-
 src/shaders/RadixSort/ScanExclusiveInt4.hlsl  |   2 +-
 .../RadixSort/ScanExclusiveInt4DLB.hlsl       |   2 +-
 .../RadixSort/ScanExclusivePartInt4.hlsl      |   2 +-
 .../RadixSort/ScatterKeysAndValues.hlsl       |   2 +-
 src/shaders/Rebraid.hlsl                      |   2 +-
 src/shaders/RefitBounds.hlsl                  |   2 +-
 src/shaders/TraceRay.hlsl                     |  16 +-
 src/shaders/Update.hlsl                       |   2 +-
 src/shaders/UpdateParallel.hlsl               |   2 +-
 src/shaders/UpdateQBVH.hlsl                   |   2 +-
 .../build}/CopyAS.hlsl                        |   4 +-
 src/shadersClean/common/InstanceDesc.hlsli    |   2 +-
 src/shadersClean/common/NodePointers.hlsli    |   2 +-
 src/shadersClean/common/ShaderDefs.hlsli      |  16 +-
 src/shadersClean/common/TempAssert.hlsli      |  38 --
 .../common/gfx10/BoxNode1_0.hlsli             |   2 +-
 .../common/gfx10/InstanceNode1_0.hlsli        |   2 +-
 .../common/gfx10/ProceduralNode1_0.hlsli      |   2 +-
 .../common/gfx10/TriangleNode1_0.hlsli        |   2 +-
 .../traversal/TraversalDefs.hlsli             |  11 +-
 src/shared/rayTracingDefs.h                   |   4 -
 tools/CompileRTShaders.py                     |  45 ++-
 tools/DebugPreprocessShaders.py               |  28 +-
 tools/DebugPreprocessShadersInput.txt.in      |   2 +
 57 files changed, 578 insertions(+), 346 deletions(-)
 rename src/{shaders => shadersClean/build}/CopyAS.hlsl (98%)
 delete mode 100644 src/shadersClean/common/TempAssert.hlsli
 create mode 100644 tools/DebugPreprocessShadersInput.txt.in

diff --git a/backends/pal/gpurtPalBackend.cpp b/backends/pal/gpurtPalBackend.cpp
index 08f4571..d06394c 100644
--- a/backends/pal/gpurtPalBackend.cpp
+++ b/backends/pal/gpurtPalBackend.cpp
@@ -152,11 +152,11 @@ void PalBackend::WriteImmediateSingle(
     ImmediateDataWidth    width
     ) const
 {
-    // We want to use HwPipePreCs (ME) so that the writes do not occur before UAV barriers are done waiting.
+    // We want to use StagePostPrefetch (ME) so that the writes do not occur before UAV barriers are done waiting.
     // Both internal barriers during the build and application barriers synchronizing access to acceleration
-    // structure memory wait at HwPipePreCs.
+    // structure memory wait at StagePostPrefetch.
     GetCmdBuffer(cmdBuffer)->CmdWriteImmediate(
-        Pal::HwPipePoint::HwPipePreCs,
+        Pal::PipelineStageFlag::PipelineStagePostPrefetch,
         value,
         GpuRtToPalImmediateDataWidth(width),
         destVa);
diff --git a/cmake/GpuRtGenerateShaders.cmake b/cmake/GpuRtGenerateShaders.cmake
index 4654fa0..42a779b 100644
--- a/cmake/GpuRtGenerateShaders.cmake
+++ b/cmake/GpuRtGenerateShaders.cmake
@@ -76,6 +76,7 @@ if (GPURT_ENABLE_GPU_DEBUG)
     set(debugShaderDirectory "${CMAKE_CURRENT_BINARY_DIR}/debugShaders/src/shaders/")
     set(gpurtShaderSource ${GPURT_SHADER_SOURCE_FILES})
     set(gpurtShadersSourceDir ${debugShaderDirectory})
+    set(gpurtShadersPreprocessInputFile "${CMAKE_CURRENT_BINARY_DIR}/debugShaders/DebugPreprocessShadersInput.txt")
     list(TRANSFORM gpurtShaderSource PREPEND "${debugShaderDirectory}")
     set(preprocessArgs "")
     foreach(originalSourceFile ${GPURT_SHADER_SOURCE_FILES})
@@ -84,10 +85,13 @@ if (GPURT_ENABLE_GPU_DEBUG)
         list(APPEND preprocessArgs "${originalSourcePath}" "${newSourceFilePath}")
     endforeach()
     set(gpurtDebugPreprocessorScript "${gpurtToolsDir}/DebugPreprocessShaders.py")
+    configure_file("${gpurtToolsDir}/DebugPreprocessShadersInput.txt.in"
+    ${gpurtShadersPreprocessInputFile}
+    )
     add_custom_command(
         OUTPUT  ${gpurtShaderSource} ${gpurtDebugInfoFile}
-        DEPENDS ${originalShaderSource} ${gpurtDebugPreprocessorScript}
-        COMMAND Python3::Interpreter ${gpurtDebugPreprocessorScript} ${preprocessArgs} ${gpurtDebugInfoFile}
+        DEPENDS ${originalShaderSource} ${gpurtDebugPreprocessorScript} ${gpurtShadersPreprocessInputFile}
+        COMMAND Python3::Interpreter ${gpurtDebugPreprocessorScript} -i ${gpurtShadersPreprocessInputFile} -o ${gpurtDebugInfoFile}
     )
 else()
     set(gpurtShaderSource "${originalShaderSource}")
diff --git a/gpurt/gpurt.h b/gpurt/gpurt.h
index 68d5ef5..5d9d8e2 100644
--- a/gpurt/gpurt.h
+++ b/gpurt/gpurt.h
@@ -1471,13 +1471,21 @@ class IDevice
     // @param pDispatchRaysConstants  (in/out) Non-null pointer to a DispatchRaysConstants
     // @param cpsMemoryGpuAddr        (in) GPU address pointing to the beginning of cps memory
     // @param cpsMemoryBytes          (in) Cps allocated memory size in bytes
-    //
-    // @return the required global memory allocation size in bytes
     virtual void PatchDispatchRaysConstants(
         DispatchRaysConstants* pDispatchRaysConstants,
         const gpusize          cpsMemoryGpuAddr,
         const gpusize          cpsMemoryBytes) = 0;
 
+    // Populates the GPU addresses in the InitExecuteIndirectConstants structure
+    //
+    // @param pInitExecuteIndirectConstants (in/out) Non-null pointer to a InitExecuteIndirectConstants
+    // @param cpsMemoryGpuAddr              (in) GPU address pointing to the beginning of cps memory
+    // @param cpsMemoryBytes                (in) Cps allocated memory size in bytes
+    virtual void PatchInitExecuteIndirectConstants(
+        GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants,
+        const gpusize                        cpsMemoryGpuAddr,
+        const gpusize                        cpsMemoryBytes) = 0;
+
     //
     // @param cpsVideoMem          [in] Cps video memory
     // @param cpsMemoryBytes       [in] Cps allocated memory size in bytes
@@ -1630,6 +1638,8 @@ class IDevice
     // Check if a build is a good candidate for ACE offload (typically barrier-free cases)
     virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const = 0;
 
+    virtual uint32 CalculateBvhPrimitiveCount(const AccelStructBuildInputs& inputs) const = 0;
+
 protected:
 
     /// Client must create objects by explicitly calling CreateDevice method
diff --git a/gpurt/gpurtDispatch.h b/gpurt/gpurtDispatch.h
index 8f4ce03..fee7757 100644
--- a/gpurt/gpurtDispatch.h
+++ b/gpurt/gpurtDispatch.h
@@ -55,6 +55,8 @@ struct DispatchRaysTopLevelData
     uint32 accelStructTrackerSrd[MaxBufferSrdSize]; // Structured buffer SRD pointing to the accel struct tracker
 };
 
+#define DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID  48
+
 // Dispatch rays constant buffer data (GPU structure). Note, using unaligned uint64_t in HLSL constant buffers requires
 // -no-legacy-cbuf-layout for cpp style structure alignment to work. But currently that support is incomplete in DXC
 // and until that is resolved we need to use uint32's explicitly.
@@ -74,7 +76,8 @@ struct DispatchRaysConstantData
     uint32 hitGroupTableBaseAddressLo;  // Hit group table base address low 32-bits
     uint32 hitGroupTableBaseAddressHi;  // Hit group table base address high 32-bits
     uint32 hitGroupTableStrideInBytes;  // Hit group table record byte stride
-    uint32 reserved0;                   // Reserved padding
+    uint32 cpsDispatchId;               // Continuations DispatchId, written in the persistent mode.
+                                        // This value should not be read via constant buffer.
     uint32 callableTableBaseAddressLo;  // Callable shader table base address low 32-bits
     uint32 callableTableBaseAddressHi;  // Callable shader table base address high 32-bits
     uint32 callableTableStrideInBytes;  // Callable shader table byte stride
@@ -96,6 +99,8 @@ struct DispatchRaysConstantData
     uint32 cpsGlobalMemoryAddressLo;    // Separate CPS stack memory base address low 32-bits
     uint32 cpsGlobalMemoryAddressHi;    // Separate CPS stack memory base address high 32-bits
     uint32 counterMask;                 // Mask for filtering ray history token
+    uint32 cpsDispatchIdAddressLo;      // Continuations cpsDispatchId address low 32-bits
+    uint32 cpsDispatchIdAddressHi;      // Continuations cpsDispatchId address high 32-bits
 };
 #pragma pack(pop)
 
@@ -109,6 +114,8 @@ struct DispatchRaysConstants
 #if __cplusplus
 static_assert((sizeof(DispatchRaysConstants) % sizeof(uint32)) == 0,
               "DispatchRaysConstants is not dword-aligned");
+static_assert(DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID == offsetof(DispatchRaysConstantData, cpsDispatchId),
+              "DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID mismatches to cpsDispatchId");
 
 constexpr uint32 DispatchRaysConstantsDw = sizeof(DispatchRaysConstants) / sizeof(uint32);
 #endif
@@ -132,6 +139,17 @@ struct InitExecuteIndirectUserData
 // Constants for InitExecuteIndirect shader
 struct InitExecuteIndirectConstants
 {
+#if __cplusplus
+    // Internal counter buffer SRDs
+    uint32 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize];
+
+    // Internal acceleration structure tracker buffer SRD.
+    uint32 accelStructTrackerSrd[MaxBufferSrdSize];
+#else
+    uint4 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize / 4];
+    uint4 accelStructTrackerSrd[MaxBufferSrdSize / 4];
+#endif
+
     uint32 inputBytesPerDispatch;   // Size of application indirect arguments
     uint32 outputBytesPerDispatch;  // Size of resulting driver internal arguments
     uint32 bindingArgsSize;         // Size of binding arguments in the app buffer preceeding the dispatch
@@ -160,18 +178,10 @@ struct InitExecuteIndirectConstants
     uint32 counterRayIdRangeBegin;  // Counter ray ID range begin
     uint32 counterRayIdRangeEnd;    // Counter ray ID range end
     uint32 cpsBackendStackSize;     // Scratch memory used by a compiler backend, start at offset 0
-    uint32 padding0;                // Padding for 16-byte alignment
+    uint32 cpsFrontendStackSize;    // Scratch memory used by IR (Intermediate Representation), for a continuation passing shader
 
-#if __cplusplus
-     // Internal counter buffer SRDs
-    uint32 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize];
-
-    // Internal acceleration structure tracker buffer SRD.
-    uint32 accelStructTrackerSrd[MaxBufferSrdSize];
-#else
-    uint4 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize / 4];
-    uint4 accelStructTrackerSrd[MaxBufferSrdSize / 4];
-#endif
+    uint32 cpsGlobalMemoryAddressLo;    // Separate CPS stack memory base address low 32-bits
+    uint32 cpsGlobalMemoryAddressHi;    // Separate CPS stack memory base address high 32-bits
 };
 
 constexpr uint32 InitExecuteIndirectConstantsDw = sizeof(InitExecuteIndirectConstants) / sizeof(uint32);
@@ -184,7 +194,7 @@ static_assert((MaxBufferSrdSize == 4), "Buffer SRD size changed, affected shader
 #endif
 static_assert((sizeof(InitExecuteIndirectConstants) % sizeof(uint32)) == 0,
               "InitExecuteIndirectConstants is not dword-aligned");
-}
+}       // namespace GpuRt
 #endif
 
 #endif
diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp
index 47e4043..59287b6 100644
--- a/src/gpurtBvhBuilder.cpp
+++ b/src/gpurtBvhBuilder.cpp
@@ -1629,7 +1629,7 @@ AccelStructHeader BvhBuilder::InitAccelStructHeader() const
     header.geometryType             = static_cast<uint32>(m_buildConfig.geometryType);
     header.uuidLo                   = Util::LowPart(m_deviceSettings.accelerationStructureUUID);
     header.uuidHi                   = Util::HighPart(m_deviceSettings.accelerationStructureUUID);
-    header.rtIpLevel                = uint32(m_pDevice->GetRtIpLevel());
+    header.rtIpLevel                = static_cast<uint32>(PalToGpuRtIpLevel(m_pDevice->GetRtIpLevel()));
 
     if (m_buildConfig.topLevelBuild)
     {
@@ -2313,8 +2313,8 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo(
     // the build when performing the update causing page faults.
     scratchDataSize = Util::Max(scratchDataSize, updateDataSize);
 
-    // Some applications crash when the driver reports 0 scratch size. Use 1 instead.
-    scratchDataSize = Util::Max(1u, scratchDataSize);
+    // Some applications crash when the driver reports 0 scratch size. Use 1 DWORD instead.
+    scratchDataSize = Util::Max(static_cast<uint32>(sizeof(uint32)), scratchDataSize);
 
     prebuildInfo.scratchDataSizeInBytes       = scratchDataSize;
     prebuildInfo.updateScratchDataSizeInBytes = updateDataSize;
diff --git a/src/gpurtDevice.cpp b/src/gpurtDevice.cpp
index 6058d33..f3b2f7a 100644
--- a/src/gpurtDevice.cpp
+++ b/src/gpurtDevice.cpp
@@ -467,6 +467,18 @@ Pal::Result Device::InitializeCpsMemory(
     return result;
 }
 
+//=====================================================================================================================
+// Populates the GPU addresses in the Constant structure
+template<typename ConstantsType>
+void Device::PatchConstants(ConstantsType* pConstant,
+                            const gpusize  cpsMemoryGpuAddr,
+                            const gpusize  cpsMemoryBytes)
+{
+    pConstant->cpsGlobalMemoryAddressLo = Util::LowPart(cpsMemoryGpuAddr);
+    pConstant->cpsGlobalMemoryAddressHi = Util::HighPart(cpsMemoryGpuAddr);
+
+}
+
 //=====================================================================================================================
 // Populates the GPU addresses in the DispatchRaysConstants structure
 void Device::PatchDispatchRaysConstants(
@@ -474,9 +486,17 @@ void Device::PatchDispatchRaysConstants(
     const gpusize          cpsMemoryGpuAddr,
     const gpusize          cpsMemoryBytes)
 {
-    pDispatchRaysConstants->constData.cpsGlobalMemoryAddressLo = Util::LowPart(cpsMemoryGpuAddr);
-    pDispatchRaysConstants->constData.cpsGlobalMemoryAddressHi = Util::HighPart(cpsMemoryGpuAddr);
+    PatchConstants(&pDispatchRaysConstants->constData, cpsMemoryGpuAddr, cpsMemoryBytes);
+}
 
+//=====================================================================================================================
+// Populates the GPU addresses in the InitExecuteIndirectConstants structure
+void Device::PatchInitExecuteIndirectConstants(
+    GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants,
+    const gpusize                        cpsMemoryGpuAddr,
+    const gpusize                        cpsMemoryBytes)
+{
+    PatchConstants(pInitExecuteIndirectConstants, cpsMemoryGpuAddr, cpsMemoryBytes);
 }
 
 //=====================================================================================================================
@@ -2125,6 +2145,27 @@ bool Device::ShouldUseGangedAceForBuild(
     return shouldUseGangedAce;
 }
 
+// =====================================================================================================================
+uint32 Device::CalculateBvhPrimitiveCount(
+    const AccelStructBuildInputs& inputs
+    ) const
+{
+    // For top-level acceleration structure, inputElementCount represents the number of instances
+    uint32 primitiveCount = (inputs.type == AccelStructType::TopLevel) ? inputs.inputElemCount : 0;
+
+    if (inputs.type == AccelStructType::BottomLevel)
+    {
+        for (uint32 i = 0; i < inputs.inputElemCount; ++i)
+        {
+            const Geometry geometry = m_clientCb.pfnConvertAccelStructBuildGeometry(inputs, i);
+            const uint32 geometryPrimCount = BvhBuilder::GetGeometryPrimCount(geometry);
+            primitiveCount += geometryPrimCount;
+        }
+    }
+
+    return primitiveCount;
+}
+
 // =====================================================================================================================
 const AccelStructBuildInputs Device::OverrideBuildInputs(
     const AccelStructBuildInputs& inputs
diff --git a/src/gpurtInternal.h b/src/gpurtInternal.h
index 7cf7f2c..f59b164 100644
--- a/src/gpurtInternal.h
+++ b/src/gpurtInternal.h
@@ -106,6 +106,42 @@ enum EncodeFlags : uint32
     EncodeFlagFusedInstanceNode     = 0x00000008,
 };
 
+// Values should remain stable for RRA binary-compatibility (PAL equivalents do not guarantee stability)
+enum RtIpLevel : uint32
+{
+    RtIpNone        = 0x0,  ///< The device does not have an RayTracing Ip Level
+    RtIp1_0         = 0x1,  ///< First Implementation of HW RT
+    RtIp1_1         = 0x2,  ///< Added computation of triangle barycentrics into HW
+    RtIp2_0         = 0x3,  ///< Added more Hardware RayTracing features, such as BoxSort, PointerFlag, etc
+    RtIpReserved    = 0x5,  ///< Special value, should not be used
+};
+
+// =====================================================================================================================
+// Convert PAL RtIpLevel values to their GpuRT equivalent
+static RtIpLevel PalToGpuRtIpLevel(Pal::RayTracingIpLevel palRtIpLevel)
+{
+    RtIpLevel gpuRtIpLevel = RtIpLevel::RtIpNone;
+
+    switch (palRtIpLevel)
+    {
+    case Pal::RayTracingIpLevel::RtIp1_0:
+        gpuRtIpLevel = RtIpLevel::RtIp1_0;
+        break;
+    case Pal::RayTracingIpLevel::RtIp1_1:
+        gpuRtIpLevel = RtIpLevel::RtIp1_1;
+        break;
+    case Pal::RayTracingIpLevel::RtIp2_0:
+        gpuRtIpLevel = RtIpLevel::RtIp2_0;
+        break;
+    case Pal::RayTracingIpLevel::None:
+    default:
+        gpuRtIpLevel = RtIpLevel::RtIpNone;
+        break;
+    }
+
+    return gpuRtIpLevel;
+}
+
 struct RadixSortConfig
 {
     uint32 workGroupSize;
@@ -336,13 +372,21 @@ class Device : public IDevice
     // @param pDispatchRaysConstants  (in/out) Non-null pointer to a DispatchRaysConstants
     // @param cpsMemoryGpuAddr        (in) GPU address pointing to the beginning of cps memory
     // @param cpsMemoryBytes          (in) Cps allocated memory size in bytes
-    //
-    // @return the required global memory allocation size in bytes
     virtual void PatchDispatchRaysConstants(
         DispatchRaysConstants* pDispatchRaysConstants,
         const gpusize          cpsMemoryGpuAddr,
         const gpusize          cpsMemoryBytes) override;
 
+    // Populates the GPU addresses in the InitExecuteIndirectConstants structure
+    //
+    // @param pInitExecuteIndirectConstants (in/out) Non-null pointer to a InitExecuteIndirectConstants
+    // @param cpsMemoryGpuAddr              (in) GPU address pointing to the beginning of cps memory
+    // @param cpsMemoryBytes                (in) Cps allocated memory size in bytes
+    virtual void PatchInitExecuteIndirectConstants(
+        GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants,
+        const gpusize                        cpsMemoryGpuAddr,
+        const gpusize                        cpsMemoryBytes) override;
+
     //
     // @param cpsVideoMem          [in] Cps video memory
     // @param cpsMemoryBytes       [in] Cps allocated memory size in bytes
@@ -683,6 +727,8 @@ class Device : public IDevice
 
     virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const override;
 
+    virtual uint32 CalculateBvhPrimitiveCount(const AccelStructBuildInputs& inputs) const override;
+
     // Returns size in DWORDs of a typed buffer view SRD
     uint32 GetTypedBufferSrdSizeDw() const { return m_typedBufferSrdSizeDw; };
 
@@ -722,6 +768,12 @@ class Device : public IDevice
 
     virtual ~Device() override;
 
+    template<typename ConstantsType>
+    void PatchConstants(
+        ConstantsType* pConstant,
+        const gpusize  cpsMemoryGpuAddr,
+        const gpusize  cpsMemoryBytes);
+
     DeviceInitInfo  m_info;
 
     Util::GenericAllocatorTracked            m_allocator;
diff --git a/src/gpurtTraceSource.cpp b/src/gpurtTraceSource.cpp
index 36c5b3e..209a92c 100644
--- a/src/gpurtTraceSource.cpp
+++ b/src/gpurtTraceSource.cpp
@@ -51,7 +51,7 @@ void AccelStructTraceSource::OnTraceBegin(
     if (m_pDevice->AccelStructTrackerGpuAddr() != 0)
     {
         // Before starting the trace set tracking to enabled.
-        pCmdBuf->CmdWriteImmediate(Pal::HwPipeBottom, 1, Pal::ImmediateDataWidth::ImmediateData32Bit,
+        pCmdBuf->CmdWriteImmediate(Pal::PipelineStageBottomOfPipe, 1, Pal::ImmediateDataWidth::ImmediateData32Bit,
                                    m_pDevice->AccelStructTrackerGpuAddr() + offsetof(AccelStructTracker, enabled));
         m_pDevice->RaytracingBarrier(pCmdBuf, BarrierFlagSyncPostCpWrite);
     }
@@ -67,7 +67,7 @@ void AccelStructTraceSource::OnTraceEnd(
     if (m_pDevice->AccelStructTrackerGpuAddr() != 0)
     {
         // Disable tracking.
-        pCmdBuf->CmdWriteImmediate(Pal::HwPipeBottom, 0, Pal::ImmediateDataWidth::ImmediateData32Bit,
+        pCmdBuf->CmdWriteImmediate(Pal::PipelineStageBottomOfPipe, 0, Pal::ImmediateDataWidth::ImmediateData32Bit,
                                    m_pDevice->AccelStructTrackerGpuAddr() + offsetof(AccelStructTracker, enabled));
         m_pDevice->RaytracingBarrier(pCmdBuf, BarrierFlagSyncPostCpWrite);
     }
diff --git a/src/options.yaml b/src/options.yaml
index 4ea170e..859c740 100644
--- a/src/options.yaml
+++ b/src/options.yaml
@@ -32,12 +32,12 @@ enum CpsCandidatePrimitiveMode:
     # Controls how candidate primitives are handled in the continuations (CPS) software Traversal loop.
     SuspendLane:  # Suspend a lane upon candidate hits and wait for other lanes to end the Traversal loop.
                   #  This is the default. Other modes are experimental and might not be implemented on all RtIps.
-    SuspendWave:  # On each Traversal iteration, check whether any lane has a candidate, and break if so.
-                  #  Only implemented for RtIp 2.0, all other cases use SuspendLane.
     DeferFirst:   # When finding the first candidate, record it and ignore it for the time being. At the end of the
                   #  Traversal loop, process pending candidates. When finding the second candidate, immediately break
                   #  out of the loop to first process the first one.
-                  #  Only implemented for triangle primitives on RtIp 2.0, all other cases use SuspendLane.
+                  #  Implementation status:
+                  #   * RtIp 1.1: Not supported, SuspendLane is always used.
+                  #   * RtIp 2.0: DeferFirst is supported, but only for triangle primitives.
 
 # ------------------------------------------------------------------------------------------------------------------
 # This is the definition of the single options struct.
diff --git a/src/shaders/BuildBVH.hlsl b/src/shaders/BuildBVH.hlsl
index 202f0f9..8ec5c77 100644
--- a/src/shaders/BuildBVH.hlsl
+++ b/src/shaders/BuildBVH.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #include "BuildRootSignature.hlsl"
 
diff --git a/src/shaders/BuildBVHTDTR.hlsl b/src/shaders/BuildBVHTDTR.hlsl
index 7d1d71f..2614a34 100644
--- a/src/shaders/BuildBVHTDTR.hlsl
+++ b/src/shaders/BuildBVHTDTR.hlsl
@@ -188,7 +188,7 @@ struct StateTDBuild
 #define USE_LDS     1
 
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTMETADATA
 #define GC_SCRATCHBUFFER
diff --git a/src/shaders/BuildFastAgglomerativeLbvh.hlsl b/src/shaders/BuildFastAgglomerativeLbvh.hlsl
index 526053c..56328b6 100644
--- a/src/shaders/BuildFastAgglomerativeLbvh.hlsl
+++ b/src/shaders/BuildFastAgglomerativeLbvh.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_SCRATCHBUFFER
 #include "BuildRootSignature.hlsl"
diff --git a/src/shaders/BuildPLOC.hlsl b/src/shaders/BuildPLOC.hlsl
index 2c39642..079f1bc 100644
--- a/src/shaders/BuildPLOC.hlsl
+++ b/src/shaders/BuildPLOC.hlsl
@@ -88,7 +88,7 @@ struct BuildPlocArgs
 #include "Common.hlsl"
 
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTMETADATA
 #define GC_SCRATCHBUFFER
diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl
index eaf9090..8a3df86 100644
--- a/src/shaders/BuildParallel.hlsl
+++ b/src/shaders/BuildParallel.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #define BUILD_PARALLEL 1
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define TASK_COUNTER_BUFFER   ScratchGlobal
 #define TASK_COUNTER_OFFSET   (ShaderConstants.offsets.taskLoopCounters + TASK_LOOP_BUILD_PARALLEL_COUNTER_OFFSET)
diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl
index 512496a..60e527f 100644
--- a/src/shaders/BuildQBVH.hlsl
+++ b/src/shaders/BuildQBVH.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_DSTMETADATA
diff --git a/src/shaders/BuildRootSignature.hlsl b/src/shaders/BuildRootSignature.hlsl
index 2df19b0..15c48e2 100644
--- a/src/shaders/BuildRootSignature.hlsl
+++ b/src/shaders/BuildRootSignature.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 // DebugBuffer
 #if GPURT_ENABLE_GPU_DEBUG
diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli
index 5929195..ac6e315 100644
--- a/src/shaders/BuildSettings.hlsli
+++ b/src/shaders/BuildSettings.hlsli
@@ -26,6 +26,8 @@
 #ifndef _BUILDSETTINGS_HLSLI
 #define _BUILDSETTINGS_HLSLI
 
+#include "../shadersClean/common/ShaderDefs.hlsli"
+
 [[vk::constant_id(BUILD_SETTINGS_DATA_TOP_LEVEL_BUILD_ID)]]                        uint topLevelBuild                 = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_BUILD_MODE_ID)]]                             uint buildMode                     = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_TRIANGLE_COMPRESSION_MODE_ID)]]              uint triangleCompressionMode       = 0;
diff --git a/src/shaders/CMakeLists.txt b/src/shaders/CMakeLists.txt
index 54ef25e..ac75c24 100644
--- a/src/shaders/CMakeLists.txt
+++ b/src/shaders/CMakeLists.txt
@@ -59,7 +59,7 @@ set(gpurtHlsl
     CompactAS.hlsl
     CompactAS1_1.hlsl
     CompactCommon.hlsl
-    CopyAS.hlsl
+    ../shadersClean/build/CopyAS.hlsl
     ../shadersClean/build/CopyBufferRaw.hlsl
     DecodeAS.hlsl
     DecodeCommon.hlsl
@@ -129,7 +129,6 @@ set(otherDeps
     ../shadersClean/common/InstanceDesc.hlsli
     ../shadersClean/common/NodePointers.hlsli
     ../shadersClean/common/ScratchNode.hlsli
-    ../shadersClean/common/TempAssert.hlsli
     ../shadersClean/traversal/TraversalDefs.hlsli
     ../shadersClean/common/gfx10/BoxNode1_0.hlsli
     ../shadersClean/common/gfx10/InstanceNode1_0.hlsli
diff --git a/src/shaders/Common.hlsl b/src/shaders/Common.hlsl
index 1b55ccf..92f72d9 100644
--- a/src/shaders/Common.hlsl
+++ b/src/shaders/Common.hlsl
@@ -34,7 +34,7 @@
 #ifndef _COMMON_HLSL
 #define _COMMON_HLSL
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 #include "../shadersClean/common/ScratchNode.hlsli"
 
 typedef AccelStructDataOffsets AccelStructOffsets;
diff --git a/src/shaders/Continuations2_0.hlsl b/src/shaders/Continuations2_0.hlsl
index 221143c..73293dc 100644
--- a/src/shaders/Continuations2_0.hlsl
+++ b/src/shaders/Continuations2_0.hlsl
@@ -509,39 +509,10 @@ static void TraversalInternal2_0(
         }
 
         bool laneHasCandidate = (state < TRAVERSAL_STATE_COMMITTED_NOTHING);
-        if (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::SuspendWave)
+        if (laneHasCandidate)
         {
-
-            // Stopping the Traversal loop for the whole wave on the first AHS/IS might be too aggressive.
-            // We implement this basic version here as basis for further experiments.
-            // Delaying it a bit could have potential benefits:
-            //   * avoid overhead of wave-intrinsic in every iteration (depending on the implementation of delaying)
-            //   * letting more lanes join the IS/AHS work
-            if (WaveActiveAnyTrue(laneHasCandidate))
-            {
-                if (laneHasCandidate)
-                {
-                    // Break out of traversal to run AHS/IS
-                }
-                else if (IsValidNode(nextNodePtr))
-                {
-                    // Break out of traversal so other lanes can run AHS/IS and re-join traversal
-                    state = TRAVERSAL_STATE_SUSPEND_TRAVERSAL;
-                }
-                else
-                {
-                    // The lane is done with Traversal, and wants to run CHS or Miss
-                }
-                break;
-            }
-        }
-        else
-        {
-            if (laneHasCandidate)
-            {
-                // Break out of traversal to run AHS/IS
-                break;
-            }
+            // Break out of traversal to run AHS/IS
+            break;
         }
     }
 
diff --git a/src/shaders/EncodeCommon.hlsl b/src/shaders/EncodeCommon.hlsl
index fb666c6..0aa00b0 100644
--- a/src/shaders/EncodeCommon.hlsl
+++ b/src/shaders/EncodeCommon.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #include "BuildCommonScratch.hlsl"
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #include "TrianglePrimitive.hlsl"
 #include "UpdateCommon.hlsl"
diff --git a/src/shaders/EncodeNodes.hlsl b/src/shaders/EncodeNodes.hlsl
index 2075069..3ee98e6 100644
--- a/src/shaders/EncodeNodes.hlsl
+++ b/src/shaders/EncodeNodes.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_DSTMETADATA
diff --git a/src/shaders/EncodeTopLevel.hlsl b/src/shaders/EncodeTopLevel.hlsl
index 00419bc..90c1954 100644
--- a/src/shaders/EncodeTopLevel.hlsl
+++ b/src/shaders/EncodeTopLevel.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #include "BuildRootSignature.hlsl"
 
diff --git a/src/shaders/EncodeTopLevelBuild.hlsl b/src/shaders/EncodeTopLevelBuild.hlsl
index 2424f4a..097c3ac 100644
--- a/src/shaders/EncodeTopLevelBuild.hlsl
+++ b/src/shaders/EncodeTopLevelBuild.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 #include "../shadersClean/common/ScratchNode.hlsli"
 
 //=====================================================================================================================
diff --git a/src/shaders/Extensions.hlsl b/src/shaders/Extensions.hlsl
index e78c92b..176dff2 100644
--- a/src/shaders/Extensions.hlsl
+++ b/src/shaders/Extensions.hlsl
@@ -29,6 +29,13 @@
 #include "../shadersClean/common/Extensions.hlsli"
 #include "../shadersClean/common/Math.hlsli"
 
+#define AmdExtD3DShaderIntrinsicsWaveOp_MinF        0x07
+#define AmdExtD3DShaderIntrinsicsWaveOp_MaxF        0x0a
+#define AmdExtD3DShaderIntrinsicsWaveOp_Inclusive   0x01
+
+#define AmdExtClusteredSubgroup                     3
+#define AmdExtClusteredReduce                       3
+
 // Dummy implementation for Vulkan build only
 __decl uint AmdExtD3DShaderIntrinsics_LoadDwordAtAddr(
     uint gpuVaLoBits, uint gpuVaHiBits, uint offset) DUMMY_UINT_FUNC
@@ -57,10 +64,6 @@ __decl uint2 AmdExtD3DShaderIntrinsics_AtomicMinU64(
 __decl uint2 AmdExtD3DShaderIntrinsics_ShaderClock() DUMMY_UINT2_FUNC
 __decl uint2 AmdExtD3DShaderIntrinsics_ShaderRealtimeClock() DUMMY_UINT2_FUNC
 
-#define AmdExtD3DShaderIntrinsicsWaveOp_MinF        0x07
-#define AmdExtD3DShaderIntrinsicsWaveOp_MaxF        0x0a
-#define AmdExtD3DShaderIntrinsicsWaveOp_Inclusive   0x01
-
 __decl float3 AmdExtD3DShaderIntrinsics_WaveScan(
     uint waveOp, uint flags, float3 src) DUMMY_FLOAT3_FUNC
 
@@ -112,56 +115,115 @@ __decl float3 AmdExtD3DShaderIntrinsics_FloatOpWithRoundMode(
     uint roundMode, uint operation, float3 src0, float3 src1) DUMMY_FLOAT3_FUNC
 
 //=====================================================================================================================
-// Sub-group wave reductions
+// Sub-group wave reductions spirv ops
 // Ref: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_instructions
 
 [[vk::ext_capability(/* GroupNonUniform */ 61)]]
 [[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
 [[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
-
 [[vk::ext_instruction(350)]]
 float spirv_OpGroupNonUniformFAdd_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
 
+[[vk::ext_capability(/* GroupNonUniform */ 61)]]
+[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
+[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
+[[vk::ext_instruction(355)]]
+float spirv_OpGroupNonUniformFMin_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
+
+[[vk::ext_capability(/* GroupNonUniform */ 61)]]
+[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
+[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
+[[vk::ext_instruction(358)]]
+float spirv_OpGroupNonUniformFMax_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
+
+[[vk::ext_capability(/* GroupNonUniform */ 61)]]
+[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
+[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
+[[vk::ext_instruction(359)]]
+uint spirv_OpGroupNonUniformBitwiseAnd_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+
+[[vk::ext_capability(/* GroupNonUniform */ 61)]]
+[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
+[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
+[[vk::ext_instruction(360)]]
+uint spirv_OpGroupNonUniformBitwiseOr_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+
+//=====================================================================================================================
+// GpuRt WaveClusterSum Intrinsics
 float AmdExtD3DShaderIntrinsics_WaveClusterSum(float x, uint dxClusterSize)
 {
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformFAdd_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    return spirv_OpGroupNonUniformFAdd_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize);
 }
 
-[[vk::ext_instruction(355)]]
-float spirv_OpGroupNonUniformFMin_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
-
+//=====================================================================================================================
+// GpuRt WaveClusterMin Intrinsics
 float AmdExtD3DShaderIntrinsics_WaveClusterMin(float x, uint dxClusterSize)
 {
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformFMin_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    return spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize);
 }
 
-[[vk::ext_instruction(358)]]
-float spirv_OpGroupNonUniformFMax_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
+float2 AmdExtD3DShaderIntrinsics_WaveClusterMin(float2 val, uint dxClusterSize)
+{
+    float2 result;
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    result.x = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize);
+    result.y = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize);
+    return result;
+}
 
-float AmdExtD3DShaderIntrinsics_WaveClusterMax(float x, uint dxClusterSize)
+float3 AmdExtD3DShaderIntrinsics_WaveClusterMin(float3 val, uint dxClusterSize)
 {
+    float3 result;
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformFMax_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    result.x = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize);
+    result.y = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize);
+    result.z = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.z, clusterSize);
+    return result;
 }
 
-[[vk::ext_instruction(359)]]
-uint spirv_OpGroupNonUniformBitwiseAnd_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+//=====================================================================================================================
+// GpuRt WaveClusterMax Intrinsics
+float AmdExtD3DShaderIntrinsics_WaveClusterMax(float val, uint dxClusterSize)
+{
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    return spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val, clusterSize);
+}
 
-uint AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(uint x, uint dxClusterSize)
+float2 AmdExtD3DShaderIntrinsics_WaveClusterMax(float2 val, uint dxClusterSize)
 {
+    float2 result;
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformBitwiseAnd_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    result.x = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize);
+    result.y = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize);
+    return result;
 }
 
-[[vk::ext_instruction(360)]]
-uint spirv_OpGroupNonUniformBitwiseOr_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+float3 AmdExtD3DShaderIntrinsics_WaveClusterMax(float3 val, uint dxClusterSize)
+{
+    float3 result;
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    result.x = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize);
+    result.y = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize);
+    result.z = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.z, clusterSize);
+    return result;
+}
 
+//=====================================================================================================================
+// GpuRt WaveClusterBitAnd Intrinsics
+uint AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(uint x, uint dxClusterSize)
+{
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    return spirv_OpGroupNonUniformBitwiseAnd_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize);
+}
+
+//=====================================================================================================================
+// GpuRt WaveClusterBitOr Intrinsics
 uint AmdExtD3DShaderIntrinsics_WaveClusterBitOr(uint x, uint dxClusterSize)
 {
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformBitwiseOr_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    return spirv_OpGroupNonUniformBitwiseOr_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize);
 }
 
 //=====================================================================================================================
@@ -317,6 +379,7 @@ __decl uint64_t AmdExtConstantLoad64AtAddr(GpuVirtualAddress addr, uint offset)
 __decl uint AmdExtDispatchThreadIdFlat() DUMMY_UINT_FUNC;
 
 //=====================================================================================================================
+__decl uint AmdExtAtomicAddAtAddr(uint64_t gpuVa, uint offset, uint value) DUMMY_UINT_FUNC;
 __decl uint64_t AmdExtAtomic64AddAtAddr(uint64_t gpuVa, uint offset, uint64_t value) DUMMY_UINT_FUNC
 __decl uint64_t AmdExtAtomic64CmpXchgAtAddr(uint64_t gpuVa, uint offset, uint64_t compare_value, uint64_t value) DUMMY_UINT_FUNC
 __decl uint64_t AmdExtLoad64AtAddrUncached(uint64_t gpuVa, uint offset) DUMMY_UINT_FUNC
@@ -324,6 +387,12 @@ __decl uint  AmdExtLoadDwordAtAddrUncached(uint64_t addr, uint offset) DUMMY_UIN
 __decl void  AmdExtStoreDwordAtAddrUncached(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC
 __decl uint3 AmdExtGroupIdCompute() DUMMY_UINT3_FUNC
 __decl uint3 AmdExtGroupDimCompute() DUMMY_UINT3_FUNC
+__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC
+__decl uint  AmdExtFlattenedThreadIdInGroupCompute() DUMMY_UINT_FUNC
+__decl uint  AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC
+__decl void  AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC
+__decl void  AmdExtDeviceMemoryAcquire() DUMMY_VOID_FUNC
+__decl void  AmdExtDeviceMemoryRelease() DUMMY_VOID_FUNC
 __decl uint  AmdExtLaneCount() DUMMY_UINT_FUNC
 __decl void  AmdExtSleep(uint value) DUMMY_VOID_FUNC
 
diff --git a/src/shaders/GenerateMortonCodes.hlsl b/src/shaders/GenerateMortonCodes.hlsl
index 6cd8bbd..79df409 100644
--- a/src/shaders/GenerateMortonCodes.hlsl
+++ b/src/shaders/GenerateMortonCodes.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #include "BuildRootSignature.hlsl"
 #endif
diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl
index de0dc26..293fae0 100644
--- a/src/shaders/GpuRtLibraryCont.hlsl
+++ b/src/shaders/GpuRtLibraryCont.hlsl
@@ -81,18 +81,13 @@
 #define TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT 5
 #define TRAVERSAL_STATE_COMMITTED_PROCEDURAL_PRIMITIVE_HIT 6
 
-// This state implies Traversal was stopped to run AHS/IS for other lanes. This lane wants to resume Traversal.
-#define TRAVERSAL_STATE_SUSPEND_TRAVERSAL 7
-
 // Shader priorities for continuation scheduling. Higher values mean higher scheduling precedence.
-// Reserve priority 0 as invalid value. This way, 0-initialized priorities in metadata-annotated
-// function pointers (e.g. from relocations) can be detected.
 // Note: For 32-bit packing of function pointers, we require the scheduling priority to fit into 3 bits.
-#define SCHEDULING_PRIORITY_INVALID   0
-#define SCHEDULING_PRIORITY_RGS       1
-#define SCHEDULING_PRIORITY_CHS       2
-#define SCHEDULING_PRIORITY_MISS      2
-#define SCHEDULING_PRIORITY_TRAVERSAL 3
+#define SCHEDULING_PRIORITY_PWG_DEAD  0
+#define SCHEDULING_PRIORITY_TRAVERSAL 1
+#define SCHEDULING_PRIORITY_RGS       2
+#define SCHEDULING_PRIORITY_CHS       3
+#define SCHEDULING_PRIORITY_MISS      3
 // Give IS higher prio than AHS so AHS called by ReportHit
 // have a chance to run together with AHS called by Traversal.
 #define SCHEDULING_PRIORITY_AHS       4
@@ -144,7 +139,7 @@ static uint GetPriorityForShaderType(
     case DXILShaderKind::AnyHit:         return SCHEDULING_PRIORITY_AHS;
     case DXILShaderKind::Intersection:   return SCHEDULING_PRIORITY_IS;
     case DXILShaderKind::RayGeneration:  return SCHEDULING_PRIORITY_RGS;
-    default:                             return SCHEDULING_PRIORITY_INVALID;
+    default:                             GPU_ASSERT(false); return 0;
     }
 }
 
@@ -153,60 +148,128 @@ static uint3 GetDispatchRaysDimensions();
 
 //=====================================================================================================================
 
-static uint64_t GetVpcWithPriority(uint64_t vpc, uint priority)
-{
-    if (_AmdIsLlpc())
+struct Vpc64 {
+    uint64_t vpc;
+
+#if defined(__cplusplus)
+    Vpc64(uint64_t value) : vpc(value) {}
+#endif
+
+    uint64_t GetU64()
     {
         return vpc;
     }
 
-    const uint64_t prio64 = priority;
-    const uint firstMetadataBit = 32;
-    const uint firstPriorityBitInMetadata = 16;
-    GPU_ASSERT((vpc & 0xFFFF000000000000) == 0);
-    return vpc | (prio64 << (firstMetadataBit + firstPriorityBitInMetadata));
-}
+    uint GetFunctionAddr()
+    {
+        return (vpc & 0xFFFFFFFF);
+    }
+
+    bool IsValid()
+    {
+        return GetFunctionAddr() != 0;
+    }
+
+    Vpc64 SetPriority(uint priority)
+    {
+        if (_AmdIsLlpc())
+        {
+            return Vpc64(vpc);
+        }
+
+        const uint64_t prio64 = (uint64_t)(priority);
+        const uint firstMetadataBit = 32;
+        const uint firstPriorityBitInMetadata = 16;
+        GPU_ASSERT((vpc & 0xFFFF000000000000) == 0);
+        vpc |= (prio64 << (firstMetadataBit + firstPriorityBitInMetadata));
+        return Vpc64(vpc);
+    }
+
+    uint GetPriority()
+    {
+        uint inMetadata = (uint)(vpc >> 32);
+        return (uint)(inMetadata >> 16);
+    }
+
+    static Vpc64 MakeWithPriority(Vpc64 vpc64, uint priority)
+    {
+        return vpc64.SetPriority(priority);
+    }
+};
+
+struct Vpc32 {
+    uint32_t vpc;
+
+#if defined(__cplusplus)
+    Vpc32(uint32_t value) : vpc(value) {}
+#endif
+
+    uint32_t GetU32()
+    {
+        return vpc;
+    }
+
+    uint32_t GetFunctionAddr()
+    {
+        return (uint32_t)(vpc & 0xFFFFFFC0);
+    }
+
+    bool IsValid()
+    {
+        return GetFunctionAddr() != 0;
+    }
+
+    void SetPriority(uint priority)
+    {
+        vpc |= priority;
+    }
+
+    uint GetPriority()
+    {
+        return (uint)(vpc & 0x7);
+    }
+};
 
 //=====================================================================================================================
 // 32-bit function pointer packing/unpacking
 //
-static uint64_t Unpack32BitVpcTo64BitVpc(uint32_t vpc32, bool unpackPriority)
+static Vpc64 Vpc32ToVpc64(Vpc32 vpc32, bool unpackPriority)
 {
     if (_AmdIsLlpc())
     {
-        return vpc32;
+        return Vpc64(vpc32.GetU32());
     }
 
-    uint64_t vpc = (vpc32 & 0xFFFFFFC0);
+    Vpc64 vpc64 = Vpc64((uint64_t)(vpc32.GetFunctionAddr()));
 
     if (unpackPriority)
     {
-       // The priority is stored in bits 0..2.
-       uint32_t priority = (vpc32 & 0x7);
-       vpc = GetVpcWithPriority(vpc, priority);
+       vpc64.SetPriority(vpc32.GetPriority());
     }
 
-    return vpc;
+    return vpc64;
 }
 
-static uint32_t Pack64BitVpcTo32Bits(uint64_t vpc)
+static Vpc32 Vpc64ToVpc32(Vpc64 vpc64)
 {
+    Vpc32 vpc32 = Vpc32((uint32_t)(vpc64.GetFunctionAddr()));
+
     if (_AmdIsLlpc())
     {
-        return (vpc & 0xFFFFFFFF);
+        return vpc32;
     }
 
+    GPU_ASSERT((vpc32.GetU32() & 0x2F) == 0);
+
     // Incoming metadata is in the high dword
-    uint32_t inMetadata = (uint32_t)(vpc >> 32);
-    uint32_t prio = (inMetadata >> 16);
+    uint prio = vpc64.GetPriority();
+
     // We only have three bits for the priority:
     GPU_ASSERT(prio <= 7);
 
-    // Outgoing metadata is in the low 6 bits
-    uint32_t outMetadata = prio;
+    vpc32.SetPriority(prio);
 
-    GPU_ASSERT((vpc & 0x2F) == 0);
-    return SplitUint64(vpc).x | outMetadata;
+    return vpc32;
 }
 
 //=====================================================================================================================
@@ -596,14 +659,14 @@ struct _AmdTraversalState
         return committed.State();
     }
 
-    void PackReturnAddress(uint64_t returnAddr)
+    void PackReturnAddress(Vpc64 returnAddr)
     {
-        packedReturnAddr = Pack64BitVpcTo32Bits(returnAddr);
+        packedReturnAddr = Vpc64ToVpc32(returnAddr).GetU32();
     }
 
-    uint64_t ReturnAddress()
+    Vpc64 ReturnAddress()
     {
-        return Unpack32BitVpcTo64BitVpc(packedReturnAddr, true);
+        return Vpc32ToVpc64(Vpc32(packedReturnAddr), true);
     }
 };
 
@@ -679,9 +742,7 @@ struct _AmdSystemData
 
     bool IsChsOrMiss(in uint state)
     {
-        return (state >= TRAVERSAL_STATE_COMMITTED_NOTHING) &&
-               ((Options::getCpsCandidatePrimitiveMode() != CpsCandidatePrimitiveMode::SuspendWave) ||
-                (state < TRAVERSAL_STATE_SUSPEND_TRAVERSAL));
+        return (state >= TRAVERSAL_STATE_COMMITTED_NOTHING);
     }
 
     bool IsMiss(in uint state)
@@ -762,19 +823,17 @@ struct _AmdTraversalResultData
 DECLARE_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data)
 
 DECLARE_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data)
+DECLARE_ENQUEUE(TraversalDead, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
 DECLARE_ENQUEUE(RayGen, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
-DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data)
-DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
 
 DECLARE_ENQUEUE(AnyHit, uint64_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics)
 DECLARE_ENQUEUE(Intersection, uint64_t returnAddr, _AmdAnyHitSystemData data)
-DECLARE_WAIT_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data)
 
 DECLARE_AWAIT(AnyHit, _AmdAnyHitSystemData, uint64_t returnAddr, _AmdAnyHitSystemData data)
 DECLARE_AWAIT(CallShader, _AmdDispatchSystemData, uint64_t returnAddr, _AmdDispatchSystemData data)
 
 // No returnAddr argument. The return address is instead included in the passed system data.
-DECLARE_WAIT_AWAIT(Traversal, _AmdDispatchSystemData, _AmdSystemData data)
+DECLARE_AWAIT(Traversal, _AmdDispatchSystemData, _AmdSystemData data)
 
 DECLARE_RESTORE_SYSTEM_DATA(, _AmdDispatchSystemData data)
 DECLARE_RESTORE_SYSTEM_DATA(AnyHit, _AmdAnyHitSystemData data)
@@ -826,36 +885,37 @@ inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithoutStack()
 
 //=====================================================================================================================
 // Return the argument.
-static uint64_t GetVpcFromShaderId(uint32_t shaderId, uint priority)
+static Vpc64 GetVpc64FromShaderId(Vpc32 shaderId, uint priority)
 {
-    uint64_t vpc = Unpack32BitVpcTo64BitVpc(shaderId, /* unpackPriority = */ false);
-    return GetVpcWithPriority(vpc, priority);
+    Vpc64 vpc64 = Vpc32ToVpc64(shaderId, /* unpackPriority = */ false);
+    vpc64.SetPriority(priority);
+    return vpc64;
 }
 
 //=====================================================================================================================
-static uint64_t GetVpcFromShaderIdAddr(GpuVirtualAddress addr, uint priority)
+static Vpc64 GetVpc64FromShaderIdAddr(GpuVirtualAddress addr, uint priority)
 {
 #ifdef __cplusplus
     return 1;
 #else
-    uint32_t shaderId = ConstantLoadDwordAtAddr(addr);
-    return GetVpcFromShaderId(shaderId, priority);
+    Vpc32 shaderId = Vpc32(ConstantLoadDwordAtAddr(addr));
+    return GetVpc64FromShaderId(shaderId, priority);
 #endif
 }
 
 //=====================================================================================================================
-static uint64_t GetVpcFromShaderIdTable(
+static Vpc64 GetVpc64FromShaderIdTable(
     GpuVirtualAddress tableAddress,
     uint index,
     uint stride,
     uint priority)
 {
-    return GetVpcFromShaderIdAddr(tableAddress + stride * index, priority);
+    return GetVpc64FromShaderIdAddr(tableAddress + stride * index, priority);
 }
 
 //=====================================================================================================================
 // Returns the 32-bit part of the hit group shader id containing the AHS shader id.
-static uint32_t GetAnyHit32BitShaderId(
+static Vpc32 GetAnyHit32BitShaderId(
     uint hitGroupRecordIndex)
 {
     const uint offset = DispatchRaysConstBuf.hitGroupTableStrideInBytes * hitGroupRecordIndex;
@@ -864,18 +924,18 @@ static uint32_t GetAnyHit32BitShaderId(
         PackUint64(DispatchRaysConstBuf.hitGroupTableBaseAddressLo, DispatchRaysConstBuf.hitGroupTableBaseAddressHi);
     if (tableVa == 0)
     {
-       return 0;
+       return Vpc32(0);
     }
-    return ConstantLoadDwordAtAddr(tableVa + offset + 8);
+    return Vpc32(ConstantLoadDwordAtAddr(tableVa + offset + 8));
 }
 
 //=====================================================================================================================
 // Returns the 64-bit VPC for the given AHS by loading its shader address, and setting the AHS priority.
-static uint64_t GetAnyHitAddr(
+static Vpc64 GetAnyHitAddr(
     uint hitGroupRecordIndex)
 {
-    uint32_t shaderId = GetAnyHit32BitShaderId(hitGroupRecordIndex);
-    return GetVpcFromShaderId(shaderId, SCHEDULING_PRIORITY_AHS);
+    Vpc32 shaderId = GetAnyHit32BitShaderId(hitGroupRecordIndex);
+    return GetVpc64FromShaderId(shaderId, SCHEDULING_PRIORITY_AHS);
 }
 
 //=====================================================================================================================
@@ -891,7 +951,7 @@ static bool AnyHitIsNonNull(
                                        geometryContributionToHitGroupIndex,
                                        instanceContributionToHitGroupIndex);
 
-    return GetAnyHit32BitShaderId(hitGroupRecordIndex) != 0;
+    return GetAnyHit32BitShaderId(hitGroupRecordIndex).IsValid();
 }
 
 //=====================================================================================================================
@@ -942,14 +1002,6 @@ static float4x3 WorldToObject4x3(in uint64_t tlasBaseAddr, in uint instNodePtr)
     return transpose(WorldToObject3x4(tlasBaseAddr, instNodePtr));
 }
 
-//=====================================================================================================================
-__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC
-__decl uint  AmdExtFlattenedThreadIdInGroupCompute() DUMMY_UINT_FUNC
-__decl uint  AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC
-__decl void  AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC
-__decl void  AmdExtDeviceMemoryAcquire() DUMMY_VOID_FUNC
-__decl void  AmdExtDeviceMemoryRelease() DUMMY_VOID_FUNC
-
 //=====================================================================================================================
 // Implementation of DispatchRaysIndex.
 export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data)
@@ -970,7 +1022,7 @@ static uint3 GetDispatchRaysDimensions()
 
 //=====================================================================================================================
 // Persistent dispatch size (1D).
-static uint3 GetPersistentDispatchSize()
+static uint GetPersistentDispatchSize()
 {
     // Groups needed to cover the dispatch if each thread only processes 1 ray
     const uint3 rayDispatch   = GetDispatchRaysDimensions();
@@ -1069,6 +1121,7 @@ static uint3 GetDispatchId()
     dispatchId.z = groupId.y;
     if ((dims.x > 1) && (dims.y > 1))
     {
+        // Use 8 x (threadGroupSize / 8) tiles.
         /*
         Sample: D3D12_DISPATCH_RAYS_DESC::(w x h x d) = (18, 6, 1). Divided into 8x4 tiles(boxes).
         A number in a box is the group id.
@@ -1334,20 +1387,27 @@ export uint64_t _cont_GetContinuationStackGlobalMemBase()
 }
 
 //=====================================================================================================================
-static uint64_t GetTraversalVpc()
+static Vpc64 GetTraversalVpc64()
 {
     // NOTE: DXCP uses a table for TraceRay, thus a load to traceRayGpuVa retrieves the actual traversal function
     // address. But Vulkan does not use the table so far, traceRayGpuVa is already the traversal function address.
-    return PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
-                      DispatchRaysConstBuf.traceRayGpuVaHi);
+    return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
+                      DispatchRaysConstBuf.traceRayGpuVaHi));
 }
 
 //=====================================================================================================================
-static uint64_t GetRayGenVpc()
+static Vpc64 GetTraversalVpc64PwgDead()
 {
-    return GetVpcFromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo,
-                                             DispatchRaysConstBuf.rayGenerationTableAddressHi),
-                                  SCHEDULING_PRIORITY_RGS);
+    return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
+                      DispatchRaysConstBuf.traceRayGpuVaHi));
+}
+
+//=====================================================================================================================
+static Vpc64 GetRayGenVpc64()
+{
+    return GetVpc64FromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo,
+                                               DispatchRaysConstBuf.rayGenerationTableAddressHi),
+                                               SCHEDULING_PRIORITY_RGS);
 }
 
 //=====================================================================================================================
@@ -1610,7 +1670,6 @@ static uint2 RayHistoryGetIdentifierFromVPC(uint64_t vpc)
 //=====================================================================================================================
 static uint2 RayHistoryGetIdentifierFromShaderId(uint2 shaderId)
 {
-    // Zero out the dVGPR bits and the higher dWord
     return uint2(shaderId.x & 0xFFFFFFC0, 0);
 }
 
@@ -1828,15 +1887,14 @@ export bool _cont_ReportHit(inout_param(_AmdAnyHitSystemData) data, float THit,
             hitGroupRecordIndex = data.base.dispatch.shaderRecIdx;
         }
         // Compute hit group address and fetch shader identifiers
-        const uint64_t anyHitAddr = GetAnyHitAddr(hitGroupRecordIndex);
+        const Vpc64 anyHitAddr = GetAnyHitAddr(hitGroupRecordIndex);
 
-        if (SplitUint64(anyHitAddr).x != 0)
+        if (anyHitAddr.IsValid())
         {
             // Call AnyHit
             // Hit attributes are added as an additional argument by the compiler
-            const uint64_t resumeAddr = _AmdGetResumePointAddr();
-            const uint64_t resumeAddrWithPrio = GetVpcWithPriority(resumeAddr, SCHEDULING_PRIORITY_IS);
-            data = _AmdAwaitAnyHit(anyHitAddr, resumeAddrWithPrio, data);
+            Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), SCHEDULING_PRIORITY_IS);
+            data = _AmdAwaitAnyHit(anyHitAddr.GetU64(), resumeAddr.GetU64(), data);
             _AmdRestoreSystemDataAnyHit(data);
             return data.base.ray.AnyHitDidAccept();
         }
@@ -1874,12 +1932,12 @@ export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint inde
         return;
     }
 
-    const uint64_t addr = GetVpcFromShaderIdTable(callableTableBaseAddress,
-                                                  index,
-                                                  DispatchRaysConstBuf.callableTableStrideInBytes,
-                                                  SCHEDULING_PRIORITY_CALLABLE);
+    const Vpc64 addr = GetVpc64FromShaderIdTable(callableTableBaseAddress,
+                                                 index,
+                                                 DispatchRaysConstBuf.callableTableStrideInBytes,
+                                                 SCHEDULING_PRIORITY_CALLABLE);
 
-    if (SplitUint64(addr).x == 0)
+    if (!addr.IsValid())
     {
         // See TODO above on how to handle this case better.
         return;
@@ -1890,10 +1948,9 @@ export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint inde
 
     const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
     const uint resumePrio = GetPriorityForShaderType(enclosingShaderType);
-    const uint64_t resumeAddr = _AmdGetResumePointAddr();
-    const uint64_t resumeAddrWithPrio = GetVpcWithPriority(resumeAddr, resumePrio);
+    const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio);
 
-    data = _AmdAwaitCallShader(addr, resumeAddrWithPrio, data);
+    data = _AmdAwaitCallShader(addr.GetU64(), resumeAddr.GetU64(), data);
 
     // for the resume part.
     data.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
@@ -1903,23 +1960,23 @@ export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint inde
 //=====================================================================================================================
 // Returns the low part of the miss shader address and sets up the dispatch data to have the correct shader record
 // index.
-static uint64_t SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx)
+static Vpc64 SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx)
 {
     const uint64_t missTableBaseAddress =
         PackUint64(DispatchRaysConstBuf.missTableBaseAddressLo, DispatchRaysConstBuf.missTableBaseAddressHi);
     if (missTableBaseAddress == 0)
     {
         shaderRecIdx = 0;
-        return 0;
+        return Vpc64(0);
     }
 
     shaderRecIdx = ExtractMissShaderIndex(data.ray.traceParameters);
 
     // Calculate miss shader record address
-    const uint64_t shaderAddr = GetVpcFromShaderIdTable(missTableBaseAddress,
-                                                        shaderRecIdx,
-                                                        DispatchRaysConstBuf.missTableStrideInBytes,
-                                                        SCHEDULING_PRIORITY_MISS);
+    const Vpc64 shaderAddr = GetVpc64FromShaderIdTable(missTableBaseAddress,
+                                                       shaderRecIdx,
+                                                       DispatchRaysConstBuf.missTableStrideInBytes,
+                                                       SCHEDULING_PRIORITY_MISS);
 
     return shaderAddr;
 }
@@ -1949,6 +2006,11 @@ static HitGroupInfo GetHitGroupInfo(
 #include "Continuations2_0.hlsl"
 
 #if CONTINUATION_ON_GPU
+static uint64_t GetDispatchIdAddr()
+{
+    return PackUint64(DispatchRaysConstBuf.cpsDispatchIdAddressLo, DispatchRaysConstBuf.cpsDispatchIdAddressHi);
+}
+
 //=====================================================================================================================
 static void LaunchRayGen(bool setupStack)
 {
@@ -1968,28 +2030,16 @@ static void LaunchRayGen(bool setupStack)
 
         // This is written in a way that is intended to be correct even if threads don't reconverge after calling into
         // the ray generation shader.
-        uint localWorkId;
         const uint popCount = WaveActiveCountBits(true);
 
+        uint flatDispatchId = 0;
         if (WaveIsFirstLane())
         {
-            localWorkId = AmdTraceRayPersistentLdsAtomicAdd(0, popCount);
+            flatDispatchId = AmdExtAtomicAddAtAddr(GetDispatchIdAddr(), 0, popCount);
         }
-        localWorkId = WaveReadLaneFirst(localWorkId) + WavePrefixCountBits(true);
+        flatDispatchId = WaveReadLaneFirst(flatDispatchId) + WavePrefixCountBits(true);
 
         const uint3 rayDims = GetDispatchRaysDimensions();
-        const uint  tgCount = GetPersistentDispatchSize();
-
-        // Single dimension dispatch so the flattened group ID is the same as the x component of the group ID
-        const uint tgId = AmdExtGroupIdCompute().x;
-
-        // Interleave waves' worth of work among CUs so that every CU does approximately the same amount of work even
-        // for dispatches that are smaller than the maximum occupancy of the GPU. This is probably also a bit better
-        // for memory and shader execution locality, since CUs should tend to stay roughly within the same region of
-        // the dispatch. Assume numthreads(32, 1, 1).
-        const uint lowPart        = localWorkId & 31;
-        const uint highPart       = localWorkId & ~31;
-        const uint flatDispatchId = highPart * tgCount + tgId * 32 + lowPart;
 
         dispatchId = GetDispatchId(rayDims.x, rayDims.y, flatDispatchId);
         valid      = flatDispatchId < (rayDims.x * rayDims.y * rayDims.z);
@@ -2016,12 +2066,12 @@ static void LaunchRayGen(bool setupStack)
 #if DEVELOPER
         systemData.parentId = -1;
 #endif
-        _AmdEnqueueRayGen(GetRayGenVpc(), _AmdGetUninitializedI64(), systemData);
+        _AmdEnqueueRayGen(GetRayGenVpc64().GetU64(), _AmdGetUninitializedI64(), systemData);
     }
     else if (Options::getPersistentLaunchEnabled())
     {
         _AmdDispatchSystemData systemData = _AmdDispatchSystemData::MakeDeadLaneWithStack();
-        _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), systemData);
+        _AmdEnqueueTraversalDead(GetTraversalVpc64PwgDead().GetU64(), _AmdGetUninitializedI64(), systemData);
     }
 }
 
@@ -2029,16 +2079,6 @@ static void LaunchRayGen(bool setupStack)
 // KernelEntry is entry function of the RayTracing continuation mode
 export void _cont_KernelEntry()
 {
-    if (Options::getPersistentLaunchEnabled())
-    {
-        if (AmdExtFlattenedThreadIdInGroupCompute() == 0)
-        {
-            AmdTraceRayPersistentLdsWrite(0, 0);
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-    }
-
     LaunchRayGen(true);
 }
 
@@ -2136,17 +2176,16 @@ export void _cont_TraceRay(
 
     const uint     callerShaderRecIdx    = dispatch.shaderRecIdx; // 0 if from RayGen.
     const uint     parentId              = RayHistoryGetParentId(dispatch);
-    const uint64_t traversalAddrWithPrio = GetTraversalVpc();
+    const Vpc64    traversalAddr         = GetTraversalVpc64();
 
     // The type of the shader containing this TraceRay call, i.e. the shader we are inlined into.
     const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
     const uint           resumePrio          = GetPriorityForShaderType(enclosingShaderType);
 
-    // NO control flow is allowed between _AmdGetResumePointAddr() and _AmdWaitAwaitTraversal().
-    const uint64_t resumeAddr         = _AmdGetResumePointAddr();
-    const uint64_t resumeAddrWithPrio = GetVpcWithPriority(resumeAddr, resumePrio);
-    data.traversal.PackReturnAddress(resumeAddrWithPrio);
-    dispatch = _AmdWaitAwaitTraversal(traversalAddrWithPrio, -1, data);
+    // NO control flow is allowed between _AmdGetResumePointAddr() and _AmdAwaitTraversal().
+    const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio);
+    data.traversal.PackReturnAddress(resumeAddr);
+    dispatch = _AmdAwaitTraversal(traversalAddr.GetU64(), data);
 
     // for the resume part.
     dispatch.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
@@ -2164,14 +2203,14 @@ static bool GetNextHitMissPc(
     inout_param(_AmdSystemData) data,
     uint state,
     _AmdPrimitiveSystemState candidate,
-    out_param(uint64_t) nextShaderAddr)
+    out_param(Vpc64) nextShaderAddr)
 {
     // MS
     if (data.IsMiss(state))
     {
         uint shaderRecIdx;
-        const uint64_t missShaderAddr = SetupMissShader(data, shaderRecIdx);
-        if (SplitUint64(missShaderAddr).x != 0)
+        const Vpc64 missShaderAddr = SetupMissShader(data, shaderRecIdx);
+        if (missShaderAddr.IsValid())
         {
             // Valid MS
             data.dispatch.shaderRecIdx = shaderRecIdx;
@@ -2194,7 +2233,7 @@ static bool GetNextHitMissPc(
             if (hitInfo.closestHitId.x != 0)
             {
                 // Valid CHS
-                nextShaderAddr = GetVpcFromShaderId(hitInfo.closestHitId.x, SCHEDULING_PRIORITY_CHS);
+                nextShaderAddr = GetVpc64FromShaderId(Vpc32(hitInfo.closestHitId.x), SCHEDULING_PRIORITY_CHS);
                 return true;
             }
         }
@@ -2225,7 +2264,7 @@ static void TraversalInternal(
     }
 }
 
-static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_t returnAddr, _AmdSystemData data)
+static void EnqueueNextShader(bool hasWorkToDo, Vpc64 nextShaderAddr, Vpc64 returnAddr, _AmdSystemData data)
 {
     if (!hasWorkToDo)
     {
@@ -2233,7 +2272,7 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_
         {
             // No work to do = dead lane, jump to traversal as a synchronization point with an empty system data
             _AmdSystemData sysData = _AmdSystemData::MakeDeadLaneWithoutStack();
-            _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), sysData);
+            _AmdEnqueueTraversal(GetTraversalVpc64().GetU64(), _AmdGetUninitializedI64(), sysData);
         }
         else
         {
@@ -2244,21 +2283,21 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_
     const uint newState = data.traversal.committed.State();
     RayHistoryWriteEnd(data, newState);
 
-    if (nextShaderAddr != returnAddr)
+    if (nextShaderAddr.GetU64() != returnAddr.GetU64())
     {
         const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(newState) ?
                                           (int)DXILShaderKind::Miss : // convert to int to fix linux build error
                                           (int)DXILShaderKind::ClosestHit);
         RayHistoryWriteFunctionCall(data,
-                                    RayHistoryGetIdentifierFromVPC(nextShaderAddr),
+                                    RayHistoryGetIdentifierFromVPC(nextShaderAddr.GetU64()),
                                     data.dispatch.shaderRecIdx,
                                     shaderKind);
 
-        _AmdEnqueue(nextShaderAddr, returnAddr, data);
+        _AmdEnqueue(nextShaderAddr.GetU64(), returnAddr.GetU64(), data);
     }
 
     // Return to RayGen. No need to set a priority, as it is already set in the stored return address.
-    _AmdEnqueueRayGen(returnAddr, _AmdGetUninitializedI64(), data.dispatch);
+    _AmdEnqueueRayGen(returnAddr.GetU64(), _AmdGetUninitializedI64(), data.dispatch);
 }
 
 //=====================================================================================================================
@@ -2350,21 +2389,24 @@ export void _cont_Traversal(
     _AmdTraversalResultData result = (_AmdTraversalResultData)0;
 
     bool IsChsOrMiss = data.IsChsOrMiss(state);
-    if ((_AmdContinuationStackIsGlobal() && WaveActiveAllTrue(IsChsOrMiss)) ||
-        (!_AmdContinuationStackIsGlobal() && IsChsOrMiss))
+    // Re-enqueue Traversal until all lanes are done with BVH Traversal.
+    // Only then enqueue CHS/Miss to ensure other lanes that are not yet done with Traversal
+    // converge on these CHS/Miss invocations.
+    // This is necessary because Traversal has lower scheduling priority.
+    if (WaveActiveAllTrue(IsChsOrMiss))
     {
         EnterSchedulerSection();
 
-        uint64_t nextShaderAddr = 0;
+        Vpc64 nextShaderAddr = Vpc64(0);
         GetNextHitMissPc(data, state, candidate, nextShaderAddr);
 
         bool hasWorkToDo = true;
-        if (_AmdContinuationStackIsGlobal() && (nextShaderAddr != 0))
+        if (_AmdContinuationStackIsGlobal() && nextShaderAddr.IsValid())
         {
         }
 
-        const uint64_t returnAddr = data.traversal.ReturnAddress();
-        if (nextShaderAddr == 0)
+        const Vpc64 returnAddr = data.traversal.ReturnAddress();
+        if (!nextShaderAddr.IsValid())
         {
             nextShaderAddr = returnAddr;
         }
@@ -2372,10 +2414,7 @@ export void _cont_Traversal(
     }
     else
     {
-        bool mayEnqueueTraversal = (_AmdContinuationStackIsGlobal() ||
-            (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::SuspendWave));
-        // If we cannot re-enqueue Traversal, then we already know that we are in AHS or IS state.
-        if (!mayEnqueueTraversal || data.IsAhs(state) || data.IsIs(state))
+        if (data.IsAhs(state) || data.IsIs(state))
         {
             HitGroupInfo hitInfo = (HitGroupInfo)0;
             {
@@ -2395,10 +2434,9 @@ export void _cont_Traversal(
                                             hitInfo.tableIndex,
                                             DXILShaderKind::AnyHit);
 
-                const uint64_t addr = GetVpcFromShaderId(hitInfo.anyHitId.x, SCHEDULING_PRIORITY_AHS);
-                const uint64_t returnAddr = _AmdGetCurrentFuncAddr();
-                const uint64_t returnAddrWithPrio = GetVpcWithPriority(returnAddr, SCHEDULING_PRIORITY_TRAVERSAL);
-                _AmdEnqueueAnyHit(addr, returnAddrWithPrio, anyHitData, candidateBarycentrics);
+                const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.anyHitId.x), SCHEDULING_PRIORITY_AHS);
+                const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+                _AmdEnqueueAnyHit(addr.GetU64(), returnAddr.GetU64(), anyHitData, candidateBarycentrics);
             }
             else
             {
@@ -2410,10 +2448,9 @@ export void _cont_Traversal(
                                             hitInfo.tableIndex,
                                             DXILShaderKind::Intersection);
 
-                const uint64_t addr = GetVpcFromShaderId(hitInfo.intersectionId.x, SCHEDULING_PRIORITY_IS);
-                const uint64_t returnAddr = _AmdGetCurrentFuncAddr();
-                const uint64_t returnAddrWithPrio = GetVpcWithPriority(returnAddr, SCHEDULING_PRIORITY_TRAVERSAL);
-                _AmdEnqueueIntersection(addr, returnAddrWithPrio, anyHitData);
+                const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.intersectionId.x), SCHEDULING_PRIORITY_IS);
+                const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+                _AmdEnqueueIntersection(addr.GetU64(), returnAddr.GetU64(), anyHitData);
             }
         }
         else
@@ -2421,9 +2458,8 @@ export void _cont_Traversal(
             //
             // Everything else needs to go back through scheduling/traversal, regardless of state
             // Note we don't need "Wait" here because priorities run AHS and IS first
-            const uint64_t traversalAddr = _AmdGetCurrentFuncAddr();
-            const uint64_t traversalAddrWithPrio = GetVpcWithPriority(traversalAddr, SCHEDULING_PRIORITY_TRAVERSAL);
-            _AmdEnqueueTraversal(traversalAddrWithPrio, _AmdGetUninitializedI64(), data);
+            const Vpc64 traversalAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+            _AmdEnqueueTraversal(traversalAddr.GetU64(), _AmdGetUninitializedI64(), data);
         }
     }
     // This is unreachable
diff --git a/src/shaders/IndirectArgBufferUtils.hlsl b/src/shaders/IndirectArgBufferUtils.hlsl
index 73a627d..952c6a5 100644
--- a/src/shaders/IndirectArgBufferUtils.hlsl
+++ b/src/shaders/IndirectArgBufferUtils.hlsl
@@ -31,7 +31,7 @@
 #ifndef _INDIRECTARGBUFFER_HLSL
 #define _INDIRECTARGBUFFER_HLSL
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 #include "BuildSettings.hlsli"
 
 //======================================================================================================================
diff --git a/src/shaders/MergeSort.hlsl b/src/shaders/MergeSort.hlsl
index bd1921a..50d6882 100644
--- a/src/shaders/MergeSort.hlsl
+++ b/src/shaders/MergeSort.hlsl
@@ -26,7 +26,7 @@
 #define BUILD_THREADGROUP_SIZE 512
 
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTMETADATA
 #define GC_SCRATCHBUFFER
diff --git a/src/shaders/PairCompression.hlsl b/src/shaders/PairCompression.hlsl
index 91aac60..da86963 100644
--- a/src/shaders/PairCompression.hlsl
+++ b/src/shaders/PairCompression.hlsl
@@ -25,7 +25,7 @@
 #if NO_SHADER_ENTRYPOINT == 0
 
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_DSTMETADATA
diff --git a/src/shaders/RadixSort/BitHistogram.hlsl b/src/shaders/RadixSort/BitHistogram.hlsl
index f2b3fb2..5b5d4a3 100644
--- a/src/shaders/RadixSort/BitHistogram.hlsl
+++ b/src/shaders/RadixSort/BitHistogram.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/RadixSort/DistributePartSumInt4.hlsl b/src/shaders/RadixSort/DistributePartSumInt4.hlsl
index 1c86e7c..8d1aaf2 100644
--- a/src/shaders/RadixSort/DistributePartSumInt4.hlsl
+++ b/src/shaders/RadixSort/DistributePartSumInt4.hlsl
@@ -25,7 +25,7 @@
 #if NO_SHADER_ENTRYPOINT == 0
 
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/RadixSort/ScanCommon.hlsli b/src/shaders/RadixSort/ScanCommon.hlsli
index d43217f..edd8f35 100644
--- a/src/shaders/RadixSort/ScanCommon.hlsli
+++ b/src/shaders/RadixSort/ScanCommon.hlsli
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #define NUMBER_OF_BLOCKS_PER_GROUP 1
 #define NUM_BINS                   16
diff --git a/src/shaders/RadixSort/ScanExclusiveInt4.hlsl b/src/shaders/RadixSort/ScanExclusiveInt4.hlsl
index 4fd23ed..6c9ff45 100644
--- a/src/shaders/RadixSort/ScanExclusiveInt4.hlsl
+++ b/src/shaders/RadixSort/ScanExclusiveInt4.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl b/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl
index 91ff455..fe50439 100644
--- a/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl
+++ b/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_SCRATCHBUFFER
 #include "../BuildRootSignature.hlsl"
diff --git a/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl b/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl
index 34e53bf..40f8620 100644
--- a/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl
+++ b/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/RadixSort/ScatterKeysAndValues.hlsl b/src/shaders/RadixSort/ScatterKeysAndValues.hlsl
index dede6b2..03bb570 100644
--- a/src/shaders/RadixSort/ScatterKeysAndValues.hlsl
+++ b/src/shaders/RadixSort/ScatterKeysAndValues.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/Rebraid.hlsl b/src/shaders/Rebraid.hlsl
index 4aee6d9..48d6edc 100644
--- a/src/shaders/Rebraid.hlsl
+++ b/src/shaders/Rebraid.hlsl
@@ -30,7 +30,7 @@
 
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_DSTMETADATA
diff --git a/src/shaders/RefitBounds.hlsl b/src/shaders/RefitBounds.hlsl
index d86fc26..c500419 100644
--- a/src/shaders/RefitBounds.hlsl
+++ b/src/shaders/RefitBounds.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_SCRATCHBUFFER
diff --git a/src/shaders/TraceRay.hlsl b/src/shaders/TraceRay.hlsl
index 334e2dc..5b9f06c 100644
--- a/src/shaders/TraceRay.hlsl
+++ b/src/shaders/TraceRay.hlsl
@@ -265,11 +265,17 @@ static bool TraceRayCommon(
     {
         if ((rayFlags & RAY_FLAG_SKIP_CLOSEST_HIT_SHADER) == 0)
         {
-            const uint instanceContribution = (result.instanceContribution & 0x00ffffff);
-            const HitGroupInfo hitInfo = GetHitGroupInfo(rayContributionToHitGroupIndex,
-                                                         multiplierForGeometryContributionToShaderIndex,
-                                                         result.geometryIndex,
-                                                         instanceContribution);
+            uint instanceContribution = 0;
+            HitGroupInfo hitInfo = (HitGroupInfo)0;
+
+            {
+                instanceContribution = (result.instanceContribution & 0x00ffffff);
+                hitInfo = GetHitGroupInfo(rayContributionToHitGroupIndex,
+                                          multiplierForGeometryContributionToShaderIndex,
+                                          result.geometryIndex,
+                                          instanceContribution);
+            }
+
             uint64_t instNodePtr64 = 0;
             {
                 instNodePtr64 = CalculateInstanceNodePtr64(rtIpLevel, accelStruct, result.instNodePtr);
diff --git a/src/shaders/Update.hlsl b/src/shaders/Update.hlsl
index 3818053..3c091ad 100644
--- a/src/shaders/Update.hlsl
+++ b/src/shaders/Update.hlsl
@@ -47,7 +47,7 @@ struct RootConstants
     uint numThreads;
 };
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 [[vk::push_constant]] ConstantBuffer<RootConstants>                ShaderRootConstants  : register(b0);
 [[vk::binding(1, 1)]] ConstantBuffer<BuildShaderConstants>         ShaderConstants      : register(b1);
diff --git a/src/shaders/UpdateParallel.hlsl b/src/shaders/UpdateParallel.hlsl
index 56c782a..7af9953 100644
--- a/src/shaders/UpdateParallel.hlsl
+++ b/src/shaders/UpdateParallel.hlsl
@@ -44,7 +44,7 @@ struct RootConstants
     uint numThreads;
 };
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 [[vk::push_constant]] ConstantBuffer<RootConstants>        ShaderRootConstants : register(b0);
 [[vk::binding(1, 1)]] ConstantBuffer<BuildShaderConstants> ShaderConstants     : register(b1);
diff --git a/src/shaders/UpdateQBVH.hlsl b/src/shaders/UpdateQBVH.hlsl
index ae818c8..d0d253c 100644
--- a/src/shaders/UpdateQBVH.hlsl
+++ b/src/shaders/UpdateQBVH.hlsl
@@ -44,7 +44,7 @@ struct RootConstants
     uint numThreads;
 };
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 [[vk::push_constant]] ConstantBuffer<RootConstants>        ShaderRootConstants : register(b0);
 [[vk::binding(1, 1)]] ConstantBuffer<BuildShaderConstants> ShaderConstants     : register(b1);
diff --git a/src/shaders/CopyAS.hlsl b/src/shadersClean/build/CopyAS.hlsl
similarity index 98%
rename from src/shaders/CopyAS.hlsl
rename to src/shadersClean/build/CopyAS.hlsl
index 2ca420e..dd1354c 100644
--- a/src/shaders/CopyAS.hlsl
+++ b/src/shadersClean/build/CopyAS.hlsl
@@ -22,8 +22,8 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../../gpurt/gpurtAccelStruct.h"
-#include "../shared/rayTracingDefs.h"
+#include "../../../gpurt/gpurtAccelStruct.h"
+#include "../common/ShaderDefs.hlsli"
 
 // Note, CBV(b255) must be the last used binding in the root signature.
 #define RootSig "RootConstants(num32BitConstants=3, b0, visibility=SHADER_VISIBILITY_ALL), "\
diff --git a/src/shadersClean/common/InstanceDesc.hlsli b/src/shadersClean/common/InstanceDesc.hlsli
index 09f910c..35cbe1c 100644
--- a/src/shadersClean/common/InstanceDesc.hlsli
+++ b/src/shadersClean/common/InstanceDesc.hlsli
@@ -25,7 +25,7 @@
 #ifndef INSTANCE_DESC_HLSLI
 #define INSTANCE_DESC_HLSLI
 
-#include "TempAssert.hlsli"
+#include "../../shared/assert.h"
 
 //=====================================================================================================================
 // 64-byte aligned structure matching D3D12_RAYTRACING_INSTANCE_DESC
diff --git a/src/shadersClean/common/NodePointers.hlsli b/src/shadersClean/common/NodePointers.hlsli
index 46e6fa3..9e690e8 100644
--- a/src/shadersClean/common/NodePointers.hlsli
+++ b/src/shadersClean/common/NodePointers.hlsli
@@ -26,7 +26,7 @@
 #ifndef NODE_POINTERS_HLSLI
 #define NODE_POINTERS_HLSLI
 
-#include "../common/TempAssert.hlsli"
+#include "../../shared/assert.h"
 
 //=====================================================================================================================
 // Node pointer size in bytes
diff --git a/src/shadersClean/common/ShaderDefs.hlsli b/src/shadersClean/common/ShaderDefs.hlsli
index 3ca709b..f552f78 100644
--- a/src/shadersClean/common/ShaderDefs.hlsli
+++ b/src/shadersClean/common/ShaderDefs.hlsli
@@ -37,7 +37,7 @@
 #define DUMMY_FLOAT2_FUNC { return float2(0, 0); }
 #define DUMMY_FLOAT3_FUNC { return float3(0, 0, 0); }
 
-#include "TempAssert.hlsli"
+#include "../../shared/assert.h"
 
 // TODO: there are functions that use values from these files, but really
 // those functions should be in these files, and then the files that use the functions
@@ -49,6 +49,8 @@
 #include "gfx10/InstanceNode1_0.hlsli"
 #include "NodePointers.hlsli"
 
+#include "../../shared/rayTracingDefs.h"
+
 #define SAH_COST_TRIANGLE_INTERSECTION       1.5
 #define SAH_COST_AABBB_INTERSECTION          1
 
@@ -473,14 +475,22 @@ enum RebraidType : uint
 //=====================================================================================================================
 struct TriangleData
 {
+#ifdef __cplusplus
+    TriangleData(uint val)
+    {
+        memset(this, val, sizeof(TriangleData));
+    }
+
+    TriangleData() : TriangleData(0)
+    {}
+#endif
     float3 v0; ///< Vertex 0
     float3 v1; ///< Vertex 1
     float3 v2; ///< Vertex 2
 };
 
 #ifndef LIBRARY_COMPILATION
-// This does not include RayTracingDefs.h as the goal is
-// to eventually have everything in this file alone
+
 #endif
 
 #endif
diff --git a/src/shadersClean/common/TempAssert.hlsli b/src/shadersClean/common/TempAssert.hlsli
deleted file mode 100644
index 1407fe8..0000000
--- a/src/shadersClean/common/TempAssert.hlsli
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
- *
- **********************************************************************************************************************/
-
-// TODO: this is a temporary assert file to allow files with asserts to be "clean"
-// while the assert file itself cannot be. We need this as we have to move files out of "shared"
-// which use assert.h, but cannot then include assert.h as "clean" inclusion of shared files isn't set up yet,
-// *because* there are too many files in shared, and they can't be moved out because
-// they use assert.h and... (cyclical issue)
-
-#ifndef ASSERT_HLSLI
-#define ASSERT_HLSLI
-#ifndef GPURT_STATIC_ASSERT
-// _Static_assert is not supported with -spirv: https://github.com/microsoft/DirectXShaderCompiler/issues/5750
-#define GPURT_STATIC_ASSERT(condition, message)
-#endif
-#endif
diff --git a/src/shadersClean/common/gfx10/BoxNode1_0.hlsli b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli
index 6103e61..6623b47 100644
--- a/src/shadersClean/common/gfx10/BoxNode1_0.hlsli
+++ b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli
@@ -25,7 +25,7 @@
 #ifndef BOX_NODE_1_1_HLSLI
 #define BOX_NODE_1_1_HLSLI
 
-#include "../TempAssert.hlsli"
+#include "../../../shared/assert.h"
 
 //=====================================================================================================================
 // Hardware 32-bit box node format and offsets
diff --git a/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli
index ae0280d..e615089 100644
--- a/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli
+++ b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli
@@ -27,7 +27,7 @@
 
 #include "BoxNode1_0.hlsli"
 #include "../InstanceDesc.hlsli"
-#include "../TempAssert.hlsli"
+#include "../../../shared/assert.h"
 
 //=====================================================================================================================
 struct InstanceSidebandData1_1
diff --git a/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli
index 4431ecd..b8e01ec 100644
--- a/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli
+++ b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli
@@ -25,7 +25,7 @@
 #ifndef PROCEDURAL_NODE_1_1_HLSLI
 #define PROCEDURAL_NODE_1_1_HLSLI
 
-#include "../TempAssert.hlsli"
+#include "../../../shared/assert.h"
 
 //=====================================================================================================================
 #define USER_NODE_PROCEDURAL_MIN_OFFSET 0
diff --git a/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli
index 0d9d1eb..7e618be 100644
--- a/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli
+++ b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli
@@ -25,7 +25,7 @@
 #ifndef TRIANGLE_NODE_1_0_HLSLI
 #define TRIANGLE_NODE_1_0_HLSLI
 
-#include "../TempAssert.hlsli"
+#include "../../../shared/assert.h"
 
 //=====================================================================================================================
 // Hardware triangle node format and offsets
diff --git a/src/shadersClean/traversal/TraversalDefs.hlsli b/src/shadersClean/traversal/TraversalDefs.hlsli
index 8541f35..28f9999 100644
--- a/src/shadersClean/traversal/TraversalDefs.hlsli
+++ b/src/shadersClean/traversal/TraversalDefs.hlsli
@@ -25,7 +25,7 @@
 #ifndef TRAVERSAL_DEFS_HLSLI
 #define TRAVERSAL_DEFS_HLSLI
 
-#include "../common/TempAssert.hlsli"
+#include "../../shared/assert.h"
 
 #define ENCODE_FLAG_ARRAY_OF_POINTERS          0x00000001
 #define ENCODE_FLAG_UPDATE_IN_PLACE            0x00000002
@@ -93,6 +93,15 @@ struct RaySystemData
 // Ray description matching the D3D12 HLSL header
 struct RayDesc
 {
+#ifdef __cplusplus
+    RayDesc(uint val)
+    {
+        memset(this, val, sizeof(RayDesc));
+    }
+
+    RayDesc() : RayDesc(0)
+    {}
+#endif
     float3 Origin;
     float TMin;
     float3 Direction;
diff --git a/src/shared/rayTracingDefs.h b/src/shared/rayTracingDefs.h
index 6dfec65..5778dfd 100644
--- a/src/shared/rayTracingDefs.h
+++ b/src/shared/rayTracingDefs.h
@@ -27,10 +27,6 @@
 #ifndef _RAYTRACING_DEF_H
 #define _RAYTRACING_DEF_H
 
-#ifndef __cplusplus
-#include "../shadersClean/common/ShaderDefs.hlsli"
-#endif
-
 #include "../../gpurt/gpurtAccelStruct.h"
 #include "../../gpurt/gpurtBuildSettings.h"
 #include "../../gpurt/gpurtDispatch.h"
diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py
index a967fdf..95a90d3 100644
--- a/tools/CompileRTShaders.py
+++ b/tools/CompileRTShaders.py
@@ -136,7 +136,7 @@ def isBVH(self):
     ShaderConfig(path="EmitAS.hlsl", entryPoint="EmitCompactSize"),
     ShaderConfig(path="EmitAS.hlsl", entryPoint="EmitSerializeDesc"),
     ShaderConfig(path="EmitAS.hlsl", entryPoint="EmitToolVisDesc"),
-    ShaderConfig(path="CopyAS.hlsl", entryPoint="CopyAS"),
+    ShaderConfig(path="../shadersClean/build/CopyAS.hlsl", entryPoint="CopyAS"),
     ShaderConfig(path="CompactAS.hlsl", entryPoint="CompactAS"),
     ShaderConfig(path="DecodeAS.hlsl", entryPoint="DecodeAS"),
     ShaderConfig(path="SerializeAS.hlsl", entryPoint="SerializeAS"),
@@ -313,6 +313,37 @@ def validateCompilation(cmd: [str], path: pathlib.Path) -> bool:
 
     return True
 
+"""
+Validates the organization of files in the shared folder to enforce cpp/h a src/header sort of structure
+This helps keep the shader library untangled and easier to maintain.
+#define'ing LIBRARY_COMPILATION enables including files in any order and does not include implementation dependencies.
+"""
+def validateShared(args) -> bool:
+    cmdBase = getValidationCmdArgs(args)
+    # use resolve() + as_posix() to avoid path mismatches when using drive mapping
+    srcPath = pathlib.Path(FixInputPath(args.basepath)).parent.resolve()
+
+    gpurtInterfacePath = (srcPath / "../gpurt").resolve()
+    sharedPath = srcPath / "shared"
+    generatedFilepath = pathlib.Path(args.g_FilePath)
+    implExt = "._unused_"
+    headerExt = ".h"
+
+    # shared files need to be able to include the gpurt interface files due to the requirements of the interface
+    # we treat this as an exception for rules about what files can be included
+
+    for path, (hasImpl, hasHeader) in getImplInterfacePairs(sharedPath, implExt, headerExt).items():
+        assert (hasHeader and not hasImpl), "Shared files should be header only."
+        fullPath = path.with_suffix(path.suffix + (implExt if hasImpl else headerExt))
+        for defines in getDefineCombos(fullPath):
+            compileCmd = cmdBase + defines + [fullPath.as_posix()]
+            if not validateIncludes(compileCmd, path, implExt, headerExt, [(sharedPath, headerExt), (gpurtInterfacePath, ".h"), (generatedFilepath, ".h")]):
+                return False
+            if not validateCompilation(compileCmd, fullPath):
+                return False
+
+    return True
+
 """
 Validates the organization of shaders to enforce cpp/h a src/header sort of structure
 This helps keep the shader library untangled and easier to maintain.
@@ -322,6 +353,11 @@ def validateShadersClean(args) -> bool:
     cmdBase = getValidationCmdArgs(args)
     # use resolve() + as_posix() to avoid path mismatches when using drive mapping
     srcPath = pathlib.Path(FixInputPath(args.basepath)).parent.resolve()
+
+    gpurtInterfacePath = (srcPath / "../gpurt").resolve()
+    sharedPath = srcPath / "shared"
+    generatedFilepath = pathlib.Path(args.g_FilePath)
+# Validation of the shadersClean folder
     shadersCleanPath = srcPath / "shadersClean"
 
     implExt = ".hlsl"
@@ -331,11 +367,10 @@ def validateShadersClean(args) -> bool:
         fullPath = path.with_suffix(path.suffix + (implExt if hasImpl else headerExt))
         for defines in getDefineCombos(fullPath):
             compileCmd = cmdBase + defines + [fullPath.as_posix()]
-            if not validateIncludes(compileCmd, path, implExt, headerExt, [(shadersCleanPath, headerExt)]):
+            if not validateIncludes(compileCmd, path, implExt, headerExt, [(shadersCleanPath, headerExt), (sharedPath, ".hlsli"), (gpurtInterfacePath, ".h"), (generatedFilepath, ".h")]):
                 return False
             if not validateCompilation(compileCmd, fullPath):
                 return False
-
     return True
 
 def isSpirvShader(shaderConfig, args):
@@ -751,6 +786,7 @@ def main() -> int:
     parser.add_argument('--verbose', action='store_true', help='Output verbose inforation', default=False)
     parser.add_argument('--defines', help='Defines for the shader compiler, separated by ; or ,.', default="")
     parser.add_argument('--includePaths', help='Include paths for the shader compiler, separated by ; or ,.', default="")
+    parser.add_argument('--g_FilePath', help='Path to the build destination where generated headers are written', default="")
     parser.add_argument('--compilerPath', help='Path to standalone compiler.', default='./dxc.exe')
     parser.add_argument('--dxcompilerLibPath', help='Path to dxcompiler.dll/libdxcompiler.so', default='./dxcompiler.dll')
     parser.add_argument('--spirvRemapPath', help='Path to spirv-remap executable', default='./spirv-remap.exe')
@@ -769,11 +805,14 @@ def main() -> int:
         tBegin = time.perf_counter()
 
         validIncludes = validateShadersClean(args)
+        validIncludes &= validateShared(args)
+
         # For vulkan, we validate SPIR-V shaders in the same run instead of running the script again.
         if args.vulkan and not args.spirv:
             print("Now doing SPIR-V validation...")
             args.spirv = True
             validIncludes &= validateShadersClean(args)
+            validIncludes &= validateShared(args)
 
         tDuration = time.perf_counter() - tBegin
         if validIncludes:
diff --git a/tools/DebugPreprocessShaders.py b/tools/DebugPreprocessShaders.py
index b51c1bd..4793b96 100644
--- a/tools/DebugPreprocessShaders.py
+++ b/tools/DebugPreprocessShaders.py
@@ -26,6 +26,7 @@
 import sys
 import os
 import re
+import argparse
 
 cpp_file_header = """
 /* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
@@ -62,7 +63,7 @@ def process_file(src_path, dst_path):
         for line in src_file:
             # Find something that looks like a GPU_ASSERT macro invocation (GPU_ASSERT + '(' or space)
             if line.find('#define') == -1:
-                m = re.match('.*GPU_ASSERT\s*(\()', line)
+                m = re.match('.*GPU_ASSERT\\s*(\\()', line)
                 if m is not None:
                     open_paren = m.start(1)
                     assert_id = add_assert(src_name, line_num, line)
@@ -71,7 +72,7 @@ def process_file(src_path, dst_path):
                     line = line.replace('GPU_ASSERT', 'GPU_ASSERT_IMPL', 1)
                 else:
                     # Find something that looks like a GPU_DPF macro invocation (GPU_DPF + '(' or space)
-                    m = re.match('.*GPU_DPF\s*(\().*"(.*)"', line)
+                    m = re.match('.*GPU_DPF\\s*(\\().*"(.*)"', line)
                     if m is not None:
                         open_paren = m.start(1)
                         msg_id = add_print_msg(src_name, line_num, m.group(2))
@@ -91,13 +92,26 @@ def generate_cpp_file(output_file_path):
         output_str += cpp_file_footer
         output_file.write(output_str)
 
-def main():
+def main(cpp_file, input_pair_list):
     # Process each file in the argument list
     # The argments are pairs of input and ouput files then the path to the output file
-    for i in range(1, len(sys.argv) - 1, 2):
-        process_file(sys.argv[i], sys.argv[i+1])
-    generate_cpp_file(sys.argv[-1])
+    for i in range(0, len(input_pair_list), 2):
+        process_file(input_pair_list[i], input_pair_list[i+1])
+    generate_cpp_file(cpp_file)
     return 0
 
 if __name__ == '__main__':
-    sys.exit(main())
+    parser = argparse.ArgumentParser(
+        prog='DebugPreprocessShaders',
+        description='Preprocesses shaders for GPU_ASSERT/GPU_DPF lines and generates a lookup table to match their text with their ID'
+    )
+    parser.add_argument('-i', '--input', help='File containing a list of input shader/output processed shader path pairs, semicolon delimited', required=True)
+    parser.add_argument('-o', '--output', help='Path to output cpp header', required=True)
+    args = parser.parse_args()
+
+    input_file = open(args.input, 'r')
+    # Strip any newlines or whitespace from the beginning/end, and split by ';'
+    input_pair_list = input_file.read().strip().split(';')
+
+    sys.exit(main(args.output, input_pair_list))
+
diff --git a/tools/DebugPreprocessShadersInput.txt.in b/tools/DebugPreprocessShadersInput.txt.in
new file mode 100644
index 0000000..3f15488
--- /dev/null
+++ b/tools/DebugPreprocessShadersInput.txt.in
@@ -0,0 +1,2 @@
+${preprocessArgs}
+