diff --git a/backends/pal/gpurtPalBackend.cpp b/backends/pal/gpurtPalBackend.cpp
index 08f4571..d06394c 100644
--- a/backends/pal/gpurtPalBackend.cpp
+++ b/backends/pal/gpurtPalBackend.cpp
@@ -152,11 +152,11 @@ void PalBackend::WriteImmediateSingle(
     ImmediateDataWidth    width
     ) const
 {
-    // We want to use HwPipePreCs (ME) so that the writes do not occur before UAV barriers are done waiting.
+    // We want to use StagePostPrefetch (ME) so that the writes do not occur before UAV barriers are done waiting.
     // Both internal barriers during the build and application barriers synchronizing access to acceleration
-    // structure memory wait at HwPipePreCs.
+    // structure memory wait at StagePostPrefetch.
     GetCmdBuffer(cmdBuffer)->CmdWriteImmediate(
-        Pal::HwPipePoint::HwPipePreCs,
+        Pal::PipelineStageFlag::PipelineStagePostPrefetch,
         value,
         GpuRtToPalImmediateDataWidth(width),
         destVa);
diff --git a/cmake/GpuRtGenerateShaders.cmake b/cmake/GpuRtGenerateShaders.cmake
index 4654fa0..42a779b 100644
--- a/cmake/GpuRtGenerateShaders.cmake
+++ b/cmake/GpuRtGenerateShaders.cmake
@@ -76,6 +76,7 @@ if (GPURT_ENABLE_GPU_DEBUG)
     set(debugShaderDirectory "${CMAKE_CURRENT_BINARY_DIR}/debugShaders/src/shaders/")
     set(gpurtShaderSource ${GPURT_SHADER_SOURCE_FILES})
     set(gpurtShadersSourceDir ${debugShaderDirectory})
+    set(gpurtShadersPreprocessInputFile "${CMAKE_CURRENT_BINARY_DIR}/debugShaders/DebugPreprocessShadersInput.txt")
     list(TRANSFORM gpurtShaderSource PREPEND "${debugShaderDirectory}")
     set(preprocessArgs "")
     foreach(originalSourceFile ${GPURT_SHADER_SOURCE_FILES})
@@ -84,10 +85,13 @@ if (GPURT_ENABLE_GPU_DEBUG)
         list(APPEND preprocessArgs "${originalSourcePath}" "${newSourceFilePath}")
     endforeach()
     set(gpurtDebugPreprocessorScript "${gpurtToolsDir}/DebugPreprocessShaders.py")
+    configure_file("${gpurtToolsDir}/DebugPreprocessShadersInput.txt.in"
+    ${gpurtShadersPreprocessInputFile}
+    )
     add_custom_command(
         OUTPUT  ${gpurtShaderSource} ${gpurtDebugInfoFile}
-        DEPENDS ${originalShaderSource} ${gpurtDebugPreprocessorScript}
-        COMMAND Python3::Interpreter ${gpurtDebugPreprocessorScript} ${preprocessArgs} ${gpurtDebugInfoFile}
+        DEPENDS ${originalShaderSource} ${gpurtDebugPreprocessorScript} ${gpurtShadersPreprocessInputFile}
+        COMMAND Python3::Interpreter ${gpurtDebugPreprocessorScript} -i ${gpurtShadersPreprocessInputFile} -o ${gpurtDebugInfoFile}
     )
 else()
     set(gpurtShaderSource "${originalShaderSource}")
diff --git a/gpurt/gpurt.h b/gpurt/gpurt.h
index 68d5ef5..5d9d8e2 100644
--- a/gpurt/gpurt.h
+++ b/gpurt/gpurt.h
@@ -1471,13 +1471,21 @@ class IDevice
     // @param pDispatchRaysConstants  (in/out) Non-null pointer to a DispatchRaysConstants
     // @param cpsMemoryGpuAddr        (in) GPU address pointing to the beginning of cps memory
     // @param cpsMemoryBytes          (in) Cps allocated memory size in bytes
-    //
-    // @return the required global memory allocation size in bytes
     virtual void PatchDispatchRaysConstants(
         DispatchRaysConstants* pDispatchRaysConstants,
         const gpusize          cpsMemoryGpuAddr,
         const gpusize          cpsMemoryBytes) = 0;
 
+    // Populates the GPU addresses in the InitExecuteIndirectConstants structure
+    //
+    // @param pInitExecuteIndirectConstants (in/out) Non-null pointer to a InitExecuteIndirectConstants
+    // @param cpsMemoryGpuAddr              (in) GPU address pointing to the beginning of cps memory
+    // @param cpsMemoryBytes                (in) Cps allocated memory size in bytes
+    virtual void PatchInitExecuteIndirectConstants(
+        GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants,
+        const gpusize                        cpsMemoryGpuAddr,
+        const gpusize                        cpsMemoryBytes) = 0;
+
     //
     // @param cpsVideoMem          [in] Cps video memory
     // @param cpsMemoryBytes       [in] Cps allocated memory size in bytes
@@ -1630,6 +1638,8 @@ class IDevice
     // Check if a build is a good candidate for ACE offload (typically barrier-free cases)
     virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const = 0;
 
+    virtual uint32 CalculateBvhPrimitiveCount(const AccelStructBuildInputs& inputs) const = 0;
+
 protected:
 
     /// Client must create objects by explicitly calling CreateDevice method
diff --git a/gpurt/gpurtDispatch.h b/gpurt/gpurtDispatch.h
index 8f4ce03..fee7757 100644
--- a/gpurt/gpurtDispatch.h
+++ b/gpurt/gpurtDispatch.h
@@ -55,6 +55,8 @@ struct DispatchRaysTopLevelData
     uint32 accelStructTrackerSrd[MaxBufferSrdSize]; // Structured buffer SRD pointing to the accel struct tracker
 };
 
+#define DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID  48
+
 // Dispatch rays constant buffer data (GPU structure). Note, using unaligned uint64_t in HLSL constant buffers requires
 // -no-legacy-cbuf-layout for cpp style structure alignment to work. But currently that support is incomplete in DXC
 // and until that is resolved we need to use uint32's explicitly.
@@ -74,7 +76,8 @@ struct DispatchRaysConstantData
     uint32 hitGroupTableBaseAddressLo;  // Hit group table base address low 32-bits
     uint32 hitGroupTableBaseAddressHi;  // Hit group table base address high 32-bits
     uint32 hitGroupTableStrideInBytes;  // Hit group table record byte stride
-    uint32 reserved0;                   // Reserved padding
+    uint32 cpsDispatchId;               // Continuations DispatchId, written in the persistent mode.
+                                        // This value should not be read via constant buffer.
     uint32 callableTableBaseAddressLo;  // Callable shader table base address low 32-bits
     uint32 callableTableBaseAddressHi;  // Callable shader table base address high 32-bits
     uint32 callableTableStrideInBytes;  // Callable shader table byte stride
@@ -96,6 +99,8 @@ struct DispatchRaysConstantData
     uint32 cpsGlobalMemoryAddressLo;    // Separate CPS stack memory base address low 32-bits
     uint32 cpsGlobalMemoryAddressHi;    // Separate CPS stack memory base address high 32-bits
     uint32 counterMask;                 // Mask for filtering ray history token
+    uint32 cpsDispatchIdAddressLo;      // Continuations cpsDispatchId address low 32-bits
+    uint32 cpsDispatchIdAddressHi;      // Continuations cpsDispatchId address high 32-bits
 };
 #pragma pack(pop)
 
@@ -109,6 +114,8 @@ struct DispatchRaysConstants
 #if __cplusplus
 static_assert((sizeof(DispatchRaysConstants) % sizeof(uint32)) == 0,
               "DispatchRaysConstants is not dword-aligned");
+static_assert(DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID == offsetof(DispatchRaysConstantData, cpsDispatchId),
+              "DISPATCHRAYSCONSTANTDATA_STRUCT_OFFSET_DISPATCHID mismatches to cpsDispatchId");
 
 constexpr uint32 DispatchRaysConstantsDw = sizeof(DispatchRaysConstants) / sizeof(uint32);
 #endif
@@ -132,6 +139,17 @@ struct InitExecuteIndirectUserData
 // Constants for InitExecuteIndirect shader
 struct InitExecuteIndirectConstants
 {
+#if __cplusplus
+    // Internal counter buffer SRDs
+    uint32 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize];
+
+    // Internal acceleration structure tracker buffer SRD.
+    uint32 accelStructTrackerSrd[MaxBufferSrdSize];
+#else
+    uint4 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize / 4];
+    uint4 accelStructTrackerSrd[MaxBufferSrdSize / 4];
+#endif
+
     uint32 inputBytesPerDispatch;   // Size of application indirect arguments
     uint32 outputBytesPerDispatch;  // Size of resulting driver internal arguments
     uint32 bindingArgsSize;         // Size of binding arguments in the app buffer preceeding the dispatch
@@ -160,18 +178,10 @@ struct InitExecuteIndirectConstants
     uint32 counterRayIdRangeBegin;  // Counter ray ID range begin
     uint32 counterRayIdRangeEnd;    // Counter ray ID range end
     uint32 cpsBackendStackSize;     // Scratch memory used by a compiler backend, start at offset 0
-    uint32 padding0;                // Padding for 16-byte alignment
+    uint32 cpsFrontendStackSize;    // Scratch memory used by IR (Intermediate Representation), for a continuation passing shader
 
-#if __cplusplus
-     // Internal counter buffer SRDs
-    uint32 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize];
-
-    // Internal acceleration structure tracker buffer SRD.
-    uint32 accelStructTrackerSrd[MaxBufferSrdSize];
-#else
-    uint4 internalUavSrd[MaxSupportedIndirectCounters][MaxBufferSrdSize / 4];
-    uint4 accelStructTrackerSrd[MaxBufferSrdSize / 4];
-#endif
+    uint32 cpsGlobalMemoryAddressLo;    // Separate CPS stack memory base address low 32-bits
+    uint32 cpsGlobalMemoryAddressHi;    // Separate CPS stack memory base address high 32-bits
 };
 
 constexpr uint32 InitExecuteIndirectConstantsDw = sizeof(InitExecuteIndirectConstants) / sizeof(uint32);
@@ -184,7 +194,7 @@ static_assert((MaxBufferSrdSize == 4), "Buffer SRD size changed, affected shader
 #endif
 static_assert((sizeof(InitExecuteIndirectConstants) % sizeof(uint32)) == 0,
               "InitExecuteIndirectConstants is not dword-aligned");
-}
+}       // namespace GpuRt
 #endif
 
 #endif
diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp
index 47e4043..59287b6 100644
--- a/src/gpurtBvhBuilder.cpp
+++ b/src/gpurtBvhBuilder.cpp
@@ -1629,7 +1629,7 @@ AccelStructHeader BvhBuilder::InitAccelStructHeader() const
     header.geometryType             = static_cast<uint32>(m_buildConfig.geometryType);
     header.uuidLo                   = Util::LowPart(m_deviceSettings.accelerationStructureUUID);
     header.uuidHi                   = Util::HighPart(m_deviceSettings.accelerationStructureUUID);
-    header.rtIpLevel                = uint32(m_pDevice->GetRtIpLevel());
+    header.rtIpLevel                = static_cast<uint32>(PalToGpuRtIpLevel(m_pDevice->GetRtIpLevel()));
 
     if (m_buildConfig.topLevelBuild)
     {
@@ -2313,8 +2313,8 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo(
     // the build when performing the update causing page faults.
     scratchDataSize = Util::Max(scratchDataSize, updateDataSize);
 
-    // Some applications crash when the driver reports 0 scratch size. Use 1 instead.
-    scratchDataSize = Util::Max(1u, scratchDataSize);
+    // Some applications crash when the driver reports 0 scratch size. Use 1 DWORD instead.
+    scratchDataSize = Util::Max(static_cast<uint32>(sizeof(uint32)), scratchDataSize);
 
     prebuildInfo.scratchDataSizeInBytes       = scratchDataSize;
     prebuildInfo.updateScratchDataSizeInBytes = updateDataSize;
diff --git a/src/gpurtDevice.cpp b/src/gpurtDevice.cpp
index 6058d33..f3b2f7a 100644
--- a/src/gpurtDevice.cpp
+++ b/src/gpurtDevice.cpp
@@ -467,6 +467,18 @@ Pal::Result Device::InitializeCpsMemory(
     return result;
 }
 
+//=====================================================================================================================
+// Populates the GPU addresses in the Constant structure
+template<typename ConstantsType>
+void Device::PatchConstants(ConstantsType* pConstant,
+                            const gpusize  cpsMemoryGpuAddr,
+                            const gpusize  cpsMemoryBytes)
+{
+    pConstant->cpsGlobalMemoryAddressLo = Util::LowPart(cpsMemoryGpuAddr);
+    pConstant->cpsGlobalMemoryAddressHi = Util::HighPart(cpsMemoryGpuAddr);
+
+}
+
 //=====================================================================================================================
 // Populates the GPU addresses in the DispatchRaysConstants structure
 void Device::PatchDispatchRaysConstants(
@@ -474,9 +486,17 @@ void Device::PatchDispatchRaysConstants(
     const gpusize          cpsMemoryGpuAddr,
     const gpusize          cpsMemoryBytes)
 {
-    pDispatchRaysConstants->constData.cpsGlobalMemoryAddressLo = Util::LowPart(cpsMemoryGpuAddr);
-    pDispatchRaysConstants->constData.cpsGlobalMemoryAddressHi = Util::HighPart(cpsMemoryGpuAddr);
+    PatchConstants(&pDispatchRaysConstants->constData, cpsMemoryGpuAddr, cpsMemoryBytes);
+}
 
+//=====================================================================================================================
+// Populates the GPU addresses in the InitExecuteIndirectConstants structure
+void Device::PatchInitExecuteIndirectConstants(
+    GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants,
+    const gpusize                        cpsMemoryGpuAddr,
+    const gpusize                        cpsMemoryBytes)
+{
+    PatchConstants(pInitExecuteIndirectConstants, cpsMemoryGpuAddr, cpsMemoryBytes);
 }
 
 //=====================================================================================================================
@@ -2125,6 +2145,27 @@ bool Device::ShouldUseGangedAceForBuild(
     return shouldUseGangedAce;
 }
 
+// =====================================================================================================================
+uint32 Device::CalculateBvhPrimitiveCount(
+    const AccelStructBuildInputs& inputs
+    ) const
+{
+    // For top-level acceleration structure, inputElementCount represents the number of instances
+    uint32 primitiveCount = (inputs.type == AccelStructType::TopLevel) ? inputs.inputElemCount : 0;
+
+    if (inputs.type == AccelStructType::BottomLevel)
+    {
+        for (uint32 i = 0; i < inputs.inputElemCount; ++i)
+        {
+            const Geometry geometry = m_clientCb.pfnConvertAccelStructBuildGeometry(inputs, i);
+            const uint32 geometryPrimCount = BvhBuilder::GetGeometryPrimCount(geometry);
+            primitiveCount += geometryPrimCount;
+        }
+    }
+
+    return primitiveCount;
+}
+
 // =====================================================================================================================
 const AccelStructBuildInputs Device::OverrideBuildInputs(
     const AccelStructBuildInputs& inputs
diff --git a/src/gpurtInternal.h b/src/gpurtInternal.h
index 7cf7f2c..f59b164 100644
--- a/src/gpurtInternal.h
+++ b/src/gpurtInternal.h
@@ -106,6 +106,42 @@ enum EncodeFlags : uint32
     EncodeFlagFusedInstanceNode     = 0x00000008,
 };
 
+// Values should remain stable for RRA binary-compatibility (PAL equivalents do not guarantee stability)
+enum RtIpLevel : uint32
+{
+    RtIpNone        = 0x0,  ///< The device does not have an RayTracing Ip Level
+    RtIp1_0         = 0x1,  ///< First Implementation of HW RT
+    RtIp1_1         = 0x2,  ///< Added computation of triangle barycentrics into HW
+    RtIp2_0         = 0x3,  ///< Added more Hardware RayTracing features, such as BoxSort, PointerFlag, etc
+    RtIpReserved    = 0x5,  ///< Special value, should not be used
+};
+
+// =====================================================================================================================
+// Convert PAL RtIpLevel values to their GpuRT equivalent
+static RtIpLevel PalToGpuRtIpLevel(Pal::RayTracingIpLevel palRtIpLevel)
+{
+    RtIpLevel gpuRtIpLevel = RtIpLevel::RtIpNone;
+
+    switch (palRtIpLevel)
+    {
+    case Pal::RayTracingIpLevel::RtIp1_0:
+        gpuRtIpLevel = RtIpLevel::RtIp1_0;
+        break;
+    case Pal::RayTracingIpLevel::RtIp1_1:
+        gpuRtIpLevel = RtIpLevel::RtIp1_1;
+        break;
+    case Pal::RayTracingIpLevel::RtIp2_0:
+        gpuRtIpLevel = RtIpLevel::RtIp2_0;
+        break;
+    case Pal::RayTracingIpLevel::None:
+    default:
+        gpuRtIpLevel = RtIpLevel::RtIpNone;
+        break;
+    }
+
+    return gpuRtIpLevel;
+}
+
 struct RadixSortConfig
 {
     uint32 workGroupSize;
@@ -336,13 +372,21 @@ class Device : public IDevice
     // @param pDispatchRaysConstants  (in/out) Non-null pointer to a DispatchRaysConstants
     // @param cpsMemoryGpuAddr        (in) GPU address pointing to the beginning of cps memory
     // @param cpsMemoryBytes          (in) Cps allocated memory size in bytes
-    //
-    // @return the required global memory allocation size in bytes
     virtual void PatchDispatchRaysConstants(
         DispatchRaysConstants* pDispatchRaysConstants,
         const gpusize          cpsMemoryGpuAddr,
         const gpusize          cpsMemoryBytes) override;
 
+    // Populates the GPU addresses in the InitExecuteIndirectConstants structure
+    //
+    // @param pInitExecuteIndirectConstants (in/out) Non-null pointer to a InitExecuteIndirectConstants
+    // @param cpsMemoryGpuAddr              (in) GPU address pointing to the beginning of cps memory
+    // @param cpsMemoryBytes                (in) Cps allocated memory size in bytes
+    virtual void PatchInitExecuteIndirectConstants(
+        GpuRt::InitExecuteIndirectConstants* pInitExecuteIndirectConstants,
+        const gpusize                        cpsMemoryGpuAddr,
+        const gpusize                        cpsMemoryBytes) override;
+
     //
     // @param cpsVideoMem          [in] Cps video memory
     // @param cpsMemoryBytes       [in] Cps allocated memory size in bytes
@@ -683,6 +727,8 @@ class Device : public IDevice
 
     virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const override;
 
+    virtual uint32 CalculateBvhPrimitiveCount(const AccelStructBuildInputs& inputs) const override;
+
     // Returns size in DWORDs of a typed buffer view SRD
     uint32 GetTypedBufferSrdSizeDw() const { return m_typedBufferSrdSizeDw; };
 
@@ -722,6 +768,12 @@ class Device : public IDevice
 
     virtual ~Device() override;
 
+    template<typename ConstantsType>
+    void PatchConstants(
+        ConstantsType* pConstant,
+        const gpusize  cpsMemoryGpuAddr,
+        const gpusize  cpsMemoryBytes);
+
     DeviceInitInfo  m_info;
 
     Util::GenericAllocatorTracked            m_allocator;
diff --git a/src/gpurtTraceSource.cpp b/src/gpurtTraceSource.cpp
index 36c5b3e..209a92c 100644
--- a/src/gpurtTraceSource.cpp
+++ b/src/gpurtTraceSource.cpp
@@ -51,7 +51,7 @@ void AccelStructTraceSource::OnTraceBegin(
     if (m_pDevice->AccelStructTrackerGpuAddr() != 0)
     {
         // Before starting the trace set tracking to enabled.
-        pCmdBuf->CmdWriteImmediate(Pal::HwPipeBottom, 1, Pal::ImmediateDataWidth::ImmediateData32Bit,
+        pCmdBuf->CmdWriteImmediate(Pal::PipelineStageBottomOfPipe, 1, Pal::ImmediateDataWidth::ImmediateData32Bit,
                                    m_pDevice->AccelStructTrackerGpuAddr() + offsetof(AccelStructTracker, enabled));
         m_pDevice->RaytracingBarrier(pCmdBuf, BarrierFlagSyncPostCpWrite);
     }
@@ -67,7 +67,7 @@ void AccelStructTraceSource::OnTraceEnd(
     if (m_pDevice->AccelStructTrackerGpuAddr() != 0)
     {
         // Disable tracking.
-        pCmdBuf->CmdWriteImmediate(Pal::HwPipeBottom, 0, Pal::ImmediateDataWidth::ImmediateData32Bit,
+        pCmdBuf->CmdWriteImmediate(Pal::PipelineStageBottomOfPipe, 0, Pal::ImmediateDataWidth::ImmediateData32Bit,
                                    m_pDevice->AccelStructTrackerGpuAddr() + offsetof(AccelStructTracker, enabled));
         m_pDevice->RaytracingBarrier(pCmdBuf, BarrierFlagSyncPostCpWrite);
     }
diff --git a/src/options.yaml b/src/options.yaml
index 4ea170e..859c740 100644
--- a/src/options.yaml
+++ b/src/options.yaml
@@ -32,12 +32,12 @@ enum CpsCandidatePrimitiveMode:
     # Controls how candidate primitives are handled in the continuations (CPS) software Traversal loop.
     SuspendLane:  # Suspend a lane upon candidate hits and wait for other lanes to end the Traversal loop.
                   #  This is the default. Other modes are experimental and might not be implemented on all RtIps.
-    SuspendWave:  # On each Traversal iteration, check whether any lane has a candidate, and break if so.
-                  #  Only implemented for RtIp 2.0, all other cases use SuspendLane.
     DeferFirst:   # When finding the first candidate, record it and ignore it for the time being. At the end of the
                   #  Traversal loop, process pending candidates. When finding the second candidate, immediately break
                   #  out of the loop to first process the first one.
-                  #  Only implemented for triangle primitives on RtIp 2.0, all other cases use SuspendLane.
+                  #  Implementation status:
+                  #   * RtIp 1.1: Not supported, SuspendLane is always used.
+                  #   * RtIp 2.0: DeferFirst is supported, but only for triangle primitives.
 
 # ------------------------------------------------------------------------------------------------------------------
 # This is the definition of the single options struct.
diff --git a/src/shaders/BuildBVH.hlsl b/src/shaders/BuildBVH.hlsl
index 202f0f9..8ec5c77 100644
--- a/src/shaders/BuildBVH.hlsl
+++ b/src/shaders/BuildBVH.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #include "BuildRootSignature.hlsl"
 
diff --git a/src/shaders/BuildBVHTDTR.hlsl b/src/shaders/BuildBVHTDTR.hlsl
index 7d1d71f..2614a34 100644
--- a/src/shaders/BuildBVHTDTR.hlsl
+++ b/src/shaders/BuildBVHTDTR.hlsl
@@ -188,7 +188,7 @@ struct StateTDBuild
 #define USE_LDS     1
 
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTMETADATA
 #define GC_SCRATCHBUFFER
diff --git a/src/shaders/BuildFastAgglomerativeLbvh.hlsl b/src/shaders/BuildFastAgglomerativeLbvh.hlsl
index 526053c..56328b6 100644
--- a/src/shaders/BuildFastAgglomerativeLbvh.hlsl
+++ b/src/shaders/BuildFastAgglomerativeLbvh.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_SCRATCHBUFFER
 #include "BuildRootSignature.hlsl"
diff --git a/src/shaders/BuildPLOC.hlsl b/src/shaders/BuildPLOC.hlsl
index 2c39642..079f1bc 100644
--- a/src/shaders/BuildPLOC.hlsl
+++ b/src/shaders/BuildPLOC.hlsl
@@ -88,7 +88,7 @@ struct BuildPlocArgs
 #include "Common.hlsl"
 
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTMETADATA
 #define GC_SCRATCHBUFFER
diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl
index eaf9090..8a3df86 100644
--- a/src/shaders/BuildParallel.hlsl
+++ b/src/shaders/BuildParallel.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #define BUILD_PARALLEL 1
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define TASK_COUNTER_BUFFER   ScratchGlobal
 #define TASK_COUNTER_OFFSET   (ShaderConstants.offsets.taskLoopCounters + TASK_LOOP_BUILD_PARALLEL_COUNTER_OFFSET)
diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl
index 512496a..60e527f 100644
--- a/src/shaders/BuildQBVH.hlsl
+++ b/src/shaders/BuildQBVH.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_DSTMETADATA
diff --git a/src/shaders/BuildRootSignature.hlsl b/src/shaders/BuildRootSignature.hlsl
index 2df19b0..15c48e2 100644
--- a/src/shaders/BuildRootSignature.hlsl
+++ b/src/shaders/BuildRootSignature.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 // DebugBuffer
 #if GPURT_ENABLE_GPU_DEBUG
diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli
index 5929195..ac6e315 100644
--- a/src/shaders/BuildSettings.hlsli
+++ b/src/shaders/BuildSettings.hlsli
@@ -26,6 +26,8 @@
 #ifndef _BUILDSETTINGS_HLSLI
 #define _BUILDSETTINGS_HLSLI
 
+#include "../shadersClean/common/ShaderDefs.hlsli"
+
 [[vk::constant_id(BUILD_SETTINGS_DATA_TOP_LEVEL_BUILD_ID)]]                        uint topLevelBuild                 = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_BUILD_MODE_ID)]]                             uint buildMode                     = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_TRIANGLE_COMPRESSION_MODE_ID)]]              uint triangleCompressionMode       = 0;
diff --git a/src/shaders/CMakeLists.txt b/src/shaders/CMakeLists.txt
index 54ef25e..ac75c24 100644
--- a/src/shaders/CMakeLists.txt
+++ b/src/shaders/CMakeLists.txt
@@ -59,7 +59,7 @@ set(gpurtHlsl
     CompactAS.hlsl
     CompactAS1_1.hlsl
     CompactCommon.hlsl
-    CopyAS.hlsl
+    ../shadersClean/build/CopyAS.hlsl
     ../shadersClean/build/CopyBufferRaw.hlsl
     DecodeAS.hlsl
     DecodeCommon.hlsl
@@ -129,7 +129,6 @@ set(otherDeps
     ../shadersClean/common/InstanceDesc.hlsli
     ../shadersClean/common/NodePointers.hlsli
     ../shadersClean/common/ScratchNode.hlsli
-    ../shadersClean/common/TempAssert.hlsli
     ../shadersClean/traversal/TraversalDefs.hlsli
     ../shadersClean/common/gfx10/BoxNode1_0.hlsli
     ../shadersClean/common/gfx10/InstanceNode1_0.hlsli
diff --git a/src/shaders/Common.hlsl b/src/shaders/Common.hlsl
index 1b55ccf..92f72d9 100644
--- a/src/shaders/Common.hlsl
+++ b/src/shaders/Common.hlsl
@@ -34,7 +34,7 @@
 #ifndef _COMMON_HLSL
 #define _COMMON_HLSL
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 #include "../shadersClean/common/ScratchNode.hlsli"
 
 typedef AccelStructDataOffsets AccelStructOffsets;
diff --git a/src/shaders/Continuations2_0.hlsl b/src/shaders/Continuations2_0.hlsl
index 221143c..73293dc 100644
--- a/src/shaders/Continuations2_0.hlsl
+++ b/src/shaders/Continuations2_0.hlsl
@@ -509,39 +509,10 @@ static void TraversalInternal2_0(
         }
 
         bool laneHasCandidate = (state < TRAVERSAL_STATE_COMMITTED_NOTHING);
-        if (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::SuspendWave)
+        if (laneHasCandidate)
         {
-
-            // Stopping the Traversal loop for the whole wave on the first AHS/IS might be too aggressive.
-            // We implement this basic version here as basis for further experiments.
-            // Delaying it a bit could have potential benefits:
-            //   * avoid overhead of wave-intrinsic in every iteration (depending on the implementation of delaying)
-            //   * letting more lanes join the IS/AHS work
-            if (WaveActiveAnyTrue(laneHasCandidate))
-            {
-                if (laneHasCandidate)
-                {
-                    // Break out of traversal to run AHS/IS
-                }
-                else if (IsValidNode(nextNodePtr))
-                {
-                    // Break out of traversal so other lanes can run AHS/IS and re-join traversal
-                    state = TRAVERSAL_STATE_SUSPEND_TRAVERSAL;
-                }
-                else
-                {
-                    // The lane is done with Traversal, and wants to run CHS or Miss
-                }
-                break;
-            }
-        }
-        else
-        {
-            if (laneHasCandidate)
-            {
-                // Break out of traversal to run AHS/IS
-                break;
-            }
+            // Break out of traversal to run AHS/IS
+            break;
         }
     }
 
diff --git a/src/shaders/EncodeCommon.hlsl b/src/shaders/EncodeCommon.hlsl
index fb666c6..0aa00b0 100644
--- a/src/shaders/EncodeCommon.hlsl
+++ b/src/shaders/EncodeCommon.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #include "BuildCommonScratch.hlsl"
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #include "TrianglePrimitive.hlsl"
 #include "UpdateCommon.hlsl"
diff --git a/src/shaders/EncodeNodes.hlsl b/src/shaders/EncodeNodes.hlsl
index 2075069..3ee98e6 100644
--- a/src/shaders/EncodeNodes.hlsl
+++ b/src/shaders/EncodeNodes.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_DSTMETADATA
diff --git a/src/shaders/EncodeTopLevel.hlsl b/src/shaders/EncodeTopLevel.hlsl
index 00419bc..90c1954 100644
--- a/src/shaders/EncodeTopLevel.hlsl
+++ b/src/shaders/EncodeTopLevel.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #include "BuildRootSignature.hlsl"
 
diff --git a/src/shaders/EncodeTopLevelBuild.hlsl b/src/shaders/EncodeTopLevelBuild.hlsl
index 2424f4a..097c3ac 100644
--- a/src/shaders/EncodeTopLevelBuild.hlsl
+++ b/src/shaders/EncodeTopLevelBuild.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 #include "../shadersClean/common/ScratchNode.hlsli"
 
 //=====================================================================================================================
diff --git a/src/shaders/Extensions.hlsl b/src/shaders/Extensions.hlsl
index e78c92b..176dff2 100644
--- a/src/shaders/Extensions.hlsl
+++ b/src/shaders/Extensions.hlsl
@@ -29,6 +29,13 @@
 #include "../shadersClean/common/Extensions.hlsli"
 #include "../shadersClean/common/Math.hlsli"
 
+#define AmdExtD3DShaderIntrinsicsWaveOp_MinF        0x07
+#define AmdExtD3DShaderIntrinsicsWaveOp_MaxF        0x0a
+#define AmdExtD3DShaderIntrinsicsWaveOp_Inclusive   0x01
+
+#define AmdExtClusteredSubgroup                     3
+#define AmdExtClusteredReduce                       3
+
 // Dummy implementation for Vulkan build only
 __decl uint AmdExtD3DShaderIntrinsics_LoadDwordAtAddr(
     uint gpuVaLoBits, uint gpuVaHiBits, uint offset) DUMMY_UINT_FUNC
@@ -57,10 +64,6 @@ __decl uint2 AmdExtD3DShaderIntrinsics_AtomicMinU64(
 __decl uint2 AmdExtD3DShaderIntrinsics_ShaderClock() DUMMY_UINT2_FUNC
 __decl uint2 AmdExtD3DShaderIntrinsics_ShaderRealtimeClock() DUMMY_UINT2_FUNC
 
-#define AmdExtD3DShaderIntrinsicsWaveOp_MinF        0x07
-#define AmdExtD3DShaderIntrinsicsWaveOp_MaxF        0x0a
-#define AmdExtD3DShaderIntrinsicsWaveOp_Inclusive   0x01
-
 __decl float3 AmdExtD3DShaderIntrinsics_WaveScan(
     uint waveOp, uint flags, float3 src) DUMMY_FLOAT3_FUNC
 
@@ -112,56 +115,115 @@ __decl float3 AmdExtD3DShaderIntrinsics_FloatOpWithRoundMode(
     uint roundMode, uint operation, float3 src0, float3 src1) DUMMY_FLOAT3_FUNC
 
 //=====================================================================================================================
-// Sub-group wave reductions
+// Sub-group wave reductions spirv ops
 // Ref: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_instructions
 
 [[vk::ext_capability(/* GroupNonUniform */ 61)]]
 [[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
 [[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
-
 [[vk::ext_instruction(350)]]
 float spirv_OpGroupNonUniformFAdd_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
 
+[[vk::ext_capability(/* GroupNonUniform */ 61)]]
+[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
+[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
+[[vk::ext_instruction(355)]]
+float spirv_OpGroupNonUniformFMin_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
+
+[[vk::ext_capability(/* GroupNonUniform */ 61)]]
+[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
+[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
+[[vk::ext_instruction(358)]]
+float spirv_OpGroupNonUniformFMax_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
+
+[[vk::ext_capability(/* GroupNonUniform */ 61)]]
+[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
+[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
+[[vk::ext_instruction(359)]]
+uint spirv_OpGroupNonUniformBitwiseAnd_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+
+[[vk::ext_capability(/* GroupNonUniform */ 61)]]
+[[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
+[[vk::ext_capability(/* GroupNonUniformClustered */ 67)]]
+[[vk::ext_instruction(360)]]
+uint spirv_OpGroupNonUniformBitwiseOr_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+
+//=====================================================================================================================
+// GpuRt WaveClusterSum Intrinsics
 float AmdExtD3DShaderIntrinsics_WaveClusterSum(float x, uint dxClusterSize)
 {
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformFAdd_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    return spirv_OpGroupNonUniformFAdd_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize);
 }
 
-[[vk::ext_instruction(355)]]
-float spirv_OpGroupNonUniformFMin_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
-
+//=====================================================================================================================
+// GpuRt WaveClusterMin Intrinsics
 float AmdExtD3DShaderIntrinsics_WaveClusterMin(float x, uint dxClusterSize)
 {
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformFMin_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    return spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize);
 }
 
-[[vk::ext_instruction(358)]]
-float spirv_OpGroupNonUniformFMax_clustered(uint scope, [[vk::ext_literal]] uint op, float value, uint clusterSize);
+float2 AmdExtD3DShaderIntrinsics_WaveClusterMin(float2 val, uint dxClusterSize)
+{
+    float2 result;
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    result.x = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize);
+    result.y = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize);
+    return result;
+}
 
-float AmdExtD3DShaderIntrinsics_WaveClusterMax(float x, uint dxClusterSize)
+float3 AmdExtD3DShaderIntrinsics_WaveClusterMin(float3 val, uint dxClusterSize)
 {
+    float3 result;
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformFMax_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    result.x = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize);
+    result.y = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize);
+    result.z = spirv_OpGroupNonUniformFMin_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.z, clusterSize);
+    return result;
 }
 
-[[vk::ext_instruction(359)]]
-uint spirv_OpGroupNonUniformBitwiseAnd_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+//=====================================================================================================================
+// GpuRt WaveClusterMax Intrinsics
+float AmdExtD3DShaderIntrinsics_WaveClusterMax(float val, uint dxClusterSize)
+{
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    return spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val, clusterSize);
+}
 
-uint AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(uint x, uint dxClusterSize)
+float2 AmdExtD3DShaderIntrinsics_WaveClusterMax(float2 val, uint dxClusterSize)
 {
+    float2 result;
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformBitwiseAnd_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    result.x = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize);
+    result.y = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize);
+    return result;
 }
 
-[[vk::ext_instruction(360)]]
-uint spirv_OpGroupNonUniformBitwiseOr_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+float3 AmdExtD3DShaderIntrinsics_WaveClusterMax(float3 val, uint dxClusterSize)
+{
+    float3 result;
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    result.x = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.x, clusterSize);
+    result.y = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.y, clusterSize);
+    result.z = spirv_OpGroupNonUniformFMax_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, val.z, clusterSize);
+    return result;
+}
 
+//=====================================================================================================================
+// GpuRt WaveClusterBitAnd Intrinsics
+uint AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(uint x, uint dxClusterSize)
+{
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    return spirv_OpGroupNonUniformBitwiseAnd_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize);
+}
+
+//=====================================================================================================================
+// GpuRt WaveClusterBitOr Intrinsics
 uint AmdExtD3DShaderIntrinsics_WaveClusterBitOr(uint x, uint dxClusterSize)
 {
     const uint clusterSize = (1u << (dxClusterSize - 1));
-    return spirv_OpGroupNonUniformBitwiseOr_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+    return spirv_OpGroupNonUniformBitwiseOr_clustered(AmdExtClusteredSubgroup, AmdExtClusteredReduce, x, clusterSize);
 }
 
 //=====================================================================================================================
@@ -317,6 +379,7 @@ __decl uint64_t AmdExtConstantLoad64AtAddr(GpuVirtualAddress addr, uint offset)
 __decl uint AmdExtDispatchThreadIdFlat() DUMMY_UINT_FUNC;
 
 //=====================================================================================================================
+__decl uint AmdExtAtomicAddAtAddr(uint64_t gpuVa, uint offset, uint value) DUMMY_UINT_FUNC;
 __decl uint64_t AmdExtAtomic64AddAtAddr(uint64_t gpuVa, uint offset, uint64_t value) DUMMY_UINT_FUNC
 __decl uint64_t AmdExtAtomic64CmpXchgAtAddr(uint64_t gpuVa, uint offset, uint64_t compare_value, uint64_t value) DUMMY_UINT_FUNC
 __decl uint64_t AmdExtLoad64AtAddrUncached(uint64_t gpuVa, uint offset) DUMMY_UINT_FUNC
@@ -324,6 +387,12 @@ __decl uint  AmdExtLoadDwordAtAddrUncached(uint64_t addr, uint offset) DUMMY_UIN
 __decl void  AmdExtStoreDwordAtAddrUncached(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC
 __decl uint3 AmdExtGroupIdCompute() DUMMY_UINT3_FUNC
 __decl uint3 AmdExtGroupDimCompute() DUMMY_UINT3_FUNC
+__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC
+__decl uint  AmdExtFlattenedThreadIdInGroupCompute() DUMMY_UINT_FUNC
+__decl uint  AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC
+__decl void  AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC
+__decl void  AmdExtDeviceMemoryAcquire() DUMMY_VOID_FUNC
+__decl void  AmdExtDeviceMemoryRelease() DUMMY_VOID_FUNC
 __decl uint  AmdExtLaneCount() DUMMY_UINT_FUNC
 __decl void  AmdExtSleep(uint value) DUMMY_VOID_FUNC
 
diff --git a/src/shaders/GenerateMortonCodes.hlsl b/src/shaders/GenerateMortonCodes.hlsl
index 6cd8bbd..79df409 100644
--- a/src/shaders/GenerateMortonCodes.hlsl
+++ b/src/shaders/GenerateMortonCodes.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #include "BuildRootSignature.hlsl"
 #endif
diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl
index de0dc26..293fae0 100644
--- a/src/shaders/GpuRtLibraryCont.hlsl
+++ b/src/shaders/GpuRtLibraryCont.hlsl
@@ -81,18 +81,13 @@
 #define TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT 5
 #define TRAVERSAL_STATE_COMMITTED_PROCEDURAL_PRIMITIVE_HIT 6
 
-// This state implies Traversal was stopped to run AHS/IS for other lanes. This lane wants to resume Traversal.
-#define TRAVERSAL_STATE_SUSPEND_TRAVERSAL 7
-
 // Shader priorities for continuation scheduling. Higher values mean higher scheduling precedence.
-// Reserve priority 0 as invalid value. This way, 0-initialized priorities in metadata-annotated
-// function pointers (e.g. from relocations) can be detected.
 // Note: For 32-bit packing of function pointers, we require the scheduling priority to fit into 3 bits.
-#define SCHEDULING_PRIORITY_INVALID   0
-#define SCHEDULING_PRIORITY_RGS       1
-#define SCHEDULING_PRIORITY_CHS       2
-#define SCHEDULING_PRIORITY_MISS      2
-#define SCHEDULING_PRIORITY_TRAVERSAL 3
+#define SCHEDULING_PRIORITY_PWG_DEAD  0
+#define SCHEDULING_PRIORITY_TRAVERSAL 1
+#define SCHEDULING_PRIORITY_RGS       2
+#define SCHEDULING_PRIORITY_CHS       3
+#define SCHEDULING_PRIORITY_MISS      3
 // Give IS higher prio than AHS so AHS called by ReportHit
 // have a chance to run together with AHS called by Traversal.
 #define SCHEDULING_PRIORITY_AHS       4
@@ -144,7 +139,7 @@ static uint GetPriorityForShaderType(
     case DXILShaderKind::AnyHit:         return SCHEDULING_PRIORITY_AHS;
     case DXILShaderKind::Intersection:   return SCHEDULING_PRIORITY_IS;
     case DXILShaderKind::RayGeneration:  return SCHEDULING_PRIORITY_RGS;
-    default:                             return SCHEDULING_PRIORITY_INVALID;
+    default:                             GPU_ASSERT(false); return 0;
     }
 }
 
@@ -153,60 +148,128 @@ static uint3 GetDispatchRaysDimensions();
 
 //=====================================================================================================================
 
-static uint64_t GetVpcWithPriority(uint64_t vpc, uint priority)
-{
-    if (_AmdIsLlpc())
+struct Vpc64 {
+    uint64_t vpc;
+
+#if defined(__cplusplus)
+    Vpc64(uint64_t value) : vpc(value) {}
+#endif
+
+    uint64_t GetU64()
     {
         return vpc;
     }
 
-    const uint64_t prio64 = priority;
-    const uint firstMetadataBit = 32;
-    const uint firstPriorityBitInMetadata = 16;
-    GPU_ASSERT((vpc & 0xFFFF000000000000) == 0);
-    return vpc | (prio64 << (firstMetadataBit + firstPriorityBitInMetadata));
-}
+    uint GetFunctionAddr()
+    {
+        return (vpc & 0xFFFFFFFF);
+    }
+
+    bool IsValid()
+    {
+        return GetFunctionAddr() != 0;
+    }
+
+    Vpc64 SetPriority(uint priority)
+    {
+        if (_AmdIsLlpc())
+        {
+            return Vpc64(vpc);
+        }
+
+        const uint64_t prio64 = (uint64_t)(priority);
+        const uint firstMetadataBit = 32;
+        const uint firstPriorityBitInMetadata = 16;
+        GPU_ASSERT((vpc & 0xFFFF000000000000) == 0);
+        vpc |= (prio64 << (firstMetadataBit + firstPriorityBitInMetadata));
+        return Vpc64(vpc);
+    }
+
+    uint GetPriority()
+    {
+        uint inMetadata = (uint)(vpc >> 32);
+        return (uint)(inMetadata >> 16);
+    }
+
+    static Vpc64 MakeWithPriority(Vpc64 vpc64, uint priority)
+    {
+        return vpc64.SetPriority(priority);
+    }
+};
+
+struct Vpc32 {
+    uint32_t vpc;
+
+#if defined(__cplusplus)
+    Vpc32(uint32_t value) : vpc(value) {}
+#endif
+
+    uint32_t GetU32()
+    {
+        return vpc;
+    }
+
+    uint32_t GetFunctionAddr()
+    {
+        return (uint32_t)(vpc & 0xFFFFFFC0);
+    }
+
+    bool IsValid()
+    {
+        return GetFunctionAddr() != 0;
+    }
+
+    void SetPriority(uint priority)
+    {
+        vpc |= priority;
+    }
+
+    uint GetPriority()
+    {
+        return (uint)(vpc & 0x7);
+    }
+};
 
 //=====================================================================================================================
 // 32-bit function pointer packing/unpacking
 //
-static uint64_t Unpack32BitVpcTo64BitVpc(uint32_t vpc32, bool unpackPriority)
+static Vpc64 Vpc32ToVpc64(Vpc32 vpc32, bool unpackPriority)
 {
     if (_AmdIsLlpc())
     {
-        return vpc32;
+        return Vpc64(vpc32.GetU32());
     }
 
-    uint64_t vpc = (vpc32 & 0xFFFFFFC0);
+    Vpc64 vpc64 = Vpc64((uint64_t)(vpc32.GetFunctionAddr()));
 
     if (unpackPriority)
     {
-       // The priority is stored in bits 0..2.
-       uint32_t priority = (vpc32 & 0x7);
-       vpc = GetVpcWithPriority(vpc, priority);
+       vpc64.SetPriority(vpc32.GetPriority());
     }
 
-    return vpc;
+    return vpc64;
 }
 
-static uint32_t Pack64BitVpcTo32Bits(uint64_t vpc)
+static Vpc32 Vpc64ToVpc32(Vpc64 vpc64)
 {
+    Vpc32 vpc32 = Vpc32((uint32_t)(vpc64.GetFunctionAddr()));
+
     if (_AmdIsLlpc())
     {
-        return (vpc & 0xFFFFFFFF);
+        return vpc32;
     }
 
+    GPU_ASSERT((vpc32.GetU32() & 0x2F) == 0);
+
     // Incoming metadata is in the high dword
-    uint32_t inMetadata = (uint32_t)(vpc >> 32);
-    uint32_t prio = (inMetadata >> 16);
+    uint prio = vpc64.GetPriority();
+
     // We only have three bits for the priority:
     GPU_ASSERT(prio <= 7);
 
-    // Outgoing metadata is in the low 6 bits
-    uint32_t outMetadata = prio;
+    vpc32.SetPriority(prio);
 
-    GPU_ASSERT((vpc & 0x2F) == 0);
-    return SplitUint64(vpc).x | outMetadata;
+    return vpc32;
 }
 
 //=====================================================================================================================
@@ -596,14 +659,14 @@ struct _AmdTraversalState
         return committed.State();
     }
 
-    void PackReturnAddress(uint64_t returnAddr)
+    void PackReturnAddress(Vpc64 returnAddr)
     {
-        packedReturnAddr = Pack64BitVpcTo32Bits(returnAddr);
+        packedReturnAddr = Vpc64ToVpc32(returnAddr).GetU32();
     }
 
-    uint64_t ReturnAddress()
+    Vpc64 ReturnAddress()
     {
-        return Unpack32BitVpcTo64BitVpc(packedReturnAddr, true);
+        return Vpc32ToVpc64(Vpc32(packedReturnAddr), true);
     }
 };
 
@@ -679,9 +742,7 @@ struct _AmdSystemData
 
     bool IsChsOrMiss(in uint state)
     {
-        return (state >= TRAVERSAL_STATE_COMMITTED_NOTHING) &&
-               ((Options::getCpsCandidatePrimitiveMode() != CpsCandidatePrimitiveMode::SuspendWave) ||
-                (state < TRAVERSAL_STATE_SUSPEND_TRAVERSAL));
+        return (state >= TRAVERSAL_STATE_COMMITTED_NOTHING);
     }
 
     bool IsMiss(in uint state)
@@ -762,19 +823,17 @@ struct _AmdTraversalResultData
 DECLARE_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data)
 
 DECLARE_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data)
+DECLARE_ENQUEUE(TraversalDead, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
 DECLARE_ENQUEUE(RayGen, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
-DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data)
-DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
 
 DECLARE_ENQUEUE(AnyHit, uint64_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics)
 DECLARE_ENQUEUE(Intersection, uint64_t returnAddr, _AmdAnyHitSystemData data)
-DECLARE_WAIT_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data)
 
 DECLARE_AWAIT(AnyHit, _AmdAnyHitSystemData, uint64_t returnAddr, _AmdAnyHitSystemData data)
 DECLARE_AWAIT(CallShader, _AmdDispatchSystemData, uint64_t returnAddr, _AmdDispatchSystemData data)
 
 // No returnAddr argument. The return address is instead included in the passed system data.
-DECLARE_WAIT_AWAIT(Traversal, _AmdDispatchSystemData, _AmdSystemData data)
+DECLARE_AWAIT(Traversal, _AmdDispatchSystemData, _AmdSystemData data)
 
 DECLARE_RESTORE_SYSTEM_DATA(, _AmdDispatchSystemData data)
 DECLARE_RESTORE_SYSTEM_DATA(AnyHit, _AmdAnyHitSystemData data)
@@ -826,36 +885,37 @@ inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithoutStack()
 
 //=====================================================================================================================
 // Return the argument.
-static uint64_t GetVpcFromShaderId(uint32_t shaderId, uint priority)
+static Vpc64 GetVpc64FromShaderId(Vpc32 shaderId, uint priority)
 {
-    uint64_t vpc = Unpack32BitVpcTo64BitVpc(shaderId, /* unpackPriority = */ false);
-    return GetVpcWithPriority(vpc, priority);
+    Vpc64 vpc64 = Vpc32ToVpc64(shaderId, /* unpackPriority = */ false);
+    vpc64.SetPriority(priority);
+    return vpc64;
 }
 
 //=====================================================================================================================
-static uint64_t GetVpcFromShaderIdAddr(GpuVirtualAddress addr, uint priority)
+static Vpc64 GetVpc64FromShaderIdAddr(GpuVirtualAddress addr, uint priority)
 {
 #ifdef __cplusplus
     return 1;
 #else
-    uint32_t shaderId = ConstantLoadDwordAtAddr(addr);
-    return GetVpcFromShaderId(shaderId, priority);
+    Vpc32 shaderId = Vpc32(ConstantLoadDwordAtAddr(addr));
+    return GetVpc64FromShaderId(shaderId, priority);
 #endif
 }
 
 //=====================================================================================================================
-static uint64_t GetVpcFromShaderIdTable(
+static Vpc64 GetVpc64FromShaderIdTable(
     GpuVirtualAddress tableAddress,
     uint index,
     uint stride,
     uint priority)
 {
-    return GetVpcFromShaderIdAddr(tableAddress + stride * index, priority);
+    return GetVpc64FromShaderIdAddr(tableAddress + stride * index, priority);
 }
 
 //=====================================================================================================================
 // Returns the 32-bit part of the hit group shader id containing the AHS shader id.
-static uint32_t GetAnyHit32BitShaderId(
+static Vpc32 GetAnyHit32BitShaderId(
     uint hitGroupRecordIndex)
 {
     const uint offset = DispatchRaysConstBuf.hitGroupTableStrideInBytes * hitGroupRecordIndex;
@@ -864,18 +924,18 @@ static uint32_t GetAnyHit32BitShaderId(
         PackUint64(DispatchRaysConstBuf.hitGroupTableBaseAddressLo, DispatchRaysConstBuf.hitGroupTableBaseAddressHi);
     if (tableVa == 0)
     {
-       return 0;
+       return Vpc32(0);
     }
-    return ConstantLoadDwordAtAddr(tableVa + offset + 8);
+    return Vpc32(ConstantLoadDwordAtAddr(tableVa + offset + 8));
 }
 
 //=====================================================================================================================
 // Returns the 64-bit VPC for the given AHS by loading its shader address, and setting the AHS priority.
-static uint64_t GetAnyHitAddr(
+static Vpc64 GetAnyHitAddr(
     uint hitGroupRecordIndex)
 {
-    uint32_t shaderId = GetAnyHit32BitShaderId(hitGroupRecordIndex);
-    return GetVpcFromShaderId(shaderId, SCHEDULING_PRIORITY_AHS);
+    Vpc32 shaderId = GetAnyHit32BitShaderId(hitGroupRecordIndex);
+    return GetVpc64FromShaderId(shaderId, SCHEDULING_PRIORITY_AHS);
 }
 
 //=====================================================================================================================
@@ -891,7 +951,7 @@ static bool AnyHitIsNonNull(
                                        geometryContributionToHitGroupIndex,
                                        instanceContributionToHitGroupIndex);
 
-    return GetAnyHit32BitShaderId(hitGroupRecordIndex) != 0;
+    return GetAnyHit32BitShaderId(hitGroupRecordIndex).IsValid();
 }
 
 //=====================================================================================================================
@@ -942,14 +1002,6 @@ static float4x3 WorldToObject4x3(in uint64_t tlasBaseAddr, in uint instNodePtr)
     return transpose(WorldToObject3x4(tlasBaseAddr, instNodePtr));
 }
 
-//=====================================================================================================================
-__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC
-__decl uint  AmdExtFlattenedThreadIdInGroupCompute() DUMMY_UINT_FUNC
-__decl uint  AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC
-__decl void  AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC
-__decl void  AmdExtDeviceMemoryAcquire() DUMMY_VOID_FUNC
-__decl void  AmdExtDeviceMemoryRelease() DUMMY_VOID_FUNC
-
 //=====================================================================================================================
 // Implementation of DispatchRaysIndex.
 export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data)
@@ -970,7 +1022,7 @@ static uint3 GetDispatchRaysDimensions()
 
 //=====================================================================================================================
 // Persistent dispatch size (1D).
-static uint3 GetPersistentDispatchSize()
+static uint GetPersistentDispatchSize()
 {
     // Groups needed to cover the dispatch if each thread only processes 1 ray
     const uint3 rayDispatch   = GetDispatchRaysDimensions();
@@ -1069,6 +1121,7 @@ static uint3 GetDispatchId()
     dispatchId.z = groupId.y;
     if ((dims.x > 1) && (dims.y > 1))
     {
+        // Use 8 x (threadGroupSize / 8) tiles.
         /*
         Sample: D3D12_DISPATCH_RAYS_DESC::(w x h x d) = (18, 6, 1). Divided into 8x4 tiles(boxes).
         A number in a box is the group id.
@@ -1334,20 +1387,27 @@ export uint64_t _cont_GetContinuationStackGlobalMemBase()
 }
 
 //=====================================================================================================================
-static uint64_t GetTraversalVpc()
+static Vpc64 GetTraversalVpc64()
 {
     // NOTE: DXCP uses a table for TraceRay, thus a load to traceRayGpuVa retrieves the actual traversal function
     // address. But Vulkan does not use the table so far, traceRayGpuVa is already the traversal function address.
-    return PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
-                      DispatchRaysConstBuf.traceRayGpuVaHi);
+    return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
+                      DispatchRaysConstBuf.traceRayGpuVaHi));
 }
 
 //=====================================================================================================================
-static uint64_t GetRayGenVpc()
+static Vpc64 GetTraversalVpc64PwgDead()
 {
-    return GetVpcFromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo,
-                                             DispatchRaysConstBuf.rayGenerationTableAddressHi),
-                                  SCHEDULING_PRIORITY_RGS);
+    return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
+                      DispatchRaysConstBuf.traceRayGpuVaHi));
+}
+
+//=====================================================================================================================
+static Vpc64 GetRayGenVpc64()
+{
+    return GetVpc64FromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo,
+                                               DispatchRaysConstBuf.rayGenerationTableAddressHi),
+                                               SCHEDULING_PRIORITY_RGS);
 }
 
 //=====================================================================================================================
@@ -1610,7 +1670,6 @@ static uint2 RayHistoryGetIdentifierFromVPC(uint64_t vpc)
 //=====================================================================================================================
 static uint2 RayHistoryGetIdentifierFromShaderId(uint2 shaderId)
 {
-    // Zero out the dVGPR bits and the higher dWord
     return uint2(shaderId.x & 0xFFFFFFC0, 0);
 }
 
@@ -1828,15 +1887,14 @@ export bool _cont_ReportHit(inout_param(_AmdAnyHitSystemData) data, float THit,
             hitGroupRecordIndex = data.base.dispatch.shaderRecIdx;
         }
         // Compute hit group address and fetch shader identifiers
-        const uint64_t anyHitAddr = GetAnyHitAddr(hitGroupRecordIndex);
+        const Vpc64 anyHitAddr = GetAnyHitAddr(hitGroupRecordIndex);
 
-        if (SplitUint64(anyHitAddr).x != 0)
+        if (anyHitAddr.IsValid())
         {
             // Call AnyHit
             // Hit attributes are added as an additional argument by the compiler
-            const uint64_t resumeAddr = _AmdGetResumePointAddr();
-            const uint64_t resumeAddrWithPrio = GetVpcWithPriority(resumeAddr, SCHEDULING_PRIORITY_IS);
-            data = _AmdAwaitAnyHit(anyHitAddr, resumeAddrWithPrio, data);
+            Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), SCHEDULING_PRIORITY_IS);
+            data = _AmdAwaitAnyHit(anyHitAddr.GetU64(), resumeAddr.GetU64(), data);
             _AmdRestoreSystemDataAnyHit(data);
             return data.base.ray.AnyHitDidAccept();
         }
@@ -1874,12 +1932,12 @@ export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint inde
         return;
     }
 
-    const uint64_t addr = GetVpcFromShaderIdTable(callableTableBaseAddress,
-                                                  index,
-                                                  DispatchRaysConstBuf.callableTableStrideInBytes,
-                                                  SCHEDULING_PRIORITY_CALLABLE);
+    const Vpc64 addr = GetVpc64FromShaderIdTable(callableTableBaseAddress,
+                                                 index,
+                                                 DispatchRaysConstBuf.callableTableStrideInBytes,
+                                                 SCHEDULING_PRIORITY_CALLABLE);
 
-    if (SplitUint64(addr).x == 0)
+    if (!addr.IsValid())
     {
         // See TODO above on how to handle this case better.
         return;
@@ -1890,10 +1948,9 @@ export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint inde
 
     const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
     const uint resumePrio = GetPriorityForShaderType(enclosingShaderType);
-    const uint64_t resumeAddr = _AmdGetResumePointAddr();
-    const uint64_t resumeAddrWithPrio = GetVpcWithPriority(resumeAddr, resumePrio);
+    const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio);
 
-    data = _AmdAwaitCallShader(addr, resumeAddrWithPrio, data);
+    data = _AmdAwaitCallShader(addr.GetU64(), resumeAddr.GetU64(), data);
 
     // for the resume part.
     data.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
@@ -1903,23 +1960,23 @@ export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint inde
 //=====================================================================================================================
 // Returns the low part of the miss shader address and sets up the dispatch data to have the correct shader record
 // index.
-static uint64_t SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx)
+static Vpc64 SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx)
 {
     const uint64_t missTableBaseAddress =
         PackUint64(DispatchRaysConstBuf.missTableBaseAddressLo, DispatchRaysConstBuf.missTableBaseAddressHi);
     if (missTableBaseAddress == 0)
     {
         shaderRecIdx = 0;
-        return 0;
+        return Vpc64(0);
     }
 
     shaderRecIdx = ExtractMissShaderIndex(data.ray.traceParameters);
 
     // Calculate miss shader record address
-    const uint64_t shaderAddr = GetVpcFromShaderIdTable(missTableBaseAddress,
-                                                        shaderRecIdx,
-                                                        DispatchRaysConstBuf.missTableStrideInBytes,
-                                                        SCHEDULING_PRIORITY_MISS);
+    const Vpc64 shaderAddr = GetVpc64FromShaderIdTable(missTableBaseAddress,
+                                                       shaderRecIdx,
+                                                       DispatchRaysConstBuf.missTableStrideInBytes,
+                                                       SCHEDULING_PRIORITY_MISS);
 
     return shaderAddr;
 }
@@ -1949,6 +2006,11 @@ static HitGroupInfo GetHitGroupInfo(
 #include "Continuations2_0.hlsl"
 
 #if CONTINUATION_ON_GPU
+static uint64_t GetDispatchIdAddr()
+{
+    return PackUint64(DispatchRaysConstBuf.cpsDispatchIdAddressLo, DispatchRaysConstBuf.cpsDispatchIdAddressHi);
+}
+
 //=====================================================================================================================
 static void LaunchRayGen(bool setupStack)
 {
@@ -1968,28 +2030,16 @@ static void LaunchRayGen(bool setupStack)
 
         // This is written in a way that is intended to be correct even if threads don't reconverge after calling into
         // the ray generation shader.
-        uint localWorkId;
         const uint popCount = WaveActiveCountBits(true);
 
+        uint flatDispatchId = 0;
         if (WaveIsFirstLane())
         {
-            localWorkId = AmdTraceRayPersistentLdsAtomicAdd(0, popCount);
+            flatDispatchId = AmdExtAtomicAddAtAddr(GetDispatchIdAddr(), 0, popCount);
         }
-        localWorkId = WaveReadLaneFirst(localWorkId) + WavePrefixCountBits(true);
+        flatDispatchId = WaveReadLaneFirst(flatDispatchId) + WavePrefixCountBits(true);
 
         const uint3 rayDims = GetDispatchRaysDimensions();
-        const uint  tgCount = GetPersistentDispatchSize();
-
-        // Single dimension dispatch so the flattened group ID is the same as the x component of the group ID
-        const uint tgId = AmdExtGroupIdCompute().x;
-
-        // Interleave waves' worth of work among CUs so that every CU does approximately the same amount of work even
-        // for dispatches that are smaller than the maximum occupancy of the GPU. This is probably also a bit better
-        // for memory and shader execution locality, since CUs should tend to stay roughly within the same region of
-        // the dispatch. Assume numthreads(32, 1, 1).
-        const uint lowPart        = localWorkId & 31;
-        const uint highPart       = localWorkId & ~31;
-        const uint flatDispatchId = highPart * tgCount + tgId * 32 + lowPart;
 
         dispatchId = GetDispatchId(rayDims.x, rayDims.y, flatDispatchId);
         valid      = flatDispatchId < (rayDims.x * rayDims.y * rayDims.z);
@@ -2016,12 +2066,12 @@ static void LaunchRayGen(bool setupStack)
 #if DEVELOPER
         systemData.parentId = -1;
 #endif
-        _AmdEnqueueRayGen(GetRayGenVpc(), _AmdGetUninitializedI64(), systemData);
+        _AmdEnqueueRayGen(GetRayGenVpc64().GetU64(), _AmdGetUninitializedI64(), systemData);
     }
     else if (Options::getPersistentLaunchEnabled())
     {
         _AmdDispatchSystemData systemData = _AmdDispatchSystemData::MakeDeadLaneWithStack();
-        _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), systemData);
+        _AmdEnqueueTraversalDead(GetTraversalVpc64PwgDead().GetU64(), _AmdGetUninitializedI64(), systemData);
     }
 }
 
@@ -2029,16 +2079,6 @@ static void LaunchRayGen(bool setupStack)
 // KernelEntry is entry function of the RayTracing continuation mode
 export void _cont_KernelEntry()
 {
-    if (Options::getPersistentLaunchEnabled())
-    {
-        if (AmdExtFlattenedThreadIdInGroupCompute() == 0)
-        {
-            AmdTraceRayPersistentLdsWrite(0, 0);
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-    }
-
     LaunchRayGen(true);
 }
 
@@ -2136,17 +2176,16 @@ export void _cont_TraceRay(
 
     const uint     callerShaderRecIdx    = dispatch.shaderRecIdx; // 0 if from RayGen.
     const uint     parentId              = RayHistoryGetParentId(dispatch);
-    const uint64_t traversalAddrWithPrio = GetTraversalVpc();
+    const Vpc64    traversalAddr         = GetTraversalVpc64();
 
     // The type of the shader containing this TraceRay call, i.e. the shader we are inlined into.
     const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
     const uint           resumePrio          = GetPriorityForShaderType(enclosingShaderType);
 
-    // NO control flow is allowed between _AmdGetResumePointAddr() and _AmdWaitAwaitTraversal().
-    const uint64_t resumeAddr         = _AmdGetResumePointAddr();
-    const uint64_t resumeAddrWithPrio = GetVpcWithPriority(resumeAddr, resumePrio);
-    data.traversal.PackReturnAddress(resumeAddrWithPrio);
-    dispatch = _AmdWaitAwaitTraversal(traversalAddrWithPrio, -1, data);
+    // NO control flow is allowed between _AmdGetResumePointAddr() and _AmdAwaitTraversal().
+    const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio);
+    data.traversal.PackReturnAddress(resumeAddr);
+    dispatch = _AmdAwaitTraversal(traversalAddr.GetU64(), data);
 
     // for the resume part.
     dispatch.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
@@ -2164,14 +2203,14 @@ static bool GetNextHitMissPc(
     inout_param(_AmdSystemData) data,
     uint state,
     _AmdPrimitiveSystemState candidate,
-    out_param(uint64_t) nextShaderAddr)
+    out_param(Vpc64) nextShaderAddr)
 {
     // MS
     if (data.IsMiss(state))
     {
         uint shaderRecIdx;
-        const uint64_t missShaderAddr = SetupMissShader(data, shaderRecIdx);
-        if (SplitUint64(missShaderAddr).x != 0)
+        const Vpc64 missShaderAddr = SetupMissShader(data, shaderRecIdx);
+        if (missShaderAddr.IsValid())
         {
             // Valid MS
             data.dispatch.shaderRecIdx = shaderRecIdx;
@@ -2194,7 +2233,7 @@ static bool GetNextHitMissPc(
             if (hitInfo.closestHitId.x != 0)
             {
                 // Valid CHS
-                nextShaderAddr = GetVpcFromShaderId(hitInfo.closestHitId.x, SCHEDULING_PRIORITY_CHS);
+                nextShaderAddr = GetVpc64FromShaderId(Vpc32(hitInfo.closestHitId.x), SCHEDULING_PRIORITY_CHS);
                 return true;
             }
         }
@@ -2225,7 +2264,7 @@ static void TraversalInternal(
     }
 }
 
-static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_t returnAddr, _AmdSystemData data)
+static void EnqueueNextShader(bool hasWorkToDo, Vpc64 nextShaderAddr, Vpc64 returnAddr, _AmdSystemData data)
 {
     if (!hasWorkToDo)
     {
@@ -2233,7 +2272,7 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_
         {
             // No work to do = dead lane, jump to traversal as a synchronization point with an empty system data
             _AmdSystemData sysData = _AmdSystemData::MakeDeadLaneWithoutStack();
-            _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), sysData);
+            _AmdEnqueueTraversal(GetTraversalVpc64().GetU64(), _AmdGetUninitializedI64(), sysData);
         }
         else
         {
@@ -2244,21 +2283,21 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_
     const uint newState = data.traversal.committed.State();
     RayHistoryWriteEnd(data, newState);
 
-    if (nextShaderAddr != returnAddr)
+    if (nextShaderAddr.GetU64() != returnAddr.GetU64())
     {
         const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(newState) ?
                                           (int)DXILShaderKind::Miss : // convert to int to fix linux build error
                                           (int)DXILShaderKind::ClosestHit);
         RayHistoryWriteFunctionCall(data,
-                                    RayHistoryGetIdentifierFromVPC(nextShaderAddr),
+                                    RayHistoryGetIdentifierFromVPC(nextShaderAddr.GetU64()),
                                     data.dispatch.shaderRecIdx,
                                     shaderKind);
 
-        _AmdEnqueue(nextShaderAddr, returnAddr, data);
+        _AmdEnqueue(nextShaderAddr.GetU64(), returnAddr.GetU64(), data);
     }
 
     // Return to RayGen. No need to set a priority, as it is already set in the stored return address.
-    _AmdEnqueueRayGen(returnAddr, _AmdGetUninitializedI64(), data.dispatch);
+    _AmdEnqueueRayGen(returnAddr.GetU64(), _AmdGetUninitializedI64(), data.dispatch);
 }
 
 //=====================================================================================================================
@@ -2350,21 +2389,24 @@ export void _cont_Traversal(
     _AmdTraversalResultData result = (_AmdTraversalResultData)0;
 
     bool IsChsOrMiss = data.IsChsOrMiss(state);
-    if ((_AmdContinuationStackIsGlobal() && WaveActiveAllTrue(IsChsOrMiss)) ||
-        (!_AmdContinuationStackIsGlobal() && IsChsOrMiss))
+    // Re-enqueue Traversal until all lanes are done with BVH Traversal.
+    // Only then enqueue CHS/Miss to ensure other lanes that are not yet done with Traversal
+    // converge on these CHS/Miss invocations.
+    // This is necessary because Traversal has lower scheduling priority.
+    if (WaveActiveAllTrue(IsChsOrMiss))
     {
         EnterSchedulerSection();
 
-        uint64_t nextShaderAddr = 0;
+        Vpc64 nextShaderAddr = Vpc64(0);
         GetNextHitMissPc(data, state, candidate, nextShaderAddr);
 
         bool hasWorkToDo = true;
-        if (_AmdContinuationStackIsGlobal() && (nextShaderAddr != 0))
+        if (_AmdContinuationStackIsGlobal() && nextShaderAddr.IsValid())
         {
         }
 
-        const uint64_t returnAddr = data.traversal.ReturnAddress();
-        if (nextShaderAddr == 0)
+        const Vpc64 returnAddr = data.traversal.ReturnAddress();
+        if (!nextShaderAddr.IsValid())
         {
             nextShaderAddr = returnAddr;
         }
@@ -2372,10 +2414,7 @@ export void _cont_Traversal(
     }
     else
     {
-        bool mayEnqueueTraversal = (_AmdContinuationStackIsGlobal() ||
-            (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::SuspendWave));
-        // If we cannot re-enqueue Traversal, then we already know that we are in AHS or IS state.
-        if (!mayEnqueueTraversal || data.IsAhs(state) || data.IsIs(state))
+        if (data.IsAhs(state) || data.IsIs(state))
         {
             HitGroupInfo hitInfo = (HitGroupInfo)0;
             {
@@ -2395,10 +2434,9 @@ export void _cont_Traversal(
                                             hitInfo.tableIndex,
                                             DXILShaderKind::AnyHit);
 
-                const uint64_t addr = GetVpcFromShaderId(hitInfo.anyHitId.x, SCHEDULING_PRIORITY_AHS);
-                const uint64_t returnAddr = _AmdGetCurrentFuncAddr();
-                const uint64_t returnAddrWithPrio = GetVpcWithPriority(returnAddr, SCHEDULING_PRIORITY_TRAVERSAL);
-                _AmdEnqueueAnyHit(addr, returnAddrWithPrio, anyHitData, candidateBarycentrics);
+                const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.anyHitId.x), SCHEDULING_PRIORITY_AHS);
+                const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+                _AmdEnqueueAnyHit(addr.GetU64(), returnAddr.GetU64(), anyHitData, candidateBarycentrics);
             }
             else
             {
@@ -2410,10 +2448,9 @@ export void _cont_Traversal(
                                             hitInfo.tableIndex,
                                             DXILShaderKind::Intersection);
 
-                const uint64_t addr = GetVpcFromShaderId(hitInfo.intersectionId.x, SCHEDULING_PRIORITY_IS);
-                const uint64_t returnAddr = _AmdGetCurrentFuncAddr();
-                const uint64_t returnAddrWithPrio = GetVpcWithPriority(returnAddr, SCHEDULING_PRIORITY_TRAVERSAL);
-                _AmdEnqueueIntersection(addr, returnAddrWithPrio, anyHitData);
+                const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.intersectionId.x), SCHEDULING_PRIORITY_IS);
+                const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+                _AmdEnqueueIntersection(addr.GetU64(), returnAddr.GetU64(), anyHitData);
             }
         }
         else
@@ -2421,9 +2458,8 @@ export void _cont_Traversal(
             //
             // Everything else needs to go back through scheduling/traversal, regardless of state
             // Note we don't need "Wait" here because priorities run AHS and IS first
-            const uint64_t traversalAddr = _AmdGetCurrentFuncAddr();
-            const uint64_t traversalAddrWithPrio = GetVpcWithPriority(traversalAddr, SCHEDULING_PRIORITY_TRAVERSAL);
-            _AmdEnqueueTraversal(traversalAddrWithPrio, _AmdGetUninitializedI64(), data);
+            const Vpc64 traversalAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+            _AmdEnqueueTraversal(traversalAddr.GetU64(), _AmdGetUninitializedI64(), data);
         }
     }
     // This is unreachable
diff --git a/src/shaders/IndirectArgBufferUtils.hlsl b/src/shaders/IndirectArgBufferUtils.hlsl
index 73a627d..952c6a5 100644
--- a/src/shaders/IndirectArgBufferUtils.hlsl
+++ b/src/shaders/IndirectArgBufferUtils.hlsl
@@ -31,7 +31,7 @@
 #ifndef _INDIRECTARGBUFFER_HLSL
 #define _INDIRECTARGBUFFER_HLSL
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 #include "BuildSettings.hlsli"
 
 //======================================================================================================================
diff --git a/src/shaders/MergeSort.hlsl b/src/shaders/MergeSort.hlsl
index bd1921a..50d6882 100644
--- a/src/shaders/MergeSort.hlsl
+++ b/src/shaders/MergeSort.hlsl
@@ -26,7 +26,7 @@
 #define BUILD_THREADGROUP_SIZE 512
 
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTMETADATA
 #define GC_SCRATCHBUFFER
diff --git a/src/shaders/PairCompression.hlsl b/src/shaders/PairCompression.hlsl
index 91aac60..da86963 100644
--- a/src/shaders/PairCompression.hlsl
+++ b/src/shaders/PairCompression.hlsl
@@ -25,7 +25,7 @@
 #if NO_SHADER_ENTRYPOINT == 0
 
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_DSTMETADATA
diff --git a/src/shaders/RadixSort/BitHistogram.hlsl b/src/shaders/RadixSort/BitHistogram.hlsl
index f2b3fb2..5b5d4a3 100644
--- a/src/shaders/RadixSort/BitHistogram.hlsl
+++ b/src/shaders/RadixSort/BitHistogram.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/RadixSort/DistributePartSumInt4.hlsl b/src/shaders/RadixSort/DistributePartSumInt4.hlsl
index 1c86e7c..8d1aaf2 100644
--- a/src/shaders/RadixSort/DistributePartSumInt4.hlsl
+++ b/src/shaders/RadixSort/DistributePartSumInt4.hlsl
@@ -25,7 +25,7 @@
 #if NO_SHADER_ENTRYPOINT == 0
 
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/RadixSort/ScanCommon.hlsli b/src/shaders/RadixSort/ScanCommon.hlsli
index d43217f..edd8f35 100644
--- a/src/shaders/RadixSort/ScanCommon.hlsli
+++ b/src/shaders/RadixSort/ScanCommon.hlsli
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #define NUMBER_OF_BLOCKS_PER_GROUP 1
 #define NUM_BINS                   16
diff --git a/src/shaders/RadixSort/ScanExclusiveInt4.hlsl b/src/shaders/RadixSort/ScanExclusiveInt4.hlsl
index 4fd23ed..6c9ff45 100644
--- a/src/shaders/RadixSort/ScanExclusiveInt4.hlsl
+++ b/src/shaders/RadixSort/ScanExclusiveInt4.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl b/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl
index 91ff455..fe50439 100644
--- a/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl
+++ b/src/shaders/RadixSort/ScanExclusiveInt4DLB.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_SCRATCHBUFFER
 #include "../BuildRootSignature.hlsl"
diff --git a/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl b/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl
index 34e53bf..40f8620 100644
--- a/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl
+++ b/src/shaders/RadixSort/ScanExclusivePartInt4.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/RadixSort/ScatterKeysAndValues.hlsl b/src/shaders/RadixSort/ScatterKeysAndValues.hlsl
index dede6b2..03bb570 100644
--- a/src/shaders/RadixSort/ScatterKeysAndValues.hlsl
+++ b/src/shaders/RadixSort/ScatterKeysAndValues.hlsl
@@ -24,7 +24,7 @@
  **********************************************************************************************************************/
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../../shared/rayTracingDefs.h"
+#include "../../shadersClean/common/ShaderDefs.hlsli"
 
 #include "../BuildRootSignature.hlsl"
 
diff --git a/src/shaders/Rebraid.hlsl b/src/shaders/Rebraid.hlsl
index 4aee6d9..48d6edc 100644
--- a/src/shaders/Rebraid.hlsl
+++ b/src/shaders/Rebraid.hlsl
@@ -30,7 +30,7 @@
 
 #if NO_SHADER_ENTRYPOINT == 0
 //=====================================================================================================================
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_DSTMETADATA
diff --git a/src/shaders/RefitBounds.hlsl b/src/shaders/RefitBounds.hlsl
index d86fc26..c500419 100644
--- a/src/shaders/RefitBounds.hlsl
+++ b/src/shaders/RefitBounds.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 #define GC_DSTBUFFER
 #define GC_SCRATCHBUFFER
diff --git a/src/shaders/TraceRay.hlsl b/src/shaders/TraceRay.hlsl
index 334e2dc..5b9f06c 100644
--- a/src/shaders/TraceRay.hlsl
+++ b/src/shaders/TraceRay.hlsl
@@ -265,11 +265,17 @@ static bool TraceRayCommon(
     {
         if ((rayFlags & RAY_FLAG_SKIP_CLOSEST_HIT_SHADER) == 0)
         {
-            const uint instanceContribution = (result.instanceContribution & 0x00ffffff);
-            const HitGroupInfo hitInfo = GetHitGroupInfo(rayContributionToHitGroupIndex,
-                                                         multiplierForGeometryContributionToShaderIndex,
-                                                         result.geometryIndex,
-                                                         instanceContribution);
+            uint instanceContribution = 0;
+            HitGroupInfo hitInfo = (HitGroupInfo)0;
+
+            {
+                instanceContribution = (result.instanceContribution & 0x00ffffff);
+                hitInfo = GetHitGroupInfo(rayContributionToHitGroupIndex,
+                                          multiplierForGeometryContributionToShaderIndex,
+                                          result.geometryIndex,
+                                          instanceContribution);
+            }
+
             uint64_t instNodePtr64 = 0;
             {
                 instNodePtr64 = CalculateInstanceNodePtr64(rtIpLevel, accelStruct, result.instNodePtr);
diff --git a/src/shaders/Update.hlsl b/src/shaders/Update.hlsl
index 3818053..3c091ad 100644
--- a/src/shaders/Update.hlsl
+++ b/src/shaders/Update.hlsl
@@ -47,7 +47,7 @@ struct RootConstants
     uint numThreads;
 };
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 [[vk::push_constant]] ConstantBuffer<RootConstants>                ShaderRootConstants  : register(b0);
 [[vk::binding(1, 1)]] ConstantBuffer<BuildShaderConstants>         ShaderConstants      : register(b1);
diff --git a/src/shaders/UpdateParallel.hlsl b/src/shaders/UpdateParallel.hlsl
index 56c782a..7af9953 100644
--- a/src/shaders/UpdateParallel.hlsl
+++ b/src/shaders/UpdateParallel.hlsl
@@ -44,7 +44,7 @@ struct RootConstants
     uint numThreads;
 };
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 [[vk::push_constant]] ConstantBuffer<RootConstants>        ShaderRootConstants : register(b0);
 [[vk::binding(1, 1)]] ConstantBuffer<BuildShaderConstants> ShaderConstants     : register(b1);
diff --git a/src/shaders/UpdateQBVH.hlsl b/src/shaders/UpdateQBVH.hlsl
index ae818c8..d0d253c 100644
--- a/src/shaders/UpdateQBVH.hlsl
+++ b/src/shaders/UpdateQBVH.hlsl
@@ -44,7 +44,7 @@ struct RootConstants
     uint numThreads;
 };
 
-#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ShaderDefs.hlsli"
 
 [[vk::push_constant]] ConstantBuffer<RootConstants>        ShaderRootConstants : register(b0);
 [[vk::binding(1, 1)]] ConstantBuffer<BuildShaderConstants> ShaderConstants     : register(b1);
diff --git a/src/shaders/CopyAS.hlsl b/src/shadersClean/build/CopyAS.hlsl
similarity index 98%
rename from src/shaders/CopyAS.hlsl
rename to src/shadersClean/build/CopyAS.hlsl
index 2ca420e..dd1354c 100644
--- a/src/shaders/CopyAS.hlsl
+++ b/src/shadersClean/build/CopyAS.hlsl
@@ -22,8 +22,8 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "../../gpurt/gpurtAccelStruct.h"
-#include "../shared/rayTracingDefs.h"
+#include "../../../gpurt/gpurtAccelStruct.h"
+#include "../common/ShaderDefs.hlsli"
 
 // Note, CBV(b255) must be the last used binding in the root signature.
 #define RootSig "RootConstants(num32BitConstants=3, b0, visibility=SHADER_VISIBILITY_ALL), "\
diff --git a/src/shadersClean/common/InstanceDesc.hlsli b/src/shadersClean/common/InstanceDesc.hlsli
index 09f910c..35cbe1c 100644
--- a/src/shadersClean/common/InstanceDesc.hlsli
+++ b/src/shadersClean/common/InstanceDesc.hlsli
@@ -25,7 +25,7 @@
 #ifndef INSTANCE_DESC_HLSLI
 #define INSTANCE_DESC_HLSLI
 
-#include "TempAssert.hlsli"
+#include "../../shared/assert.h"
 
 //=====================================================================================================================
 // 64-byte aligned structure matching D3D12_RAYTRACING_INSTANCE_DESC
diff --git a/src/shadersClean/common/NodePointers.hlsli b/src/shadersClean/common/NodePointers.hlsli
index 46e6fa3..9e690e8 100644
--- a/src/shadersClean/common/NodePointers.hlsli
+++ b/src/shadersClean/common/NodePointers.hlsli
@@ -26,7 +26,7 @@
 #ifndef NODE_POINTERS_HLSLI
 #define NODE_POINTERS_HLSLI
 
-#include "../common/TempAssert.hlsli"
+#include "../../shared/assert.h"
 
 //=====================================================================================================================
 // Node pointer size in bytes
diff --git a/src/shadersClean/common/ShaderDefs.hlsli b/src/shadersClean/common/ShaderDefs.hlsli
index 3ca709b..f552f78 100644
--- a/src/shadersClean/common/ShaderDefs.hlsli
+++ b/src/shadersClean/common/ShaderDefs.hlsli
@@ -37,7 +37,7 @@
 #define DUMMY_FLOAT2_FUNC { return float2(0, 0); }
 #define DUMMY_FLOAT3_FUNC { return float3(0, 0, 0); }
 
-#include "TempAssert.hlsli"
+#include "../../shared/assert.h"
 
 // TODO: there are functions that use values from these files, but really
 // those functions should be in these files, and then the files that use the functions
@@ -49,6 +49,8 @@
 #include "gfx10/InstanceNode1_0.hlsli"
 #include "NodePointers.hlsli"
 
+#include "../../shared/rayTracingDefs.h"
+
 #define SAH_COST_TRIANGLE_INTERSECTION       1.5
 #define SAH_COST_AABBB_INTERSECTION          1
 
@@ -473,14 +475,22 @@ enum RebraidType : uint
 //=====================================================================================================================
 struct TriangleData
 {
+#ifdef __cplusplus
+    TriangleData(uint val)
+    {
+        memset(this, val, sizeof(TriangleData));
+    }
+
+    TriangleData() : TriangleData(0)
+    {}
+#endif
     float3 v0; ///< Vertex 0
     float3 v1; ///< Vertex 1
     float3 v2; ///< Vertex 2
 };
 
 #ifndef LIBRARY_COMPILATION
-// This does not include RayTracingDefs.h as the goal is
-// to eventually have everything in this file alone
+
 #endif
 
 #endif
diff --git a/src/shadersClean/common/TempAssert.hlsli b/src/shadersClean/common/TempAssert.hlsli
deleted file mode 100644
index 1407fe8..0000000
--- a/src/shadersClean/common/TempAssert.hlsli
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
- *
- **********************************************************************************************************************/
-
-// TODO: this is a temporary assert file to allow files with asserts to be "clean"
-// while the assert file itself cannot be. We need this as we have to move files out of "shared"
-// which use assert.h, but cannot then include assert.h as "clean" inclusion of shared files isn't set up yet,
-// *because* there are too many files in shared, and they can't be moved out because
-// they use assert.h and... (cyclical issue)
-
-#ifndef ASSERT_HLSLI
-#define ASSERT_HLSLI
-#ifndef GPURT_STATIC_ASSERT
-// _Static_assert is not supported with -spirv: https://github.com/microsoft/DirectXShaderCompiler/issues/5750
-#define GPURT_STATIC_ASSERT(condition, message)
-#endif
-#endif
diff --git a/src/shadersClean/common/gfx10/BoxNode1_0.hlsli b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli
index 6103e61..6623b47 100644
--- a/src/shadersClean/common/gfx10/BoxNode1_0.hlsli
+++ b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli
@@ -25,7 +25,7 @@
 #ifndef BOX_NODE_1_1_HLSLI
 #define BOX_NODE_1_1_HLSLI
 
-#include "../TempAssert.hlsli"
+#include "../../../shared/assert.h"
 
 //=====================================================================================================================
 // Hardware 32-bit box node format and offsets
diff --git a/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli
index ae0280d..e615089 100644
--- a/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli
+++ b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli
@@ -27,7 +27,7 @@
 
 #include "BoxNode1_0.hlsli"
 #include "../InstanceDesc.hlsli"
-#include "../TempAssert.hlsli"
+#include "../../../shared/assert.h"
 
 //=====================================================================================================================
 struct InstanceSidebandData1_1
diff --git a/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli
index 4431ecd..b8e01ec 100644
--- a/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli
+++ b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli
@@ -25,7 +25,7 @@
 #ifndef PROCEDURAL_NODE_1_1_HLSLI
 #define PROCEDURAL_NODE_1_1_HLSLI
 
-#include "../TempAssert.hlsli"
+#include "../../../shared/assert.h"
 
 //=====================================================================================================================
 #define USER_NODE_PROCEDURAL_MIN_OFFSET 0
diff --git a/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli
index 0d9d1eb..7e618be 100644
--- a/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli
+++ b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli
@@ -25,7 +25,7 @@
 #ifndef TRIANGLE_NODE_1_0_HLSLI
 #define TRIANGLE_NODE_1_0_HLSLI
 
-#include "../TempAssert.hlsli"
+#include "../../../shared/assert.h"
 
 //=====================================================================================================================
 // Hardware triangle node format and offsets
diff --git a/src/shadersClean/traversal/TraversalDefs.hlsli b/src/shadersClean/traversal/TraversalDefs.hlsli
index 8541f35..28f9999 100644
--- a/src/shadersClean/traversal/TraversalDefs.hlsli
+++ b/src/shadersClean/traversal/TraversalDefs.hlsli
@@ -25,7 +25,7 @@
 #ifndef TRAVERSAL_DEFS_HLSLI
 #define TRAVERSAL_DEFS_HLSLI
 
-#include "../common/TempAssert.hlsli"
+#include "../../shared/assert.h"
 
 #define ENCODE_FLAG_ARRAY_OF_POINTERS          0x00000001
 #define ENCODE_FLAG_UPDATE_IN_PLACE            0x00000002
@@ -93,6 +93,15 @@ struct RaySystemData
 // Ray description matching the D3D12 HLSL header
 struct RayDesc
 {
+#ifdef __cplusplus
+    RayDesc(uint val)
+    {
+        memset(this, val, sizeof(RayDesc));
+    }
+
+    RayDesc() : RayDesc(0)
+    {}
+#endif
     float3 Origin;
     float TMin;
     float3 Direction;
diff --git a/src/shared/rayTracingDefs.h b/src/shared/rayTracingDefs.h
index 6dfec65..5778dfd 100644
--- a/src/shared/rayTracingDefs.h
+++ b/src/shared/rayTracingDefs.h
@@ -27,10 +27,6 @@
 #ifndef _RAYTRACING_DEF_H
 #define _RAYTRACING_DEF_H
 
-#ifndef __cplusplus
-#include "../shadersClean/common/ShaderDefs.hlsli"
-#endif
-
 #include "../../gpurt/gpurtAccelStruct.h"
 #include "../../gpurt/gpurtBuildSettings.h"
 #include "../../gpurt/gpurtDispatch.h"
diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py
index a967fdf..95a90d3 100644
--- a/tools/CompileRTShaders.py
+++ b/tools/CompileRTShaders.py
@@ -136,7 +136,7 @@ def isBVH(self):
     ShaderConfig(path="EmitAS.hlsl", entryPoint="EmitCompactSize"),
     ShaderConfig(path="EmitAS.hlsl", entryPoint="EmitSerializeDesc"),
     ShaderConfig(path="EmitAS.hlsl", entryPoint="EmitToolVisDesc"),
-    ShaderConfig(path="CopyAS.hlsl", entryPoint="CopyAS"),
+    ShaderConfig(path="../shadersClean/build/CopyAS.hlsl", entryPoint="CopyAS"),
     ShaderConfig(path="CompactAS.hlsl", entryPoint="CompactAS"),
     ShaderConfig(path="DecodeAS.hlsl", entryPoint="DecodeAS"),
     ShaderConfig(path="SerializeAS.hlsl", entryPoint="SerializeAS"),
@@ -313,6 +313,37 @@ def validateCompilation(cmd: [str], path: pathlib.Path) -> bool:
 
     return True
 
+"""
+Validates the organization of files in the shared folder to enforce cpp/h a src/header sort of structure
+This helps keep the shader library untangled and easier to maintain.
+#define'ing LIBRARY_COMPILATION enables including files in any order and does not include implementation dependencies.
+"""
+def validateShared(args) -> bool:
+    cmdBase = getValidationCmdArgs(args)
+    # use resolve() + as_posix() to avoid path mismatches when using drive mapping
+    srcPath = pathlib.Path(FixInputPath(args.basepath)).parent.resolve()
+
+    gpurtInterfacePath = (srcPath / "../gpurt").resolve()
+    sharedPath = srcPath / "shared"
+    generatedFilepath = pathlib.Path(args.g_FilePath)
+    implExt = "._unused_"
+    headerExt = ".h"
+
+    # shared files need to be able to include the gpurt interface files due to the requirements of the interface
+    # we treat this as an exception for rules about what files can be included
+
+    for path, (hasImpl, hasHeader) in getImplInterfacePairs(sharedPath, implExt, headerExt).items():
+        assert (hasHeader and not hasImpl), "Shared files should be header only."
+        fullPath = path.with_suffix(path.suffix + (implExt if hasImpl else headerExt))
+        for defines in getDefineCombos(fullPath):
+            compileCmd = cmdBase + defines + [fullPath.as_posix()]
+            if not validateIncludes(compileCmd, path, implExt, headerExt, [(sharedPath, headerExt), (gpurtInterfacePath, ".h"), (generatedFilepath, ".h")]):
+                return False
+            if not validateCompilation(compileCmd, fullPath):
+                return False
+
+    return True
+
 """
 Validates the organization of shaders to enforce cpp/h a src/header sort of structure
 This helps keep the shader library untangled and easier to maintain.
@@ -322,6 +353,11 @@ def validateShadersClean(args) -> bool:
     cmdBase = getValidationCmdArgs(args)
     # use resolve() + as_posix() to avoid path mismatches when using drive mapping
     srcPath = pathlib.Path(FixInputPath(args.basepath)).parent.resolve()
+
+    gpurtInterfacePath = (srcPath / "../gpurt").resolve()
+    sharedPath = srcPath / "shared"
+    generatedFilepath = pathlib.Path(args.g_FilePath)
+# Validation of the shadersClean folder
     shadersCleanPath = srcPath / "shadersClean"
 
     implExt = ".hlsl"
@@ -331,11 +367,10 @@ def validateShadersClean(args) -> bool:
         fullPath = path.with_suffix(path.suffix + (implExt if hasImpl else headerExt))
         for defines in getDefineCombos(fullPath):
             compileCmd = cmdBase + defines + [fullPath.as_posix()]
-            if not validateIncludes(compileCmd, path, implExt, headerExt, [(shadersCleanPath, headerExt)]):
+            if not validateIncludes(compileCmd, path, implExt, headerExt, [(shadersCleanPath, headerExt), (sharedPath, ".hlsli"), (gpurtInterfacePath, ".h"), (generatedFilepath, ".h")]):
                 return False
             if not validateCompilation(compileCmd, fullPath):
                 return False
-
     return True
 
 def isSpirvShader(shaderConfig, args):
@@ -751,6 +786,7 @@ def main() -> int:
     parser.add_argument('--verbose', action='store_true', help='Output verbose inforation', default=False)
     parser.add_argument('--defines', help='Defines for the shader compiler, separated by ; or ,.', default="")
     parser.add_argument('--includePaths', help='Include paths for the shader compiler, separated by ; or ,.', default="")
+    parser.add_argument('--g_FilePath', help='Path to the build destination where generated headers are written', default="")
     parser.add_argument('--compilerPath', help='Path to standalone compiler.', default='./dxc.exe')
     parser.add_argument('--dxcompilerLibPath', help='Path to dxcompiler.dll/libdxcompiler.so', default='./dxcompiler.dll')
     parser.add_argument('--spirvRemapPath', help='Path to spirv-remap executable', default='./spirv-remap.exe')
@@ -769,11 +805,14 @@ def main() -> int:
         tBegin = time.perf_counter()
 
         validIncludes = validateShadersClean(args)
+        validIncludes &= validateShared(args)
+
         # For vulkan, we validate SPIR-V shaders in the same run instead of running the script again.
         if args.vulkan and not args.spirv:
             print("Now doing SPIR-V validation...")
             args.spirv = True
             validIncludes &= validateShadersClean(args)
+            validIncludes &= validateShared(args)
 
         tDuration = time.perf_counter() - tBegin
         if validIncludes:
diff --git a/tools/DebugPreprocessShaders.py b/tools/DebugPreprocessShaders.py
index b51c1bd..4793b96 100644
--- a/tools/DebugPreprocessShaders.py
+++ b/tools/DebugPreprocessShaders.py
@@ -26,6 +26,7 @@
 import sys
 import os
 import re
+import argparse
 
 cpp_file_header = """
 /* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
@@ -62,7 +63,7 @@ def process_file(src_path, dst_path):
         for line in src_file:
             # Find something that looks like a GPU_ASSERT macro invocation (GPU_ASSERT + '(' or space)
             if line.find('#define') == -1:
-                m = re.match('.*GPU_ASSERT\s*(\()', line)
+                m = re.match('.*GPU_ASSERT\\s*(\\()', line)
                 if m is not None:
                     open_paren = m.start(1)
                     assert_id = add_assert(src_name, line_num, line)
@@ -71,7 +72,7 @@ def process_file(src_path, dst_path):
                     line = line.replace('GPU_ASSERT', 'GPU_ASSERT_IMPL', 1)
                 else:
                     # Find something that looks like a GPU_DPF macro invocation (GPU_DPF + '(' or space)
-                    m = re.match('.*GPU_DPF\s*(\().*"(.*)"', line)
+                    m = re.match('.*GPU_DPF\\s*(\\().*"(.*)"', line)
                     if m is not None:
                         open_paren = m.start(1)
                         msg_id = add_print_msg(src_name, line_num, m.group(2))
@@ -91,13 +92,26 @@ def generate_cpp_file(output_file_path):
         output_str += cpp_file_footer
         output_file.write(output_str)
 
-def main():
+def main(cpp_file, input_pair_list):
     # Process each file in the argument list
     # The argments are pairs of input and ouput files then the path to the output file
-    for i in range(1, len(sys.argv) - 1, 2):
-        process_file(sys.argv[i], sys.argv[i+1])
-    generate_cpp_file(sys.argv[-1])
+    for i in range(0, len(input_pair_list), 2):
+        process_file(input_pair_list[i], input_pair_list[i+1])
+    generate_cpp_file(cpp_file)
     return 0
 
 if __name__ == '__main__':
-    sys.exit(main())
+    parser = argparse.ArgumentParser(
+        prog='DebugPreprocessShaders',
+        description='Preprocesses shaders for GPU_ASSERT/GPU_DPF lines and generates a lookup table to match their text with their ID'
+    )
+    parser.add_argument('-i', '--input', help='File containing a list of input shader/output processed shader path pairs, semicolon delimited', required=True)
+    parser.add_argument('-o', '--output', help='Path to output cpp header', required=True)
+    args = parser.parse_args()
+
+    input_file = open(args.input, 'r')
+    # Strip any newlines or whitespace from the beginning/end, and split by ';'
+    input_pair_list = input_file.read().strip().split(';')
+
+    sys.exit(main(args.output, input_pair_list))
+
diff --git a/tools/DebugPreprocessShadersInput.txt.in b/tools/DebugPreprocessShadersInput.txt.in
new file mode 100644
index 0000000..3f15488
--- /dev/null
+++ b/tools/DebugPreprocessShadersInput.txt.in
@@ -0,0 +1,2 @@
+${preprocessArgs}
+