From 95c27c4031b112daaa231b76dee07f7ff05357d0 Mon Sep 17 00:00:00 2001
From: qiaojbao <Qiaojin.Bao@amd.com>
Date: Mon, 30 Sep 2024 11:46:47 +0800
Subject: [PATCH] Update gpurt from commit 80269d10

Remove unused path from BuildParallel
Set triangle 1 bits in triangle ID only when it is a pair compressed triangle
[Continuations] Change static ID handling for continuations
Fix tdr in rayquery apps when rdp attaches
Deprecate CmdBarrier() path
rename BuildBVHPLOC to BuildPLOC
Fix cmake defines and includes for validation
Add deviceSetting to disable compaction
Enable GPU debugging in all build stages
Move cpp-shared raytracingdefs.h into non-shared ShaderDefs.hlsli
Reduce CopyAS.hlsl dependencies
Move bit-related utils from Math to Bits.hlsli
Replace bufferView with typed and untyped BufferView
Continuations persistent launch support
[Continuations] Fix legacy compilation
Initialize parentId to -1 for ray history counter
Skip redundant copy in merge-sort iteration
Copy Indirect Args in InitExecuteIndirect
[Continuations] Remove redundant repacking of constant known ray flags
[Continuations] Add options to override TraceRay flags
Fix barrier corner cases
Softcode validation file extension and directory
[Continuations] Remove stack lowering guard
Add spirv pass to validation of clean shaders
Do not limit number of waves per simd for the encode path
Recompute the dispatchID when threadGroupSize != 32
Separate merge sort local/global dispatches
---
 backends/pal/gpurtPalBackend.cpp              |  104 +-
 gpurt/gpurt.h                                 |   11 +-
 gpurt/gpurtBuildSettings.h                    |    8 +-
 gpurt/gpurtDispatch.h                         |    6 +-
 gpurt/gpurtLib.h                              |    2 +-
 src/gpurtBvhBatcher.cpp                       |   84 +-
 src/gpurtBvhBuilder.cpp                       |  107 +-
 src/gpurtBvhBuilder.h                         |    6 +-
 src/gpurtBvhBuilderCommon.h                   |    6 +-
 src/gpurtDevice.cpp                           |   13 +-
 src/gpurtInternal.h                           |   18 +-
 src/gpurtInternalShaders.cpp                  |    5 +-
 src/options.yaml                              |   17 +
 src/shaders/BuildBVHTDTR.hlsl                 |  138 ++
 src/shaders/BuildCommon.hlsl                  |   31 +-
 src/shaders/BuildCommonScratch.hlsl           |   11 +-
 src/shaders/BuildFastAgglomerativeLbvh.hlsl   |   47 +-
 .../{BuildBVHPLOC.hlsl => BuildPLOC.hlsl}     |    8 +-
 src/shaders/BuildParallel.hlsl                |  100 +-
 src/shaders/BuildQBVH.hlsl                    |   17 +-
 src/shaders/BuildSettings.hlsli               |    7 +-
 src/shaders/CMakeLists.txt                    |   15 +-
 src/shaders/Common.hlsl                       |   69 +-
 src/shaders/Continuations1_1.hlsl             |    5 +-
 src/shaders/Continuations2_0.hlsl             |    7 +-
 src/shaders/CopyAS.hlsl                       |    5 +-
 src/shaders/Debug.hlsl                        |    7 +-
 src/shaders/EncodeCommon.hlsl                 |   21 +-
 src/shaders/EncodeHwBvhCommon.hlsl            |    5 +-
 src/shaders/EncodeNodes.hlsl                  |   12 +-
 src/shaders/EncodePairedTriangleImpl.hlsl     |   33 +-
 src/shaders/EncodeTopLevel.hlsl               |    2 +-
 src/shaders/EncodeTopLevelBuild.hlsl          |    6 +-
 src/shaders/Extensions.hlsl                   |  175 +--
 src/shaders/GpuRtLibrary.hlsl                 |    3 +
 src/shaders/GpuRtLibraryCont.hlsl             |  306 +++-
 src/shaders/InitExecuteIndirect.hlsl          |   20 +-
 src/shaders/MergeSort.hlsl                    |  318 +++-
 src/shaders/PairCompression.hlsl              |   12 +-
 src/shaders/RayQuery.hlsl                     |    2 +-
 src/shaders/TaskQueueCounter.hlsl             |    7 +
 src/shaders/TriangleSplitting.hlsl            |   32 +
 src/shaders/Update.hlsl                       |    2 +-
 src/shadersClean/common/Bits.hlsli            |  166 ++
 src/shadersClean/common/BoundingBox.hlsli     |   74 +
 src/shadersClean/common/Extensions.hlsli      |    4 -
 src/shadersClean/common/InstanceDesc.hlsli    |   51 +
 src/shadersClean/common/Math.hlsl             |    3 +
 src/shadersClean/common/Math.hlsli            |  142 +-
 src/shadersClean/common/NodePointers.hlsli    |   82 +
 .../common/ScratchNode.hlsli}                 |   13 +-
 src/shadersClean/common/ShaderDefs.hlsli      |  451 ++++++
 src/shadersClean/common/TempAssert.hlsli      |   38 +
 .../common/gfx10/BoxNode1_0.hlsli             |  137 ++
 .../common/gfx10/InstanceNode1_0.hlsli        |   72 +
 .../common/gfx10/ProceduralNode1_0.hlsli      |   56 +
 .../common/gfx10/TriangleNode1_0.hlsli        |   82 +
 .../traversal/TraversalDefs.hlsli             |  160 ++
 src/shared/rayTracingDefs.h                   | 1389 +----------------
 tools/CompileRTShaders.py                     |  139 +-
 60 files changed, 2679 insertions(+), 2190 deletions(-)
 rename src/shaders/{BuildBVHPLOC.hlsl => BuildPLOC.hlsl} (99%)
 create mode 100644 src/shadersClean/common/Bits.hlsli
 create mode 100644 src/shadersClean/common/BoundingBox.hlsli
 create mode 100644 src/shadersClean/common/InstanceDesc.hlsli
 create mode 100644 src/shadersClean/common/NodePointers.hlsli
 rename src/{shared/scratchNode.h => shadersClean/common/ScratchNode.hlsli} (97%)
 create mode 100644 src/shadersClean/common/TempAssert.hlsli
 create mode 100644 src/shadersClean/common/gfx10/BoxNode1_0.hlsli
 create mode 100644 src/shadersClean/common/gfx10/InstanceNode1_0.hlsli
 create mode 100644 src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli
 create mode 100644 src/shadersClean/common/gfx10/TriangleNode1_0.hlsli
 create mode 100644 src/shadersClean/traversal/TraversalDefs.hlsli

diff --git a/backends/pal/gpurtPalBackend.cpp b/backends/pal/gpurtPalBackend.cpp
index bbae889..08f4571 100644
--- a/backends/pal/gpurtPalBackend.cpp
+++ b/backends/pal/gpurtPalBackend.cpp
@@ -167,7 +167,9 @@ uint32 PalBackend::GetMaxDescriptorTableSize(
     ClientCmdBufferHandle cmdBuffer
     ) const
 {
-    const uint32 bufferSrdSizeDw = m_deviceProperties.gfxipProperties.srdSizes.bufferView / sizeof(uint32);
+    const uint32 bufferSrdSizeDw = Util::Max(m_deviceProperties.gfxipProperties.srdSizes.typedBufferView,
+                                             m_deviceProperties.gfxipProperties.srdSizes.untypedBufferView) /
+                                   sizeof(uint32);
     return GetCmdBuffer(cmdBuffer)->GetLargeEmbeddedDataLimit() / bufferSrdSizeDw;
 }
 
@@ -239,81 +241,39 @@ void PalBackend::InsertBarrier(
     const bool syncPostCpWrite  = flags & BarrierFlagSyncPostCpWrite;
 
     Pal::ICmdBuffer* pCmdBuffer = GetCmdBuffer(cmdBuffer);
-    if (m_deviceSettings.enableAcquireReleaseInterface)
-    {
-        Pal::AcquireReleaseInfo acqRelInfo  = {};
-        Pal::MemBarrier memoryBarrier       = {};
-
-        if (syncDispatch || syncIndirectArgs)
-        {
-            memoryBarrier.srcStageMask  = Pal::PipelineStageCs;
-            memoryBarrier.srcAccessMask = Pal::CoherShader;
-        }
-
-        if (syncPostCpWrite)
-        {
-            memoryBarrier.srcStageMask  |= Pal::PipelineStagePostPrefetch;
-            memoryBarrier.srcAccessMask |= Pal::CoherCp;
-        }
-
-        if (syncDispatch || syncPostCpWrite)
-        {
-            memoryBarrier.dstStageMask  = Pal::PipelineStageCs;
-            memoryBarrier.dstAccessMask = Pal::CoherShader;
-        }
-
-        if (syncIndirectArgs)
-        {
-            memoryBarrier.dstStageMask  |= Pal::PipelineStageFetchIndirectArgs;
-            memoryBarrier.dstAccessMask |= Pal::CoherIndirectArgs;
-        }
-
-        acqRelInfo.memoryBarrierCount = 1;
-        acqRelInfo.pMemoryBarriers    = &memoryBarrier;
-        acqRelInfo.reason             = m_deviceSettings.rgpBarrierReason;
-
-        pCmdBuffer->CmdReleaseThenAcquire(acqRelInfo);
-    }
-    else
-    {
-        Pal::BarrierInfo barrierInfo = {};
 
-        const uint32 pipePointCount  = (syncDispatch || syncIndirectArgs) ? 1 : 0;
-        Pal::HwPipePoint pipePoint   = Pal::HwPipePostCs;
+    Pal::AcquireReleaseInfo acqRelInfo  = {};
+    Pal::MemBarrier memoryBarrier       = {};
 
-        Pal::BarrierTransition transition = {};
-
-        if (syncDispatch)
-        {
-            transition.srcCacheMask = Pal::CoherShader;
-        }
-
-        if (syncPostCpWrite)
-        {
-            transition.srcCacheMask |= Pal::CoherCp;
-        }
+    if (syncDispatch || syncIndirectArgs)
+    {
+        memoryBarrier.srcStageMask  = Pal::PipelineStageCs;
+        memoryBarrier.srcAccessMask = Pal::CoherShader;
+    }
 
-        if (syncDispatch || syncPostCpWrite)
-        {
-            barrierInfo.waitPoint   = Pal::HwPipePreCs;
-            transition.dstCacheMask = Pal::CoherShader;
-        }
+    if (syncPostCpWrite)
+    {
+        memoryBarrier.srcStageMask  |= Pal::PipelineStagePostPrefetch;
+        memoryBarrier.srcAccessMask |= Pal::CoherCp;
+    }
 
-        if (syncIndirectArgs)
-        {
-            barrierInfo.waitPoint    = Pal::HwPipeTop;
-            transition.dstCacheMask |= Pal::CoherIndirectArgs;
-        }
+    if (syncDispatch || syncPostCpWrite)
+    {
+        memoryBarrier.dstStageMask  = Pal::PipelineStageCs;
+        memoryBarrier.dstAccessMask = Pal::CoherShader;
+    }
 
-        barrierInfo.pipePointWaitCount  = pipePointCount;
-        barrierInfo.pPipePoints         = &pipePoint;
-        barrierInfo.transitionCount     = 1;
-        barrierInfo.pTransitions        = &transition;
+    if (syncIndirectArgs)
+    {
+        memoryBarrier.dstStageMask  |= Pal::PipelineStageFetchIndirectArgs;
+        memoryBarrier.dstAccessMask |= Pal::CoherIndirectArgs;
+    }
 
-        barrierInfo.reason = m_deviceSettings.rgpBarrierReason;
+    acqRelInfo.memoryBarrierCount = 1;
+    acqRelInfo.pMemoryBarriers    = &memoryBarrier;
+    acqRelInfo.reason             = m_deviceSettings.rgpBarrierReason;
 
-        pCmdBuffer->CmdBarrier(barrierInfo);
-    }
+    pCmdBuffer->CmdReleaseThenAcquire(acqRelInfo);
 }
 
 // =====================================================================================================================
@@ -324,7 +284,11 @@ void PalBackend::CreateBufferViewSrds(
     bool                  isTyped
     ) const
 {
-    const uint32 bufferSrdSizeDw = m_deviceProperties.gfxipProperties.srdSizes.bufferView / sizeof(uint32);
+    const uint32 bufferSrdSizeDw = ((isTyped) ?
+                                    m_deviceProperties.gfxipProperties.srdSizes.typedBufferView :
+                                    m_deviceProperties.gfxipProperties.srdSizes.untypedBufferView)
+                                   / sizeof(uint32);
+
     const Pal::BufferViewInfo palBufferViewInfo = ConvertBufferViewToPalBufferView(bufferViewInfo);
     const void* pNullBuffer = m_deviceProperties.gfxipProperties.nullSrds.pNullBufferView;
 
diff --git a/gpurt/gpurt.h b/gpurt/gpurt.h
index 412b556..68d5ef5 100644
--- a/gpurt/gpurt.h
+++ b/gpurt/gpurt.h
@@ -285,7 +285,10 @@ enum class InternalRayTracingCsType : uint32
     BuildBVH,
     BuildBVHTD,
     BuildBVHTDTR,
-    BuildBVHPLOC,
+    BuildPLOC,
+#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 52
+    BuildBVHPLOC = BuildPLOC,
+#endif
     UpdateQBVH,
     UpdateParallel,
     RefitBounds,
@@ -311,6 +314,9 @@ enum class InternalRayTracingCsType : uint32
     InitExecuteIndirect,
     PairCompression,
     MergeSort,
+    MergeSortLocal,
+    MergeSortGlobalIteration,
+    MergeSortCopyLastLevel,
     UpdateTriangles,
     UpdateAabbs,
     InitAccelerationStructure,
@@ -753,7 +759,9 @@ struct DeviceSettings
         uint32 enableParallelUpdate : 1;
         uint32 enableParallelBuild : 1;
         uint32 enablePrefixScanDLB : 1;
+#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 51
         uint32 enableAcquireReleaseInterface : 1;
+#endif
         uint32 enableBuildAccelStructDumping : 1;
         uint32 enableBuildAccelStructScratchDumping : 1;
         uint32 enableBuildAccelStructStats : 1;
@@ -779,6 +787,7 @@ struct DeviceSettings
 
         uint32 enableRemapScratchBuffer : 1;                // Enable remapping bvh2 data from ScratchBuffer to ResultBuffer
         uint32 checkBufferOverlapsInBatch : 1;
+        uint32 disableCompaction : 1;                       // Reports and perform copy instead of compaction
     };
 
     uint64                      accelerationStructureUUID;  // Acceleration Structure UUID
diff --git a/gpurt/gpurtBuildSettings.h b/gpurt/gpurtBuildSettings.h
index cce62a4..5c73247 100644
--- a/gpurt/gpurtBuildSettings.h
+++ b/gpurt/gpurtBuildSettings.h
@@ -69,7 +69,7 @@ struct CompileTimeBuildSettings
     uint32 enableTopDownBuild;
     uint32 useMortonCode30;
     uint32 enableMergeSort;
-    uint32 fastBuildThreshold;
+    uint32 unused14;
     uint32 enableFusedInstanceNode;
     float  tsPriority;
     uint32 numRebraidIterations;
@@ -99,7 +99,7 @@ struct CompileTimeBuildSettings
     uint32 unused11;
     uint32 unused12;
     uint32 unused13;
-    uint32 rebuildAccelStruct;
+    uint32 disableCompaction;
 };
 
 #define BUILD_SETTINGS_DATA_TOP_LEVEL_BUILD_ID                        0
@@ -119,7 +119,7 @@ struct CompileTimeBuildSettings
 #define BUILD_SETTINGS_DATA_ENABLE_TOP_DOWN_BUILD_ID                  14
 #define BUILD_SETTINGS_DATA_USE_MORTON_CODE_30_ID                     15
 #define BUILD_SETTINGS_DATA_ENABLE_MERGE_SORT_ID                      16
-#define BUILD_SETTINGS_DATA_FAST_BUILD_THRESHOLD_ID                   17
+// unused14 id                                                          17
 #define BUILD_SETTINGS_DATA_ENABLE_FUSED_INSTANCE_NODE_ID             18
 #define BUILD_SETTINGS_DATA_TS_PRIORITY_ID                            19
 #define BUILD_SETTINGS_DATA_NUM_REBRAID_ITERATIONS_ID                 20
@@ -135,7 +135,7 @@ struct CompileTimeBuildSettings
 #define BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID               41
 #define BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID          42
 #define BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID              43
-#define BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID         47
+#define BUILD_SETTINGS_DATA_DISABLE_COMPACTION_ID                     47
 
 #ifdef __cplusplus
 } // namespace GpuRt
diff --git a/gpurt/gpurtDispatch.h b/gpurt/gpurtDispatch.h
index b6fb1b9..8f4ce03 100644
--- a/gpurt/gpurtDispatch.h
+++ b/gpurt/gpurtDispatch.h
@@ -70,11 +70,11 @@ struct DispatchRaysConstantData
     uint32 missTableBaseAddressLo;      // Miss shader table base address low 32-bits
     uint32 missTableBaseAddressHi;      // Miss shader table base address high 32-bits
     uint32 missTableStrideInBytes;      // Miss shader table record byte stride
-    uint32 reserved0;                   // Reserved padding
+    uint32 rayDispatchMaxGroups;        // Max groups dispatched if persistent launch is enabled, else 0
     uint32 hitGroupTableBaseAddressLo;  // Hit group table base address low 32-bits
     uint32 hitGroupTableBaseAddressHi;  // Hit group table base address high 32-bits
     uint32 hitGroupTableStrideInBytes;  // Hit group table record byte stride
-    uint32 reserved1;                   // Reserved padding
+    uint32 reserved0;                   // Reserved padding
     uint32 callableTableBaseAddressLo;  // Callable shader table base address low 32-bits
     uint32 callableTableBaseAddressHi;  // Callable shader table base address high 32-bits
     uint32 callableTableStrideInBytes;  // Callable shader table byte stride
@@ -146,6 +146,7 @@ struct InitExecuteIndirectConstants
     uint32 rtThreadGroupSizeX;      // Internal RT threadgroup size X
     uint32 rtThreadGroupSizeY;      // Internal RT threadgroup size Y
     uint32 rtThreadGroupSizeZ;      // Internal RT threadgroup size Z
+    uint32 rayDispatchMaxGroups;    // Max groups dispatched if persistent launch is enabled, else 0
     uint32 counterMask;             // Mask for filtering ray history token
     uint32 pipelineCount;           // Number of pipelines to launch (1 for indirect launch, raygen count for unified)
     uint32 maxIterations;           // Max traversal interations for profiling
@@ -160,7 +161,6 @@ struct InitExecuteIndirectConstants
     uint32 counterRayIdRangeEnd;    // Counter ray ID range end
     uint32 cpsBackendStackSize;     // Scratch memory used by a compiler backend, start at offset 0
     uint32 padding0;                // Padding for 16-byte alignment
-    uint32 padding1;                // Padding for 16-byte alignment
 
 #if __cplusplus
      // Internal counter buffer SRDs
diff --git a/gpurt/gpurtLib.h b/gpurt/gpurtLib.h
index b607c3e..0fc8001 100644
--- a/gpurt/gpurtLib.h
+++ b/gpurt/gpurtLib.h
@@ -42,7 +42,7 @@ namespace GpuRt
 // update their definition of GPURT_CLIENT_INTERFACE_MAJOR_VERSION to indicate that they have made the required changes
 // to support a new version. When the client version is updated, the old interface will be compiled out and only the
 // new one will remain.
-#define GPURT_INTERFACE_MAJOR_VERSION 49
+#define GPURT_INTERFACE_MAJOR_VERSION 52
 
 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 44
 // Minor interface version. This number is incrememnted when a compatible interface change is made. Compatible changes
diff --git a/src/gpurtBvhBatcher.cpp b/src/gpurtBvhBatcher.cpp
index 08ee242..ba70d10 100644
--- a/src/gpurtBvhBatcher.cpp
+++ b/src/gpurtBvhBatcher.cpp
@@ -200,7 +200,14 @@ void BvhBatcher::BuildRaytracingAccelerationStructureBatch(
     }
     if ((updaters.IsEmpty() == false) || (builders.IsEmpty() == false))
     {
-        Barrier();
+        uint32 barrierFlags = BarrierFlagSyncDispatch;
+        if (updaters.IsEmpty() == false)
+        {
+            // Updates can be launched with indirect dispatch. We need to avoid fetching the indirect arguments
+            // from the header before they are written by a previous build/update/copy.
+            barrierFlags |= BarrierFlagSyncIndirectArg;
+        }
+        Barrier(barrierFlags);
     }
     RGP_POP_MARKER();
 
@@ -304,11 +311,72 @@ void BvhBatcher::BuildMultiDispatch(Util::Span<BvhBuilder> builders)
     if (PhaseEnabled(BuildPhaseFlags::MergeSort))
     {
         Barrier();
-        const uint32 wavesPerSimd = builders.size() == 1 ? 16U : 2U;
-        BuildFunction(BuildPhaseFlags::MergeSort, builders, [wavesPerSimd](BvhBuilder& builder)
+
+        if (builders.size() > 1)
         {
-            builder.MergeSort(wavesPerSimd);
-        });
+            const uint32 wavesPerSimd = 2U;
+            BuildFunction(BuildPhaseFlags::MergeSort, builders, [wavesPerSimd](BvhBuilder& builder)
+            {
+                builder.MergeSort(wavesPerSimd);
+            });
+        }
+        else
+        {
+            RGP_PUSH_MARKER("Merge Sort");
+
+            // Batch local sorts together.
+            BuildPhase("Merge Sort (Local)", builders, &BvhBuilder::MergeSortLocal);
+
+            Barrier();
+
+            // Batch global sort iterations together. Compute max iterations amongst the builder batch
+            uint32 maxMergeSortTreeLevel = 0;
+
+            bool batchNeedsLastLevelCopy = false;
+
+            for (const auto& builder : builders)
+            {
+                const uint32 mergeSortTreeLevel = builder.GetMaxMergeSortTreeLevel();
+                maxMergeSortTreeLevel = Util::Max(maxMergeSortTreeLevel, mergeSortTreeLevel);
+                batchNeedsLastLevelCopy |= ((mergeSortTreeLevel & 1) == 1);
+            }
+
+            if (maxMergeSortTreeLevel > 0)
+            {
+                RGP_PUSH_MARKER("Merge Sort (Global Iteration)");
+                for (uint32 level = 1; level <= maxMergeSortTreeLevel; level++)
+                {
+                    Barrier();
+
+                    BuildFunction(nullptr, builders, [level](BvhBuilder& builder)
+                    {
+                        if (level <= builder.GetMaxMergeSortTreeLevel())
+                        {
+                            builder.MergeSortGlobalIteration(level);
+                        }
+                    });
+                }
+                RGP_POP_MARKER();
+
+                if (batchNeedsLastLevelCopy)
+                {
+                    Barrier();
+
+                    RGP_PUSH_MARKER("Merge Sort (Copy Last Level)");
+                    BuildFunction(nullptr, builders, [](BvhBuilder& builder)
+                    {
+                        const uint32 mergeSortTreeLevel = builder.GetMaxMergeSortTreeLevel();
+                        if ((mergeSortTreeLevel & 1) == 1)
+                        {
+                            builder.MergeSortCopyLastLevel();
+                        }
+                    });
+                    RGP_POP_MARKER();
+                }
+            }
+
+            RGP_POP_MARKER();
+        }
     }
     if (PhaseEnabled(BuildPhaseFlags::RadixSort))
     {
@@ -327,13 +395,13 @@ void BvhBatcher::BuildMultiDispatch(Util::Span<BvhBuilder> builders)
         Barrier();
         BuildPhase(BuildPhaseFlags::BuildFastAgglomerativeLbvh, builders, &BvhBuilder::BuildFastAgglomerativeLbvh);
     }
-    if (PhaseEnabled(BuildPhaseFlags::BuildBVHPLOC))
+    if (PhaseEnabled(BuildPhaseFlags::BuildPLOC))
     {
         Barrier();
         const uint32 wavesPerSimd = builders.size() == 1 ? 8U : 1U;
-        BuildFunction(BuildPhaseFlags::BuildBVHPLOC, builders, [wavesPerSimd](BvhBuilder& builder)
+        BuildFunction(BuildPhaseFlags::BuildPLOC, builders, [wavesPerSimd](BvhBuilder& builder)
         {
-            builder.BuildBVHPLOC(wavesPerSimd);
+            builder.BuildPLOC(wavesPerSimd);
         });
     }
     if (PhaseEnabled(BuildPhaseFlags::RefitBounds))
diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp
index feb9e0b..47e4043 100644
--- a/src/gpurtBvhBuilder.cpp
+++ b/src/gpurtBvhBuilder.cpp
@@ -1523,7 +1523,7 @@ void BvhBuilder::InitGeometryConstants()
 
     void* pVbvTable = m_pDevice->AllocateDescriptorTable(m_cmdBuffer, geometryCount, &m_geomBufferSrdTable);
     void* pCbvTable = m_pDevice->AllocateDescriptorTable(m_cmdBuffer, geometryCount, &m_geomConstSrdTable);
-    const uint32 srdSizeBytes = m_pDevice->GetBufferSrdSizeDw() * sizeof(uint32);
+    const uint32 srdSizeBytes = m_pDevice->GetUntypedBufferSrdSizeDw() * sizeof(uint32);
 
     for (uint32 i = 0; i < geometryCount; i++)
     {
@@ -2201,7 +2201,6 @@ void BvhBuilder::InitBuildSettings()
     m_buildSettings.rebraidType                  = static_cast<uint32>(m_buildConfig.rebraidType);
     m_buildSettings.enableTopDownBuild           = m_buildConfig.topDownBuild;
     m_buildSettings.useMortonCode30              = m_deviceSettings.enableMortonCode30;
-    m_buildSettings.fastBuildThreshold           = m_deviceSettings.fastBuildThreshold;
     m_buildSettings.enableFusedInstanceNode      = m_deviceSettings.enableFusedInstanceNode;
     m_buildSettings.enableMergeSort              = m_buildConfig.enableMergeSort;
 
@@ -2248,7 +2247,9 @@ void BvhBuilder::InitBuildSettings()
 
     m_buildSettings.updateFlags =
         m_buildArgs.inputs.flags & (AccelStructBuildFlagPerformUpdate | AccelStructBuildFlagAllowUpdate);
-    m_buildSettings.rebuildAccelStruct = m_buildConfig.rebuildAccelStruct;
+
+    // Rebuilding an updateable acceleration structure need to use the original size and not compacted one.
+    m_buildSettings.disableCompaction = m_buildConfig.rebuildAccelStruct || m_deviceSettings.disableCompaction;
 
     m_buildSettings.isUpdateInPlace = IsUpdateInPlace();
     m_buildSettings.encodeArrayOfPointers =
@@ -2821,7 +2822,14 @@ void BvhBuilder::EmitAccelerationStructurePostBuildInfo(
         break;
 
     case AccelStructPostBuildInfoType::CompactedSize:
-        EmitASCompactedType(postBuildInfo);
+        if (m_deviceSettings.disableCompaction)
+        {
+            EmitASCurrentSize(postBuildInfo);
+        }
+        else
+        {
+            EmitASCompactedType(postBuildInfo);
+        }
         break;
 
     case AccelStructPostBuildInfoType::ToolsVisualization:
@@ -2990,7 +2998,14 @@ void BvhBuilder::CopyAccelerationStructure(
         break;
 
     case AccelStructCopyMode::Compact:
-        CopyASCompactMode(copyArgs);
+        if (m_deviceSettings.disableCompaction)
+        {
+            CopyASCloneMode(copyArgs);
+        }
+        else
+        {
+            CopyASCompactMode(copyArgs);
+        }
         break;
 
     case AccelStructCopyMode::Serialize:
@@ -3257,7 +3272,7 @@ BuildPhaseFlags BvhBuilder::EnabledPhases() const
                 }
                 if (m_buildConfig.buildMode == BvhBuildMode::PLOC)
                 {
-                    flags |= BuildPhaseFlags::BuildBVHPLOC;
+                    flags |= BuildPhaseFlags::BuildPLOC;
                 }
                 if (AllowLatePairCompression())
                 {
@@ -3345,6 +3360,78 @@ void BvhBuilder::MergeSort(
     RGP_POP_MARKER();
 }
 
+// =====================================================================================================================
+// Executes merge sort shader to sort the input keys and values
+void BvhBuilder::MergeSortLocal()
+{
+    PAL_ASSERT(m_buildConfig.enableMergeSort);
+
+    BindPipeline(InternalRayTracingCsType::MergeSortLocal);
+
+    WriteBuildBufferBindings();
+
+    RGP_PUSH_MARKER("Merge Sort Local (maxNumPrimitives %u)", m_buildConfig.maxNumPrimitives);
+
+    const uint32 tgSize = 512;
+    Dispatch(Util::RoundUpQuotient(m_buildConfig.maxNumPrimitives, tgSize));
+
+    RGP_POP_MARKER();
+}
+
+// =====================================================================================================================
+uint32 BvhBuilder::GetMaxMergeSortTreeLevel() const
+{
+    const uint32 tgSize = 512;
+
+    const uint32 groupSize = tgSize;
+    const uint32 numKeysPerThread = 2u;
+    const uint32 groupCapacity = groupSize * numKeysPerThread;
+    const uint32 numLocalSortedGroups = Util::RoundUpQuotient(m_buildConfig.maxNumPrimitives, groupCapacity);
+    const uint32 numLevelsOfMergeTree = Util::CeilLog2(numLocalSortedGroups);
+
+    return numLevelsOfMergeTree;
+}
+
+// =====================================================================================================================
+// Executes merge sort shader to sort the input keys and values
+void BvhBuilder::MergeSortGlobalIteration(
+    uint32 level)
+{
+    PAL_ASSERT(m_buildConfig.enableMergeSort);
+
+    BindPipeline(InternalRayTracingCsType::MergeSortGlobalIteration);
+
+    const BuildShaderRootConstants1 constants = {
+        .passIndex = level,
+    };
+    WriteBuildBufferBindings(constants);
+
+    RGP_PUSH_MARKER("Merge Sort Global Iteration (maxNumPrimitives %u, level %u)", m_buildConfig.maxNumPrimitives, level);
+
+    const uint32 tgSize = 512;
+    Dispatch(Util::RoundUpQuotient(m_buildConfig.maxNumPrimitives, tgSize));
+
+    RGP_POP_MARKER();
+}
+
+// =====================================================================================================================
+// Executes merge sort shader to sort the input keys and values
+void BvhBuilder::MergeSortCopyLastLevel()
+{
+    PAL_ASSERT(m_buildConfig.enableMergeSort);
+
+    BindPipeline(InternalRayTracingCsType::MergeSortCopyLastLevel);
+
+    WriteBuildBufferBindings();
+
+    RGP_PUSH_MARKER("Merge Sort Copy (maxNumPrimitives %u)", m_buildConfig.maxNumPrimitives);
+
+    const uint32 tgSize = 512;
+    Dispatch(Util::RoundUpQuotient(m_buildConfig.maxNumPrimitives, tgSize));
+
+    RGP_POP_MARKER();
+}
+
 // =====================================================================================================================
 // Returns true when the builder uses the Rebraid phase
 bool BvhBuilder::AllowRebraid() const
@@ -3476,10 +3563,10 @@ void BvhBuilder::BuildBVHTD()
 
 // =====================================================================================================================
 // Executes the build BVH PLOC shader
-void BvhBuilder::BuildBVHPLOC(
+void BvhBuilder::BuildPLOC(
     uint32 wavesPerSimd)
 {
-    BindPipeline(InternalRayTracingCsType::BuildBVHPLOC);
+    BindPipeline(InternalRayTracingCsType::BuildPLOC);
 
     const uint32 tgSize = 256u;
     const uint32 numThreadGroups = GetNumPersistentThreadGroups(m_buildConfig.maxNumPrimitives, tgSize, wavesPerSimd);
@@ -3664,9 +3751,7 @@ void BvhBuilder::EncodeHwBvh()
     }
 
     const uint32 nodeCount       = GetNumInternalNodeCount();
-    const uint32 numThreadGroups =
-        m_buildSettings.topLevelBuild ? Util::RoundUpQuotient(nodeCount, DefaultThreadGroupSize) :
-        GetNumPersistentThreadGroups(nodeCount);
+    const uint32 numThreadGroups = Util::RoundUpQuotient(nodeCount, DefaultThreadGroupSize);
 
     BuildShaderRootConstants0 shaderConstants =
     {
diff --git a/src/gpurtBvhBuilder.h b/src/gpurtBvhBuilder.h
index 4af762f..806c518 100644
--- a/src/gpurtBvhBuilder.h
+++ b/src/gpurtBvhBuilder.h
@@ -341,7 +341,7 @@ class BvhBuilder
 
     void BuildBVHTD();
 
-    void BuildBVHPLOC(uint32 wavesPerSimd);
+    void BuildPLOC(uint32 wavesPerSimd);
 
     void BuildFastAgglomerativeLbvh();
 
@@ -374,6 +374,10 @@ class BvhBuilder
         uint32 numElems);
 
     void MergeSort(uint32 wavesPerSimd);
+    void MergeSortLocal();
+    void MergeSortGlobalIteration(uint32 level);
+    void MergeSortCopyLastLevel();
+    uint32 GetMaxMergeSortTreeLevel() const;
     void SortRadixInt32();
 
     void ScanExclusiveAdd(
diff --git a/src/gpurtBvhBuilderCommon.h b/src/gpurtBvhBuilderCommon.h
index 5953044..435ca0d 100644
--- a/src/gpurtBvhBuilderCommon.h
+++ b/src/gpurtBvhBuilderCommon.h
@@ -49,7 +49,7 @@ enum class BuildPhaseFlags : uint32_t
     MergeSort                     = 1 << 4,
     RadixSort                     = 1 << 5,
     BuildBVH                      = 1 << 6,
-    BuildBVHPLOC                  = 1 << 7,
+    BuildPLOC                     = 1 << 7,
     RefitBounds                   = 1 << 8,
     PairCompression               = 1 << 9,
     SeparateEmitPostBuildInfoPass = 1 << 12,
@@ -78,8 +78,8 @@ static const char* BuildPhaseName(BuildPhaseFlags phase)
         return "RadixSort";
     case GpuRt::BuildPhaseFlags::BuildBVH:
         return "BuildBVH";
-    case GpuRt::BuildPhaseFlags::BuildBVHPLOC:
-        return "BuildBVHPLOC";
+    case GpuRt::BuildPhaseFlags::BuildPLOC:
+        return "BuildPLOC";
     case GpuRt::BuildPhaseFlags::RefitBounds:
         return "RefitBounds";
     case GpuRt::BuildPhaseFlags::PairCompression:
diff --git a/src/gpurtDevice.cpp b/src/gpurtDevice.cpp
index b1368f0..6058d33 100644
--- a/src/gpurtDevice.cpp
+++ b/src/gpurtDevice.cpp
@@ -387,6 +387,8 @@ Device::Device(
     m_tlasCaptureList(this),
     m_isTraceActive(false),
     m_accelStructTraceSource(this),
+    m_typedBufferSrdSizeDw{},
+    m_untypedBufferSrdSizeDw{},
     m_rayHistoryTraceSource(this),
 #if GPURT_ENABLE_GPU_DEBUG
     m_debugMonitor(this),
@@ -421,7 +423,8 @@ Pal::Result Device::Init()
     Pal::DeviceProperties props = {};
     m_info.pPalDevice->GetProperties(&props);
 
-    m_bufferSrdSizeDw = props.gfxipProperties.srdSizes.bufferView / sizeof(uint32);
+    m_typedBufferSrdSizeDw   = props.gfxipProperties.srdSizes.typedBufferView / sizeof(uint32);
+    m_untypedBufferSrdSizeDw = props.gfxipProperties.srdSizes.untypedBufferView / sizeof(uint32);
 
     if (m_info.deviceSettings.emulatedRtIpLevel == Pal::RayTracingIpLevel::None)
     {
@@ -812,7 +815,7 @@ void* Device::AllocateDescriptorTable(
     uint32                count,
     gpusize*              pGpuAddress) const
 {
-    const uint32 srdSizeBytes = m_bufferSrdSizeDw * sizeof(uint32);
+    const uint32 srdSizeBytes = m_typedBufferSrdSizeDw * sizeof(uint32);
     const uint32 srdBufferSizeBytes = srdSizeBytes * count;
     return AllocateTemporaryData(cmdBuffer, srdBufferSizeBytes, pGpuAddress);
 }
@@ -828,7 +831,7 @@ uint32 Device::WriteBufferSrdTable(
 {
     gpusize tableVa;
     void* pTable = AllocateDescriptorTable(cmdBuffer, count, &tableVa);
-    const uint32 srdSizeBytes = m_bufferSrdSizeDw * sizeof(uint32);
+    const uint32 srdSizeBytes = (typedBuffer ? m_typedBufferSrdSizeDw  : m_untypedBufferSrdSizeDw) * sizeof(uint32);
 
     for (uint32 i = 0; i < count; i++)
     {
@@ -2134,6 +2137,10 @@ const AccelStructBuildInputs Device::OverrideBuildInputs(
     {
         buildInputs.flags &= ~(GpuRt::AccelStructBuildFlagAllowUpdate | GpuRt::AccelStructBuildFlagPerformUpdate);
     }
+    if (Settings().disableCompaction)
+    {
+        buildInputs.flags &= ~(GpuRt::AccelStructBuildFlagAllowCompaction);
+    }
 
     return buildInputs;
 }
diff --git a/src/gpurtInternal.h b/src/gpurtInternal.h
index d5f0251..7cf7f2c 100644
--- a/src/gpurtInternal.h
+++ b/src/gpurtInternal.h
@@ -231,14 +231,6 @@ using InternalPipelineMap = std::unordered_map<InternalPipelineKey,
                                                InternalPipelineMemoryPair,
                                                InternalPipelineKeyHasher>;
 
-//=====================================================================================================================
-// different ways to encode the scene bounds used to generate morton codes
-enum class SceneBoundsCalculation : uint32
-{
-    BasedOnGeometry = 0,
-    BasedOnGeometryWithSize
-};
-
 namespace Internal {
 
 // =====================================================================================================================
@@ -691,8 +683,11 @@ class Device : public IDevice
 
     virtual bool ShouldUseGangedAceForBuild(const AccelStructBuildInputs& inputs) const override;
 
-    // Returns size in DWORDs of a buffer view SRD
-    uint32 GetBufferSrdSizeDw() const { return m_bufferSrdSizeDw; };
+    // Returns size in DWORDs of a typed buffer view SRD
+    uint32 GetTypedBufferSrdSizeDw() const { return m_typedBufferSrdSizeDw; };
+
+    // Returns size in DWORDs of a untyped buffer view SRD
+    uint32 GetUntypedBufferSrdSizeDw() const { return m_untypedBufferSrdSizeDw; };
 
     Pal::RayTracingIpLevel GetRtIpLevel() const { return m_rtIpLevel; }
 
@@ -736,7 +731,8 @@ class Device : public IDevice
     Util::Mutex                              m_traceBvhLock;
     bool                                     m_isTraceActive;
     GpuRt::AccelStructTraceSource            m_accelStructTraceSource;
-    uint32                                   m_bufferSrdSizeDw;
+    uint32                                   m_typedBufferSrdSizeDw;
+    uint32                                   m_untypedBufferSrdSizeDw;
     ClientCallbacks                          m_clientCb;
     Pal::RayTracingIpLevel                   m_rtIpLevel;           // the actual RTIP level GPURT is using,
                                                                     // is based on emulatedRtIpLevel and the actual device.
diff --git a/src/gpurtInternalShaders.cpp b/src/gpurtInternalShaders.cpp
index 64bce5f..2f4eec5 100644
--- a/src/gpurtInternalShaders.cpp
+++ b/src/gpurtInternalShaders.cpp
@@ -70,7 +70,7 @@ const PipelineBuildInfo InternalPipelineBuildInfo[size_t(InternalRayTracingCsTyp
     PIPELINE_BUILD_BVH_INFO(BuildBVH),
     PIPELINE_BUILD_BVH_INFO(BuildBVHTD),
     PIPELINE_BUILD_BVH_INFO(BuildBVHTDTR),
-    PIPELINE_BUILD_BVH_INFO(BuildBVHPLOC),
+    PIPELINE_BUILD_BVH_INFO(BuildPLOC),
     PIPELINE_BUILD_INFO(UpdateQBVH),
     PIPELINE_BUILD_INFO(UpdateParallel),
     PIPELINE_BUILD_BVH_INFO(RefitBounds),
@@ -96,6 +96,9 @@ const PipelineBuildInfo InternalPipelineBuildInfo[size_t(InternalRayTracingCsTyp
     PIPELINE_BUILD_INFO(InitExecuteIndirect),
     PIPELINE_BUILD_BVH_INFO(PairCompression),
     PIPELINE_BUILD_BVH_INFO(MergeSort),
+    PIPELINE_BUILD_BVH_INFO(MergeSortLocal),
+    PIPELINE_BUILD_BVH_INFO(MergeSortGlobalIteration),
+    PIPELINE_BUILD_BVH_INFO(MergeSortCopyLastLevel),
     PIPELINE_BUILD_INFO(UpdateTriangles),
     PIPELINE_BUILD_INFO(UpdateAabbs),
     PIPELINE_BUILD_INFO(InitAccelerationStructure),
diff --git a/src/options.yaml b/src/options.yaml
index 80b15d4..4ea170e 100644
--- a/src/options.yaml
+++ b/src/options.yaml
@@ -54,3 +54,20 @@ Options:
     cpsCandidatePrimitiveMode:
         Type: CpsCandidatePrimitiveMode
         Default: CpsCandidatePrimitiveMode::SuspendLane
+
+    persistentLaunchEnabled:
+        Type: uint32
+        Default: 0
+
+    rayFlagsOverrideForceEnableMask:
+        # The incoming TraceRay ray flags are ORed with this mask before use, allowing to force-enable specific flags.
+        # In case of conflicting bits with the disable mask, the enable mask wins, as it is applied after the disable mask.
+        # Only supported with continuations.
+        Type: uint32
+        Default: 0
+
+    rayFlagsOverrideForceDisableMask:
+        # The incoming TraceRay ray flags are ANDed with the bitwise inverse of this mask before use, allowing to force-disable specific flags.
+        # Only supported with continuations.
+        Type: uint32
+        Default: 0
diff --git a/src/shaders/BuildBVHTDTR.hlsl b/src/shaders/BuildBVHTDTR.hlsl
index ed21d28..7d1d71f 100644
--- a/src/shaders/BuildBVHTDTR.hlsl
+++ b/src/shaders/BuildBVHTDTR.hlsl
@@ -22,6 +22,8 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
+#include "../shadersClean/common/BoundingBox.hlsli"
+
 #define USE_SAH             1
 //=====================================================================================================================
 // 32 bit constants
@@ -46,6 +48,142 @@ struct TDArgs
 #define INVALID_IDX     0xffffffff
 #define TD_EPSILON      0.99999
 
+//=====================================================================================================================
+#define REF_SCRATCH_SIDE_LEFT       0
+#define REF_SCRATCH_SIDE_RIGHT      1
+#define REF_SCRATCH_SIDE_LEAF       2
+
+struct TDRefScratch
+{
+    uint        primitiveIndex;
+    uint        nodeIndex;
+    float3      center;
+    BoundingBox box;
+    uint        side;
+#if USE_BVH_REBRAID
+    uint        nodePointer; //rebraid only
+#endif
+#if USE_BLAS_PRIM_COUNT
+    uint        numPrimitives;
+#endif
+};
+
+#define TD_REF_PRIM_INDEX_OFFSET    0
+#define TD_REF_NODE_INDEX_OFFSET    4
+#define TD_REF_CENTER_OFFSET        8
+#define TD_REF_BOX_OFFSET           20
+#define TD_REF_SIDE_OFFSET          (TD_REF_BOX_OFFSET + sizeof(BoundingBox))
+#define TD_REF_NODE_POINTER_OFFSET  (TD_REF_SIDE_OFFSET + 4)
+#if USE_BLAS_PRIM_COUNT
+#define TD_REF_NUM_PRIM_OFFSET      (TD_REF_NODE_POINTER_OFFSET + sizeof(uint))
+#endif
+
+//=====================================================================================================================
+#define NUM_SPLIT_BINS        4
+
+#define TD_NODE_REBRAID_STATE_OPEN   0
+#define TD_NODE_REBRAID_STATE_CLOSED 1
+
+struct TDBins
+{
+    uint64_t        firstRefIndex;
+
+    UintBoundingBox binBoxes[3][NUM_SPLIT_BINS];
+    uint            binPrimCount[3][NUM_SPLIT_BINS];
+
+    uint            bestAxis;
+    uint            bestSplit;
+    uint            numLeft;
+    uint            numRight;
+
+#if USE_BLAS_PRIM_COUNT
+    uint            binBLASPrimCount[3][NUM_SPLIT_BINS];
+#endif
+};
+
+#define TD_BINS_FIRST_REF_INDEX_OFFSET        0
+#define TD_BINS_BIN_BOXES_OFFSET              (TD_BINS_FIRST_REF_INDEX_OFFSET + 8)
+#define TD_BINS_BIN_PRIM_COUNT_OFFSET         (TD_BINS_BIN_BOXES_OFFSET + sizeof(UintBoundingBox) * NUM_SPLIT_BINS * 3)
+#define TD_BINS_BEST_AXIS_OFFSET              (TD_BINS_BIN_PRIM_COUNT_OFFSET + sizeof(uint) * NUM_SPLIT_BINS * 3)
+#define TD_BINS_BEST_SPLIT_OFFSET             (TD_BINS_BEST_AXIS_OFFSET + 4)
+#define TD_BINS_NUM_LEFT_OFFSET               (TD_BINS_BEST_SPLIT_OFFSET + 4)
+#define TD_BINS_NUM_RIGHT_OFFSET              (TD_BINS_NUM_LEFT_OFFSET + 4)
+#if USE_BLAS_PRIM_COUNT
+#define TD_BINS_BLAS_PRIM_COUNT_OFFSET        (TD_BINS_NUM_RIGHT_OFFSET + 4)
+#endif
+
+struct TDNode
+{
+    UintBoundingBox centroidBox;
+    uint            binsIndex;
+    uint            childCount;
+
+#if USE_BVH_REBRAID
+    uint            largestAxis;    // rebraid only
+    float           largestWidth;   // rebraid only
+    uint            rebraidState;   // rebraid only
+    uint            primIndex;      // rebraid only
+#endif
+};
+
+#define TD_NODE_CENTROID_BOX_OFFSET           0
+#define TD_NODE_BINS_INDEX_OFFSET             (TD_NODE_CENTROID_BOX_OFFSET + sizeof(UintBoundingBox))
+#define TD_NODE_CHILD_COUNT_OFFSET            (TD_NODE_BINS_INDEX_OFFSET + 4)
+#define TD_NODE_LARGEST_AXIS_OFFSET           (TD_NODE_CHILD_COUNT_OFFSET + 4)
+#define TD_NODE_LARGEST_WIDTH_OFFSET          (TD_NODE_LARGEST_AXIS_OFFSET + 4)
+#define TD_NODE_REBRAID_STATE_OFFSET          (TD_NODE_LARGEST_WIDTH_OFFSET + 4)
+#define TD_NODE_PRIM_INDEX_OFFSET             (TD_NODE_REBRAID_STATE_OFFSET + 4)
+
+//=====================================================================================================================
+
+#define TD_REBRAID_STATE_NO_OPEN    0
+#define TD_REBRAID_STATE_NEED_OPEN  1
+#define TD_REBRAID_STATE_OOM        2
+
+#define TD_PHASE_INIT_STATE                 0
+#define TD_PHASE_INIT_REFS_TO_LEAVES        1
+#define TD_PHASE_CHECK_NEED_ALLOC           2
+#define TD_PHASE_ALLOC_ROOT_NODE            3
+#define TD_PHASE_REBRAID_COUNT_OPENINGS     4
+#define TD_PHASE_REBRAID_CHECK_TERMINATION  5
+#define TD_PHASE_REBRAID_OPEN               6
+#define TD_PHASE_REBRAID_UPDATE_NODES       7
+#define TD_PHASE_BIN_REFS                   8
+#define TD_PHASE_FIND_BEST_SPLIT            9
+#define TD_PHASE_SECOND_PASS                10
+#define TD_PHASE_UPDATE_NEW_NODES           11
+#define TD_PHASE_DONE                       12
+
+struct StateTDBuild
+{
+    uint            numNodes;
+    uint            numProcessedNodes;
+    uint            numNodesAllocated;
+    uint            numRefs;
+    uint            numRefsAllocated;
+    uint            numInactiveInstance;
+    UintBoundingBox rootCentroidBBox;
+    uint            numLeaves;
+    uint            binsCounter;
+
+#if USE_BVH_REBRAID
+    uint            rebraidState;
+    uint            leafAllocOffset;
+#endif
+};
+
+#define STATE_TD_NUM_NODES_OFFSET               0
+#define STATE_TD_NUM_PROCESSED_NODES_OFFSET     4
+#define STATE_TD_NUM_NODES_ALLOCATED_OFFSET     8
+#define STATE_TD_NUM_REFS_OFFSET                12
+#define STATE_TD_NUM_REFS_ALLOCATED_OFFSET      16
+#define STATE_TD_NUM_INACTIVE_INSTANCE_OFFSET   20
+#define STATE_TD_CENTROID_BBOX_OFFSET           24
+#define STATE_TD_NUM_LEAVES_OFFSET              (STATE_TD_CENTROID_BBOX_OFFSET + sizeof(UintBoundingBox))
+#define STATE_TD_BINS_COUNTER_OFFSET            (STATE_TD_NUM_LEAVES_OFFSET + 4)
+#define STATE_TD_REBRAID_STATE_OFFSET           (STATE_TD_BINS_COUNTER_OFFSET + 4)
+#define STATE_TD_LEAF_ALLOC_OFFSET_OFFSET       (STATE_TD_REBRAID_STATE_OFFSET + 4)
+
 #if NO_SHADER_ENTRYPOINT == 0
 #define USE_LDS     1
 
diff --git a/src/shaders/BuildCommon.hlsl b/src/shaders/BuildCommon.hlsl
index 4ecf131..5746130 100644
--- a/src/shaders/BuildCommon.hlsl
+++ b/src/shaders/BuildCommon.hlsl
@@ -325,33 +325,6 @@ float3 Uint3ToFloat3(in uint3 v)
     return asfloat(v);
 }
 
-//=====================================================================================================================
-// Divide uints and round up
-uint RoundUpQuotient(
-    uint dividend,
-    uint divisor)
-{
-    return (dividend + divisor - 1) / divisor;
-}
-
-//=====================================================================================================================
-// Divide ints and round up
-int RoundUpQuotient(
-    int dividend,
-    int divisor)
-{
-    return (dividend + divisor - 1) / divisor;
-}
-
-//=====================================================================================================================
-// Divide ints and round up
-uint64_t RoundUpQuotient(
-    uint64_t dividend,
-    uint64_t divisor)
-{
-    return (dividend + divisor - 1) / divisor;
-}
-
 //=====================================================================================================================
 static uint32_t GetNumInternalNodeCount(
     in uint32_t primitiveCount)
@@ -678,9 +651,9 @@ uint PackInstanceMaskAndNodeFlags(
 uint PackScratchNodeFlags(
     uint instanceInclusionMask,
     uint nodeFlags,
-    uint triangleId)
+    uint quadSwizzle)
 {
-    const uint packedFlags = (triangleId << 16) | PackInstanceMaskAndNodeFlags(instanceInclusionMask, nodeFlags);
+    const uint packedFlags = (quadSwizzle << 16) | PackInstanceMaskAndNodeFlags(instanceInclusionMask, nodeFlags);
     return packedFlags;
 }
 
diff --git a/src/shaders/BuildCommonScratch.hlsl b/src/shaders/BuildCommonScratch.hlsl
index cd54497..b0d3197 100644
--- a/src/shaders/BuildCommonScratch.hlsl
+++ b/src/shaders/BuildCommonScratch.hlsl
@@ -47,7 +47,7 @@
 #ifndef _BUILDCOMMONSCRATCH_HLSL
 #define _BUILDCOMMONSCRATCH_HLSL
 
-#include "../shared/scratchNode.h"
+#include "../shadersClean/common/ScratchNode.hlsli"
 #include "BuildCommon.hlsl"
 #include "BuildCommonScratchGlobal.hlsl"
 #include "TaskCounter.hlsl"
@@ -853,16 +853,17 @@ void RefitNode(
 }
 
 //=====================================================================================================================
-static TriangleData GetScratchNodeTrianglePairVertices(
+static TriangleData GetScratchNodeQuadVertices(
     in uint scratchNodesOffset,
     in uint nodeIndex,
     in uint triangleIndex)
 {
-    const uint nodeType = (triangleIndex == 0) ? NODE_TYPE_TRIANGLE_0 : NODE_TYPE_TRIANGLE_1;
-
     const uint packedFlags = FETCH_SCRATCH_NODE_DATA(uint, scratchNodesOffset, nodeIndex, SCRATCH_NODE_FLAGS_OFFSET);
 
-    uint3 indices = CalcTriangleCompressionVertexIndices(nodeType, ExtractScratchNodeTriangleId(packedFlags));
+    const uint quadSwizzle = ExtractScratchNodeQuadSwizzle(packedFlags);
+    const uint triSwizzle = (quadSwizzle >> (triangleIndex * 4)) & 0xFF;
+
+    uint3 indices = ComputeQuadTriangleVertexIndex(triangleIndex, triSwizzle);
 
     TriangleData tri;
 
diff --git a/src/shaders/BuildFastAgglomerativeLbvh.hlsl b/src/shaders/BuildFastAgglomerativeLbvh.hlsl
index 5c4a810..526053c 100644
--- a/src/shaders/BuildFastAgglomerativeLbvh.hlsl
+++ b/src/shaders/BuildFastAgglomerativeLbvh.hlsl
@@ -104,15 +104,14 @@ uint32_t Delta30(
     const int leftCode  = ScratchBuffer.Load(mortonCodesOffset + (left * sizeof(int)));
     const int rightCode = ScratchBuffer.Load(mortonCodesOffset + (right * sizeof(int)));
 
-    // logical xor can be used instead of finding the index of the highest differing bit as we can compare the numbers.
-    // The higher the index of the differing bit, the larger the number
-    return (leftCode != rightCode) ? (leftCode ^ rightCode) : (left ^ right);
+    // returns number of matching bits starting from MSB
+    return (leftCode != rightCode) ? clz(leftCode ^ rightCode) : (32 + clz(left ^ right));
 }
 
 //=====================================================================================================================
 // This function indicates a distance metric between the two keys where each internal node splits the hierarchy
 // Optionally, we can use the squared distance to compute the distance between two centroids
-uint64_t Delta64(
+uint32_t Delta64(
     uint mortonCodesOffset,
     uint id)
 {
@@ -123,9 +122,8 @@ uint64_t Delta64(
     const uint64_t leftCode  = ScratchBuffer.Load<uint64_t>(mortonCodesOffset + (left * sizeof(uint64_t)));
     const uint64_t rightCode = ScratchBuffer.Load<uint64_t>(mortonCodesOffset + (right * sizeof(uint64_t)));
 
-    // logical xor can be used instead of finding the index of the highest differing bit as we can compare the numbers.
-    // The higher the index of the differing bit, the larger the number
-    return (leftCode != rightCode) ? (leftCode ^ rightCode) : (left ^ right);
+    // returns number of matching bits starting from MSB
+    return (leftCode != rightCode) ? clz64(leftCode ^ rightCode) : (64 + clz64(left ^ right));
 }
 
 //=====================================================================================================================
@@ -137,11 +135,11 @@ bool IsSplitRight(
 {
     if (useMortonCode30)
     {
-        return (Delta30(mortonCodesOffset, right) < Delta30(mortonCodesOffset, left - 1));
+        return (Delta30(mortonCodesOffset, right) > Delta30(mortonCodesOffset, left - 1));
     }
     else
     {
-        return (Delta64(mortonCodesOffset, right) < Delta64(mortonCodesOffset, left - 1));
+        return (Delta64(mortonCodesOffset, right) > Delta64(mortonCodesOffset, left - 1));
     }
 }
 
@@ -173,6 +171,21 @@ void FastAgglomerativeLbvhImpl(
     // Total number of internal nodes is N - 1
     const uint numInternalNodes = args.numActivePrims - 1;
 
+    if (numInternalNodes == 0)
+    {
+        if (primitiveIndex == 0)
+        {
+            const uint rootIndex = FetchSortedPrimIndex(args.sortedPrimIndicesOffset, 0);
+            {
+                // Store invalid index as parent of root
+                WriteScratchNodeData(args.baseScratchNodesOffset, rootIndex, SCRATCH_NODE_PARENT_OFFSET, 0xffffffff);
+            }
+
+            WriteRootNodeIndex(args.rootNodeIndexOffset, rootIndex);
+        }
+        return;
+    }
+
     // The root of the tree will be stored in the left child of the n-th internal node, where n represents the size of
     // the key array
 
@@ -244,8 +257,11 @@ void FastAgglomerativeLbvhImpl(
         // the root node index and remove this conditional
         if (parentNodeIndex == numInternalNodes)
         {
-            // Store invalid index as parent of root
-            WriteScratchNodeData(args.baseScratchNodesOffset, currentNodeIndex, SCRATCH_NODE_PARENT_OFFSET, 0xffffffff);
+            {
+                // Store invalid index as parent of root
+                WriteScratchNodeData(args.baseScratchNodesOffset, currentNodeIndex, SCRATCH_NODE_PARENT_OFFSET, 0xffffffff);
+            }
+
             // Store the index of the root node
             WriteRootNodeIndex(args.rootNodeIndexOffset, currentNodeIndex);
             // Do not write the parent node since it's invalid.
@@ -286,7 +302,14 @@ void BuildFastAgglomerativeLbvh(
     const uint numActivePrims = ReadAccelStructHeaderField(ACCEL_STRUCT_HEADER_NUM_ACTIVE_PRIMS_OFFSET);
     const FastLBVHArgs args   = GetFastLbvhArgs(numActivePrims);
 
-    if (globalId < numActivePrims)
+    if (numActivePrims == 0)
+    {
+        if (globalId == 0)
+        {
+            WriteRootNodeIndex(args.rootNodeIndexOffset, 0);
+        }
+    }
+    else if (globalId < numActivePrims)
     {
         FastAgglomerativeLbvhImpl(globalId, args);
     }
diff --git a/src/shaders/BuildBVHPLOC.hlsl b/src/shaders/BuildPLOC.hlsl
similarity index 99%
rename from src/shaders/BuildBVHPLOC.hlsl
rename to src/shaders/BuildPLOC.hlsl
index effeb80..2c39642 100644
--- a/src/shaders/BuildBVHPLOC.hlsl
+++ b/src/shaders/BuildPLOC.hlsl
@@ -88,7 +88,7 @@ struct BuildPlocArgs
 #include "Common.hlsl"
 
 //=====================================================================================================================
-#include "..\shared\rayTracingDefs.h"
+#include "../shared/rayTracingDefs.h"
 
 #define GC_DSTMETADATA
 #define GC_SCRATCHBUFFER
@@ -748,7 +748,7 @@ void UpdateClusterCount(
 }
 
 //=====================================================================================================================
-void BuildBvhPlocImpl(
+void BuildPlocImpl(
     uint            globalId,
     uint            localId,
     uint            groupId,
@@ -858,7 +858,7 @@ void BuildBvhPlocImpl(
 //====================================================================================================================
 [RootSignature(RootSig)]
 [numthreads(BUILD_THREADGROUP_SIZE, 1, 1)]
-void BuildBVHPLOC(
+void BuildPLOC(
     uint globalIdIn : SV_DispatchThreadID,
     uint groupIdIn  : SV_GroupID,
     uint localIdIn  : SV_GroupThreadID)
@@ -890,7 +890,7 @@ void BuildBVHPLOC(
 
     if (numActivePrims > 0)
     {
-        BuildBvhPlocImpl(globalId, localId, groupId, numActivePrims, plocArgs);
+        BuildPlocImpl(globalId, localId, groupId, numActivePrims, plocArgs);
     }
 }
 #endif
diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl
index e557003..eaf9090 100644
--- a/src/shaders/BuildParallel.hlsl
+++ b/src/shaders/BuildParallel.hlsl
@@ -85,7 +85,7 @@ void WaitForEncodeTasksToFinish(
 #include "GenerateMortonCodes.hlsl"
 #include "RadixSort/ScanExclusiveInt4DLBCommon.hlsl"
 #include "RadixSort/RadixSortParallel.hlsl"
-#include "BuildBVHPLOC.hlsl"
+#include "BuildPLOC.hlsl"
 #include "BuildQBVH.hlsl"
 #include "BuildBVHTDTR.hlsl"
 #include "BuildBVH.hlsl"
@@ -242,7 +242,7 @@ void TriangleSplitting(
 }
 
 //======================================================================================================================
-void BuildBvhPloc(
+void BuildPloc(
     inout uint numTasksWait,
     inout uint waveId,
     uint       globalId,
@@ -270,7 +270,7 @@ void BuildBvhPloc(
     plocArgs.primIndicesSortedScratchOffset = ShaderConstants.offsets.primIndicesSorted;
     plocArgs.unsortedBvhLeafNodesOffset     = ShaderConstants.offsets.bvhLeafNodeData;
 
-    BuildBvhPlocImpl(globalId, localId, groupId, numActivePrims, plocArgs);
+    BuildPlocImpl(globalId, localId, groupId, numActivePrims, plocArgs);
 }
 
 //======================================================================================================================
@@ -376,8 +376,8 @@ void MergeSort(inout uint numTasksWait, inout uint waveId, uint localId, uint gr
                   numPrimitives,
                   ShaderConstants.offsets.mortonCodes,
                   ShaderConstants.offsets.mortonCodesSorted,
-                  ShaderConstants.offsets.primIndicesSorted,
                   ShaderConstants.offsets.primIndicesSortedSwap,
+                  ShaderConstants.offsets.primIndicesSorted,
                   Settings.useMortonCode30);
 }
 
@@ -543,75 +543,59 @@ void BuildBvh(
     {
         bool needRefit = false;
 
-        if ((Settings.fastBuildThreshold) && (numPrimitives <= Settings.fastBuildThreshold) && (numPrimitives <= WaveGetLaneCount()))
-        {
-            BEGIN_TASK(1);
-
-            FastBuildBVH(globalId,
-                         numPrimitives,
-                         ShaderConstants.offsets.bvhLeafNodeData,
-                         ShaderConstants.offsets.bvhNodeData);
-
-            END_TASK(1);
-            needRefit = true;
-            numActivePrims = ReadAccelStructHeaderField(ACCEL_STRUCT_HEADER_NUM_ACTIVE_PRIMS_OFFSET);
-        }
-        else
-        {
-            BEGIN_TASK(ShaderRootConstants.NumThreadGroups());
+        BEGIN_TASK(ShaderRootConstants.NumThreadGroups());
 
-            GenerateMortonCodes(globalId, numPrimitives);
+        GenerateMortonCodes(globalId, numPrimitives);
 
-            END_TASK(ShaderRootConstants.NumThreadGroups());
-            WriteDebugCounter(COUNTER_MORTONGEN_OFFSET);
-            numActivePrims = ReadAccelStructHeaderField(ACCEL_STRUCT_HEADER_NUM_ACTIVE_PRIMS_OFFSET);
+        END_TASK(ShaderRootConstants.NumThreadGroups());
+        WriteDebugCounter(COUNTER_MORTONGEN_OFFSET);
+        numActivePrims = ReadAccelStructHeaderField(ACCEL_STRUCT_HEADER_NUM_ACTIVE_PRIMS_OFFSET);
 
-            if (numActivePrims > 0)
+        if (numActivePrims > 0)
+        {
+            if (Settings.enableMergeSort)
             {
-                if (Settings.enableMergeSort)
-                {
-                    MergeSort(numTasksWait, waveId, localId, groupId, numPrimitives);
-                }
-                else
-                {
-                    RadixSort(numTasksWait, waveId, globalId, localId, groupId, numPrimitives, Settings.radixSortScanLevel, Settings.useMortonCode30);
-                }
-                WriteDebugCounter(COUNTER_MORTON_SORT_OFFSET);
-                // Note there is an implicit sync on the last pass of the sort
+                MergeSort(numTasksWait, waveId, localId, groupId, numPrimitives);
+            }
+            else
+            {
+                RadixSort(numTasksWait, waveId, globalId, localId, groupId, numPrimitives, Settings.radixSortScanLevel, Settings.useMortonCode30);
+            }
+            WriteDebugCounter(COUNTER_MORTON_SORT_OFFSET);
+            // Note there is an implicit sync on the last pass of the sort
 
-                // If the top down builder is off, the unsorted leaves will stay where the
+            // If the top down builder is off, the unsorted leaves will stay where the
                 // Encode step put them. On top of that, if TS or Rebraid is also on,
                 // there might be a gap between the last inner node and the first leaf
                 // if we place the root of the tree at ShaderConstants.offsets.bvhNodeData.
-                // To avoid that gap, the root is moved forward by numLeafNodes - numActivePrims
-                // nodes from this point onwards.
+            // To avoid that gap, the root is moved forward by numLeafNodes - numActivePrims
+            // nodes from this point onwards.
 
-                if (Settings.buildMode == BUILD_MODE_PLOC)
+            if (Settings.buildMode == BUILD_MODE_PLOC)
+            {
+                BuildPloc(numTasksWait, waveId, globalId, localId, groupId, numActivePrims);
+                WriteDebugCounter(COUNTER_BUILDPLOC_OFFSET);
+            }
+            else
+            {
+                if (Settings.enableFastLBVH == false)
                 {
-                    BuildBvhPloc(numTasksWait, waveId, globalId, localId, groupId, numActivePrims);
-                    WriteDebugCounter(COUNTER_BUILDPLOC_OFFSET);
+                    BEGIN_TASK(ShaderRootConstants.NumThreadGroups());
+
+                    BuildBvhLinear(globalId, numActivePrims, numPrimitives);
+
+                    END_TASK(ShaderRootConstants.NumThreadGroups());
+                    WriteDebugCounter(COUNTER_BUILDLBVH_OFFSET);
+                    needRefit = true;
                 }
                 else
                 {
-                    if (Settings.enableFastLBVH == false)
-                    {
-                        BEGIN_TASK(ShaderRootConstants.NumThreadGroups());
-
-                        BuildBvhLinear(globalId, numActivePrims, numPrimitives);
-
-                        END_TASK(ShaderRootConstants.NumThreadGroups());
-                        WriteDebugCounter(COUNTER_BUILDLBVH_OFFSET);
-                        needRefit = true;
-                    }
-                    else
-                    {
-                        BEGIN_TASK(ShaderRootConstants.NumThreadGroups());
+                    BEGIN_TASK(ShaderRootConstants.NumThreadGroups());
 
-                        FastAgglomerativeLbvh(globalId, numActivePrims);
+                    FastAgglomerativeLbvh(globalId, numActivePrims);
 
-                        END_TASK(ShaderRootConstants.NumThreadGroups());
-                        WriteDebugCounter(COUNTER_BUILDFASTLBVH_OFFSET);
-                    }
+                    END_TASK(ShaderRootConstants.NumThreadGroups());
+                    WriteDebugCounter(COUNTER_BUILDFASTLBVH_OFFSET);
                 }
             }
         }
diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl
index 9dd9e30..512496a 100644
--- a/src/shaders/BuildQBVH.hlsl
+++ b/src/shaders/BuildQBVH.hlsl
@@ -306,7 +306,8 @@ uint WritePrimitiveNode(
         nodeOffset = offsets.leafNodes + (destIndex * primitiveNodeSize);
     }
 
-    const uint triangleId = ExtractScratchNodeTriangleId(scratchNode.packedFlags);
+    const uint quadSwizzle = ExtractScratchNodeQuadSwizzle(scratchNode.packedFlags);
+    const uint boxNodeFlags = ExtractScratchNodeBoxFlags(scratchNode.packedFlags);
 
     if (nodeType == NODE_TYPE_USER_NODE_PROCEDURAL)
     {
@@ -320,8 +321,6 @@ uint WritePrimitiveNode(
     }
     else
     {
-        DstBuffer.Store(nodeOffset + TRIANGLE_NODE_ID_OFFSET, triangleId);
-
         bool isPairCompressed = (Settings.triangleCompressionMode == PAIR_TRIANGLE_COMPRESSION);
         if (Settings.enableEarlyPairCompression)
         {
@@ -335,6 +334,18 @@ uint WritePrimitiveNode(
         // Pair compressed triangles nodes are referenced by triangle 1
         nodeType = isPairCompressed ? NODE_TYPE_TRIANGLE_1 : NODE_TYPE_TRIANGLE_0;
 
+        uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, (quadSwizzle >> 0) & 0xF, boxNodeFlags);
+
+        // The compaction shader (CompactASImpl1_1) looks at triangleId to determine the node type of a leaf node.
+        // Hence, we must only set the triangleId fields for NODE_TYPE_TRIANGLE_1 to non-zero for a pair
+        // compressed triangle.
+        if (isPairCompressed)
+        {
+            triangleId = WriteTriangleIdField(triangleId, NODE_TYPE_TRIANGLE_1, (quadSwizzle >> 4) & 0xF, boxNodeFlags);
+        }
+
+        DstBuffer.Store(nodeOffset + TRIANGLE_NODE_ID_OFFSET, triangleId);
+
         {
             DstBuffer.Store(nodeOffset + TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET, geometryIndexAndFlags);
             DstBuffer.Store(nodeOffset + TRIANGLE_NODE_PRIMITIVE_INDEX0_OFFSET + (nodeType * 4),
diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli
index 6bb5026..5929195 100644
--- a/src/shaders/BuildSettings.hlsli
+++ b/src/shaders/BuildSettings.hlsli
@@ -43,7 +43,6 @@
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_TOP_DOWN_BUILD_ID)]]                  uint enableTopDownBuild            = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_USE_MORTON_CODE_30_ID)]]                     uint useMortonCode30               = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_MERGE_SORT_ID)]]                      uint enableMergeSort               = 0;
-[[vk::constant_id(BUILD_SETTINGS_DATA_FAST_BUILD_THRESHOLD_ID)]]                   uint fastBuildThreshold            = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_FUSED_INSTANCE_NODE_ID)]]             uint enableFusedInstanceNode       = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_TS_PRIORITY_ID)]]                            float tsPriority                   = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_NUM_REBRAID_ITERATIONS_ID)]]                 uint numRebraidIterations          = 0;
@@ -59,7 +58,7 @@
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID)]]               uint encodeArrayOfPointers         = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID)]]          uint sceneBoundsCalculationType    = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID)]]              uint rebraidQualityHeuristic       = 0;
-[[vk::constant_id(BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID)]]         uint rebuildAccelStruct            = 0;
+[[vk::constant_id(BUILD_SETTINGS_DATA_DISABLE_COMPACTION_ID)]]                     uint disableCompaction            = 0;
 
 static const CompileTimeBuildSettings Settings = {
     topLevelBuild,
@@ -79,7 +78,7 @@ static const CompileTimeBuildSettings Settings = {
     enableTopDownBuild,
     useMortonCode30,
     enableMergeSort,
-    fastBuildThreshold,
+    0,
     enableFusedInstanceNode,
     tsPriority,
     numRebraidIterations,
@@ -109,7 +108,7 @@ static const CompileTimeBuildSettings Settings = {
     0,
     0,
     0,
-    rebuildAccelStruct,
+    disableCompaction,
 };
 
 #endif
diff --git a/src/shaders/CMakeLists.txt b/src/shaders/CMakeLists.txt
index 4b5b5dd..54ef25e 100644
--- a/src/shaders/CMakeLists.txt
+++ b/src/shaders/CMakeLists.txt
@@ -39,7 +39,7 @@ set(gpurtHlsl
     AccelStructTracker.hlsl
     BuildRootSignature.hlsl
     BuildBVH.hlsl
-    BuildBVHPLOC.hlsl
+    BuildPLOC.hlsl
     BuildBVHTDTR.hlsl
     BuildCommon.hlsl
     BuildCommonScratch.hlsl
@@ -119,11 +119,22 @@ set(otherDeps
     ../shared/gpurtBuildConstants.h
     ../shared/hlslTypes.h
     ../shared/rayTracingDefs.h
-    ../shared/scratchNode.h
+    ../shadersClean/common/Bits.hlsli
     ../shadersClean/common/Math.hlsli
     ../shadersClean/common/Math.hlsl
     ../shadersClean/common/Extensions.hlsli
+    ../shadersClean/common/Extensions.hlsl
     ../shadersClean/common/ShaderDefs.hlsli
+    ../shadersClean/common/BoundingBox.hlsli
+    ../shadersClean/common/InstanceDesc.hlsli
+    ../shadersClean/common/NodePointers.hlsli
+    ../shadersClean/common/ScratchNode.hlsli
+    ../shadersClean/common/TempAssert.hlsli
+    ../shadersClean/traversal/TraversalDefs.hlsli
+    ../shadersClean/common/gfx10/BoxNode1_0.hlsli
+    ../shadersClean/common/gfx10/InstanceNode1_0.hlsli
+    ../shadersClean/common/gfx10/ProceduralNode1_0.hlsli
+    ../shadersClean/common/gfx10/TriangleNode1_0.hlsli
 )
 
 set(GPURT_SHADER_SOURCE_FILES "${gpurtHlsl}" "${otherDeps}" PARENT_SCOPE)
diff --git a/src/shaders/Common.hlsl b/src/shaders/Common.hlsl
index d4562fb..1b55ccf 100644
--- a/src/shaders/Common.hlsl
+++ b/src/shaders/Common.hlsl
@@ -35,7 +35,7 @@
 #define _COMMON_HLSL
 
 #include "../shared/rayTracingDefs.h"
-#include "../shared/scratchNode.h"
+#include "../shadersClean/common/ScratchNode.hlsli"
 
 typedef AccelStructDataOffsets AccelStructOffsets;
 
@@ -62,16 +62,19 @@ typedef AccelStructDataOffsets AccelStructOffsets;
 #define INVALID_IDX       0xffffffff
 
 // Node pointer values with special meanings
-#define INVALID_NODE      0xffffffff
-#define TERMINAL_NODE     0xfffffffe
-#define SKIP_0_3          0xfffffffd
-#define SKIP_4_7          0xfffffffb
-#define SKIP_0_7          0xfffffff9
-#define END_SEARCH        0xfffffff8
-#define DEAD_LANE         0xfffffff7
+#define INVALID_NODE            0xffffffff
+#define TERMINAL_NODE           0xfffffffe
+#define SKIP_0_3                0xfffffffd
+#define SKIP_4_7                0xfffffffb
+#define SKIP_0_7                0xfffffff9
+#define END_SEARCH              0xfffffff8
+#define DEAD_LANE_WITHOUT_STACK 0xfffffff7
+#define DEAD_LANE_WITH_STACK    0xfffffff6
 
 #include "Extensions.hlsl"
 #include "../shadersClean/common/Math.hlsli"
+#include "../shadersClean/common/BoundingBox.hlsli"
+#include "../shadersClean/common/NodePointers.hlsli"
 
 #ifdef __cplusplus
 static const float NaN = std::numeric_limits<float>::quiet_NaN();
@@ -399,29 +402,57 @@ static bool CheckHandleProceduralUserNode(in uint nodePointer)
 }
 
 //=====================================================================================================================
-static uint WriteTriangleIdField(uint triangleId, uint nodeType, uint rotation, uint geometryFlags)
+static uint3 ComputeQuadTriangleVertexIndex(
+    uint triangleIndex, // Numeric constant (0 or 1)
+    uint rotation)
+{
+    // triangle_0 vertex mapping
+    //
+    // rotation 0: t0: v0, v1, v2
+    // rotation 1: t0: v1, v2, v0
+    // rotation 2: t0: v2, v0, v1
+    //
+
+    // triangle_1 vertex mapping
+    //
+    // rotation 0: t0: v1, v3, v2
+    // rotation 1: t0: v3, v2, v1
+    // rotation 2: t0: v2, v1, v3
+    //
+    const uint packedVertexMapping = (triangleIndex == 0) ? 0x10210 : 0x31231;
+    const uint packedMapping = packedVertexMapping >> (rotation * 4);
+
+    return uint3((packedMapping >> 0) & 0xF,
+                 (packedMapping >> 4) & 0xF,
+                 (packedMapping >> 8) & 0xF);
+}
+
+//=====================================================================================================================
+static uint WriteTriangleIdField(uint triangleId, uint nodeType, uint rotation, uint boxNodeFlags)
 {
     const uint triangleShift = nodeType * TRIANGLE_ID_BIT_STRIDE;
 
+    // Hardware triangle ID barycentric mapping indicates the triangle vertex rotation. This maps to triangle vertex
+    // mapping for triangle index 0 in the quad.
+    const uint3 index = ComputeQuadTriangleVertexIndex(0, rotation);
+
     // Compute the barycentrics mapping table that is stored in triangle_id for RT IP 1.1
-    triangleId |= ((rotation + 1) % 3) << (triangleShift + TRIANGLE_ID_I_SRC_SHIFT);
-    triangleId |= ((rotation + 2) % 3) << (triangleShift + TRIANGLE_ID_J_SRC_SHIFT);
+    triangleId |= (index.y) << (triangleShift + TRIANGLE_ID_I_SRC_SHIFT);
+    triangleId |= (index.z) << (triangleShift + TRIANGLE_ID_J_SRC_SHIFT);
 
     // Add in the flags stored in triangle_id for RT IP 2.0
-    if (geometryFlags & D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE)
+    if (boxNodeFlags & (1u << BOX_NODE_FLAGS_ONLY_OPAQUE_SHIFT))
     {
         triangleId |= 1u << (triangleShift + TRIANGLE_ID_OPAQUE_SHIFT);
     }
+    if (boxNodeFlags & (1u << BOX_NODE_FLAGS_ONLY_PROCEDURAL_SHIFT))
+    {
+        triangleId |= 1u << (triangleShift + TRIANGLE_ID_PROCEDURAL_SHIFT);
+    }
 
     return triangleId;
 }
 
-//=====================================================================================================================
-static uint CalcUncompressedTriangleId(uint geometryFlags)
-{
-    return WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags);
-}
-
 //=====================================================================================================================
 // Extract the order of the triangle vertices from the node's triangle ID field.
 static uint3 CalcTriangleCompressionVertexIndices(
@@ -810,7 +841,7 @@ static uint32_t GetInstanceSidebandOffset(
 // Node pointers with all upper bits set are sentinels: INVALID_NODE, TERMINAL_NODE, SKIP_*
 static bool IsValidNode(uint nodePtr)
 {
-    return nodePtr < DEAD_LANE;
+    return nodePtr < DEAD_LANE_WITH_STACK;
 }
 
 //======================================================================================================================
diff --git a/src/shaders/Continuations1_1.hlsl b/src/shaders/Continuations1_1.hlsl
index ee528c4..1d17e9d 100644
--- a/src/shaders/Continuations1_1.hlsl
+++ b/src/shaders/Continuations1_1.hlsl
@@ -161,7 +161,6 @@ static _AmdTraversalState InitTraversalState1_1(
     traversal.committed.currNodePtr = INVALID_NODE;
 
     // Start traversing from root node
-    traversal.nextNodePtr = isValid ? CreateRootNodePointer1_1() : INVALID_NODE;
     traversal.reservedNodePtr         = INVALID_NODE;
     traversal.lastInstanceRootNodePtr = INVALID_NODE;
 
@@ -239,7 +238,7 @@ static void TraversalInternal1_1(
     float2 committedBarycentrics = data.traversal.committedBarycentrics;
     candidateBarycentrics = float2(0.0f, 0.0f);
 
-    uint   nextNodePtr             = data.traversal.nextNodePtr;
+    uint   nextNodePtr             = data.dispatch.nextNodePtr;
     float3 candidateRayOrigin      = topLevelRayOrigin;
     float3 candidateRayDirection   = topLevelRayDirection;
     state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
@@ -546,7 +545,7 @@ static void TraversalInternal1_1(
     data.traversal.stackPtr = stack.Pack();
 
     // Pack traversal results back into traversal state structure
-    data.traversal.nextNodePtr             = nextNodePtr;
+    data.dispatch.nextNodePtr              = nextNodePtr;
     data.traversal.committed               = committed;
     data.traversal.committedBarycentrics   = committedBarycentrics;
 #if REMAT_INSTANCE_RAY == 0
diff --git a/src/shaders/Continuations2_0.hlsl b/src/shaders/Continuations2_0.hlsl
index 19c3cfd..221143c 100644
--- a/src/shaders/Continuations2_0.hlsl
+++ b/src/shaders/Continuations2_0.hlsl
@@ -46,7 +46,6 @@ static _AmdTraversalState InitTraversalState2_0(
     traversal.committed.currNodePtr = INVALID_NODE;
 
     // Start traversing from root node
-    traversal.nextNodePtr = isValid ? CreateRootNodePointer1_1() : TERMINAL_NODE;
     traversal.reservedNodePtr         = INVALID_NODE;
     traversal.lastInstanceRootNodePtr = INVALID_NODE;
 
@@ -127,7 +126,7 @@ static void TraversalInternal2_0(
 
     instanceFlagsPreserveBits <<= POINTER_FLAGS_HIDWORD_SHIFT;
 
-    uint   nextNodePtr             = data.traversal.nextNodePtr;
+    uint   nextNodePtr             = data.dispatch.nextNodePtr;
     float3 candidateRayOrigin      = topLevelRayOrigin;
     float3 candidateRayDirection   = topLevelRayDirection;
     state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
@@ -361,7 +360,7 @@ static void TraversalInternal2_0(
                     candidate.currNodePtr    = nodePtr;
                     if (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::DeferFirst)
                     {
-                       haveCandidate = true;
+                        haveCandidate = true;
                     }
                     else
                     {
@@ -552,7 +551,7 @@ static void TraversalInternal2_0(
     }
 
     // Pack traversal results back into traversal state structure
-    data.traversal.nextNodePtr             = nextNodePtr;
+    data.dispatch.nextNodePtr              = nextNodePtr;
     data.traversal.committed               = committed;
     data.traversal.committedBarycentrics   = committedBarycentrics;
 #if REMAT_INSTANCE_RAY == 0
diff --git a/src/shaders/CopyAS.hlsl b/src/shaders/CopyAS.hlsl
index ded9b60..2ca420e 100644
--- a/src/shaders/CopyAS.hlsl
+++ b/src/shaders/CopyAS.hlsl
@@ -22,7 +22,8 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "Common.hlsl"
+#include "../../gpurt/gpurtAccelStruct.h"
+#include "../shared/rayTracingDefs.h"
 
 // Note, CBV(b255) must be the last used binding in the root signature.
 #define RootSig "RootConstants(num32BitConstants=3, b0, visibility=SHADER_VISIBILITY_ALL), "\
@@ -78,7 +79,7 @@ void CopyAS(in uint3 globalThreadId : SV_DispatchThreadID)
     if (globalID == 0)
     {
         // Offset to acceleration structure header
-        uint64_t gpuVa = MakeGpuVirtualAddress(ShaderConstants.AddressLo, ShaderConstants.AddressHi);
+        uint64_t gpuVa = PackUint64(ShaderConstants.AddressLo, ShaderConstants.AddressHi);
         gpuVa += metadataSizeInBytes;
 
         // Patch metadata header
diff --git a/src/shaders/Debug.hlsl b/src/shaders/Debug.hlsl
index 00a8724..fd45358 100644
--- a/src/shaders/Debug.hlsl
+++ b/src/shaders/Debug.hlsl
@@ -29,8 +29,10 @@
 #include "Common.hlsl"
 #include "Extensions.hlsl"
 
+#define GPURT_DEBUG_BUFFER_AVAILABLE (GPURT_ENABLE_GPU_DEBUG && GPURT_BVH_BUILD_SHADER && defined(DEBUG_BUFFER_SLOT))
+
 #if GPURT_ENABLE_GPU_DEBUG
-    #if BUILD_PARALLEL || TRIVIAL_BUILDER
+    #if GPURT_DEBUG_BUFFER_AVAILABLE
         #define GPU_ASSERT_IMPL(id, cond) DoGpuAssert(id, (cond))
         #define GPU_DPF_IMPL(id, msg, ...) \
         do \
@@ -44,7 +46,6 @@
         {  \
             if (IsDebugHaltEnabled() && !(cond)) { Halt(); } \
         } while (false)
-
         #define GPU_DPF_IMPL(msg, ...)
     #endif
 
@@ -69,7 +70,7 @@ void Halt()
     AmdExtD3DShaderIntrinsics_Halt();
 }
 
-#if GPURT_ENABLE_GPU_DEBUG && (BUILD_PARALLEL || TRIVIAL_BUILDER)
+#if GPURT_DEBUG_BUFFER_AVAILABLE
 
 globallycoherent RWByteAddressBuffer DebugBuffer : register( DEBUG_BUFFER_SLOT );
 
diff --git a/src/shaders/EncodeCommon.hlsl b/src/shaders/EncodeCommon.hlsl
index 3eca227..fb666c6 100644
--- a/src/shaders/EncodeCommon.hlsl
+++ b/src/shaders/EncodeCommon.hlsl
@@ -24,6 +24,8 @@
  **********************************************************************************************************************/
 #include "BuildCommonScratch.hlsl"
 
+#include "../shared/rayTracingDefs.h"
+
 #include "TrianglePrimitive.hlsl"
 #include "UpdateCommon.hlsl"
 
@@ -62,8 +64,7 @@ void WriteScratchTriangleNode(
         flags |= SCRATCH_NODE_FLAGS_DISABLE_TRIANGLE_SPLIT_MASK;
     }
 
-    const uint triangleId = CalcUncompressedTriangleId(geometryFlags);
-    const uint packedFlags = PackScratchNodeFlags(instanceMask, flags, triangleId);
+    const uint packedFlags = PackScratchNodeFlags(instanceMask, flags, 0);
 
     data = uint4(INVALID_IDX, 0, 0, packedFlags);
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_SPLIT_BOX_INDEX_OFFSET, data);
@@ -173,16 +174,15 @@ void EncodeTriangleNode(
                 const uint nodeOffset = metadataSize + ExtractNodePointerOffset(nodePointer);
                 const uint nodeType   = GetNodeType(nodePointer);
 
-                uint3 vertexOffsets;
+                triangleId = SrcBuffer.Load(nodeOffset + TRIANGLE_NODE_ID_OFFSET);
 
+                uint3 vertexOffsets;
                 if (Settings.triangleCompressionMode != NO_TRIANGLE_COMPRESSION)
                 {
-                    triangleId = SrcBuffer.Load(nodeOffset + TRIANGLE_NODE_ID_OFFSET);
                     vertexOffsets = CalcTriangleCompressionVertexOffsets(nodeType, triangleId);
                 }
                 else
                 {
-                    triangleId = CalcUncompressedTriangleId(geomConstants.geometryFlags);
                     vertexOffsets = CalcTriangleVertexOffsets(nodeType);
                 }
 
@@ -284,11 +284,11 @@ void EncodeTriangleNode(
             const bool isActiveTriangle = IsActive(tri);
             if (isActiveTriangle)
             {
-                if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometry)
+                if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometry)
                 {
                     UpdateSceneBounds(ShaderConstants.offsets.sceneBounds, boundingBox);
                 }
-                else if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometryWithSize)
+                else if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometryWithSize)
                 {
                     // TODO: with tri splitting, need to not update "size" here
                     UpdateSceneBoundsWithSize(ShaderConstants.offsets.sceneBounds, boundingBox);
@@ -411,12 +411,11 @@ void WriteScratchProceduralNode(
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_V2_OFFSET, data);
 
     // type, flags, splitBox, numPrimitivesAndDoCollapse
-    uint triangleId = 0;
 
     // Instance mask is assumed 0 in bottom level acceleration structures
     const uint flags = CalcProceduralBoxNodeFlags(geometryFlags);
 
-    const uint packedFlags = PackScratchNodeFlags(instanceMask, flags, triangleId);
+    const uint packedFlags = PackScratchNodeFlags(instanceMask, flags, 0);
 
     data = uint4(INVALID_IDX, 0, 0, packedFlags);
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_SPLIT_BOX_INDEX_OFFSET, data);
@@ -525,11 +524,11 @@ void EncodeAabbNode(
     }
     else
     {
-        if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometry)
+        if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometry)
         {
             UpdateSceneBounds(ShaderConstants.offsets.sceneBounds, boundingBox);
         }
-        else if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometryWithSize)
+        else if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometryWithSize)
         {
             UpdateSceneBoundsWithSize(ShaderConstants.offsets.sceneBounds, boundingBox);
         }
diff --git a/src/shaders/EncodeHwBvhCommon.hlsl b/src/shaders/EncodeHwBvhCommon.hlsl
index 22e6a0b..b95cd09 100644
--- a/src/shaders/EncodeHwBvhCommon.hlsl
+++ b/src/shaders/EncodeHwBvhCommon.hlsl
@@ -141,10 +141,9 @@ void PostHwBvhBuild(
                                                offsets,
                                                metadataSizeInBytes);
 
-        // Rebuilding an updateable acceleration structure need to use the original size and not compacted one.
-        if (Settings.rebuildAccelStruct)
+        if (Settings.disableCompaction)
         {
-            compactedSize = ShaderConstants.header.compactedSizeInBytes;
+            compactedSize = ShaderConstants.header.sizeInBytes;
         }
         WriteAccelStructHeaderField(ACCEL_STRUCT_HEADER_COMPACTED_BYTE_SIZE_OFFSET, compactedSize);
 
diff --git a/src/shaders/EncodeNodes.hlsl b/src/shaders/EncodeNodes.hlsl
index 2e516b8..2075069 100644
--- a/src/shaders/EncodeNodes.hlsl
+++ b/src/shaders/EncodeNodes.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "..\shared\rayTracingDefs.h"
+#include "../shared/rayTracingDefs.h"
 
 #define GC_DSTBUFFER
 #define GC_DSTMETADATA
@@ -212,11 +212,11 @@ void EncodeQuadNodes(
         const bool isActive = IsActive(tri);
         if (isActive)
         {
-            if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometry)
+            if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometry)
             {
                 UpdateSceneBounds(ShaderConstants.offsets.sceneBounds, boundingBox);
             }
-            else if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometryWithSize)
+            else if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometryWithSize)
             {
                 // TODO: with tri splitting, need to not update "size" here
                 UpdateSceneBoundsWithSize(ShaderConstants.offsets.sceneBounds, boundingBox);
@@ -295,18 +295,14 @@ void EncodeQuadNodes(
 
         if (hasValidQuad)
         {
-            const uint triT0Rotation = (pairInfo & 0xF);
-            const uint triT1Rotation = (pairInfo >> 4) & 0xF;
-
             WriteScratchQuadNode(dstScratchNodeIdx,
                                  geomId,
                                  geomConstants.geometryFlags,
                                  tri1,
                                  primId1,
-                                 triT1Rotation,
                                  tri,
                                  primId,
-                                 triT0Rotation);
+                                 pairInfo & 0xFF);
         }
         else if (pairInfo == -1)
         {
diff --git a/src/shaders/EncodePairedTriangleImpl.hlsl b/src/shaders/EncodePairedTriangleImpl.hlsl
index 090b544..2fc83b5 100644
--- a/src/shaders/EncodePairedTriangleImpl.hlsl
+++ b/src/shaders/EncodePairedTriangleImpl.hlsl
@@ -48,9 +48,8 @@ void WriteScratchTriangleNode(
     const BoundingBox box = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2);
     // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out.
     const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff;
-    const uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags);
 
-    const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId);
+    const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), 0);
 
     data = uint4(0, 0, 0, packedFlags);
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_SPLIT_BOX_INDEX_OFFSET, data);
@@ -63,21 +62,12 @@ void WriteScratchQuadNode(
     uint         geometryFlags,
     TriangleData tri1,
     uint         tri1PrimIdx,
-    uint         triT1Rotation,
     TriangleData tri0,
     uint         tri0PrimIdx,
-    uint         triT0Rotation)
+    uint         quadSwizzle)
 {
     // TODO: For Navi3, we can directly write the scratch node data to the result leaf node data section
     //
-    uint triangleId = 0;
-
-    // triT0 - NODE_TYPE_TRIANGLE_0 (2nd to intersect)
-    triangleId = WriteTriangleIdField(triangleId, NODE_TYPE_TRIANGLE_0, triT0Rotation, geometryFlags);
-
-    // triT1 - NODE_TYPE_TRIANGLE_1 (1st to intersect)
-    triangleId = WriteTriangleIdField(triangleId, NODE_TYPE_TRIANGLE_1, triT1Rotation, geometryFlags);
-
     uint offset = CalcScratchNodeOffset(ShaderConstants.offsets.bvhLeafNodeData, dstScratchNodeIdx);
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_PRIMITIVE_ID_OFFSET, tri1PrimIdx);
 
@@ -90,8 +80,8 @@ void WriteScratchQuadNode(
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_GEOMETRY_INDEX_OFFSET, packedGeomId);
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_PARENT_OFFSET, INVALID_IDX);
 
-    const uint3 t0VtxIndices = CalcTriangleCompressionVertexIndices(NODE_TYPE_TRIANGLE_0, triangleId);
-    const uint3 t1VtxIndices = CalcTriangleCompressionVertexIndices(NODE_TYPE_TRIANGLE_1, triangleId);
+    const uint3 t0VtxIndices = ComputeQuadTriangleVertexIndex(0, (quadSwizzle >> 0) & 0xF);
+    const uint3 t1VtxIndices = ComputeQuadTriangleVertexIndex(1, (quadSwizzle >> 4) & 0xF);
 
     const uint3 t1VtxOffsets = SCRATCH_NODE_V0_OFFSET + (t1VtxIndices * SCRATCH_NODE_TRIANGLE_VERTEX_STRIDE);
     WriteScratchNodeDataAtOffset(offset, t1VtxOffsets.x, tri1.v0);
@@ -117,7 +107,8 @@ void WriteScratchQuadNode(
 
     // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out.
     const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff;
-    const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId);
+
+    const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), quadSwizzle);
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_FLAGS_OFFSET, packedFlags);
 }
 
@@ -200,20 +191,20 @@ float ComputeEdgeBoxSurfaceArea(
     uint rotation)
 {
     // triangle v1, v2, v0
-    float3 e0 = (vertices[1]);
-    float3 e1 = (vertices[0]);
+    float3 e0 = vertices[1];
+    float3 e1 = vertices[0];
 
     if (rotation == 0)
     {
         // triangle v0, v1, v2
-        e0 = (vertices[0]);
-        e1 = (vertices[2]);
+        e0 = vertices[0];
+        e1 = vertices[2];
     }
     else if (rotation == 1)
     {
         // triangle v2, v0, v1
-        e0 = (vertices[2]);
-        e1 = (vertices[1]);
+        e0 = vertices[2];
+        e1 = vertices[1];
     }
 
     BoundingBox edgeBox = (BoundingBox)0;
diff --git a/src/shaders/EncodeTopLevel.hlsl b/src/shaders/EncodeTopLevel.hlsl
index 689a4ff..00419bc 100644
--- a/src/shaders/EncodeTopLevel.hlsl
+++ b/src/shaders/EncodeTopLevel.hlsl
@@ -22,7 +22,7 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
-#include "..\shared\rayTracingDefs.h"
+#include "../shared/rayTracingDefs.h"
 
 #include "BuildRootSignature.hlsl"
 
diff --git a/src/shaders/EncodeTopLevelBuild.hlsl b/src/shaders/EncodeTopLevelBuild.hlsl
index 8e1b618..2424f4a 100644
--- a/src/shaders/EncodeTopLevelBuild.hlsl
+++ b/src/shaders/EncodeTopLevelBuild.hlsl
@@ -22,6 +22,10 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
+#include "../shared/rayTracingDefs.h"
+#include "../shadersClean/common/ScratchNode.hlsli"
+
+//=====================================================================================================================
 void WriteScratchInstanceNode(
     uint                offset,
     uint                instanceIndex,
@@ -137,7 +141,7 @@ void EncodeInstancesBuild(
                 if (IsRebraidEnabled() == false)
                 {
                     // Update scene bounding box
-                    if (Settings.sceneBoundsCalculationType == SceneBoundsBasedOnGeometryWithSize)
+                    if (Settings.sceneBoundsCalculationType == (uint)SceneBoundsCalculation::BasedOnGeometryWithSize)
                     {
                         UpdateSceneBoundsWithSize(ShaderConstants.offsets.sceneBounds, boundingBox);
                     }
diff --git a/src/shaders/Extensions.hlsl b/src/shaders/Extensions.hlsl
index bfc0812..e78c92b 100644
--- a/src/shaders/Extensions.hlsl
+++ b/src/shaders/Extensions.hlsl
@@ -29,11 +29,7 @@
 #include "../shadersClean/common/Extensions.hlsli"
 #include "../shadersClean/common/Math.hlsli"
 
-#if !defined(__cplusplus)
-
 // Dummy implementation for Vulkan build only
-__decl uint AmdExtLaneCount() DUMMY_UINT_FUNC
-
 __decl uint AmdExtD3DShaderIntrinsics_LoadDwordAtAddr(
     uint gpuVaLoBits, uint gpuVaHiBits, uint offset) DUMMY_UINT_FUNC
 
@@ -168,8 +164,6 @@ uint AmdExtD3DShaderIntrinsics_WaveClusterBitOr(uint x, uint dxClusterSize)
     return spirv_OpGroupNonUniformBitwiseOr_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
 }
 
-#endif
-
 //=====================================================================================================================
 // The following extension functions are driver intrinsic functions
 //
@@ -281,7 +275,13 @@ __decl uint AmdTraceRayGetBoxSortHeuristicMode() DUMMY_UINT_FUNC
 __decl uint2 AmdTraceRayMakePC(uint pcVaLow) DUMMY_UINT2_FUNC
 __decl uint AmdTraceRayGetKnownSetRayFlags() DUMMY_UINT_FUNC
 __decl uint AmdTraceRayGetKnownUnsetRayFlags() DUMMY_UINT_FUNC
+#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 50
+__decl uint AmdTraceRayInitStaticId() DUMMY_UINT_FUNC
+#else
 __decl void AmdTraceRayInitStaticId() DUMMY_VOID_FUNC
+#endif
+__decl uint AmdTraceRayPersistentLdsAtomicAdd(uint offset, uint data) DUMMY_UINT_FUNC
+__decl uint AmdTraceRayPersistentLdsWrite(uint offset, uint data) DUMMY_UINT_FUNC
 
 //=====================================================================================================================
 // Ref: GpuRt::Device::GetStaticPipelineFlags
@@ -324,11 +324,12 @@ __decl uint  AmdExtLoadDwordAtAddrUncached(uint64_t addr, uint offset) DUMMY_UIN
 __decl void  AmdExtStoreDwordAtAddrUncached(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC
 __decl uint3 AmdExtGroupIdCompute() DUMMY_UINT3_FUNC
 __decl uint3 AmdExtGroupDimCompute() DUMMY_UINT3_FUNC
-__decl void AmdExtSleep(uint value) DUMMY_VOID_FUNC
+__decl uint  AmdExtLaneCount() DUMMY_UINT_FUNC
+__decl void  AmdExtSleep(uint value) DUMMY_VOID_FUNC
 
 #if USE_TEMP_ARRAY_STACK
 //=====================================================================================================================
-// Register based stack (shared with __cplusplus path)
+// Register based stack
 #define SHORT_STACK_SIZE  16
 
 //=====================================================================================================================
@@ -358,203 +359,46 @@ __decl uint AmdTraceRayGetStackSize()   DUMMY_UINT_FUNC
 #define ANYHIT_CALLTYPE_NO_DUPLICATE    1
 #define ANYHIT_CALLTYPE_DUPLICATE       2
 
-#ifdef __cplusplus
-//=====================================================================================================================
-static uint LoadDwordAtAddr(GpuVirtualAddress addr)
-{
-    return *reinterpret_cast<uint*>(addr);
-}
-#else
 //=====================================================================================================================
 static uint LoadDwordAtAddr(GpuVirtualAddress addr)
 {
     return AmdExtD3DShaderIntrinsics_LoadDwordAtAddr(LowPart(addr), HighPart(addr), 0);
 }
-#endif
 
 //=====================================================================================================================
 static uint2 LoadDwordAtAddrx2(GpuVirtualAddress addr)
 {
-#if !defined(__cplusplus)
     return AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx2(LowPart(addr), HighPart(addr), 0);
-#else
-    uint2 retVal;
-    retVal.x = LoadDwordAtAddr(addr);
-    retVal.y = LoadDwordAtAddr(addr + 4);
-
-    return retVal;
-#endif
 }
 
 //=====================================================================================================================
 static uint3 LoadDwordAtAddrx3(GpuVirtualAddress addr)
 {
-#if !defined(__cplusplus)
     return AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx3(LowPart(addr), HighPart(addr), 0);
-#else
-    uint3 retVal;
-    retVal.x = LoadDwordAtAddr(addr);
-    retVal.y = LoadDwordAtAddr(addr + 4);
-    retVal.z = LoadDwordAtAddr(addr + 8);
-
-    return retVal;
-#endif
 }
 
 //=====================================================================================================================
 static uint4 LoadDwordAtAddrx4(GpuVirtualAddress addr)
 {
-#if !defined(__cplusplus)
     return AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx4(LowPart(addr), HighPart(addr), 0);
-#else
-    uint4 retVal;
-    retVal.x = LoadDwordAtAddr(addr);
-    retVal.y = LoadDwordAtAddr(addr + 4);
-    retVal.z = LoadDwordAtAddr(addr + 8);
-    retVal.w = LoadDwordAtAddr(addr + 12);
-
-    return retVal;
-#endif
 }
 
 static uint ConstantLoadDwordAtAddr(GpuVirtualAddress addr)
 {
-#if !defined(__cplusplus)
     return AmdExtD3DShaderIntrinsics_ConstantLoadDwordAtAddr(LowPart(addr), HighPart(addr), 0);
-#else
-    return AmdExtConstantLoadDwordAtAddr(addr, 0);
-#endif
 }
 
 static uint64_t ConstantLoadDwordAtAddrx2(GpuVirtualAddress addr)
 {
-#if !defined(__cplusplus)
     uint2 retVal = AmdExtD3DShaderIntrinsics_ConstantLoadDwordAtAddrx2(LowPart(addr), HighPart(addr), 0);
     return PackUint64(retVal.x, retVal.y);
-#else
-    return AmdExtConstantLoad64AtAddr(addr, 0);
-#endif
 }
 
 static uint4 ConstantLoadDwordAtAddrx4(GpuVirtualAddress addr)
 {
-#if !defined(__cplusplus)
     return AmdExtD3DShaderIntrinsics_ConstantLoadDwordAtAddrx4(LowPart(addr), HighPart(addr), 0);
-#else
-    uint4 retVal;
-    retVal.xy = SplitUint64(AmdExtConstantLoad64AtAddr(addr, 0));
-    retVal.zw = SplitUint64(AmdExtConstantLoad64AtAddr(addr + 8, 0));
-
-    return retVal;
-#endif
 }
 
-#ifdef __cplusplus
-#include <cfenv>
-static constexpr uint RoundModeTable[] =
-{
-    FE_TONEAREST,
-    FE_UPWARD,
-    FE_DOWNWARD,
-    FE_TOWARDZERO,
-};
-
-//=====================================================================================================================
-static float FloatOpWithRoundMode(uint roundMode, uint operation, float src0, float src1)
-{
-    std::fesetround(RoundModeTable[roundMode]);
-
-    float result;
-
-    switch (operation)
-    {
-        case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Add:
-            result = src0 + src1;
-            break;
-
-        case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Subtract:
-            result = src0 - src1;
-            break;
-
-        case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Multiply:
-            result = src0 * src1;
-            break;
-
-        default:
-            printf("Unknown operation for FloatOpWithRoundMode\n");
-            assert(false);
-            break;
-    }
-
-    std::fesetround(FE_TONEAREST);
-
-    return result;
-}
-
-//=====================================================================================================================
-static float2 FloatOpWithRoundMode(uint roundMode, uint operation, float2 src0, float2 src1)
-{
-    std::fesetround(RoundModeTable[roundMode]);
-
-    float2 result;
-
-    switch (operation)
-    {
-        case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Add:
-            result = src0 + src1;
-            break;
-
-        case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Subtract:
-            result = src0 - src1;
-            break;
-
-        case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Multiply:
-            result = src0 * src1;
-            break;
-
-        default:
-            printf("Unknown operation for FloatOpWithRoundMode\n");
-            assert(false);
-            break;
-    }
-
-    std::fesetround(FE_TONEAREST);
-
-    return result;
-}
-
-//=====================================================================================================================
-static float3 FloatOpWithRoundMode(uint roundMode, uint operation, float3 src0, float3 src1)
-{
-    std::fesetround(RoundModeTable[roundMode]);
-
-    float3 result;
-
-    switch (operation)
-    {
-        case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Add:
-            result = src0 + src1;
-            break;
-
-        case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Subtract:
-            result = src0 - src1;
-            break;
-
-        case AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_Multiply:
-            result = src0 * src1;
-            break;
-
-        default:
-            printf("Unknown operation for FloatOpWithRoundMode\n");
-            assert(false);
-            break;
-    }
-
-    std::fesetround(FE_TONEAREST);
-
-    return result;
-}
-#else
 //=====================================================================================================================
 static float FloatOpWithRoundMode(uint roundMode, uint operation, float src0, float src1)
 {
@@ -572,6 +416,5 @@ static float3 FloatOpWithRoundMode(uint roundMode, uint operation, float3 src0,
 {
     return AmdExtD3DShaderIntrinsics_FloatOpWithRoundMode(roundMode, operation, src0, src1);
 }
-#endif
 
 #endif
diff --git a/src/shaders/GpuRtLibrary.hlsl b/src/shaders/GpuRtLibrary.hlsl
index 81bf9fa..879590c 100644
--- a/src/shaders/GpuRtLibrary.hlsl
+++ b/src/shaders/GpuRtLibrary.hlsl
@@ -26,6 +26,9 @@
 #ifndef _GPURT_LIBRARY_HLSL
 #define _GPURT_LIBRARY_HLSL
 
+#include "../shadersClean/traversal/TraversalDefs.hlsli"
+#include "../shadersClean/common/InstanceDesc.hlsli"
+
 // Following order matters as AccelStructTracker relies on defines from TraceRayCommon.hlsl
 #include "TraceRayCommon.hlsl"
 #include "AccelStructTracker.hlsl"
diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl
index 23ed420..de0dc26 100644
--- a/src/shaders/GpuRtLibraryCont.hlsl
+++ b/src/shaders/GpuRtLibraryCont.hlsl
@@ -32,6 +32,7 @@
 #endif
 
 #include "../shadersClean/common/Math.hlsli"
+#include "../shadersClean/common/InstanceDesc.hlsli"
 
 // By default, Gpurt exports both non-continuation and continuation traversal functions. Dxcp picks one based on panel
 // setting.
@@ -251,12 +252,19 @@ struct _AmdDispatchSystemData
         return dispatchId;
     }
 
+    static _AmdDispatchSystemData MakeDeadLaneWithStack();
+    static _AmdDispatchSystemData MakeDeadLaneWithoutStack();
+
     uint  dispatchLinearId;   // Packed dispatch linear id. Combine x/y/z into 1 DWORD.
 
     uint  shaderRecIdx; // Record index for local root parameters.
 #if DEVELOPER
-    uint  parentId;     // Record the parent Id for ray history counter, -1 for RayGen shader.
+    uint  parentId;     // Record the parent's dynamic Id for ray history counter, -1 for RayGen shader.
+    uint  staticId;     // Record the static Id of current trace ray call site.
 #endif
+
+    uint nextNodePtr;   // Next node pointer (moved here from _AmdTraversalState due to launch kernel VGPR limits).
+                        // Also contains the state of the current lane (e.g. dead with or without valid stack).
 };
 
 //=====================================================================================================================
@@ -292,6 +300,10 @@ struct _AmdRaySystemState
         // Apply known bits common to all TraceRay calls
         incomingFlags = ((incomingFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags());
 #endif
+        // Apply options overrides
+        incomingFlags &= ~Options::getRayFlagsOverrideForceDisableMask();
+        incomingFlags |=  Options::getRayFlagsOverrideForceEnableMask();
+
         return incomingFlags;
     }
 
@@ -494,7 +506,6 @@ struct _AmdTraversalState
     // register space reserved for ray attributes in general
     float2 committedBarycentrics;
 
-    uint nextNodePtr;
     uint instNodePtr;
 
     // Traversal stack state. Note, on some hardware this data represents a packed stack pointer that will
@@ -647,14 +658,23 @@ struct _AmdSystemData
     }
 #endif
 
-    bool IsDeadLane()
+    bool IsDeadLaneWithoutStack()
+    {
+        // This type of dead lane is only possible when the continuations stack is in global memory.
+        // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime.
+        return (dispatch.nextNodePtr == DEAD_LANE_WITHOUT_STACK) && _AmdContinuationStackIsGlobal();
+    }
+
+    bool IsDeadLaneWithStack()
     {
-        return traversal.nextNodePtr == DEAD_LANE;
+        // This type of dead lane is only possible when persistent launch is enabled.
+        // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime.
+        return (dispatch.nextNodePtr == DEAD_LANE_WITH_STACK) && Options::getPersistentLaunchEnabled();
     }
 
     bool IsTraversal()
     {
-        return IsValidNode(traversal.nextNodePtr);
+        return IsValidNode(dispatch.nextNodePtr);
     }
 
     bool IsChsOrMiss(in uint state)
@@ -685,9 +705,14 @@ struct _AmdSystemData
         return IsChsOrMiss(state) && IsValidNode(traversal.committed.instNodePtr);
     }
 
-    static _AmdSystemData MakeDeadLane();
+    static _AmdSystemData MakeDeadLaneWithStack();
+    static _AmdSystemData MakeDeadLaneWithoutStack();
 
+    // Note: _AmdDispatchSystemData must be the first member of _AmdSystemData. This allows us to save some VGPRs if
+    //       we need to call a function that takes _AmdSystemData but doesn't actually need ray or traversal data.
+    //       For example, the launch kernel can make a dead lane and enqueue traversal with just dispatch.nextNodePtr.
     _AmdDispatchSystemData dispatch;
+
     _AmdRaySystemState     ray;
     _AmdTraversalState     traversal;
 #if DEVELOPER
@@ -739,6 +764,7 @@ DECLARE_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data)
 DECLARE_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data)
 DECLARE_ENQUEUE(RayGen, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
 DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data)
+DECLARE_WAIT_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
 
 DECLARE_ENQUEUE(AnyHit, uint64_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics)
 DECLARE_ENQUEUE(Intersection, uint64_t returnAddr, _AmdAnyHitSystemData data)
@@ -762,19 +788,39 @@ DECLARE_GET_UNINITIALIZED(F32, float)
 DECLARE_GET_UNINITIALIZED(I32, uint32_t)
 DECLARE_GET_UNINITIALIZED(I64, uint64_t)
 DECLARE_GET_UNINITIALIZED(SystemData, _AmdSystemData)
+DECLARE_GET_UNINITIALIZED(DispatchSystemData, _AmdDispatchSystemData)
 
-#if CONTINUATIONS_LGC_STACK_LOWERING
 DECLARE_CONT_STACK_LOAD_LAST_USE(U32, uint32_t)
 DECLARE_CONT_STACK_STORE(U32, uint32_t value)
 DECLARE_CONT_STACK_LOAD_LAST_USE(U64, uint64_t)
 DECLARE_CONT_STACK_STORE(U64, uint64_t value)
 #endif
-#endif
 
-inline _AmdSystemData _AmdSystemData::MakeDeadLane()
+inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithStack()
+{
+    _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData();
+    data.nextNodePtr = DEAD_LANE_WITH_STACK;
+    return data;
+}
+
+inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithoutStack()
+{
+    _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData();
+    data.nextNodePtr = DEAD_LANE_WITHOUT_STACK;
+    return data;
+}
+
+inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithStack()
+{
+    _AmdSystemData data = _AmdGetUninitializedSystemData();
+    data.dispatch.nextNodePtr = DEAD_LANE_WITH_STACK;
+    return data;
+}
+
+inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithoutStack()
 {
     _AmdSystemData data = _AmdGetUninitializedSystemData();
-    data.traversal.nextNodePtr = DEAD_LANE;
+    data.dispatch.nextNodePtr = DEAD_LANE_WITHOUT_STACK;
     return data;
 }
 
@@ -896,6 +942,14 @@ static float4x3 WorldToObject4x3(in uint64_t tlasBaseAddr, in uint instNodePtr)
     return transpose(WorldToObject3x4(tlasBaseAddr, instNodePtr));
 }
 
+//=====================================================================================================================
+__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC
+__decl uint  AmdExtFlattenedThreadIdInGroupCompute() DUMMY_UINT_FUNC
+__decl uint  AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC
+__decl void  AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC
+__decl void  AmdExtDeviceMemoryAcquire() DUMMY_VOID_FUNC
+__decl void  AmdExtDeviceMemoryRelease() DUMMY_VOID_FUNC
+
 //=====================================================================================================================
 // Implementation of DispatchRaysIndex.
 export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data)
@@ -907,13 +961,28 @@ export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data)
 // Load dispatch dimensions from constant buffer.
 static uint3 GetDispatchRaysDimensions()
 {
-    const uint  width  = DispatchRaysConstBuf.rayDispatchWidth;
-    const uint  height = DispatchRaysConstBuf.rayDispatchHeight;
-    const uint  depth  = DispatchRaysConstBuf.rayDispatchDepth;
+    const uint width  = DispatchRaysConstBuf.rayDispatchWidth;
+    const uint height = DispatchRaysConstBuf.rayDispatchHeight;
+    const uint depth  = DispatchRaysConstBuf.rayDispatchDepth;
 
     return uint3(width, height, depth);
 }
 
+//=====================================================================================================================
+// Persistent dispatch size (1D).
+static uint3 GetPersistentDispatchSize()
+{
+    // Groups needed to cover the dispatch if each thread only processes 1 ray
+    const uint3 rayDispatch   = GetDispatchRaysDimensions();
+    const uint  threadsNeeded = rayDispatch.x * rayDispatch.y * rayDispatch.z;
+    const uint3 groupDim      = AmdExtGroupDimCompute();
+    const uint  groupsNeeded  = RoundUpQuotient(threadsNeeded, groupDim.x * groupDim.y * groupDim.z);
+
+    // Dispatch size is the lesser of rayDispatchMaxGroups and groupsNeeded
+    // rayDispatchMaxGroups would mean threads handle >= 1 ray, groupsNeeded would mean threads handle <= 1 ray
+    return min(DispatchRaysConstBuf.rayDispatchMaxGroups, groupsNeeded);
+}
+
 //=====================================================================================================================
 // Implementation of DispatchRaysDimensions().
 export uint3 _cont_DispatchRaysDimensions3(in _AmdDispatchSystemData data)
@@ -987,24 +1056,19 @@ export float _cont_RayTCurrent(in _AmdSystemData data, in _AmdPrimitiveSystemSta
 #endif
 
 //=====================================================================================================================
-__decl uint3 AmdExtThreadIdInGroupCompute() DUMMY_UINT3_FUNC
-__decl uint  AmdExtLoadDwordAtAddr(uint64_t addr, uint offset) DUMMY_UINT_FUNC
-__decl void  AmdExtStoreDwordAtAddr(uint64_t addr, uint offset, uint value) DUMMY_VOID_FUNC
-
-//=====================================================================================================================
-// Map a thread to a ray, some threads could end up with non-existent (invalid) rays. Assuming numthreads(32, 1, 1).
+// Map a thread to a ray, some threads could end up with non-existent (invalid) rays.
 // Note D3D12_DISPATCH_RAYS_DESC::(w x h x d) are organized to DispatchDims = (?, d, 1).
 static uint3 GetDispatchId()
 {
     const uint3 threadIdInGroup = AmdExtThreadIdInGroupCompute();
     const uint3 groupId         = AmdExtGroupIdCompute();
     const uint3 dims            = GetDispatchRaysDimensions();
+    const uint  threadGroupSize = AmdExtGroupDimCompute().x * AmdExtGroupDimCompute().y * AmdExtGroupDimCompute().z;
 
     uint3 dispatchId;
     dispatchId.z = groupId.y;
     if ((dims.x > 1) && (dims.y > 1))
     {
-        // Use 8 x 4 tiles.
         /*
         Sample: D3D12_DISPATCH_RAYS_DESC::(w x h x d) = (18, 6, 1). Divided into 8x4 tiles(boxes).
         A number in a box is the group id.
@@ -1020,12 +1084,12 @@ static uint3 GetDispatchId()
         const uint yTile = groupId.x / wTile;
 
         dispatchId.x = xTile * 8 + (threadIdInGroup.x % 8);
-        dispatchId.y = yTile * 4 + (threadIdInGroup.x / 8);
+        dispatchId.y = yTile * (threadGroupSize / 8) + (threadIdInGroup.x / 8);
     }
     else
     {
         // Do a naive 1:1 simple map.
-        const uint id = threadIdInGroup.x + 32 * groupId.x;
+        const uint id = threadIdInGroup.x + threadGroupSize * groupId.x;
         const uint gridSize = dims.x * dims.y; // width x height
         dispatchId.y = id / dims.x;
         dispatchId.x = id - (dispatchId.y * dims.x);
@@ -1034,6 +1098,45 @@ static uint3 GetDispatchId()
     return dispatchId;
 }
 
+//=====================================================================================================================
+// Compute the X/Y/Z ray index based on the dispatch dimensions and a 32-bit dispatch ID
+static uint3 GetDispatchId(uint width, uint height, uint dispatchId)
+{
+    // Progressively work from Z to Y to X, subtracting as we go along
+
+    // Determine the Z index - divide by size of the 2D plane
+    const uint planeSize = width * height;
+    const uint z = dispatchId / planeSize;
+    dispatchId -= z * planeSize;
+
+    // Split the 2D plane into 8 x 64 tiles
+    const uint TileWidth  = 8;
+    const uint TileHeight = 64;
+
+    // Determine which tile along the Y axis - divide by size of the 2D strip
+    const uint yTile = dispatchId / TileHeight / width;
+    dispatchId      -= yTile * TileHeight * width;
+
+    // Determine which tile along the X axis - divide by size of the 2D strip
+    // Take care in case the dispatch height is not a multiple of TileHeight
+    const uint xStripHeight = min(TileHeight, height - (yTile * TileHeight));
+    const uint xStripSize   = TileWidth * xStripHeight;
+    const uint xTile        = dispatchId / xStripSize;
+    dispatchId             -= xTile * xStripSize;
+
+    // Determine Y position within the tile - divide by width of the 2D strip
+    // Take care in case the dispatch width is not a multiple of TileWidth
+    const uint xStripWidth = min(TileWidth, width - xTile * TileWidth);
+    const uint y           = dispatchId / xStripWidth;
+    dispatchId            -= y * xStripWidth;
+
+    // Remainder is the X position within the tile
+    const uint x = dispatchId;
+
+    // Return ray index - X/Y based on their respective tiles and position within
+    return uint3(xTile * TileWidth + x, yTile * TileHeight + y, z);
+}
+
 //=====================================================================================================================
 export uint _cont_InstanceIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
 {
@@ -1144,7 +1247,7 @@ static void AcceptHit(inout_param(_AmdAnyHitSystemData) data, bool endSearch)
         data.base.traversal.committed = data.candidate;
         if (endSearch)
         {
-            data.base.traversal.nextNodePtr = END_SEARCH;     // End search
+            data.base.dispatch.nextNodePtr = END_SEARCH;     // End search
         }
     }
 }
@@ -1186,9 +1289,9 @@ export bool _cont_IsEndSearch(in _AmdAnyHitSystemData data)
 {
     // If AnyHit shader called AcceptHitAndEndSearch, or RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH was set, nextNodePtr
     // is END_SEARCH.
-    // On the other side, the values Traversal function may set to traversal.nextNodePtr on its exit are different:
+    // On the other side, the values Traversal function may set to dispatch.nextNodePtr on its exit are different:
     // normal pointers, TERMINAL_NODE or INVALID_NODE.
-    return (data.base.traversal.nextNodePtr == END_SEARCH);
+    return (data.base.dispatch.nextNodePtr == END_SEARCH);
 }
 
 //=====================================================================================================================
@@ -1202,9 +1305,10 @@ export uint _cont_GetContinuationStackAddr()
     {
         const uint3 threadIdInGroup = AmdExtThreadIdInGroupCompute();
         const uint3 groupId         = AmdExtGroupIdCompute();
+        const uint  threadGroupSize = AmdExtGroupDimCompute().x * AmdExtGroupDimCompute().y * AmdExtGroupDimCompute().z;
 
-        // Do a naive 1:1 simple map. Also for now, assume numthreads(32, 1, 1)
-        const uint id = threadIdInGroup.x + 32 * groupId.x;
+        // Do a naive 1:1 simple map.
+        const uint id = threadIdInGroup.x + threadGroupSize * groupId.x;
 
         offset = id * DispatchRaysConstBuf.cpsFrontendStackSize;
     }
@@ -1377,12 +1481,17 @@ static void RayHistorySetCandidateTCurrent(inout_param(_AmdSystemData) data, flo
 }
 
 //=====================================================================================================================
-static void RayHistoryInitStaticId()
+static void RayHistoryInitStaticId(inout_param(_AmdSystemData) data)
 {
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
+#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 50
+        data.dispatch.staticId = AmdTraceRayInitStaticId();
+#else
         AmdTraceRayInitStaticId();
+        data.dispatch.staticId = AmdTraceRayGetStaticId();
+#endif
     }
 #endif
 }
@@ -1436,7 +1545,7 @@ static void RayHistoryWriteBegin(inout_param(_AmdSystemData) data)
                                   data.ray.Flags(),
                                   data.ray.traceParameters,
                                   rayDesc,
-                                  AmdTraceRayGetStaticId(),
+                                  data.dispatch.staticId,
                                   data.counter.dynamicId,
                                   data.dispatch.parentId);
         WriteRayHistoryTokenTimeStamp(rayId, data.counter.timerBegin);
@@ -1582,7 +1691,7 @@ static void RayHistoryWriteAnyHitOrProceduralStatus(inout_param(_AmdSystemData)
     if (EnableTraversalCounter())
     {
         const uint rayId  = GetRayId(_cont_DispatchRaysIndex3(data.dispatch));
-        const uint status = (data.traversal.nextNodePtr == END_SEARCH)
+        const uint status = (data.dispatch.nextNodePtr == END_SEARCH)
                             ? HIT_STATUS_ACCEPT_AND_END_SEARCH
                             : (data.ray.AnyHitDidAccept() ? HIT_STATUS_ACCEPT : HIT_STATUS_IGNORE);
 
@@ -1840,24 +1949,97 @@ static HitGroupInfo GetHitGroupInfo(
 #include "Continuations2_0.hlsl"
 
 #if CONTINUATION_ON_GPU
+//=====================================================================================================================
+static void LaunchRayGen(bool setupStack)
+{
+    uint3 dispatchId;
+    bool  valid;
+
+    if (Options::getPersistentLaunchEnabled() == false)
+    {
+        // Each thread will process <= 1 ray. No need for extra counter logic.
+        dispatchId = GetDispatchId();
+        valid      = (dispatchId.x < DispatchRaysConstBuf.rayDispatchWidth &&
+                      dispatchId.y < DispatchRaysConstBuf.rayDispatchHeight);
+    }
+    else
+    {
+        // This is a persistent launch where each thread will process >= 1 ray.
+
+        // This is written in a way that is intended to be correct even if threads don't reconverge after calling into
+        // the ray generation shader.
+        uint localWorkId;
+        const uint popCount = WaveActiveCountBits(true);
+
+        if (WaveIsFirstLane())
+        {
+            localWorkId = AmdTraceRayPersistentLdsAtomicAdd(0, popCount);
+        }
+        localWorkId = WaveReadLaneFirst(localWorkId) + WavePrefixCountBits(true);
+
+        const uint3 rayDims = GetDispatchRaysDimensions();
+        const uint  tgCount = GetPersistentDispatchSize();
+
+        // Single dimension dispatch so the flattened group ID is the same as the x component of the group ID
+        const uint tgId = AmdExtGroupIdCompute().x;
+
+        // Interleave waves' worth of work among CUs so that every CU does approximately the same amount of work even
+        // for dispatches that are smaller than the maximum occupancy of the GPU. This is probably also a bit better
+        // for memory and shader execution locality, since CUs should tend to stay roughly within the same region of
+        // the dispatch. Assume numthreads(32, 1, 1).
+        const uint lowPart        = localWorkId & 31;
+        const uint highPart       = localWorkId & ~31;
+        const uint flatDispatchId = highPart * tgCount + tgId * 32 + lowPart;
+
+        dispatchId = GetDispatchId(rayDims.x, rayDims.y, flatDispatchId);
+        valid      = flatDispatchId < (rayDims.x * rayDims.y * rayDims.z);
+    }
+
+    // With persistent launch every lane gets a stack
+    if (setupStack)
+    {
+        _AmdContStackSetPtr(_cont_GetContinuationStackAddr());
+    }
+
+    if (WaveActiveAllTrue(!valid))
+    {
+        // This wave is done.
+        _AmdComplete();
+    }
+
+    // But only lanes that have a valid dispatch id execute RGS, the others stay dead:
+    if (valid)
+    {
+        _AmdDispatchSystemData systemData;
+        systemData.PackDispatchId(dispatchId);
+        systemData.shaderRecIdx = _AmdGetUninitializedI32();
+#if DEVELOPER
+        systemData.parentId = -1;
+#endif
+        _AmdEnqueueRayGen(GetRayGenVpc(), _AmdGetUninitializedI64(), systemData);
+    }
+    else if (Options::getPersistentLaunchEnabled())
+    {
+        _AmdDispatchSystemData systemData = _AmdDispatchSystemData::MakeDeadLaneWithStack();
+        _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), systemData);
+    }
+}
+
 //=====================================================================================================================
 // KernelEntry is entry function of the RayTracing continuation mode
 export void _cont_KernelEntry()
 {
-    _AmdDispatchSystemData systemData;
-    uint3 dispatchId = GetDispatchId();
-    systemData.PackDispatchId(dispatchId);
-    systemData.shaderRecIdx = _AmdGetUninitializedI32();
-    GPU_ASSERT(dispatchId.z < DispatchRaysConstBuf.rayDispatchDepth);
-    if (dispatchId.x >= DispatchRaysConstBuf.rayDispatchWidth ||
-        dispatchId.y >= DispatchRaysConstBuf.rayDispatchHeight)
+    if (Options::getPersistentLaunchEnabled())
     {
-        return;
-    }
+        if (AmdExtFlattenedThreadIdInGroupCompute() == 0)
+        {
+            AmdTraceRayPersistentLdsWrite(0, 0);
+        }
 
-    _AmdContStackSetPtr(_cont_GetContinuationStackAddr());
+        GroupMemoryBarrierWithGroupSync();
+    }
 
-    _AmdEnqueueRayGen(GetRayGenVpc(), _AmdGetUninitializedI64(), systemData);
+    LaunchRayGen(true);
 }
 
 //=====================================================================================================================
@@ -1934,9 +2116,11 @@ export void _cont_TraceRay(
     {
     case RayTracingIpLevel::RtIp1_1:
         traversal = InitTraversalState1_1(instanceInclusionMask, rayDesc, isValid);
+        dispatch.nextNodePtr = isValid ? CreateRootNodePointer1_1() : INVALID_NODE;
         break;
     case RayTracingIpLevel::RtIp2_0:
         traversal = InitTraversalState2_0(instanceInclusionMask, rayDesc, isValid);
+        dispatch.nextNodePtr = isValid ? CreateRootNodePointer1_1() : TERMINAL_NODE;
         break;
     default:
         break;
@@ -1947,7 +2131,7 @@ export void _cont_TraceRay(
     data.ray       = ray;
     data.traversal = traversal;
 
-    RayHistoryInitStaticId();
+    RayHistoryInitStaticId(data);
     RayHistoryWriteBegin(data);
 
     const uint     callerShaderRecIdx    = dispatch.shaderRecIdx; // 0 if from RayGen.
@@ -2028,8 +2212,7 @@ static void TraversalInternal(
 {
     switch (_AmdGetRtip())
     {
-#if   (GPURT_RTIP_LEVEL == 20) || (GPURT_RTIP_LEVEL == 0)
-    // Level 20 is used for legacy variants
+#if (GPURT_RTIP_LEVEL == GPURT_RTIP_LEGACY_LEVEL) || (GPURT_RTIP_LEVEL == 0)
     case RayTracingIpLevel::RtIp1_1:
         TraversalInternal1_1(data, state, candidate, candidateBarycentrics);
         break;
@@ -2049,7 +2232,8 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_
         if (_AmdContinuationStackIsGlobal())
         {
             // No work to do = dead lane, jump to traversal as a synchronization point with an empty system data
-            _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), _AmdSystemData::MakeDeadLane());
+            _AmdSystemData sysData = _AmdSystemData::MakeDeadLaneWithoutStack();
+            _AmdWaitEnqueueTraversal(GetTraversalVpc(), -1, _AmdGetUninitializedI64(), sysData);
         }
         else
         {
@@ -2118,11 +2302,6 @@ static void EnterSchedulerSection()
 export void _cont_Traversal(
     inout_param(_AmdSystemData) data)
 {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION  >= 41
-    data.ray.PackAccelStructAndRayflags(
-        data.ray.AccelStruct(),
-        (data.ray.IncomingFlags() & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags());
-#endif
     // Discard data that doesn't need to be kept alive during Traversal
     data.dispatch.shaderRecIdx = _AmdGetUninitializedI32();
     if (!IsBvhRebraid())
@@ -2132,7 +2311,8 @@ export void _cont_Traversal(
     }
 
     // Write AHS/IS returned status
-    if (!data.IsDeadLane())
+    bool IsDeadLane = (data.IsDeadLaneWithoutStack() || data.IsDeadLaneWithStack());
+    if (!IsDeadLane)
     {
         RayHistoryWriteAnyHitOrProceduralStatus(data);
     }
@@ -2278,6 +2458,12 @@ static IntersectionResult TraceRayInternalCPSDebug(
 
     const bool isValid = true; // already verified in the caller
 
+    _AmdDispatchSystemData dispatch = (_AmdDispatchSystemData)0;
+    dispatch.PackDispatchId(GetDispatchId());
+#if DEVELOPER
+    dispatch.parentId = -1;
+#endif
+
     // Initialise traversal system state from driver intrinsic
     _AmdTraversalState traversal = (_AmdTraversalState)0;
     switch (rtIpLevel)
@@ -2286,27 +2472,25 @@ static IntersectionResult TraceRayInternalCPSDebug(
         traversal = InitTraversalState1_1(0,
                                           rayDesc,
                                           isValid);
+        dispatch.nextNodePtr = isValid ? CreateRootNodePointer1_1() : INVALID_NODE;
         break;
     case GPURT_RTIP2_0:
         traversal = InitTraversalState2_0(0,
                                           rayDesc,
                                           isValid);
+        dispatch.nextNodePtr = isValid ? CreateRootNodePointer1_1() : TERMINAL_NODE;
         break;
     default:
         break;
     }
 
     _AmdSystemData sysData = (_AmdSystemData)0;
-    sysData.dispatch       = (_AmdDispatchSystemData)0;
-    sysData.dispatch.PackDispatchId(GetDispatchId());
-#if DEVELOPER
-    sysData.dispatch.parentId = -1;
-#endif
-    sysData.ray            = ray;
-    sysData.traversal      = traversal;
+    sysData.dispatch  = dispatch;
+    sysData.ray       = ray;
+    sysData.traversal = traversal;
 
     // Begin outer while loop
-    while (sysData.traversal.nextNodePtr < TERMINAL_NODE)
+    while (sysData.dispatch.nextNodePtr < TERMINAL_NODE)
     {
         _AmdTraversalResultData ret = TraversalInternalDebugWrapper(sysData);
         uint state = ret.state;
@@ -2372,7 +2556,7 @@ static IntersectionResult TraceRayInternalCPSDebug(
 
                         if (status == HIT_STATUS_ACCEPT_AND_END_SEARCH)
                         {
-                            sysData.traversal.nextNodePtr = INVALID_NODE;
+                            sysData.dispatch.nextNodePtr = INVALID_NODE;
                         }
                     }
                 }
@@ -2407,7 +2591,7 @@ static IntersectionResult TraceRayInternalCPSDebug(
                     sysData.traversal.committed = ret.candidate;
                     if (status == HIT_STATUS_ACCEPT_AND_END_SEARCH)
                     {
-                        sysData.traversal.nextNodePtr = INVALID_NODE;
+                        sysData.dispatch.nextNodePtr = INVALID_NODE;
                     }
                 }
             }
diff --git a/src/shaders/InitExecuteIndirect.hlsl b/src/shaders/InitExecuteIndirect.hlsl
index 2980cca..d2d079b 100644
--- a/src/shaders/InitExecuteIndirect.hlsl
+++ b/src/shaders/InitExecuteIndirect.hlsl
@@ -123,10 +123,13 @@ void InitExecuteIndirect(
     {
         DispatchRaysDimensions dispatchRaysDescDim;
 
+        // The DispatchRays indirect argument struct follows any resource bindings
+        const uint dispatchRaysDescOffset = (dispatchIdx * Constants.inputBytesPerDispatch) + Constants.bindingArgsSize;
+
         if (Constants.indirectMode == DispatchDimensions)
         {
             // vkCmdTraceRaysIndirectKHR - ray trace query dimensions
-            const DispatchRaysDimensions dispatchRaysDesc = InputArgBuffer.Load<DispatchRaysDimensions>(0);
+            const DispatchRaysDimensions dispatchRaysDesc = InputArgBuffer.Load<DispatchRaysDimensions>(dispatchRaysDescOffset);
 
             dispatchRaysDescDim = dispatchRaysDesc;
 
@@ -138,7 +141,7 @@ void InitExecuteIndirect(
         else
         {
             // vkCmdTraceRaysIndirect2KHR- shaderTable + ray trace query dimensions
-            const DispatchRaysDesc dispatchRaysDesc = InputArgBuffer.Load<DispatchRaysDesc>(0);
+            const DispatchRaysDesc dispatchRaysDesc = InputArgBuffer.Load<DispatchRaysDesc>(dispatchRaysDescOffset);
 
             dispatchRaysDescDim.width  = dispatchRaysDesc.width;
             dispatchRaysDescDim.height = dispatchRaysDesc.height;
@@ -164,7 +167,18 @@ void InitExecuteIndirect(
             OutputConstants[dispatchIdx].callableTableStrideInBytes  = uint(dispatchRaysDesc.callableShaderTable.stride);
         }
 
-        uint outputOffset = 0;
+        uint inputOffset  = dispatchIdx * Constants.inputBytesPerDispatch;
+        uint outputOffset = dispatchIdx * Constants.outputBytesPerDispatch;
+
+        // Directly copy all indirect binding args from the app buffer to our temp internal buffer
+        for (uint i = 0; i < Constants.bindingArgsSize; i += sizeof(uint))
+        {
+            const uint data = InputArgBuffer.Load(inputOffset);
+            OutputArgBuffer.Store(outputOffset, data);
+            outputOffset += sizeof(uint);
+            inputOffset  += sizeof(uint);
+        }
+
         uint3 dispatchDim = uint3(0, 0, 0);
 
         switch (Constants.dispatchDimSwizzleMode)
diff --git a/src/shaders/MergeSort.hlsl b/src/shaders/MergeSort.hlsl
index 1e38736..bd1921a 100644
--- a/src/shaders/MergeSort.hlsl
+++ b/src/shaders/MergeSort.hlsl
@@ -134,100 +134,149 @@ uint NumElemsLessThanOrEqualTo64(uint64_t val, uint offset, uint offsetNext, uin
 }
 
 //=====================================================================================================================
-void GlobalMerge(
-    inout uint numTasksWait,
-    inout uint waveId,
-    uint localId,
+void GlobalMergeIteration(
     uint groupId,
-    uint numPrimitives,
-    uint groupCapacity,
-    uint activeGroups,
+    uint localId,
+    uint globalId,
     uint groupSize,
-    uint outputKeysOffset,
-    uint outputValuesOffset,
-    uint keysOffsetSwap,
-    uint valuesOffsetSwap,
+    uint groupCapacity,
+    uint cmpGap,
+    uint splitGap,
+    uint numPrimitives,
+    uint srcOffsetKey,
+    uint srcOffsetVal,
+    uint dstOffsetKey,
+    uint dstOffsetVal,
     uint useMortonCode30)
 {
-    const uint numLevelsOfMergeTree = ceil(log2(activeGroups));
-    activeGroups  = RoundUpQuotient(numPrimitives, groupSize);
-    uint cmpGap   = 1;
-    uint splitGap = 2;
+    const uint groupIdNew = groupId / 2;
+    const uint capacity   = cmpGap * groupCapacity;
+    bool leftSubtree      = true;
+    uint subtreeOffset;
+    uint subtreeEnd;
+
+    // Left Subtree
+    if (groupIdNew % splitGap < cmpGap)
+    {
+        subtreeOffset = capacity * (groupIdNew / cmpGap) + capacity;
+        subtreeEnd    = (subtreeOffset + capacity > numPrimitives) ? numPrimitives : subtreeOffset + capacity;
+    }
+    // Right Subtree
+    else
+    {
+        subtreeEnd    = (groupIdNew / cmpGap) * capacity;
+        subtreeOffset = subtreeEnd - capacity;
+        leftSubtree   = false;
+    }
 
-    // Level 0 is the sorted partitions
-    for (uint level = 1; level <= numLevelsOfMergeTree; level++)
+    if (globalId < numPrimitives)
     {
-        BEGIN_TASK(activeGroups);
-        const uint groupIdNew = groupId / 2;
-        const uint capacity   = cmpGap * groupCapacity;
-        bool leftSubtree      = true;
-        uint subtreeOffset;
-        uint subtreeEnd;
-
-        // Left Subtree
-        if (groupIdNew % splitGap < cmpGap)
+        if (useMortonCode30)
         {
-            subtreeOffset = capacity * (groupIdNew / cmpGap) + capacity;
-            subtreeEnd    = (subtreeOffset + capacity > numPrimitives) ? numPrimitives : subtreeOffset + capacity;
+            const uint mortonCode = FetchMortonCode(srcOffsetKey, globalId);
+            const uint index      = FetchSortedPrimIndex(srcOffsetVal, globalId);
+
+            uint posInMergedList  =  localId + (groupId % splitGap) * groupSize + (groupIdNew / splitGap) * capacity * 2;
+
+            posInMergedList += (leftSubtree) ? NumElemsLessThan(mortonCode, subtreeOffset, subtreeEnd, srcOffsetKey) :
+                                               NumElemsLessThanOrEqualTo(mortonCode, subtreeOffset, subtreeEnd, srcOffsetKey);
+
+            WriteMortonCode(dstOffsetKey, posInMergedList, mortonCode);
+            WriteSortedPrimIndex(dstOffsetVal, posInMergedList, index);
         }
-        // Right Subtree
         else
         {
-            subtreeEnd    = (groupIdNew / cmpGap) * capacity;
-            subtreeOffset = subtreeEnd - capacity;
-            leftSubtree   = false;
+            const uint64_t mortonCode = FetchMortonCode64(srcOffsetKey, globalId);
+            const uint index          = FetchSortedPrimIndex(srcOffsetVal, globalId);
+
+            uint posInMergedList      =  localId + (groupId % splitGap) * groupSize + (groupIdNew / splitGap) * capacity * 2;
+
+            posInMergedList += (leftSubtree) ? NumElemsLessThan64(mortonCode, subtreeOffset, subtreeEnd, srcOffsetKey) :
+                                               NumElemsLessThanOrEqualTo64(mortonCode, subtreeOffset, subtreeEnd, srcOffsetKey);
+
+            WriteMortonCode64(dstOffsetKey, posInMergedList, mortonCode);
+            WriteSortedPrimIndex(dstOffsetVal, posInMergedList, index);
         }
+    }
+}
 
-        if (globalId < numPrimitives)
-        {
-            if (useMortonCode30)
-            {
-                const uint mortonCode = FetchMortonCode(outputKeysOffset, globalId);
-                const uint index      = FetchSortedPrimIndex(outputValuesOffset, globalId);
+//=====================================================================================================================
+void GlobalMerge(
+    inout uint numTasksWait,
+    inout uint waveId,
+    uint localId,
+    uint groupId,
+    uint numPrimitives,
+    uint groupCapacity,
+    uint numLocalSortedGroups,
+    uint groupSize,
+    uint offsetKeysOutput,
+    uint offsetValsOutput,
+    uint offsetKeysInput,
+    uint offsetValsInput,
+    uint useMortonCode30)
+{
+    const uint numLevelsOfMergeTree = ceil(log2(numLocalSortedGroups));
+    const uint activeGroups  = RoundUpQuotient(numPrimitives, groupSize);
 
-                uint posInMergedList  =  localId + (groupId % splitGap) * groupSize + (groupIdNew / splitGap) * capacity * 2;
+    // Level 0 is the local sort and always copies the sorted partitions into output buffers. The first iteration of global merge
+    // phase (i.e. Level 1) always copies from output buffers to swap buffers and then continues to ping-pong between these buffers
+    // at each iteration.
+    //
+    uint level = 1;
 
-                posInMergedList += (leftSubtree) ? NumElemsLessThan(mortonCode, subtreeOffset, subtreeEnd, outputKeysOffset) :
-                                                   NumElemsLessThanOrEqualTo(mortonCode, subtreeOffset, subtreeEnd, outputKeysOffset);
+    // Level 0 is the sorted partitions
+    for (; level <= numLevelsOfMergeTree; level++)
+    {
+        // Odd levels copy from output buffers to swap buffers, while even levels copy from swap buffers to output
+        // buffers.
+        const uint srcOffsetKey = ((level & 1) == 1) ? offsetKeysOutput : offsetKeysInput;
+        const uint srcOffsetVal = ((level & 1) == 1) ? offsetValsOutput : offsetValsInput;
 
-                WriteMortonCode(keysOffsetSwap, posInMergedList, mortonCode);
-                WriteSortedPrimIndex(valuesOffsetSwap, posInMergedList, index);
-            }
-            else
-            {
-                const uint64_t mortonCode = FetchMortonCode64(outputKeysOffset, globalId);
-                const uint index          = FetchSortedPrimIndex(outputValuesOffset, globalId);
+        const uint dstOffsetKey = ((level & 1) == 0) ? offsetKeysOutput : offsetKeysInput;
+        const uint dstOffsetVal = ((level & 1) == 0) ? offsetValsOutput : offsetValsInput;
 
-                uint posInMergedList      =  localId + (groupId % splitGap) * groupSize + (groupIdNew / splitGap) * capacity * 2;
+        BEGIN_TASK(activeGroups);
 
-                posInMergedList += (leftSubtree) ? NumElemsLessThan64(mortonCode, subtreeOffset, subtreeEnd, outputKeysOffset) :
-                                                   NumElemsLessThanOrEqualTo64(mortonCode, subtreeOffset, subtreeEnd, outputKeysOffset);
+        const uint cmpGap = 1u << (level - 1);
+        const uint splitGap = 1u << level;
+
+        GlobalMergeIteration(groupId,
+                             localId,
+                             globalId,
+                             groupSize,
+                             groupCapacity,
+                             cmpGap,
+                             splitGap,
+                             numPrimitives,
+                             srcOffsetKey,
+                             srcOffsetVal,
+                             dstOffsetKey,
+                             dstOffsetVal,
+                             useMortonCode30);
 
-                WriteMortonCode64(keysOffsetSwap, posInMergedList, mortonCode);
-                WriteSortedPrimIndex(valuesOffsetSwap, posInMergedList, index);
-            }
-        }
         END_TASK(activeGroups);
+    }
 
-        splitGap <<= 1;
-        cmpGap   <<= 1;
-
+    // If we ping-ponged to an odd level, we need to copy back the data from swap buffers to output buffers
+    if ((level & 1) == 0)
+    {
         BEGIN_TASK(activeGroups);
         if (globalId < numPrimitives)
         {
             if (useMortonCode30)
             {
-                const uint mortonCode = FetchMortonCode(keysOffsetSwap, globalId);
-                WriteMortonCode( outputKeysOffset, globalId, mortonCode);
+                const uint mortonCode = FetchMortonCode(offsetKeysInput, globalId);
+                WriteMortonCode(offsetKeysOutput, globalId, mortonCode);
             }
             else
             {
-                const uint64_t mortonCode = FetchMortonCode64(keysOffsetSwap, globalId);
-                WriteMortonCode64( outputKeysOffset, globalId, mortonCode);
+                const uint64_t mortonCode = FetchMortonCode64(offsetKeysInput, globalId);
+                WriteMortonCode64(offsetKeysOutput, globalId, mortonCode);
             }
 
-            const uint index = FetchSortedPrimIndex(valuesOffsetSwap, globalId);
-            WriteSortedPrimIndex(outputValuesOffset, globalId, index);
+            const uint index = FetchSortedPrimIndex(offsetValsInput, globalId);
+            WriteSortedPrimIndex(offsetValsOutput, globalId, index);
         }
         END_TASK(activeGroups);
     }
@@ -464,10 +513,10 @@ void MergeSortImpl(
     uint localId,
     uint groupId,
     uint numPrimitives,
-    uint inputKeysOffset,
-    uint outputKeysOffset,
-    uint outputValuesOffset,
-    uint valuesOffsetSwap,
+    uint offsetKeysInput,
+    uint offsetKeysOutput,
+    uint offsetValsInput,
+    uint offsetValsOutput,
     uint useMortonCode30)
 {
     const uint groupCapacity = BUILD_THREADGROUP_SIZE * 2;
@@ -480,9 +529,9 @@ void MergeSortImpl(
                                   numPrimitives,
                                   groupCapacity,
                                   BUILD_THREADGROUP_SIZE,
-                                  inputKeysOffset,
-                                  outputKeysOffset,
-                                  outputValuesOffset,
+                                  offsetKeysInput,
+                                  offsetKeysOutput,
+                                  offsetValsOutput,
                                   useMortonCode30);
     END_TASK(activeGroups);
 
@@ -494,10 +543,10 @@ void MergeSortImpl(
                 groupCapacity,
                 activeGroups,
                 BUILD_THREADGROUP_SIZE,
-                outputKeysOffset,
-                outputValuesOffset,
-                inputKeysOffset,
-                valuesOffsetSwap,
+                offsetKeysOutput,
+                offsetValsOutput,
+                offsetKeysInput,
+                offsetValsInput,
                 useMortonCode30);
     // Implicit Global Sync at the end of GlobalMerge();
 }
@@ -526,8 +575,125 @@ void MergeSort(
                   numPrimitives,
                   ShaderConstants.offsets.mortonCodes,
                   ShaderConstants.offsets.mortonCodesSorted,
-                  ShaderConstants.offsets.primIndicesSorted,
                   ShaderConstants.offsets.primIndicesSortedSwap,
+                  ShaderConstants.offsets.primIndicesSorted,
                   Settings.useMortonCode30);
 }
+
+//=====================================================================================================================
+// Main Function : MergeSortLocal
+//=====================================================================================================================
+[RootSignature(RootSig)]
+[numthreads(BUILD_THREADGROUP_SIZE, 1, 1)]
+void MergeSortLocal(
+    uint globalId     : SV_DispatchThreadID,
+    uint localId      : SV_GroupThreadID,
+    uint groupId      : SV_GroupID)
+{
+    const uint numPrimitives = FetchTaskCounter(
+        ShaderConstants.offsets.encodeTaskCounter + ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET);
+
+    const uint groupCapacity = BUILD_THREADGROUP_SIZE * 2;
+    FetchAndLocalSortAndWriteBack(localId,
+                                  groupId,
+                                  globalId,
+                                  numPrimitives,
+                                  groupCapacity,
+                                  BUILD_THREADGROUP_SIZE,
+                                  ShaderConstants.offsets.mortonCodes,
+                                  ShaderConstants.offsets.mortonCodesSorted,
+                                  ShaderConstants.offsets.primIndicesSorted,
+                                  Settings.useMortonCode30);
+}
+
+//=====================================================================================================================
+// Main Function : MergeSortGlobalIteration
+//=====================================================================================================================
+[RootSignature(RootSig)]
+[numthreads(BUILD_THREADGROUP_SIZE, 1, 1)]
+void MergeSortGlobalIteration(
+    uint globalId : SV_DispatchThreadID,
+    uint localId  : SV_GroupThreadID,
+    uint groupId  : SV_GroupID)
+{
+    const uint numPrimitives = FetchTaskCounter(
+        ShaderConstants.offsets.encodeTaskCounter + ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET);
+
+    const uint groupSize = BUILD_THREADGROUP_SIZE;
+    const uint groupCapacity = BUILD_THREADGROUP_SIZE * 2;
+
+    const uint offsetKeysOutput = ShaderConstants.offsets.mortonCodesSorted;
+    const uint offsetValsOutput = ShaderConstants.offsets.primIndicesSorted;
+    const uint offsetKeysInput  = ShaderConstants.offsets.mortonCodes;
+    const uint offsetValsInput  = ShaderConstants.offsets.primIndicesSortedSwap;
+
+    // Level 0 is the local sort and always copies the sorted partitions into output buffers. The first iteration of global merge
+    // phase (i.e. Level 1) always copies from output buffers to swap buffers and then continues to ping-pong between these buffers
+    // at each iteration.
+    //
+
+    // TODO: Fetch from root constants
+    const uint level = ShaderRootConstants.PassIndex();
+
+    // Odd levels copy from output buffers to swap buffers, while even levels copy from swap buffers to output
+    // buffers.
+    const uint srcOffsetKey = ((level & 1) == 1) ? offsetKeysOutput : offsetKeysInput;
+    const uint srcOffsetVal = ((level & 1) == 1) ? offsetValsOutput : offsetValsInput;
+
+    const uint dstOffsetKey = ((level & 1) == 0) ? offsetKeysOutput : offsetKeysInput;
+    const uint dstOffsetVal = ((level & 1) == 0) ? offsetValsOutput : offsetValsInput;
+
+    const uint cmpGap = 1u << (level - 1);
+    const uint splitGap = 1u << level;
+
+    GlobalMergeIteration(groupId,
+                         localId,
+                         globalId,
+                         groupSize,
+                         groupCapacity,
+                         cmpGap,
+                         splitGap,
+                         numPrimitives,
+                         srcOffsetKey,
+                         srcOffsetVal,
+                         dstOffsetKey,
+                         dstOffsetVal,
+                         Settings.useMortonCode30);
+}
+
+//=====================================================================================================================
+// Main Function : MergeSortCopyLastLevel
+//=====================================================================================================================
+[RootSignature(RootSig)]
+[numthreads(BUILD_THREADGROUP_SIZE, 1, 1)]
+void MergeSortCopyLastLevel(
+    uint globalId : SV_DispatchThreadID,
+    uint localId : SV_GroupThreadID,
+    uint groupId : SV_GroupID)
+{
+    const uint numPrimitives = FetchTaskCounter(
+        ShaderConstants.offsets.encodeTaskCounter + ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET);
+
+    const uint offsetKeysOutput = ShaderConstants.offsets.mortonCodesSorted;
+    const uint offsetValsOutput = ShaderConstants.offsets.primIndicesSorted;
+    const uint offsetKeysInput  = ShaderConstants.offsets.mortonCodes;
+    const uint offsetValsInput  = ShaderConstants.offsets.primIndicesSortedSwap;
+
+    if (globalId < numPrimitives)
+    {
+        if (Settings.useMortonCode30)
+        {
+            const uint mortonCode = FetchMortonCode(offsetKeysInput, globalId);
+            WriteMortonCode(offsetKeysOutput, globalId, mortonCode);
+        }
+        else
+        {
+            const uint64_t mortonCode = FetchMortonCode64(offsetKeysInput, globalId);
+            WriteMortonCode64(offsetKeysOutput, globalId, mortonCode);
+        }
+
+        const uint index = FetchSortedPrimIndex(offsetValsInput, globalId);
+        WriteSortedPrimIndex(offsetValsOutput, globalId, index);
+    }
+}
 #endif
diff --git a/src/shaders/PairCompression.hlsl b/src/shaders/PairCompression.hlsl
index 799f3b0..91aac60 100644
--- a/src/shaders/PairCompression.hlsl
+++ b/src/shaders/PairCompression.hlsl
@@ -142,24 +142,18 @@ void WriteCompressedNodes(
         const uint packedGeometryInfoData = DstBuffer.Load(geometryInfoOffset + GEOMETRY_INFO_FLAGS_AND_NUM_PRIMS_OFFSET);
         const uint geometryFlags = ExtractGeometryInfoFlags(packedGeometryInfoData);
 
-        uint triangleId = WriteTriangleIdField(0,
-                                               NODE_TYPE_TRIANGLE_0,
-                                               GetQuadScratchNodeVertexOffset(quad.scratchNodeIndexAndOffset[0]),
-                                               geometryFlags);
+        uint quadSwizzle = GetQuadScratchNodeVertexOffset(quad.scratchNodeIndexAndOffset[0]);
 
         // If this quad has another triangle, update triangle ID for the pair and update referenced scratch
         // triangle node
         if (quad.scratchNodeIndexAndOffset[1] != INVALID_IDX)
         {
-            triangleId = WriteTriangleIdField(triangleId,
-                                              NODE_TYPE_TRIANGLE_1,
-                                              GetQuadScratchNodeVertexOffset(quad.scratchNodeIndexAndOffset[1]),
-                                              geometryFlags);
+            quadSwizzle |= GetQuadScratchNodeVertexOffset(quad.scratchNodeIndexAndOffset[1]) << 4;
 
             const uint scratchNodeOffset = CalcScratchNodeOffset(scratchNodesScratchOffset, keptIndex);
 
             // Update triangle ID field in scratch node
-            const uint packedFlags = (triangleNode.packedFlags & 0x0000ffff) | (triangleId << 16);
+            const uint packedFlags = (triangleNode.packedFlags & 0x0000ffff) | (quadSwizzle << 16);
             WriteScratchNodeDataAtOffset(scratchNodeOffset, SCRATCH_NODE_FLAGS_OFFSET, packedFlags);
 
             // Repurpose the node pointer for saving the index of the other node in the pair.
diff --git a/src/shaders/RayQuery.hlsl b/src/shaders/RayQuery.hlsl
index 49e0564..ea80508 100644
--- a/src/shaders/RayQuery.hlsl
+++ b/src/shaders/RayQuery.hlsl
@@ -108,7 +108,7 @@ static bool RayQueryProceedCommon(
 
             if (rayQuery.committed.currNodePtr != INVALID_NODE)
             {
-                uint instNodeIndex = FetchInstanceIdx(rtIpLevel, GetRayQueryTopBvhAddress(rayQuery), rayQuery.lastInstanceNode);
+                uint instNodeIndex = FetchInstanceIdx(rtIpLevel, GetRayQueryTopBvhAddress(rayQuery), rayQuery.committed.instNodePtr);
 
                 WriteRayHistoryTokenEnd(
                     rayId,
diff --git a/src/shaders/TaskQueueCounter.hlsl b/src/shaders/TaskQueueCounter.hlsl
index 84aa2e5..3e2c5dd 100644
--- a/src/shaders/TaskQueueCounter.hlsl
+++ b/src/shaders/TaskQueueCounter.hlsl
@@ -22,6 +22,13 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
+#define STATE_TASK_QUEUE_PHASE_OFFSET               0
+#define STATE_TASK_QUEUE_START_PHASE_INDEX_OFFSET   4
+#define STATE_TASK_QUEUE_END_PHASE_INDEX_OFFSET     8
+#define STATE_TASK_QUEUE_TASK_COUNTER_OFFSET        12
+#define STATE_TASK_QUEUE_NUM_TASKS_DONE_OFFSET      16
+
+//=====================================================================================================================
 void AllocTasks(const uint numTasks, const uint phase, uint taskQueueOffset)
 {
     // start = end
diff --git a/src/shaders/TriangleSplitting.hlsl b/src/shaders/TriangleSplitting.hlsl
index 6860380..97cecee 100644
--- a/src/shaders/TriangleSplitting.hlsl
+++ b/src/shaders/TriangleSplitting.hlsl
@@ -22,6 +22,38 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
+#define TS_PHASE_INIT                 0
+#define TS_PHASE_CALC_SUM             1
+#define TS_PHASE_ALLOC_REFS           2
+#define TS_PHASE_SPLIT                3
+#define TS_PHASE_DONE                 4
+
+struct ScratchTSRef
+{
+    uint leafIndex;
+    uint numSplits;
+
+    uint splitLeafBaseIndex;
+
+    BoundingBox bbox;
+};
+
+struct ScratchTSState
+{
+    uint                    refListIndex;
+    uint                    numRefs;
+    uint                    numRefsAlloc;
+    float                   sum;
+    uint                    mutex;
+};
+
+#define STATE_TS_REF_LIST_INDEX_OFFSET       0
+#define STATE_TS_NUM_REFS_OFFSET             STATE_TS_REF_LIST_INDEX_OFFSET + 4
+#define STATE_TS_NUM_REFS_ALLOC_OFFSET       STATE_TS_NUM_REFS_OFFSET + 4
+#define STATE_TS_SUM_OFFSET                  STATE_TS_NUM_REFS_ALLOC_OFFSET + 4
+#define STATE_TS_MUTEX_OFFSET                STATE_TS_SUM_OFFSET + 4
+
+//=====================================================================================================================
 // 32 bit constants
 struct TriangleSplittingArgs
 {
diff --git a/src/shaders/Update.hlsl b/src/shaders/Update.hlsl
index 035ad4b..3818053 100644
--- a/src/shaders/Update.hlsl
+++ b/src/shaders/Update.hlsl
@@ -133,8 +133,8 @@ void Update(
 
     const uint numGroups = ShaderRootConstants.numThreads / BUILD_THREADGROUP_SIZE;
 
-    ClearUpdateFlags(globalId);
     BEGIN_TASK(numGroups);
+    ClearUpdateFlags(globalId);
     EncodePrimitives(globalId, GEOMETRY_TYPE_TRIANGLES);
     END_TASK(numGroups);
 
diff --git a/src/shadersClean/common/Bits.hlsli b/src/shadersClean/common/Bits.hlsli
new file mode 100644
index 0000000..857d03e
--- /dev/null
+++ b/src/shadersClean/common/Bits.hlsli
@@ -0,0 +1,166 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#ifndef BITS_HLSLI
+#define BITS_HLSLI
+
+//=====================================================================================================================
+static uint LowPart(uint64_t value)
+{
+    return uint(value);
+}
+
+//=====================================================================================================================
+static uint HighPart(uint64_t value)
+{
+    return uint(value >> 32);
+}
+
+//=====================================================================================================================
+// Helper function for producing a 32 bit mask of one bit
+inline uint32_t bit(uint32_t index)
+{
+    return 1u << index;
+}
+
+//=====================================================================================================================
+// Helper function for producing a 16 bit mask of one bit
+inline uint16_t bit16(uint16_t index)
+{
+    return uint16_t(1u << index);
+}
+
+//=====================================================================================================================
+// Helper function for producing a 64 bit mask of one bit
+inline uint64_t bit64(uint32_t index)
+{
+    return 1ull << index;
+}
+
+//=====================================================================================================================
+// Helper function for generating a 32-bit bit mask
+inline uint32_t bits(uint32_t bitcount)
+{
+    return (bitcount == 32) ? 0xFFFFFFFF : ((1u << bitcount) - 1);
+}
+
+//=====================================================================================================================
+// Helper function for generating a 16-bit bit mask
+inline uint16_t bits16(uint16_t bitcount)
+{
+    return (bitcount == 16) ? uint16_t(0xFFFFu) : uint16_t((1u << bitcount) - 1);
+}
+
+//=====================================================================================================================
+// Helper function for generating a 32-bit bit mask
+inline uint64_t bits64(uint64_t bitcount)
+{
+    return (bitcount == 64) ? 0xFFFFFFFFFFFFFFFFull : ((1ull << bitcount) - 1ull);
+}
+
+//=====================================================================================================================
+// Helper function for inserting data into a src bitfield and returning the output
+static uint32_t bitFieldInsert(
+    in uint32_t src,
+    in uint32_t bitOffset,
+    in uint32_t numBits,
+    in uint32_t data)
+{
+    const uint32_t mask = bits(numBits);
+    src &= ~(mask << bitOffset);
+    return (src | ((data & mask) << bitOffset));
+}
+
+//=====================================================================================================================
+// Helper function for inserting data into a uint16_t src bitfield and returning the output
+static uint16_t bitFieldInsert16(
+    in uint16_t src,
+    in uint16_t bitOffset,
+    in uint16_t numBits,
+    in uint16_t data)
+{
+    const uint16_t mask = bits16(numBits);
+    src &= ~(mask << bitOffset);
+    return (src | ((data & mask) << bitOffset));
+}
+
+//=====================================================================================================================
+// Helper function for inserting data into a uint64_t src bitfield and returning the output
+static uint64_t bitFieldInsert64(
+    in uint64_t src,
+    in uint64_t bitOffset,
+    in uint64_t numBits,
+    in uint64_t data)
+{
+    const uint64_t mask = bits64(numBits);
+    src &= ~(mask << bitOffset);
+    return (src | ((data & mask) << bitOffset));
+}
+
+//=====================================================================================================================
+// Helper function for extracting data from a src bitfield
+static uint32_t bitFieldExtract(
+    in uint32_t src,
+    in uint32_t bitOffset,
+    in uint32_t numBits)
+{
+    return (src >> bitOffset) & bits(numBits);
+}
+
+//=====================================================================================================================
+// Helper function for extracting data from a src bitfield
+static uint16_t bitFieldExtract16(
+    in uint16_t src,
+    in uint16_t bitOffset,
+    in uint16_t numBits)
+{
+    return (src >> bitOffset) & bits16(numBits);
+}
+
+//=====================================================================================================================
+// Helper function for extracting data from a uint64_t src bitfield
+static uint64_t bitFieldExtract64(
+    in uint64_t src,
+    in uint64_t bitOffset,
+    in uint64_t numBits)
+{
+    return (src >> bitOffset) & bits64(numBits);
+}
+
+//=====================================================================================================================
+static uint32_t Pow2Align(
+    uint32_t value,      ///< Value to align.
+    uint32_t alignment)  ///< Desired alignment (must be a power of 2).
+{
+    return ((value + alignment - 1) & ~(alignment - 1));
+}
+
+//=====================================================================================================================
+inline uint countbits64(uint64_t val)
+{
+    return countbits(LowPart(val)) + countbits(HighPart(val));
+}
+
+#endif
diff --git a/src/shadersClean/common/BoundingBox.hlsli b/src/shadersClean/common/BoundingBox.hlsli
new file mode 100644
index 0000000..f47ad82
--- /dev/null
+++ b/src/shadersClean/common/BoundingBox.hlsli
@@ -0,0 +1,74 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#ifndef BOUNDING_BOX_HLSLI
+#define BOUNDING_BOX_HLSLI
+
+//=====================================================================================================================
+struct BoundingBox // matches D3D12_RAYTRACING_AABB
+{
+    float3 min;
+    float3 max;
+};
+
+//=====================================================================================================================
+struct BoundingBox4
+{
+    float4 min;
+    float4 max;
+};
+
+//=====================================================================================================================
+// Internal bounding box type for scene bounds.
+struct UintBoundingBox
+{
+    uint3 min;
+    uint3 max;
+};
+
+struct UintBoundingBox4
+{
+    uint4 min;
+    uint4 max;
+};
+
+struct PackedUintBoundingBox4
+{
+    uint64_t min;
+    uint64_t max;
+};
+
+//=====================================================================================================================
+static BoundingBox CombineAABB(
+    BoundingBox b0,
+    BoundingBox b1)
+{
+    BoundingBox bbox;
+    bbox.min = min(b0.min, b1.min);
+    bbox.max = max(b0.max, b1.max);
+    return bbox;
+}
+
+#endif
diff --git a/src/shadersClean/common/Extensions.hlsli b/src/shadersClean/common/Extensions.hlsli
index a3b60ea..b139505 100644
--- a/src/shadersClean/common/Extensions.hlsli
+++ b/src/shadersClean/common/Extensions.hlsli
@@ -25,12 +25,8 @@
 #ifndef EXTENSIONS_HLSLI
 #define EXTENSIONS_HLSLI
 
-#if !defined(__cplusplus)
-
 #define __decl [noinline]
 
-#endif
-
 #define AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_TiesToEven     0x0
 #define AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_TowardPositive 0x1
 #define AmdExtD3DShaderIntrinsicsFloatOpWithRoundMode_TowardNegative 0x2
diff --git a/src/shadersClean/common/InstanceDesc.hlsli b/src/shadersClean/common/InstanceDesc.hlsli
new file mode 100644
index 0000000..09f910c
--- /dev/null
+++ b/src/shadersClean/common/InstanceDesc.hlsli
@@ -0,0 +1,51 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+#ifndef INSTANCE_DESC_HLSLI
+#define INSTANCE_DESC_HLSLI
+
+#include "TempAssert.hlsli"
+
+//=====================================================================================================================
+// 64-byte aligned structure matching D3D12_RAYTRACING_INSTANCE_DESC
+struct InstanceDesc
+{
+    float4 Transform[3];                                    // Inverse transform for traversal
+    uint   InstanceID_and_Mask;                             // 24-bit instance ID and 8-bit mask
+    uint   InstanceContributionToHitGroupIndex_and_Flags;   // 24-bit instance contribution and 8-bit flags
+    uint   accelStructureAddressLo;                         // Lower part of acceleration structure base address
+    uint   accelStructureAddressHiAndFlags;                 // Upper part of acceleration structure base address and
+                                                            // HW raytracing IP 2.0 flags
+};
+
+#define INSTANCE_DESC_SIZE                          64
+#define INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET   0
+#define INSTANCE_DESC_ID_AND_MASK_OFFSET            48
+#define INSTANCE_DESC_CONTRIBUTION_AND_FLAGS_OFFSET 52
+#define INSTANCE_DESC_VA_LO_OFFSET                  56
+#define INSTANCE_DESC_VA_HI_OFFSET                  60
+
+GPURT_STATIC_ASSERT(INSTANCE_DESC_SIZE == sizeof(InstanceDesc), "InstanceDesc structure mismatch");
+
+#endif
diff --git a/src/shadersClean/common/Math.hlsl b/src/shadersClean/common/Math.hlsl
index f8aa8ed..ca0f384 100644
--- a/src/shadersClean/common/Math.hlsl
+++ b/src/shadersClean/common/Math.hlsl
@@ -23,3 +23,6 @@
  *
  **********************************************************************************************************************/
 #include "Math.hlsli"
+
+#include "Bits.hlsli"
+
diff --git a/src/shadersClean/common/Math.hlsli b/src/shadersClean/common/Math.hlsli
index 981b9b5..ab35f66 100644
--- a/src/shadersClean/common/Math.hlsli
+++ b/src/shadersClean/common/Math.hlsli
@@ -25,145 +25,33 @@
 #ifndef MATH_HLSLI
 #define MATH_HLSLI
 
-#include "ShaderDefs.hlsli"
-
 #include "Extensions.hlsli"
 
 //=====================================================================================================================
-static uint LowPart(GpuVirtualAddress addr)
-{
-    return uint(addr);
-}
-
-//=====================================================================================================================
-static uint HighPart(GpuVirtualAddress addr)
-{
-    return uint(addr >> 32);
-}
-
-//=====================================================================================================================
-// Helper function for producing a 32 bit mask of one bit
-inline uint32_t bit(uint32_t index)
-{
-    return 1u << index;
-}
-
-//=====================================================================================================================
-// Helper function for producing a 16 bit mask of one bit
-inline uint16_t bit16(uint16_t index)
-{
-    return uint16_t(1u << index);
-}
-
-//=====================================================================================================================
-// Helper function for producing a 64 bit mask of one bit
-inline uint64_t bit64(uint32_t index)
-{
-    return 1ull << index;
-}
-
-//=====================================================================================================================
-// Helper function for generating a 32-bit bit mask
-inline uint32_t bits(uint32_t bitcount)
-{
-    return (bitcount == 32) ? 0xFFFFFFFF : ((1u << bitcount) - 1);
-}
-
-//=====================================================================================================================
-// Helper function for generating a 16-bit bit mask
-inline uint16_t bits16(uint16_t bitcount)
-{
-    return (bitcount == 16) ? uint16_t(0xFFFFu) : uint16_t((1u << bitcount) - 1);
-}
-
-//=====================================================================================================================
-// Helper function for generating a 32-bit bit mask
-inline uint64_t bits64(uint64_t bitcount)
-{
-    return (bitcount == 64) ? 0xFFFFFFFFFFFFFFFFull : ((1ull << bitcount) - 1ull);
-}
-
-//=====================================================================================================================
-// Helper function for inserting data into a src bitfield and returning the output
-static uint32_t bitFieldInsert(
-    in uint32_t src,
-    in uint32_t bitOffset,
-    in uint32_t numBits,
-    in uint32_t data)
-{
-    const uint32_t mask = bits(numBits);
-    src &= ~(mask << bitOffset);
-    return (src | ((data & mask) << bitOffset));
-}
-
-//=====================================================================================================================
-// Helper function for inserting data into a uint16_t src bitfield and returning the output
-static uint16_t bitFieldInsert16(
-    in uint16_t src,
-    in uint16_t bitOffset,
-    in uint16_t numBits,
-    in uint16_t data)
-{
-    const uint16_t mask = bits16(numBits);
-    src &= ~(mask << bitOffset);
-    return (src | ((data & mask) << bitOffset));
-}
-
-//=====================================================================================================================
-// Helper function for inserting data into a uint64_t src bitfield and returning the output
-static uint64_t bitFieldInsert64(
-    in uint64_t src,
-    in uint64_t bitOffset,
-    in uint64_t numBits,
-    in uint64_t data)
-{
-    const uint64_t mask = bits64(numBits);
-    src &= ~(mask << bitOffset);
-    return (src | ((data & mask) << bitOffset));
-}
-
-//=====================================================================================================================
-// Helper function for extracting data from a src bitfield
-static uint32_t bitFieldExtract(
-    in uint32_t src,
-    in uint32_t bitOffset,
-    in uint32_t numBits)
-{
-    return (src >> bitOffset) & bits(numBits);
-}
-
-//=====================================================================================================================
-// Helper function for extracting data from a src bitfield
-static uint16_t bitFieldExtract16(
-    in uint16_t src,
-    in uint16_t bitOffset,
-    in uint16_t numBits)
-{
-    return (src >> bitOffset) & bits16(numBits);
-}
-
-//=====================================================================================================================
-// Helper function for extracting data from a uint64_t src bitfield
-static uint64_t bitFieldExtract64(
-    in uint64_t src,
-    in uint64_t bitOffset,
-    in uint64_t numBits)
+// Divide uints and round up
+static uint RoundUpQuotient(
+    uint dividend,
+    uint divisor)
 {
-    return (src >> bitOffset) & bits64(numBits);
+    return (dividend + divisor - 1) / divisor;
 }
 
 //=====================================================================================================================
-static uint32_t Pow2Align(
-    uint32_t value,      ///< Value to align.
-    uint32_t alignment)  ///< Desired alignment (must be a power of 2).
+// Divide ints and round up
+static int RoundUpQuotient(
+    int dividend,
+    int divisor)
 {
-    return ((value + alignment - 1) & ~(alignment - 1));
+    return (dividend + divisor - 1) / divisor;
 }
 
 //=====================================================================================================================
-inline uint countbits64(uint64_t val)
+// Divide ints and round up
+static uint64_t RoundUpQuotient(
+    uint64_t dividend,
+    uint64_t divisor)
 {
-    return countbits(LowPart(val)) + countbits(HighPart(val));
+    return (dividend + divisor - 1) / divisor;
 }
 
 //=====================================================================================================================
diff --git a/src/shadersClean/common/NodePointers.hlsli b/src/shadersClean/common/NodePointers.hlsli
new file mode 100644
index 0000000..46e6fa3
--- /dev/null
+++ b/src/shadersClean/common/NodePointers.hlsli
@@ -0,0 +1,82 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#ifndef NODE_POINTERS_HLSLI
+#define NODE_POINTERS_HLSLI
+
+#include "../common/TempAssert.hlsli"
+
+//=====================================================================================================================
+// Node pointer size in bytes
+#define NODE_PTR_SIZE 4
+GPURT_STATIC_ASSERT(NODE_PTR_SIZE == sizeof(uint32_t), "NODE_PTR_SIZE size mismatch");
+
+//=====================================================================================================================
+// Instance base pointer layout from the HW raytracing IP 2.0 spec:
+// Zero                         [ 2: 0]
+// Tree Base Address (64B index)[53: 3]
+// Force Opaque                 [   54]
+// Force Non-Opaque             [   55]
+// Disable Triangle Cull        [   56]
+// Flip Facedness               [   57]
+// Cull Back Facing Triangles   [   58]
+// Cull Front Facing Triangles  [   59]
+// Cull Opaque                  [   60]
+// Cull Non-Opaque              [   61]
+// Skip Triangles               [   62]
+// Skip Procedural              [   63]
+//
+// Since GPU VAs can only be 48 bits, only 42 bits of the Tree Base Address field are used:
+// Used Address                 [44: 3]
+// Unused Address               [53:45]
+//
+#define INSTANCE_BASE_POINTER_ZERO_MASK                          0x7ull
+#define INSTANCE_BASE_POINTER_ADDRESS_USED_MASK       0x1FFFFFFFFFF8ull
+#define INSTANCE_BASE_POINTER_ADDRESS_UNUSED_MASK   0x3FE00000000000ull
+#define INSTANCE_BASE_POINTER_ADDRESS_MASK          0x3FFFFFFFFFFFF8ull
+#define INSTANCE_BASE_POINTER_FLAGS_MASK          0xFFC0000000000000ull
+
+#define NODE_POINTER_FLAGS_SHIFT                 54
+#define NODE_POINTER_FORCE_OPAQUE_SHIFT          54
+#define NODE_POINTER_FORCE_NON_OPAQUE_SHIFT      55
+#define NODE_POINTER_DISABLE_TRIANGLE_CULL_SHIFT 56
+#define NODE_POINTER_FLIP_FACEDNESS_SHIFT        57
+#define NODE_POINTER_CULL_BACK_FACING_SHIFT      58
+#define NODE_POINTER_CULL_FRONT_FACING_SHIFT     59
+#define NODE_POINTER_CULL_OPAQUE_SHIFT           60
+#define NODE_POINTER_CULL_NON_OPAQUE_SHIFT       61
+#define NODE_POINTER_SKIP_TRIANGLES_SHIFT        62
+#define NODE_POINTER_SKIP_PROCEDURAL_SHIFT       63
+
+#define RAY_FLAG_VALID_MASK         0x3ffu
+#define RAY_FLAG_EXCLUDE_MASK       (RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER)
+#define RAY_FLAG_OVERRIDE_MASK      (RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_FORCE_NON_OPAQUE)   // 0x3
+#define RAY_FLAG_PRESERVE_MASK      (RAY_FLAG_VALID_MASK & (~RAY_FLAG_OVERRIDE_MASK))     // 0x3fc
+
+#define POINTER_FLAGS_HIDWORD_SHIFT (NODE_POINTER_FORCE_OPAQUE_SHIFT - 32)                  // 22
+#define POINTER_FLAGS_VALID_MASK    (RAY_FLAG_VALID_MASK << POINTER_FLAGS_HIDWORD_SHIFT)    // 0x3ff << 22
+#define POINTER_FLAGS_EXCLUDED_MASK  ~(POINTER_FLAGS_VALID_MASK)                            // 0xFFC00000
+
+#endif
diff --git a/src/shared/scratchNode.h b/src/shadersClean/common/ScratchNode.hlsli
similarity index 97%
rename from src/shared/scratchNode.h
rename to src/shadersClean/common/ScratchNode.hlsli
index 1ff9b95..db7c67d 100644
--- a/src/shared/scratchNode.h
+++ b/src/shadersClean/common/ScratchNode.hlsli
@@ -1,7 +1,7 @@
 /*
  ***********************************************************************************************************************
  *
- *  Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
  *
  *  Permission is hereby granted, free of charge, to any person obtaining a copy
  *  of this software and associated documentation files (the "Software"), to deal
@@ -22,10 +22,11 @@
  *  SOFTWARE.
  *
  **********************************************************************************************************************/
+
 #ifndef _SCRATCHNODE_HLSL
 #define _SCRATCHNODE_HLSL
 
-#include "rayTracingDefs.h"
+#include "gfx10/BoxNode1_0.hlsli"
 
 //=====================================================================================================================
 // The structure is 64-byte aligned
@@ -49,7 +50,7 @@ struct ScratchNode
                                            // scratch node index of the tri in the pair in PAIR_TRIANGLE_COMPRESSION /
                                            // BLAS metadata size for instance nodes
     uint   sortedPrimIndex;                // it's the index of the sorted primitive (leaf) or start index of the sorted primitives
-    uint   packedFlags;                    // flags [0:7], instanceMask [8:15], triangleId [16:31]
+    uint   packedFlags;                    // flags [0:7], instanceMask [8:15], quadSwizzle [16:23]
 };
 
 #define SCRATCH_NODE_FLAGS_DISABLE_TRIANGLE_SPLIT_SHIFT 31
@@ -176,11 +177,11 @@ static uint ExtractScratchNodeInstanceMask(
 }
 
 //=====================================================================================================================
-// Extract triangle ID from scratch node
-static uint ExtractScratchNodeTriangleId(
+// Extract quad swizzle from scratch node
+static uint ExtractScratchNodeQuadSwizzle(
     in uint packedFlags)
 {
-    return (packedFlags >> 16);
+    return (packedFlags >> 16) & 0xFF;
 }
 
 //=====================================================================================================================
diff --git a/src/shadersClean/common/ShaderDefs.hlsli b/src/shadersClean/common/ShaderDefs.hlsli
index 5d254e6..3ca709b 100644
--- a/src/shadersClean/common/ShaderDefs.hlsli
+++ b/src/shadersClean/common/ShaderDefs.hlsli
@@ -25,8 +25,459 @@
 #ifndef SHADERDEFS_HLSLI
 #define SHADERDEFS_HLSLI
 
+// These DUMMY_*_FUNC postfix stubs must be included at the end of every driver stub (AmdTraceRay*) declaration to
+// work around a DXC + Spirv issue where the compiler can't deal with calls to functions that don't have bodies.
+#define DUMMY_BOOL_FUNC   { return false; }
+#define DUMMY_VOID_FUNC   { }
+#define DUMMY_UINT_FUNC   { return 0; }
+#define DUMMY_UINT2_FUNC  { return uint2(0, 0); }
+#define DUMMY_UINT3_FUNC  { return uint3(0, 0, 0); }
+#define DUMMY_UINT4_FUNC  { return uint4(0, 0, 0, 0); }
+#define DUMMY_FLOAT_FUNC  { return 0; }
+#define DUMMY_FLOAT2_FUNC { return float2(0, 0); }
+#define DUMMY_FLOAT3_FUNC { return float3(0, 0, 0); }
+
+#include "TempAssert.hlsli"
+
+// TODO: there are functions that use values from these files, but really
+// those functions should be in these files, and then the files that use the functions
+// should include that file, instead of ShaderDefs.h
+#include "gfx10/BoxNode1_0.hlsli"
+#include "gfx10/TriangleNode1_0.hlsli"
+#include "gfx10/ProceduralNode1_0.hlsli"
+#include "gfx10/BoxNode1_0.hlsli"
+#include "gfx10/InstanceNode1_0.hlsli"
+#include "NodePointers.hlsli"
+
+#define SAH_COST_TRIANGLE_INTERSECTION       1.5
+#define SAH_COST_AABBB_INTERSECTION          1
+
 typedef uint64_t GpuVirtualAddress;
 
+//=====================================================================================================================
+enum PrimitiveType : uint
+{
+    Triangle = 0,
+    AABB     = 1,
+    Instance = 2,
+};
+
+//=====================================================================================================================
+// BVH node types shared between HW and SW nodes
+#define NODE_TYPE_TRIANGLE_0           0
+#define NODE_TYPE_TRIANGLE_1           1
+#define NODE_TYPE_TRIANGLE_2           2
+#define NODE_TYPE_TRIANGLE_3           3
+#define NODE_TYPE_BOX_FLOAT16          4
+#define NODE_TYPE_BOX_FLOAT32          5
+#define NODE_TYPE_USER_NODE_INSTANCE   6
+// From the HW IP 2.0 spec: '7: User Node 1 (processed as a Procedural Node for culling)'
+#define NODE_TYPE_USER_NODE_PROCEDURAL 7
+
+//=====================================================================================================================
+// Acceleration structure type
+#define TOP_LEVEL      0
+#define BOTTOM_LEVEL   1
+
+//=====================================================================================================================
+// Triangle Compression Modes
+#define NO_TRIANGLE_COMPRESSION        0
+#define RESERVED                       1
+#define PAIR_TRIANGLE_COMPRESSION      2
+#define AUTO_TRIANGLE_COMPRESSION      3
+
+#define LATE_PAIR_COMP_BATCH_SIZE 8
+
+//=====================================================================================================================
+// Amount of ULPs(Unit in Last Place) added to Box node when using hardware intersection instruction
+#define BOX_EXPANSION_DEFAULT_AMOUNT 6
+
+//=====================================================================================================================
+// Box sorting heuristic value
+// 0: closethit
+// 1: LargestFirst
+// 2: ClosestMidpoint
+// 3: undefined / disabled
+// 4: LargestFirstOrClosest (auto select with rayFlag)
+// 5: BoxSortLargestFirstOrClosestMidPoint  (auto select with rayFlag)
+// 6: DisabledOnAcceptFirstHit (disable if bvhNode sort is on, and rayFlag is AcceptFirstHit)
+//
+// This need to match ILC_BOX_SORT_HEURISTIC_MODE
+enum BoxSortHeuristic : uint
+{
+    Closest                       = 0x0,
+    Largest                       = 0x1,
+    MidPoint                      = 0x2,
+    Disabled                      = 0x3,
+    LargestFirstOrClosest         = 0x4,
+    LargestFirstOrClosestMidPoint = 0x5,
+    DisabledOnAcceptFirstHit      = 0x6,
+};
+
+//=====================================================================================================================
+// Options for where FP16 box nodes are created within BLAS for QBVH
+#define NO_NODES_IN_BLAS_AS_FP16           0
+#define LEAF_NODES_IN_BLAS_AS_FP16         1
+#define MIXED_NODES_IN_BLAS_AS_FP16        2
+#define ALL_INTERIOR_NODES_IN_BLAS_AS_FP16 3
+
+// The highest 3 bits are zero after the right shift in PackNodePointer and may be repurposed.
+// Mask for MSB within node pointer
+#define NODE_POINTER_MASK_MSB              0x80000000u
+
+//=====================================================================================================================
+#define BVH4_NODE_32_STRIDE_SHIFT             7   // Box 32 node
+#define BVH4_NODE_16_STRIDE_SHIFT             6   // Box 16 node
+
+#define INVALID_IDX           0xffffffff
+#define INACTIVE_PRIM         0xfffffffe
+
+static const uint ByteStrideScratchNode = 64;
+static const uint ByteStrideU32         = 12;
+static const uint IndexFormatInvalid    = 0;
+static const uint IndexFormatU32        = 1;
+static const uint IndexFormatU16        = 2;
+
+const static uint TILE_WIDTH = 256;
+const static uint TILE_SIZE  = TILE_WIDTH * TILE_WIDTH;
+
+#ifndef BUILD_THREADGROUP_SIZE
+#define BUILD_THREADGROUP_SIZE 64
+#endif
+
+//======================================================================================================================
+// matches VkAccelerationStructureBuildRangeInfoKHR
+struct IndirectBuildOffset
+{
+    uint primitiveCount;
+    uint primitiveOffset;
+    uint firstVertex;
+    uint transformOffset;
+};
+
+//=====================================================================================================================
+// Function assumes the type passed in is a valid node type
+//
+static uint PackNodePointer(uint type, uint address)
+{
+    uint nodePointer = type; // this assumes that the type is valid
+    // uint pointer = type & 0x7;
+
+    // The input address is a byte offset, and node_addr is a 64-byte offset that starts at bit 3.
+    nodePointer |= (address >> 3); // this assumes that the input address is 64-byte aligned
+    // pointer |= (address >> 6) << 3;
+
+    return nodePointer;
+}
+
+//=====================================================================================================================
+static uint GetNodeType(uint nodePointer)
+{
+    // From the HW raytracing spec:
+    // node_type = node_pointer[ 2:0]
+    return nodePointer & 0x7;
+}
+
+//=====================================================================================================================
+static uint ClearNodeType(uint nodePointer)
+{
+    return nodePointer & ~0x7;
+}
+
+//=====================================================================================================================
+// NOTE: The highest 3 bits are excluded. They aren't written when building the QBVH and may have been repurposed. See
+// NODE_POINTER_MASK_MSB
+static uint ExtractNodePointerOffset(uint nodePointer)
+{
+    // From the HW raytracing spec:
+    // node_addr[60:0] = node_pointer[63:3]
+    // Also, based on the following, the node_addr is 64-byte aligned:
+    // fetch_addr0 = T#.base_address*256+node_addr*64
+    return ClearNodeType(nodePointer) << 3;
+}
+
+//=====================================================================================================================
+// Removes temp flag (MSB) within node type set by RefitBounds when fp16 nodes mode is LEAF_NODES_IN_BLAS_AS_FP16.
+static uint GetNodePointerExclMsbFlag(uint nodePointer)
+{
+    return nodePointer & (~NODE_POINTER_MASK_MSB);
+}
+
+//=====================================================================================================================
+// Primitive data structure that includes the unpacked data needed to process a primitive
+struct PrimitiveData
+{
+    uint primitiveIndex; // Primitive index used to indicate what primitive in geometry description
+    uint geometryIndex;  // Geometry index used to indicate what geometry description
+    uint geometryFlags;  // Geometry flags contains if the geometry is opaque or non opaque
+};
+
+//=====================================================================================================================
+// Extract the geometry index from the bottom 24 bits
+static uint ExtractGeometryIndex(uint geometryIndexAndFlags)
+{
+    return geometryIndexAndFlags & 0xFFFFFF;
+}
+
+//=====================================================================================================================
+// Extract the geometry flags from bits 25-26
+static uint ExtractGeometryFlags(uint geometryIndexAndFlags)
+{
+    return (geometryIndexAndFlags >> 24) & 0x3;
+}
+
+//=====================================================================================================================
+// Extract the geometry index from the bottom 24 bits and geometry flags from bits 25-26
+static uint2 UnpackGeometryIndexAndFlags(uint geometryIndexAndFlags)
+{
+    return uint2(ExtractGeometryIndex(geometryIndexAndFlags), ExtractGeometryFlags(geometryIndexAndFlags));
+}
+
+//=====================================================================================================================
+// Pack the geometry index in the bottom 24 bits and the geometry flags into bits 25-26
+static uint PackGeometryIndexAndFlags(
+    uint geometryIndex,
+    uint geometryFlags)
+{
+    return (geometryFlags << 24) | (geometryIndex & 0xFFFFFF);
+}
+
+//=====================================================================================================================
+// Additional geometry information for bottom level acceleration structures primitives
+struct GeometryInfo
+{
+    uint geometryFlagsAndNumPrimitives;
+    uint geometryBufferOffset;
+    uint primNodePtrsOffset; // Offset from the base of all prim node ptrs to this geometry's prim node ptrs
+};
+
+#define DXGI_FORMAT_UNKNOWN         0
+#define DXGI_FORMAT_R32G32B32_FLOAT 6
+
+#define DECODE_VERTEX_STRIDE                     12
+#define DECODE_PRIMITIVE_STRIDE_TRIANGLE         36
+#define DECODE_PRIMITIVE_STRIDE_AABB             24
+#define GEOMETRY_INFO_SIZE                       12
+#define GEOMETRY_INFO_FLAGS_AND_NUM_PRIMS_OFFSET  0
+#define GEOMETRY_INFO_GEOM_BUFFER_OFFSET          4
+#define GEOMETRY_INFO_PRIM_NODE_PTRS_OFFSET       8
+
+#define PIPELINE_FLAG_SKIP_TRIANGLES                0x100
+#define PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES    0x200
+
+GPURT_STATIC_ASSERT(GEOMETRY_INFO_SIZE == sizeof(GeometryInfo), "Geometry info structure mismatch");
+
+//=====================================================================================================================
+static uint ExtractGeometryInfoFlags(uint packedGeometryFlagsAndNumPrimitives)
+{
+    return (packedGeometryFlagsAndNumPrimitives >> 29);
+}
+
+//=====================================================================================================================
+static uint ExtractGeometryInfoNumPrimitives(uint packedGeometryFlagsAndNumPrimitives)
+{
+    // ((1 << 29) - 1) = 0x1fffffff
+    return (packedGeometryFlagsAndNumPrimitives & 0x1FFFFFFF);
+}
+
+//=====================================================================================================================
+static uint PackGeometryFlagsAndNumPrimitives(uint geometryFlags, uint numPrimitives)
+{
+    return (geometryFlags << 29) | numPrimitives;
+}
+
+//=====================================================================================================================
+static uint64_t PackUint64(uint lowBits, uint highBits)
+{
+    // Note glslang doesn't like uint64_t casts
+    uint64_t addr = highBits;
+    addr = (addr << 32) | lowBits;
+    return addr;
+}
+
+//======================================================================================================================
+// Packs the channels of a uint2 into a single uint64_t.
+static uint64_t PackUint64(uint2 lowHigh)
+{
+    // Note glslang doesn't like uint64_t casts
+    uint64_t addr = lowHigh.y;
+    addr = (addr << 32) | lowHigh.x;
+    return addr;
+}
+
+//=====================================================================================================================
+static uint2 SplitUint64(uint64_t x)
+{
+    return uint2(x, (x >> 32));
+}
+
+//=====================================================================================================================
+// Build Stage Counters (Debug only)
+// It starts with the qbvhGlobalCounters offset, i.e.,
+// qbvhGlobalStack...qbvhGlobalStackPtrs...bvhBuildDebugCounters
+
+#define COUNTER_MORTONGEN_OFFSET        0x0
+#define COUNTER_MORTON_SORT_OFFSET      0x4
+#define COUNTER_SORTLEAF_OFFSET         0x8
+#define COUNTER_BUILDPLOC_OFFSET        0xC
+#define COUNTER_BUILDLBVH_OFFSET        0x10
+#define COUNTER_REFIT_OFFSET            0x14
+#define COUNTER_INITENCODEHWBVH_OFFSET  0x18
+#define COUNTER_ENCODEHWBVH_OFFSET      0x1C
+#define COUNTER_EMPTYPRIM_OFFSET        0x20
+#define COUNTER_EMITCOMPACTSIZE_OFFSET  0x24
+#define COUNTER_BUILDFASTLBVH_OFFSET    0x28
+
+//=====================================================================================================================
+// Get leaf triangle node size in bytes
+static uint GetBvhNodeSizeTriangle()
+{
+    return TRIANGLE_NODE_SIZE;
+}
+
+//=====================================================================================================================
+// Get leaf AABB node size in bytes
+static uint GetBvhNodeSizeProcedural()
+{
+    return USER_NODE_PROCEDURAL_SIZE;
+}
+
+//=====================================================================================================================
+// Get leaf instance node size in bytes
+static uint GetBvhNodeSizeInstance(uint enableFusedInstanceNode)
+{
+    return (enableFusedInstanceNode == 0) ? INSTANCE_NODE_SIZE : FUSED_INSTANCE_NODE_SIZE;
+}
+
+//=====================================================================================================================
+// Get internal BVH node size in bytes
+static uint GetBvhNodeSizeInternal()
+{
+    return FLOAT32_BOX_NODE_SIZE;
+}
+
+//=====================================================================================================================
+// Get internal BVH node size in bytes
+static uint GetBvhNodeSizeLeaf(
+    uint primitiveType,
+    uint enableFusedInstanceNode)
+{
+    uint sizeInBytes = 0;
+    switch (primitiveType)
+    {
+    case PrimitiveType::Triangle:
+        sizeInBytes = GetBvhNodeSizeTriangle();
+        break;
+    case PrimitiveType::AABB:
+        sizeInBytes = GetBvhNodeSizeProcedural();
+        break;
+    case PrimitiveType::Instance:
+        sizeInBytes = GetBvhNodeSizeInstance(enableFusedInstanceNode);
+        break;
+    }
+
+    return sizeInBytes;
+}
+
+//=====================================================================================================================
+static uint CalcParentPtrOffset(uint nodePtr)
+{
+    // Subtract 1 from the index to account for negative offset calculations. I.e. index 0 is actually at -4 byte
+    // offset from the end of the parent pointer memory
+    const uint linkIndex = (nodePtr >> 3) - 1;
+    return linkIndex * NODE_PTR_SIZE;
+}
+
+//=====================================================================================================================
+static uint CalcBottomGeometryInfoSize(uint numGeometries)
+{
+    return numGeometries * GEOMETRY_INFO_SIZE;
+}
+
+//=====================================================================================================================
+struct DataOffsetAndSize
+{
+    uint offset;
+    uint size;
+};
+
+//=====================================================================================================================
+struct StateTaskQueueCounter
+{
+    uint            phase;
+    uint            startPhaseIndex;
+    uint            endPhaseIndex;
+    uint            taskCounter;
+    uint            numTasksDone;
+};
+
+#define USE_BLAS_PRIM_COUNT   0
+
+//=====================================================================================================================
+struct Flags
+{
+    uint dataValid;
+    uint prefixSum;
+};
+
+#define FLAGS_DATA_VALID_OFFSET          0
+#define FLAGS_PREFIX_SUM_OFFSET          4
+
+#define DLB_KEYS_PER_THREAD     4
+#define DLB_KEYS_PER_GROUP      (BUILD_THREADGROUP_SIZE * DLB_KEYS_PER_THREAD)
+
+#define DLB_VALID_SUM           0
+#define DLB_VALID_PREFIX_SUM    1
+#define NUM_DLB_VALID_TYPES     2
+
+//=====================================================================================================================
+
+#define PLOC_PHASE_INIT                     0
+#define PLOC_PHASE_FIND_NEAREST_NEIGHBOUR   1
+#define PLOC_PHASE_UPDATE_CLUSTER_COUNT     2
+#define PLOC_PHASE_DONE                     3
+struct StatePLOC
+{
+    uint                    numClusters;
+    uint                    internalNodesIndex;
+    uint                    clusterListIndex;
+    uint                    numClustersAlloc;
+};
+
+#define STATE_PLOC_NUM_CLUSTERS_OFFSET                      0
+#define STATE_PLOC_INTERNAL_NODES_INDEX_OFFSET              4
+#define STATE_PLOC_CLUSTER_LIST_INDEX_OFFSET                8
+#define STATE_PLOC_NUM_CLUSTERS_ALLOC_OFFSET                12
+
+//=====================================================================================================================
+struct IndexBufferInfo
+{
+    uint gpuVaLo;
+    uint gpuVaHi;
+    uint byteOffset;
+    uint format;
+};
+
+//=====================================================================================================================
+enum RebraidType : uint
+{
+    Off = 0, // No Rebraid
+    V1  = 1, // First version of Rebraid
+    V2  = 2, // Second version of Rebraid
+};
+
+#define BUILD_MODE_LINEAR   0
+// BUILD_MODE_AC was 1, but it has been removed.
+#define BUILD_MODE_PLOC     2
+
+//=====================================================================================================================
+struct TriangleData
+{
+    float3 v0; ///< Vertex 0
+    float3 v1; ///< Vertex 1
+    float3 v2; ///< Vertex 2
+};
+
 #ifndef LIBRARY_COMPILATION
 // This does not include RayTracingDefs.h as the goal is
 // to eventually have everything in this file alone
diff --git a/src/shadersClean/common/TempAssert.hlsli b/src/shadersClean/common/TempAssert.hlsli
new file mode 100644
index 0000000..1407fe8
--- /dev/null
+++ b/src/shadersClean/common/TempAssert.hlsli
@@ -0,0 +1,38 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+// TODO: this is a temporary assert file to allow files with asserts to be "clean"
+// while the assert file itself cannot be. We need this as we have to move files out of "shared"
+// which use assert.h, but cannot then include assert.h as "clean" inclusion of shared files isn't set up yet,
+// *because* there are too many files in shared, and they can't be moved out because
+// they use assert.h and... (cyclical issue)
+
+#ifndef ASSERT_HLSLI
+#define ASSERT_HLSLI
+#ifndef GPURT_STATIC_ASSERT
+// _Static_assert is not supported with -spirv: https://github.com/microsoft/DirectXShaderCompiler/issues/5750
+#define GPURT_STATIC_ASSERT(condition, message)
+#endif
+#endif
diff --git a/src/shadersClean/common/gfx10/BoxNode1_0.hlsli b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli
new file mode 100644
index 0000000..6103e61
--- /dev/null
+++ b/src/shadersClean/common/gfx10/BoxNode1_0.hlsli
@@ -0,0 +1,137 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+#ifndef BOX_NODE_1_1_HLSLI
+#define BOX_NODE_1_1_HLSLI
+
+#include "../TempAssert.hlsli"
+
+//=====================================================================================================================
+// Hardware 32-bit box node format and offsets
+#define FLOAT32_BBOX_STRIDE                    24
+#define FLOAT32_BOX_NODE_CHILD0_OFFSET         0
+#define FLOAT32_BOX_NODE_CHILD1_OFFSET         4
+#define FLOAT32_BOX_NODE_CHILD2_OFFSET         8
+#define FLOAT32_BOX_NODE_CHILD3_OFFSET         12
+#define FLOAT32_BOX_NODE_BB0_MIN_OFFSET        16
+#define FLOAT32_BOX_NODE_BB0_MAX_OFFSET        28
+#define FLOAT32_BOX_NODE_BB1_MIN_OFFSET        40
+#define FLOAT32_BOX_NODE_BB1_MAX_OFFSET        52
+#define FLOAT32_BOX_NODE_BB2_MIN_OFFSET        64
+#define FLOAT32_BOX_NODE_BB2_MAX_OFFSET        76
+#define FLOAT32_BOX_NODE_BB3_MIN_OFFSET        88
+#define FLOAT32_BOX_NODE_BB3_MAX_OFFSET        100
+#define FLOAT32_BOX_NODE_FLAGS_OFFSET          112
+#define FLOAT32_BOX_NODE_NUM_PRIM_OFFSET       116
+#define FLOAT32_BOX_NODE_UNUSED2_OFFSET        120
+#define FLOAT32_BOX_NODE_UNUSED3_OFFSET        124
+#define FLOAT32_BOX_NODE_SIZE                  128
+
+//=====================================================================================================================
+// Float32 box node flags contains 4 1-byte fields, 1 per child node:
+// Child 0 [ 7: 0]
+// Child 1 [15: 8]
+// Child 2 [23:16]
+// Child 3 [31:24]
+//
+// Each child node's 1-byte field contains these flags:
+// Only Opaque     [  0]
+// Only Non-Opaque [  1]
+// Only Triangles  [  2]
+// Only Procedural [  3]
+// Unused          [7:4]
+#define BOX_NODE_FLAGS_BIT_STRIDE 8
+
+#define BOX_NODE_FLAGS_ONLY_OPAQUE_SHIFT     0
+#define BOX_NODE_FLAGS_ONLY_NON_OPAQUE_SHIFT 1
+#define BOX_NODE_FLAGS_ONLY_TRIANGLES_SHIFT  2
+#define BOX_NODE_FLAGS_ONLY_PROCEDURAL_SHIFT 3
+
+//=====================================================================================================================
+struct Float32BoxNode
+{
+    uint   child0;    /// Child node pointer 0
+    uint   child1;    /// Child node pointer 1
+    uint   child2;    /// Child node pointer 2
+    uint   child3;    /// Child node pointer 3
+
+    float3 bbox0_min; /// Node bounding box 0 minimum bounds
+    float3 bbox0_max; /// Node bounding box 0 maximum bounds
+
+    float3 bbox1_min; /// Node bounding box 1 minimum bounds
+    float3 bbox1_max; /// Node bounding box 1 maximum bounds
+
+    float3 bbox2_min; /// Node bounding box 2 minimum bounds
+    float3 bbox2_max; /// Node bounding box 2 maximum bounds
+
+    float3 bbox3_min; /// Node bounding box 3 minimum bounds
+    float3 bbox3_max; /// Node bounding box 3 maximum bounds
+
+    uint   flags;          /// Reserved for RTIP 2.0
+    uint   numPrimitives;  /// Padding for 64-byte alignment
+    uint   padding2;       /// Padding for 64-byte alignment
+    uint   padding3;       /// Padding for 64-byte alignment
+
+};
+
+GPURT_STATIC_ASSERT(FLOAT32_BOX_NODE_SIZE == sizeof(Float32BoxNode), "Float32BoxNode structure mismatch");
+
+//=====================================================================================================================
+// Hardware 16-bit box node format and offsets
+#define FLOAT16_BBOX_STRIDE                 12
+#define FLOAT16_BOX_NODE_CHILD0_OFFSET      0
+#define FLOAT16_BOX_NODE_CHILD1_OFFSET      4
+#define FLOAT16_BOX_NODE_CHILD2_OFFSET      8
+#define FLOAT16_BOX_NODE_CHILD3_OFFSET      12
+#define FLOAT16_BOX_NODE_BB0_OFFSET         16
+#define FLOAT16_BOX_NODE_BB1_OFFSET         28
+#define FLOAT16_BOX_NODE_BB2_OFFSET         40
+#define FLOAT16_BOX_NODE_BB3_OFFSET         52
+#define FLOAT16_BOX_NODE_SIZE               64
+
+//=====================================================================================================================
+struct Float16BoxNode
+{
+    uint  child0;   /// Child node pointer 0
+    uint  child1;   /// Child node pointer 1
+    uint  child2;   /// Child node pointer 2
+    uint  child3;   /// Child node pointer 3
+
+    uint3 bbox0;    /// Node bounding box 0, packed, uses float16: minx, miny | minz, maxx | maxy, maxz
+    uint3 bbox1;    /// Node bounding box 1, packed, uses float16: minx, miny | minz, maxx | maxy, maxz
+    uint3 bbox2;    /// Node bounding box 2, packed, uses float16: minx, miny | minz, maxx | maxy, maxz
+    uint3 bbox3;    /// Node bounding box 3, packed, uses float16: minx, miny | minz, maxx | maxy, maxz
+
+    // NOTE: each bounding box is defined as uint3 for simplicity
+    // Each 32 bits pack 2x float16s. Order above is written as: a, b
+    // with a located in the lower 16 bits, b in the upper 16 bits
+    // bbox0.x stores minx, miny
+    //
+    // Alternatively, one can define each bbox as a pair of float16_t3
+    // similar to FLOAT32_BOX_NODE. Indexing in hlsl would require extra work
+};
+
+GPURT_STATIC_ASSERT(FLOAT16_BOX_NODE_SIZE == sizeof(Float16BoxNode), "Float16BoxNode structure mismatch");
+
+#endif
diff --git a/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli
new file mode 100644
index 0000000..ae0280d
--- /dev/null
+++ b/src/shadersClean/common/gfx10/InstanceNode1_0.hlsli
@@ -0,0 +1,72 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+#ifndef INSTANCE_NODE_1_1_HLSLI
+#define INSTANCE_NODE_1_1_HLSLI
+
+#include "BoxNode1_0.hlsli"
+#include "../InstanceDesc.hlsli"
+#include "../TempAssert.hlsli"
+
+//=====================================================================================================================
+struct InstanceSidebandData1_1
+{
+    uint   instanceIndex;
+    uint   blasNodePointer; // might not point to root
+    uint   blasMetadataSize;
+    uint   padding0;
+    float4 Transform[3]; // Non-inverse (original D3D12_RAYTRACING_INSTANCE_DESC.Transform)
+};
+
+#define RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET        0
+#define RTIP1_1_INSTANCE_SIDEBAND_CHILD_POINTER_OFFSET         4
+#define RTIP1_1_INSTANCE_SIDEBAND_CHILD_METADATA_SIZE_OFFSET   8
+#define RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET         16
+#define RTIP1_1_INSTANCE_SIDEBAND_SIZE                        64
+
+GPURT_STATIC_ASSERT(RTIP1_1_INSTANCE_SIDEBAND_SIZE == sizeof(InstanceSidebandData1_1), "Instance sideband structure mismatch");
+
+//=====================================================================================================================
+struct FusedInstanceNode
+{
+    InstanceDesc            desc;
+    InstanceSidebandData1_1 sideband;
+    Float32BoxNode          blasRootNode;
+};
+
+//=====================================================================================================================
+struct InstanceNode
+{
+    InstanceDesc            desc;
+    InstanceSidebandData1_1 sideband;
+};
+
+#define INSTANCE_NODE_DESC_OFFSET       0
+#define INSTANCE_NODE_EXTRA_OFFSET      64
+#define INSTANCE_NODE_SIZE              128
+#define FUSED_INSTANCE_NODE_ROOT_OFFSET INSTANCE_NODE_SIZE
+#define FUSED_INSTANCE_NODE_SIZE        256
+GPURT_STATIC_ASSERT(INSTANCE_NODE_SIZE == sizeof(InstanceNode), "InstanceNode structure mismatch");
+
+#endif
diff --git a/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli
new file mode 100644
index 0000000..4431ecd
--- /dev/null
+++ b/src/shadersClean/common/gfx10/ProceduralNode1_0.hlsli
@@ -0,0 +1,56 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+#ifndef PROCEDURAL_NODE_1_1_HLSLI
+#define PROCEDURAL_NODE_1_1_HLSLI
+
+#include "../TempAssert.hlsli"
+
+//=====================================================================================================================
+#define USER_NODE_PROCEDURAL_MIN_OFFSET 0
+#define USER_NODE_PROCEDURAL_MAX_OFFSET 12
+#define USER_NODE_PROCEDURAL_SIZE       64
+
+//=====================================================================================================================
+// Procedural node primitive data offsets
+#define USER_NODE_PROCEDURAL_PRIMITIVE_INDEX_OFFSET            TRIANGLE_NODE_PRIMITIVE_INDEX1_OFFSET
+#define USER_NODE_PROCEDURAL_GEOMETRY_INDEX_AND_FLAGS_OFFSET   TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET
+#define USER_NODE_PROCEDURAL_TRIANGLE_ID_OFFSET                TRIANGLE_NODE_ID_OFFSET
+
+//=====================================================================================================================
+// User defined procedural node format
+struct ProceduralNode
+{
+    float3 bbox_min;
+    float3 bbox_max;
+    uint   padding1[6];
+    uint   geometryIndexAndFlags;
+    uint   reserved;
+    uint   primitiveIndex;
+    uint   triangleId;
+};
+
+GPURT_STATIC_ASSERT(USER_NODE_PROCEDURAL_SIZE == sizeof(ProceduralNode), "ProceduralNode structure mismatch");
+
+#endif
diff --git a/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli
new file mode 100644
index 0000000..0d9d1eb
--- /dev/null
+++ b/src/shadersClean/common/gfx10/TriangleNode1_0.hlsli
@@ -0,0 +1,82 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+#ifndef TRIANGLE_NODE_1_0_HLSLI
+#define TRIANGLE_NODE_1_0_HLSLI
+
+#include "../TempAssert.hlsli"
+
+//=====================================================================================================================
+// Hardware triangle node format and offsets
+// Note: GPURT limits triangle compression to 2 triangles per node. As a result the remaining bytes in the triangle node
+// are used for sideband data. The geometry index is packed in bottom 24 bits and geometry flags in bits 25-26.
+#define TRIANGLE_NODE_V0_OFFSET 0
+#define TRIANGLE_NODE_V1_OFFSET 12
+#define TRIANGLE_NODE_V2_OFFSET 24
+#define TRIANGLE_NODE_V3_OFFSET 36
+#define TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET 48
+#define TRIANGLE_NODE_PRIMITIVE_INDEX0_OFFSET         52
+#define TRIANGLE_NODE_PRIMITIVE_INDEX1_OFFSET         56
+#define TRIANGLE_NODE_ID_OFFSET 60
+#define TRIANGLE_NODE_SIZE      64
+
+//=====================================================================================================================
+// Triangle ID contains 4 1-byte fields, 1 per triangle:
+// Triangle 0 [ 7: 0]
+// Triangle 1 [15: 8]
+// Triangle 2 [23:16]
+// Triangle 3 [31:24]
+//
+// Each triangle's 8-bit segment contains these fields:
+// I SRC        [1:0] Specifies which vertex in triangle 0 corresponds to the I barycentric value
+// J SRC        [3:2] Specifies which vertex in triangle 0 corresponds to the J barycentric value
+// Double Sided [  4] Specifies whether triangle 0 should be treated as double sided for culling
+// Flip Winding [  5] Specifies whether triangle 0 should have its facedness flipped
+// Procedural   [  6] Specifies whether it is a procedural node
+// Opaque       [  7] Specifies whether triangle 0 should be considered as opaque
+#define TRIANGLE_ID_BIT_STRIDE 8
+
+#define TRIANGLE_ID_I_SRC_SHIFT        0
+#define TRIANGLE_ID_J_SRC_SHIFT        2
+#define TRIANGLE_ID_DOUBLE_SIDED_SHIFT 4
+#define TRIANGLE_ID_FLIP_WINDING_SHIFT 5
+#define TRIANGLE_ID_PROCEDURAL_SHIFT   6
+#define TRIANGLE_ID_OPAQUE_SHIFT       7
+
+//=====================================================================================================================
+struct TriangleNode
+{
+    float3 v0;                      // Vertex 0
+    float3 v1;                      // Vertex 1
+    float3 v2;                      // Vertex 2
+    float3 v3;                      // Vertex 3
+    uint   geometryIndexAndFlags;   // Geometry index and flags for pair of triangles
+    uint   primitiveIndex0;         // Primitive index for triangle 0
+    uint   primitiveIndex1;         // Primitive index for triangle 1
+    uint   triangleId;              // Triangle ID
+};
+
+GPURT_STATIC_ASSERT(TRIANGLE_NODE_SIZE == sizeof(TriangleNode), "TriangleNode structure mismatch");
+
+#endif
diff --git a/src/shadersClean/traversal/TraversalDefs.hlsli b/src/shadersClean/traversal/TraversalDefs.hlsli
new file mode 100644
index 0000000..8541f35
--- /dev/null
+++ b/src/shadersClean/traversal/TraversalDefs.hlsli
@@ -0,0 +1,160 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+#ifndef TRAVERSAL_DEFS_HLSLI
+#define TRAVERSAL_DEFS_HLSLI
+
+#include "../common/TempAssert.hlsli"
+
+#define ENCODE_FLAG_ARRAY_OF_POINTERS          0x00000001
+#define ENCODE_FLAG_UPDATE_IN_PLACE            0x00000002
+#define ENCODE_FLAG_REBRAID_ENABLED            0x00000004
+#define ENCODE_FLAG_ENABLE_FUSED_INSTANCE_NODE 0x00000008
+
+//=====================================================================================================================
+struct IntersectionResult
+{
+    float  t;                     // Relative to tMin
+    uint   nodeIndex;
+    float2 barycentrics;
+    uint   geometryIndex;
+    uint   primitiveIndex;
+    uint   instNodePtr;
+    uint   hitkind;
+    uint   instanceContribution;
+
+#if DEVELOPER
+    uint   numIterations;
+    uint   maxStackDepth;
+    uint   numRayBoxTest;
+    uint   numCandidateHits;
+    uint   numRayTriangleTest;
+    uint   numAnyHitInvocation;
+    uint   instanceIntersections;
+#endif
+};
+
+//=====================================================================================================================
+// Commit status
+typedef uint COMMITTED_STATUS;
+
+#define COMMITTED_NOTHING 0
+#define COMMITTED_TRIANGLE_HIT 1
+#define COMMITTED_PROCEDURAL_PRIMITIVE_HIT 2
+
+//=====================================================================================================================
+// Candidate type
+typedef uint CANDIDATE_STATUS;
+
+#define CANDIDATE_NON_OPAQUE_TRIANGLE 0
+#define CANDIDATE_PROCEDURAL_PRIMITIVE 1
+#define CANDIDATE_NON_OPAQUE_PROCEDURAL_PRIMITIVE 2
+#define CANDIDATE_EARLY_RAY_TERMINATE 4
+
+//=====================================================================================================================
+// Data required for system value intrinsics
+struct RaySystemData
+{
+    uint   currNodePtr;
+    float  rayTCurrent;
+    uint   instNodePtr;
+    uint   instanceContribution;
+    uint   geometryIndex;
+    uint   primitiveIndex;
+    float2 barycentrics;
+    uint   frontFace;
+    float3 origin;
+    float3 direction;
+};
+
+//=====================================================================================================================
+#if DEFINE_RAYDESC
+// Ray description matching the D3D12 HLSL header
+struct RayDesc
+{
+    float3 Origin;
+    float TMin;
+    float3 Direction;
+    float TMax;
+};
+#endif
+
+//=====================================================================================================================
+// Internal RayQuery structure initialised at TraceRaysInline()
+struct RayQueryInternal
+{
+    // Internal query data holding address of current BVH and stack information.
+    // Additional data that may be required will be stored here.
+    uint             bvhLo;
+    uint             bvhHi;
+    uint             topLevelBvhLo;
+    uint             topLevelBvhHi;
+    uint             stackPtr;
+    uint             stackPtrTop;
+    uint             stackNumEntries;
+    uint             instNodePtr;
+    uint             currNodePtr;
+    uint             instanceHitContributionAndFlags;
+    uint             prevNodePtr;
+    uint             isGoingDown;
+    uint             lastInstanceNode;
+
+    RayDesc          rayDesc;
+    float            rayTMin;
+    uint             rayFlags;
+    uint             instanceInclusionMask;
+
+    // Candidate system data
+    CANDIDATE_STATUS candidateType;
+    RaySystemData    candidate;
+
+    // Committed system data
+    COMMITTED_STATUS committedStatus;
+    RaySystemData    committed;
+
+    uint             reserved;
+
+    // Counter data
+    // @note We don't wrap these in DEVELOPER because it would result in mismatch of RayQuery struct size
+    //       on the driver side when we're not using counters.
+    uint             numRayBoxTest;
+    uint             numRayTriangleTest;
+    uint             numIterations;
+    uint             maxStackDepthAndDynamicId;
+    uint             clocks;
+    uint             numCandidateHits;
+    uint             instanceIntersections;
+    uint             rayQueryObjId;
+};
+
+//=====================================================================================================================
+struct HitGroupInfo
+{
+    uint2 closestHitId;
+    uint2 anyHitId;
+    uint2 intersectionId;
+    uint  tableIndex;
+};
+
+#endif
diff --git a/src/shared/rayTracingDefs.h b/src/shared/rayTracingDefs.h
index a2c4a8a..6dfec65 100644
--- a/src/shared/rayTracingDefs.h
+++ b/src/shared/rayTracingDefs.h
@@ -27,6 +27,10 @@
 #ifndef _RAYTRACING_DEF_H
 #define _RAYTRACING_DEF_H
 
+#ifndef __cplusplus
+#include "../shadersClean/common/ShaderDefs.hlsli"
+#endif
+
 #include "../../gpurt/gpurtAccelStruct.h"
 #include "../../gpurt/gpurtBuildSettings.h"
 #include "../../gpurt/gpurtDispatch.h"
@@ -51,136 +55,25 @@ static_assert(GPURT_RTIP2_0 == uint32_t(Pal::RayTracingIpLevel::RtIp2_0), "GPURT
 #endif
 
 //=====================================================================================================================
-enum PrimitiveType : uint
-{
-    Triangle = 0,
-    AABB     = 1,
-    Instance = 2,
-};
-
-#if defined(__cplusplus)
-#define __decl extern
-#endif
-
-// These DUMMY_*_FUNC postfix stubs must be included at the end of every driver stub (AmdTraceRay*) declaration to
-// work around a DXC + Spirv issue where the compiler can't deal with calls to functions that don't have bodies.
-#define DUMMY_BOOL_FUNC   { return false; }
-#define DUMMY_VOID_FUNC   { }
-#define DUMMY_UINT_FUNC   { return 0; }
-#define DUMMY_UINT2_FUNC  { return uint2(0, 0); }
-#define DUMMY_UINT3_FUNC  { return uint3(0, 0, 0); }
-#define DUMMY_UINT4_FUNC  { return uint4(0, 0, 0, 0); }
-#define DUMMY_FLOAT_FUNC  { return 0; }
-#define DUMMY_FLOAT2_FUNC { return float2(0, 0); }
-#define DUMMY_FLOAT3_FUNC { return float3(0, 0, 0); }
-
-//=====================================================================================================================
-// Acceleration structure type
-#define TOP_LEVEL      0
-#define BOTTOM_LEVEL   1
-
-//=====================================================================================================================
-// BVH node types shared between HW and SW nodes
-#define NODE_TYPE_TRIANGLE_0           0
-#define NODE_TYPE_TRIANGLE_1           1
-#define NODE_TYPE_TRIANGLE_2           2
-#define NODE_TYPE_TRIANGLE_3           3
-#define NODE_TYPE_BOX_FLOAT16          4
-#define NODE_TYPE_BOX_FLOAT32          5
-#define NODE_TYPE_USER_NODE_INSTANCE   6
-// From the HW IP 2.0 spec: '7: User Node 1 (processed as a Procedural Node for culling)'
-#define NODE_TYPE_USER_NODE_PROCEDURAL 7
-//=====================================================================================================================
-// Triangle Compression Modes
-#define NO_TRIANGLE_COMPRESSION        0
-#define RESERVED                       1
-#define PAIR_TRIANGLE_COMPRESSION      2
-#define AUTO_TRIANGLE_COMPRESSION      3
-
-#define LATE_PAIR_COMP_BATCH_SIZE 8
-
-//=====================================================================================================================
-// Amount of ULPs(Unit in Last Place) added to Box node when using hardware intersection instruction
-#define BOX_EXPANSION_DEFAULT_AMOUNT 6
-
-//=====================================================================================================================
-// Box sorting heuristic value
-// 0: closethit
-// 1: LargestFirst
-// 2: ClosestMidpoint
-// 3: undefined / disabled
-// 4: LargestFirstOrClosest (auto select with rayFlag)
-// 5: BoxSortLargestFirstOrClosestMidPoint  (auto select with rayFlag)
-// 6: DisabledOnAcceptFirstHit (disable if bvhNode sort is on, and rayFlag is AcceptFirstHit)
-//
-// This need to match ILC_BOX_SORT_HEURISTIC_MODE
-enum BoxSortHeuristic : uint
-{
-    Closest                       = 0x0,
-    Largest                       = 0x1,
-    MidPoint                      = 0x2,
-    Disabled                      = 0x3,
-    LargestFirstOrClosest         = 0x4,
-    LargestFirstOrClosestMidPoint = 0x5,
-    DisabledOnAcceptFirstHit      = 0x6,
-};
+#define REBRAID_PHASE_CALC_SUM                 0
+#define REBRAID_PHASE_OPEN                     1
+#define REBRAID_PHASE_DONE                     2
 
-enum SceneBoundsCalculation : uint
+struct RebraidState
 {
-    SceneBoundsBasedOnGeometry = 0x0,
-    SceneBoundsBasedOnGeometryWithSize = 0x1
+    float                   sumValue[2];
+    uint                    mutex;
+    uint                    numLeafIndices;
+    uint                    iterationCount;
 };
 
-//=====================================================================================================================
-// Options for where FP16 box nodes are created within BLAS for QBVH
-#define NO_NODES_IN_BLAS_AS_FP16           0
-#define LEAF_NODES_IN_BLAS_AS_FP16         1
-#define MIXED_NODES_IN_BLAS_AS_FP16        2
-#define ALL_INTERIOR_NODES_IN_BLAS_AS_FP16 3
-
-// The highest 3 bits are zero after the right shift in PackNodePointer and may be repurposed.
-// Mask for MSB within node pointer
-#define NODE_POINTER_MASK_MSB              0x80000000u
-
-//=====================================================================================================================
-#define BVH4_NODE_32_STRIDE_SHIFT             7   // Box 32 node
-#define BVH4_NODE_16_STRIDE_SHIFT             6   // Box 16 node
-
-#define INVALID_IDX           0xffffffff
-#define INACTIVE_PRIM         0xfffffffe
-
-static const uint ByteStrideScratchNode = 64;
-static const uint ByteStrideU32         = 12;
-static const uint IndexFormatInvalid    = 0;
-static const uint IndexFormatU32        = 1;
-static const uint IndexFormatU16        = 2;
-
-const static uint TILE_WIDTH = 256;
-const static uint TILE_SIZE  = TILE_WIDTH * TILE_WIDTH;
-
-#ifndef BUILD_THREADGROUP_SIZE
-#define BUILD_THREADGROUP_SIZE 64
-#endif
-
-//=====================================================================================================================
-struct BoundingBox // matches D3D12_RAYTRACING_AABB
-{
-    float3 min;
-    float3 max;
-};
+#define STATE_REBRAID_SUM_VALUE_OFFSET          0
+#define STATE_REBRAID_MUTEX_OFFSET              (STATE_REBRAID_SUM_VALUE_OFFSET + 8)
+#define STATE_REBRAID_NUM_LEAF_INDICES_OFFSET   (STATE_REBRAID_MUTEX_OFFSET + 4)
+#define STATE_REBRAID_ITERATION_COUNT_OFFSET    (STATE_REBRAID_NUM_LEAF_INDICES_OFFSET + 4)
 
-#ifndef __cplusplus
-//=====================================================================================================================
-static BoundingBox CombineAABB(
-    BoundingBox b0,
-    BoundingBox b1)
-{
-    BoundingBox bbox;
-    bbox.min = min(b0.min, b1.min);
-    bbox.max = max(b0.max, b1.max);
-    return bbox;
-}
-#endif
+#define REBRAID_KEYS_PER_THREAD                 4
+#define REBRAID_KEYS_PER_GROUP                  (BUILD_THREADGROUP_SIZE * REBRAID_KEYS_PER_THREAD)
 
 //======================================================================================================================
 // matches VkAccelerationStructureBuildRangeInfoKHR
@@ -193,711 +86,6 @@ struct IndirectBuildRangeInfo
 };
 
 //=====================================================================================================================
-struct BoundingBox4
-{
-    float4 min;
-    float4 max;
-};
-
-//=====================================================================================================================
-// Internal bounding box type for scene bounds.
-struct UintBoundingBox
-{
-    uint3 min;
-    uint3 max;
-};
-
-struct UintBoundingBox4
-{
-    uint4 min;
-    uint4 max;
-};
-
-struct PackedUintBoundingBox4
-{
-    uint64_t min;
-    uint64_t max;
-};
-
-//=====================================================================================================================
-// Hardware 32-bit box node format and offsets
-#define FLOAT32_BBOX_STRIDE                    24
-#define FLOAT32_BOX_NODE_CHILD0_OFFSET         0
-#define FLOAT32_BOX_NODE_CHILD1_OFFSET         4
-#define FLOAT32_BOX_NODE_CHILD2_OFFSET         8
-#define FLOAT32_BOX_NODE_CHILD3_OFFSET         12
-#define FLOAT32_BOX_NODE_BB0_MIN_OFFSET        16
-#define FLOAT32_BOX_NODE_BB0_MAX_OFFSET        28
-#define FLOAT32_BOX_NODE_BB1_MIN_OFFSET        40
-#define FLOAT32_BOX_NODE_BB1_MAX_OFFSET        52
-#define FLOAT32_BOX_NODE_BB2_MIN_OFFSET        64
-#define FLOAT32_BOX_NODE_BB2_MAX_OFFSET        76
-#define FLOAT32_BOX_NODE_BB3_MIN_OFFSET        88
-#define FLOAT32_BOX_NODE_BB3_MAX_OFFSET        100
-#define FLOAT32_BOX_NODE_FLAGS_OFFSET          112
-#define FLOAT32_BOX_NODE_NUM_PRIM_OFFSET       116
-#define FLOAT32_BOX_NODE_UNUSED2_OFFSET        120
-#define FLOAT32_BOX_NODE_UNUSED3_OFFSET        124
-#define FLOAT32_BOX_NODE_SIZE                  128
-
-//=====================================================================================================================
-// Float32 box node flags contains 4 1-byte fields, 1 per child node:
-// Child 0 [ 7: 0]
-// Child 1 [15: 8]
-// Child 2 [23:16]
-// Child 3 [31:24]
-//
-// Each child node's 1-byte field contains these flags:
-// Only Opaque     [  0]
-// Only Non-Opaque [  1]
-// Only Triangles  [  2]
-// Only Procedural [  3]
-// Unused          [7:4]
-#define BOX_NODE_FLAGS_BIT_STRIDE 8
-
-#define BOX_NODE_FLAGS_ONLY_OPAQUE_SHIFT     0
-#define BOX_NODE_FLAGS_ONLY_NON_OPAQUE_SHIFT 1
-#define BOX_NODE_FLAGS_ONLY_TRIANGLES_SHIFT  2
-#define BOX_NODE_FLAGS_ONLY_PROCEDURAL_SHIFT 3
-
-//=====================================================================================================================
-struct Float32BoxNode
-{
-    uint   child0;    /// Child node pointer 0
-    uint   child1;    /// Child node pointer 1
-    uint   child2;    /// Child node pointer 2
-    uint   child3;    /// Child node pointer 3
-
-    float3 bbox0_min; /// Node bounding box 0 minimum bounds
-    float3 bbox0_max; /// Node bounding box 0 maximum bounds
-
-    float3 bbox1_min; /// Node bounding box 1 minimum bounds
-    float3 bbox1_max; /// Node bounding box 1 maximum bounds
-
-    float3 bbox2_min; /// Node bounding box 2 minimum bounds
-    float3 bbox2_max; /// Node bounding box 2 maximum bounds
-
-    float3 bbox3_min; /// Node bounding box 3 minimum bounds
-    float3 bbox3_max; /// Node bounding box 3 maximum bounds
-
-    uint   flags;          /// Reserved for RTIP 2.0
-    uint   numPrimitives;  /// Padding for 64-byte alignment
-    uint   padding2;       /// Padding for 64-byte alignment
-    uint   padding3;       /// Padding for 64-byte alignment
-
-#ifdef __cplusplus
-    // parameterised constructor for HLSL compatibility
-    Float32BoxNode(uint val)
-    {
-        memset(this, val, sizeof(Float32BoxNode));
-    }
-
-    // default constructor
-    Float32BoxNode() : Float32BoxNode(0)
-    {
-    }
-#endif
-};
-
-#ifdef __cplusplus
-static_assert(FLOAT32_BOX_NODE_SIZE == sizeof(Float32BoxNode), "Float32BoxNode structure mismatch");
-static_assert(FLOAT32_BOX_NODE_CHILD0_OFFSET   == offsetof(Float32BoxNode, child0), "");
-static_assert(FLOAT32_BOX_NODE_CHILD1_OFFSET   == offsetof(Float32BoxNode, child1), "");
-static_assert(FLOAT32_BOX_NODE_CHILD2_OFFSET   == offsetof(Float32BoxNode, child2), "");
-static_assert(FLOAT32_BOX_NODE_CHILD3_OFFSET   == offsetof(Float32BoxNode, child3), "");
-static_assert(FLOAT32_BOX_NODE_BB0_MIN_OFFSET  == offsetof(Float32BoxNode, bbox0_min), "");
-static_assert(FLOAT32_BOX_NODE_BB0_MAX_OFFSET  == offsetof(Float32BoxNode, bbox0_max), "");
-static_assert(FLOAT32_BOX_NODE_BB1_MIN_OFFSET  == offsetof(Float32BoxNode, bbox1_min), "");
-static_assert(FLOAT32_BOX_NODE_BB1_MAX_OFFSET  == offsetof(Float32BoxNode, bbox1_max), "");
-static_assert(FLOAT32_BOX_NODE_BB2_MIN_OFFSET  == offsetof(Float32BoxNode, bbox2_min), "");
-static_assert(FLOAT32_BOX_NODE_BB2_MAX_OFFSET  == offsetof(Float32BoxNode, bbox2_max), "");
-static_assert(FLOAT32_BOX_NODE_BB3_MIN_OFFSET  == offsetof(Float32BoxNode, bbox3_min), "");
-static_assert(FLOAT32_BOX_NODE_BB3_MAX_OFFSET  == offsetof(Float32BoxNode, bbox3_max), "");
-static_assert(FLOAT32_BOX_NODE_FLAGS_OFFSET    == offsetof(Float32BoxNode, flags), "");
-static_assert(FLOAT32_BOX_NODE_NUM_PRIM_OFFSET == offsetof(Float32BoxNode, numPrimitives), "");
-static_assert(FLOAT32_BOX_NODE_UNUSED2_OFFSET  == offsetof(Float32BoxNode, padding2), "");
-static_assert(FLOAT32_BOX_NODE_UNUSED3_OFFSET  == offsetof(Float32BoxNode, padding3), "");
-#endif
-
-//=====================================================================================================================
-// Hardware 16-bit box node format and offsets
-#define FLOAT16_BBOX_STRIDE                 12
-#define FLOAT16_BOX_NODE_CHILD0_OFFSET      0
-#define FLOAT16_BOX_NODE_CHILD1_OFFSET      4
-#define FLOAT16_BOX_NODE_CHILD2_OFFSET      8
-#define FLOAT16_BOX_NODE_CHILD3_OFFSET      12
-#define FLOAT16_BOX_NODE_BB0_OFFSET         16
-#define FLOAT16_BOX_NODE_BB1_OFFSET         28
-#define FLOAT16_BOX_NODE_BB2_OFFSET         40
-#define FLOAT16_BOX_NODE_BB3_OFFSET         52
-#define FLOAT16_BOX_NODE_SIZE               64
-
-//=====================================================================================================================
-struct Float16BoxNode
-{
-    uint  child0;   /// Child node pointer 0
-    uint  child1;   /// Child node pointer 1
-    uint  child2;   /// Child node pointer 2
-    uint  child3;   /// Child node pointer 3
-
-    uint3 bbox0;    /// Node bounding box 0, packed, uses float16: minx, miny | minz, maxx | maxy, maxz
-    uint3 bbox1;    /// Node bounding box 1, packed, uses float16: minx, miny | minz, maxx | maxy, maxz
-    uint3 bbox2;    /// Node bounding box 2, packed, uses float16: minx, miny | minz, maxx | maxy, maxz
-    uint3 bbox3;    /// Node bounding box 3, packed, uses float16: minx, miny | minz, maxx | maxy, maxz
-
-    // NOTE: each bounding box is defined as uint3 for simplicity
-    // Each 32 bits pack 2x float16s. Order above is written as: a, b
-    // with a located in the lower 16 bits, b in the upper 16 bits
-    // bbox0.x stores minx, miny
-    //
-    // Alternatively, one can define each bbox as a pair of float16_t3
-    // similar to FLOAT32_BOX_NODE. Indexing in hlsl would require extra work
-};
-
-#ifdef __cplusplus
-static_assert(FLOAT16_BOX_NODE_SIZE == sizeof(Float16BoxNode), "Float16BoxNode structure mismatch");
-static_assert(FLOAT16_BOX_NODE_CHILD0_OFFSET == offsetof(Float16BoxNode, child0), "");
-static_assert(FLOAT16_BOX_NODE_CHILD1_OFFSET == offsetof(Float16BoxNode, child1), "");
-static_assert(FLOAT16_BOX_NODE_CHILD2_OFFSET == offsetof(Float16BoxNode, child2), "");
-static_assert(FLOAT16_BOX_NODE_CHILD3_OFFSET == offsetof(Float16BoxNode, child3), "");
-static_assert(FLOAT16_BOX_NODE_BB0_OFFSET    == offsetof(Float16BoxNode, bbox0), "");
-static_assert(FLOAT16_BOX_NODE_BB1_OFFSET    == offsetof(Float16BoxNode, bbox1), "");
-static_assert(FLOAT16_BOX_NODE_BB2_OFFSET    == offsetof(Float16BoxNode, bbox2), "");
-static_assert(FLOAT16_BOX_NODE_BB3_OFFSET    == offsetof(Float16BoxNode, bbox3), "");
-#endif
-
-//=====================================================================================================================
-// Hardware triangle node format and offsets
-// Note: GPURT limits triangle compression to 2 triangles per node. As a result the remaining bytes in the triangle node
-// are used for sideband data. The geometry index is packed in bottom 24 bits and geometry flags in bits 25-26.
-#define TRIANGLE_NODE_V0_OFFSET 0
-#define TRIANGLE_NODE_V1_OFFSET 12
-#define TRIANGLE_NODE_V2_OFFSET 24
-#define TRIANGLE_NODE_V3_OFFSET 36
-#define TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET 48
-#define TRIANGLE_NODE_PRIMITIVE_INDEX0_OFFSET         52
-#define TRIANGLE_NODE_PRIMITIVE_INDEX1_OFFSET         56
-#define TRIANGLE_NODE_ID_OFFSET 60
-#define TRIANGLE_NODE_SIZE      64
-
-//=====================================================================================================================
-// Triangle ID contains 4 1-byte fields, 1 per triangle:
-// Triangle 0 [ 7: 0]
-// Triangle 1 [15: 8]
-// Triangle 2 [23:16]
-// Triangle 3 [31:24]
-//
-// Each triangle's 8-bit segment contains these fields:
-// I SRC        [1:0] Specifies which vertex in triangle 0 corresponds to the I barycentric value
-// J SRC        [3:2] Specifies which vertex in triangle 0 corresponds to the J barycentric value
-// Double Sided [  4] Specifies whether triangle 0 should be treated as double sided for culling
-// Flip Winding [  5] Specifies whether triangle 0 should have its facedness flipped
-// Procedural   [  6] Specifies whether it is a procedural node
-// Opaque       [  7] Specifies whether triangle 0 should be considered as opaque
-#define TRIANGLE_ID_BIT_STRIDE 8
-
-#define TRIANGLE_ID_I_SRC_SHIFT        0
-#define TRIANGLE_ID_J_SRC_SHIFT        2
-#define TRIANGLE_ID_DOUBLE_SIDED_SHIFT 4
-#define TRIANGLE_ID_FLIP_WINDING_SHIFT 5
-#define TRIANGLE_ID_PROCEDURAL_SHIFT   6
-#define TRIANGLE_ID_OPAQUE_SHIFT       7
-
-//=====================================================================================================================
-struct TriangleNode
-{
-    float3 v0;                      // Vertex 0
-    float3 v1;                      // Vertex 1
-    float3 v2;                      // Vertex 2
-    float3 v3;                      // Vertex 3
-    uint   geometryIndexAndFlags;   // Geometry index and flags for pair of triangles
-    uint   primitiveIndex0;         // Primitive index for triangle 0
-    uint   primitiveIndex1;         // Primitive index for triangle 1
-    uint   triangleId;              // Triangle ID
-};
-
-#ifdef __cplusplus
-static_assert(TRIANGLE_NODE_SIZE == sizeof(TriangleNode), "TriangleNode structure mismatch");
-static_assert(TRIANGLE_NODE_V0_OFFSET == offsetof(TriangleNode, v0), "");
-static_assert(TRIANGLE_NODE_V1_OFFSET == offsetof(TriangleNode, v1), "");
-static_assert(TRIANGLE_NODE_V2_OFFSET == offsetof(TriangleNode, v2), "");
-static_assert(TRIANGLE_NODE_V3_OFFSET == offsetof(TriangleNode, v3), "");
-static_assert(TRIANGLE_NODE_ID_OFFSET == offsetof(TriangleNode, triangleId), "");
-#endif
-
-//=====================================================================================================================
-#define USER_NODE_PROCEDURAL_MIN_OFFSET 0
-#define USER_NODE_PROCEDURAL_MAX_OFFSET 12
-#define USER_NODE_PROCEDURAL_SIZE       64
-
-//=====================================================================================================================
-// Procedural node primitive data offsets
-#define USER_NODE_PROCEDURAL_PRIMITIVE_INDEX_OFFSET            TRIANGLE_NODE_PRIMITIVE_INDEX1_OFFSET
-#define USER_NODE_PROCEDURAL_GEOMETRY_INDEX_AND_FLAGS_OFFSET   TRIANGLE_NODE_GEOMETRY_INDEX_AND_FLAGS_OFFSET
-#define USER_NODE_PROCEDURAL_TRIANGLE_ID_OFFSET                TRIANGLE_NODE_ID_OFFSET
-
-//=====================================================================================================================
-// User defined procedural node format
-struct ProceduralNode
-{
-    float3 bbox_min;
-    float3 bbox_max;
-    uint   padding1[6];
-    uint   geometryIndexAndFlags;
-    uint   reserved;
-    uint   primitiveIndex;
-    uint   triangleId;
-};
-
-#ifdef __cplusplus
-static_assert(USER_NODE_PROCEDURAL_SIZE == sizeof(ProceduralNode), "ProceduralNode structure mismatch");
-static_assert(USER_NODE_PROCEDURAL_MIN_OFFSET == offsetof(ProceduralNode, bbox_min), "");
-static_assert(USER_NODE_PROCEDURAL_MAX_OFFSET == offsetof(ProceduralNode, bbox_max), "");
-static_assert(USER_NODE_PROCEDURAL_GEOMETRY_INDEX_AND_FLAGS_OFFSET == offsetof(ProceduralNode, geometryIndexAndFlags), "");
-static_assert(USER_NODE_PROCEDURAL_PRIMITIVE_INDEX_OFFSET == offsetof(ProceduralNode, primitiveIndex), "");
-static_assert(USER_NODE_PROCEDURAL_TRIANGLE_ID_OFFSET == offsetof(ProceduralNode, triangleId), "");
-#endif
-
-#ifdef __cplusplus
-//=====================================================================================================================
-union NodePointer32
-{
-    struct
-    {
-        uint32_t type               :  3; // Hardware NODE_TYPE_*
-        uint32_t aligned_offset_64b : 29; // 64-byte aligned offset
-    };
-
-    uint32_t u32;
-};
-
-//=====================================================================================================================
-// Instance base pointer layout from the HW raytracing IP 2.0 spec:
-// Zero                         [ 2: 0]
-// Tree Base Address (64B index)[53: 3]
-// Force Opaque                 [   54]
-// Force Non-Opaque             [   55]
-// Disable Triangle Cull        [   56]
-// Flip Facedness               [   57]
-// Cull Back Facing Triangles   [   58]
-// Cull Front Facing Triangles  [   59]
-// Cull Opaque                  [   60]
-// Cull Non-Opaque              [   61]
-// Skip Triangles               [   62]
-// Skip Procedural              [   63]
-union NodePointer64
-{
-    struct
-    {
-        uint64_t type                     : 3;   // Hardware NODE_TYPE_*
-        uint64_t aligned_addr_64b         : 51;  // 64-byte aligned address
-        uint64_t force_opaque             : 1;
-        uint64_t force_non_opaque         : 1;
-        uint64_t disable_triangle_cull    : 1;
-        uint64_t flip_facedness           : 1;
-        uint64_t cull_back_face_triangle  : 1;
-        uint64_t cull_front_face_triangle : 1;
-        uint64_t cull_opaque              : 1;
-        uint64_t cull_non_opaque          : 1;
-        uint64_t skip_triangles           : 1;
-        uint64_t skip_procedural          : 1;
-    };
-
-    uint64_t u64;
-};
-
-//=====================================================================================================================
-union HwTriangleFlags
-{
-    struct
-    {
-        uint8_t i            : 2;
-        uint8_t j            : 2;
-        uint8_t double_sided : 1;
-        uint8_t flip_winding : 1;
-        uint8_t unused       : 1;
-        uint8_t opaque       : 1;
-    };
-
-    uint8_t u8;
-};
-
-//=====================================================================================================================
-union HwTriangleID
-{
-    struct
-    {
-        HwTriangleFlags triangle0;
-        HwTriangleFlags triangle1;
-        uint16_t        unused;
-    };
-
-    uint32_t u32;
-};
-
-//=====================================================================================================================
-union BoxNodeChildFlags
-{
-    struct
-    {
-        uint8_t only_opaque     : 1;
-        uint8_t only_non_opaque : 1;
-        uint8_t only_triangles  : 1;
-        uint8_t only_procedural : 1;
-        uint8_t unused          : 4;
-    };
-
-    uint8_t u8All;
-};
-
-//=====================================================================================================================
-union BoxNodeFlags
-{
-    struct
-    {
-        BoxNodeChildFlags child0;
-        BoxNodeChildFlags child1;
-        BoxNodeChildFlags child2;
-        BoxNodeChildFlags child3;
-    };
-
-    uint32_t u32All;
-};
-#endif
-
-//=====================================================================================================================
-// Node pointer size in bytes
-#define NODE_PTR_SIZE 4
-
-#ifdef __cplusplus
-static_assert(NODE_PTR_SIZE == sizeof(NodePointer32), "Node pointer size mismatch");
-#endif
-
-//=====================================================================================================================
-// Function assumes the type passed in is a valid node type
-//
-static uint PackNodePointer(uint type, uint address)
-{
-    uint nodePointer = type; // this assumes that the type is valid
-    // uint pointer = type & 0x7;
-
-    // The input address is a byte offset, and node_addr is a 64-byte offset that starts at bit 3.
-    nodePointer |= (address >> 3); // this assumes that the input address is 64-byte aligned
-    // pointer |= (address >> 6) << 3;
-
-    return nodePointer;
-}
-
-//=====================================================================================================================
-static uint GetNodeType(uint nodePointer)
-{
-    // From the HW raytracing spec:
-    // node_type = node_pointer[ 2:0]
-    return nodePointer & 0x7;
-}
-
-//=====================================================================================================================
-static uint ClearNodeType(uint nodePointer)
-{
-    return nodePointer & ~0x7;
-}
-
-//=====================================================================================================================
-// NOTE: The highest 3 bits are excluded. They aren't written when building the QBVH and may have been repurposed. See
-// NODE_POINTER_MASK_MSB
-static uint ExtractNodePointerOffset(uint nodePointer)
-{
-    // From the HW raytracing spec:
-    // node_addr[60:0] = node_pointer[63:3]
-    // Also, based on the following, the node_addr is 64-byte aligned:
-    // fetch_addr0 = T#.base_address*256+node_addr*64
-    return ClearNodeType(nodePointer) << 3;
-}
-
-//=====================================================================================================================
-// Removes temp flag (MSB) within node type set by RefitBounds when fp16 nodes mode is LEAF_NODES_IN_BLAS_AS_FP16.
-static uint GetNodePointerExclMsbFlag(uint nodePointer)
-{
-    return nodePointer & (~NODE_POINTER_MASK_MSB);
-}
-
-//=====================================================================================================================
-// Primitive data structure that includes the unpacked data needed to process a primitive
-struct PrimitiveData
-{
-    uint primitiveIndex; // Primitive index used to indicate what primitive in geometry description
-    uint geometryIndex;  // Geometry index used to indicate what geometry description
-    uint geometryFlags;  // Geometry flags contains if the geometry is opaque or non opaque
-};
-
-//=====================================================================================================================
-// Extract the geometry index from the bottom 24 bits
-static uint ExtractGeometryIndex(uint geometryIndexAndFlags)
-{
-    return geometryIndexAndFlags & 0xFFFFFF;
-}
-
-//=====================================================================================================================
-// Extract the geometry flags from bits 25-26
-static uint ExtractGeometryFlags(uint geometryIndexAndFlags)
-{
-    return (geometryIndexAndFlags >> 24) & 0x3;
-}
-
-//=====================================================================================================================
-// Extract the geometry index from the bottom 24 bits and geometry flags from bits 25-26
-static uint2 UnpackGeometryIndexAndFlags(uint geometryIndexAndFlags)
-{
-    return uint2(ExtractGeometryIndex(geometryIndexAndFlags), ExtractGeometryFlags(geometryIndexAndFlags));
-}
-
-//=====================================================================================================================
-// Pack the geometry index in the bottom 24 bits and the geometry flags into bits 25-26
-static uint PackGeometryIndexAndFlags(
-    uint geometryIndex,
-    uint geometryFlags)
-{
-    return (geometryFlags << 24) | (geometryIndex & 0xFFFFFF);
-}
-
-//=====================================================================================================================
-// Additional geometry information for bottom level acceleration structures primitives
-struct GeometryInfo
-{
-    uint geometryFlagsAndNumPrimitives;
-    uint geometryBufferOffset;
-    uint primNodePtrsOffset; // Offset from the base of all prim node ptrs to this geometry's prim node ptrs
-};
-
-#define DXGI_FORMAT_UNKNOWN         0
-#define DXGI_FORMAT_R32G32B32_FLOAT 6
-
-#define DECODE_VERTEX_STRIDE                     12
-#define DECODE_PRIMITIVE_STRIDE_TRIANGLE         36
-#define DECODE_PRIMITIVE_STRIDE_AABB             24
-#define GEOMETRY_INFO_SIZE                       12
-#define GEOMETRY_INFO_FLAGS_AND_NUM_PRIMS_OFFSET  0
-#define GEOMETRY_INFO_GEOM_BUFFER_OFFSET          4
-#define GEOMETRY_INFO_PRIM_NODE_PTRS_OFFSET       8
-
-#define PIPELINE_FLAG_SKIP_TRIANGLES                0x100
-#define PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES    0x200
-
-#ifdef __cplusplus
-static_assert(GEOMETRY_INFO_SIZE == sizeof(GeometryInfo), "Geometry info structure mismatch");
-static_assert(GEOMETRY_INFO_FLAGS_AND_NUM_PRIMS_OFFSET == offsetof(GeometryInfo, geometryFlagsAndNumPrimitives), "");
-static_assert(GEOMETRY_INFO_GEOM_BUFFER_OFFSET         == offsetof(GeometryInfo, geometryBufferOffset), "");
-static_assert(GEOMETRY_INFO_PRIM_NODE_PTRS_OFFSET      == offsetof(GeometryInfo, primNodePtrsOffset), "");
-#endif
-
-//=====================================================================================================================
-static uint ExtractGeometryInfoFlags(uint packedGeometryFlagsAndNumPrimitives)
-{
-    return (packedGeometryFlagsAndNumPrimitives >> 29);
-}
-
-//=====================================================================================================================
-static uint ExtractGeometryInfoNumPrimitives(uint packedGeometryFlagsAndNumPrimitives)
-{
-    // ((1 << 29) - 1) = 0x1fffffff
-    return (packedGeometryFlagsAndNumPrimitives & 0x1FFFFFFF);
-}
-
-//=====================================================================================================================
-static uint PackGeometryFlagsAndNumPrimitives(uint geometryFlags, uint numPrimitives)
-{
-    return (geometryFlags << 29) | numPrimitives;
-}
-
-//=====================================================================================================================
-// 64-byte aligned BVH2 node structure
-struct BVHNode
-{
-    float3 bbox_left_min_or_v0;    /// Left Node bounding box minimum bounds or vertex 0
-    uint   left;                   /// Left child node pointer  (Also, primitive ID for leaves, instance ID for instances)
-
-    float3 bbox_left_max_or_v1;    /// Left Node bounding box maximum bounds or vertex 1
-    uint   right;                  /// Right child node pointer (Also, geometry Index for leaves)
-
-    float3 bbox_right_min_or_v2;   /// Right Node bounding box min bounds or vertex 2
-    uint   flags;                  /// Bottom: geometry flags OR Top: node[0] this is used to hold num instances
-
-    float3 bbox_right_max;         /// Right node bounding box max bounds
-    uint   unused;                 /// Unused
-};
-
-#define BVH_NODE_SIZE                  64
-#define BVH_NODE_LEFT_MIN_OFFSET       0
-#define BVH_NODE_V0_OFFSET             BVH_NODE_LEFT_MIN_OFFSET
-#define BVH_NODE_LEFT_OFFSET           12
-#define BVH_NODE_PRIMITIVE_ID_OFFSET   BVH_NODE_LEFT_OFFSET
-#define BVH_NODE_LEFT_MAX_OFFSET       16
-#define BVH_NODE_V1_OFFSET             BVH_NODE_LEFT_MAX_OFFSET
-#define BVH_NODE_RIGHT_OFFSET          28
-#define BVH_NODE_GEOMETRY_INDEX_OFFSET BVH_NODE_RIGHT_OFFSET
-#define BVH_NODE_RIGHT_MIN_OFFSET      32
-#define BVH_NODE_V2_OFFSET             BVH_NODE_RIGHT_MIN_OFFSET
-#define BVH_NODE_FLAGS_OFFSET          44
-#define BVH_NODE_RIGHT_MAX_OFFSET      48
-
-#ifdef __cplusplus
-static_assert(BVH_NODE_SIZE == sizeof(BVHNode), "BVH2Node structure mismatch");
-static_assert(BVH_NODE_LEFT_MIN_OFFSET  == offsetof(BVHNode, bbox_left_min_or_v0), "");
-static_assert(BVH_NODE_LEFT_OFFSET      == offsetof(BVHNode, left), "");
-static_assert(BVH_NODE_LEFT_MAX_OFFSET  == offsetof(BVHNode, bbox_left_max_or_v1), "");
-static_assert(BVH_NODE_RIGHT_OFFSET     == offsetof(BVHNode, right), "");
-static_assert(BVH_NODE_RIGHT_MIN_OFFSET == offsetof(BVHNode, bbox_right_min_or_v2), "");
-static_assert(BVH_NODE_FLAGS_OFFSET     == offsetof(BVHNode, flags), "");
-static_assert(BVH_NODE_RIGHT_MAX_OFFSET == offsetof(BVHNode, bbox_right_max), "");
-#endif
-
-//=====================================================================================================================
-struct InstanceSidebandData1_1
-{
-    uint   instanceIndex;
-    uint   blasNodePointer; // might not point to root
-    uint   blasMetadataSize;
-    uint   padding0;
-    float4 Transform[3]; // Non-inverse (original D3D12_RAYTRACING_INSTANCE_DESC.Transform)
-};
-
-#define RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET        0
-#define RTIP1_1_INSTANCE_SIDEBAND_CHILD_POINTER_OFFSET         4
-#define RTIP1_1_INSTANCE_SIDEBAND_CHILD_METADATA_SIZE_OFFSET   8
-#define RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET         16
-#define RTIP1_1_INSTANCE_SIDEBAND_SIZE                        64
-
-//=====================================================================================================================
-// 64-byte aligned structure matching D3D12_RAYTRACING_INSTANCE_DESC
-struct InstanceDesc
-{
-    float4 Transform[3];                                    // Inverse transform for traversal
-    uint   InstanceID_and_Mask;                             // 24-bit instance ID and 8-bit mask
-    uint   InstanceContributionToHitGroupIndex_and_Flags;   // 24-bit instance contribution and 8-bit flags
-    uint   accelStructureAddressLo;                         // Lower part of acceleration structure base address
-    uint   accelStructureAddressHiAndFlags;                 // Upper part of acceleration structure base address and
-                                                            // HW raytracing IP 2.0 flags
-};
-
-#define INSTANCE_DESC_SIZE                          64
-#define INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET   0
-#define INSTANCE_DESC_ID_AND_MASK_OFFSET            48
-#define INSTANCE_DESC_CONTRIBUTION_AND_FLAGS_OFFSET 52
-#define INSTANCE_DESC_VA_LO_OFFSET                  56
-#define INSTANCE_DESC_VA_HI_OFFSET                  60
-
-#ifdef __cplusplus
-static_assert(INSTANCE_DESC_SIZE == sizeof(InstanceDesc), "InstanceDesc structure mismatch");
-static_assert(INSTANCE_DESC_ID_AND_MASK_OFFSET == offsetof(InstanceDesc, InstanceID_and_Mask), "");
-static_assert(INSTANCE_DESC_CONTRIBUTION_AND_FLAGS_OFFSET == offsetof(InstanceDesc, InstanceContributionToHitGroupIndex_and_Flags), "");
-static_assert(INSTANCE_DESC_VA_LO_OFFSET == offsetof(InstanceDesc, accelStructureAddressLo), "");
-static_assert(INSTANCE_DESC_VA_HI_OFFSET == offsetof(InstanceDesc, accelStructureAddressHiAndFlags), "");
-#endif
-
-#ifdef __cplusplus
-static_assert(RTIP1_1_INSTANCE_SIDEBAND_SIZE == sizeof(InstanceSidebandData1_1), "Instance sideband structure mismatch");
-static_assert(RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET == offsetof(InstanceSidebandData1_1, instanceIndex), "");
-static_assert(RTIP1_1_INSTANCE_SIDEBAND_CHILD_POINTER_OFFSET == offsetof(InstanceSidebandData1_1, blasNodePointer), "");
-static_assert(RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET == offsetof(InstanceSidebandData1_1, Transform[0]), "");
-#endif
-
-//=====================================================================================================================
-struct FusedInstanceNode
-{
-    InstanceDesc            desc;
-    InstanceSidebandData1_1 sideband;
-    Float32BoxNode          blasRootNode;
-};
-
-//=====================================================================================================================
-struct InstanceNode
-{
-    InstanceDesc            desc;
-    InstanceSidebandData1_1 sideband;
-};
-
-#define INSTANCE_NODE_DESC_OFFSET       0
-#define INSTANCE_NODE_EXTRA_OFFSET      64
-#define INSTANCE_NODE_SIZE              128
-#define FUSED_INSTANCE_NODE_ROOT_OFFSET INSTANCE_NODE_SIZE
-#define FUSED_INSTANCE_NODE_SIZE        256
-
-#ifdef __cplusplus
-static_assert(INSTANCE_NODE_SIZE == sizeof(InstanceNode), "InstanceNode structure mismatch");
-static_assert(INSTANCE_NODE_DESC_OFFSET == offsetof(InstanceNode, desc), "InstanceNode structure mismatch");
-static_assert(INSTANCE_NODE_EXTRA_OFFSET == offsetof(InstanceNode, sideband), "InstanceNode structure mismatch");
-#endif
-
-//=====================================================================================================================
-static uint64_t PackUint64(uint lowBits, uint highBits)
-{
-    // Note glslang doesn't like uint64_t casts
-    uint64_t addr = highBits;
-    addr = (addr << 32) | lowBits;
-    return addr;
-}
-
-//======================================================================================================================
-// Packs the channels of a uint2 into a single uint64_t.
-static uint64_t PackUint64(uint2 lowHigh)
-{
-    // Note glslang doesn't like uint64_t casts
-    uint64_t addr = lowHigh.y;
-    addr = (addr << 32) | lowHigh.x;
-    return addr;
-}
-
-//=====================================================================================================================
-static uint2 SplitUint64(uint64_t x)
-{
-    return uint2(x, (x >> 32));
-}
-
-//=====================================================================================================================
-// Instance base pointer layout from the HW raytracing IP 2.0 spec:
-// Zero                         [ 2: 0]
-// Tree Base Address (64B index)[53: 3]
-// Force Opaque                 [   54]
-// Force Non-Opaque             [   55]
-// Disable Triangle Cull        [   56]
-// Flip Facedness               [   57]
-// Cull Back Facing Triangles   [   58]
-// Cull Front Facing Triangles  [   59]
-// Cull Opaque                  [   60]
-// Cull Non-Opaque              [   61]
-// Skip Triangles               [   62]
-// Skip Procedural              [   63]
-//
-// Since GPU VAs can only be 48 bits, only 42 bits of the Tree Base Address field are used:
-// Used Address                 [44: 3]
-// Unused Address               [53:45]
-//
-#define INSTANCE_BASE_POINTER_ZERO_MASK                          0x7ull
-#define INSTANCE_BASE_POINTER_ADDRESS_USED_MASK       0x1FFFFFFFFFF8ull
-#define INSTANCE_BASE_POINTER_ADDRESS_UNUSED_MASK   0x3FE00000000000ull
-#define INSTANCE_BASE_POINTER_ADDRESS_MASK          0x3FFFFFFFFFFFF8ull
-#define INSTANCE_BASE_POINTER_FLAGS_MASK          0xFFC0000000000000ull
-
-#define NODE_POINTER_FLAGS_SHIFT                 54
-#define NODE_POINTER_FORCE_OPAQUE_SHIFT          54
-#define NODE_POINTER_FORCE_NON_OPAQUE_SHIFT      55
-#define NODE_POINTER_DISABLE_TRIANGLE_CULL_SHIFT 56
-#define NODE_POINTER_FLIP_FACEDNESS_SHIFT        57
-#define NODE_POINTER_CULL_BACK_FACING_SHIFT      58
-#define NODE_POINTER_CULL_FRONT_FACING_SHIFT     59
-#define NODE_POINTER_CULL_OPAQUE_SHIFT           60
-#define NODE_POINTER_CULL_NON_OPAQUE_SHIFT       61
-#define NODE_POINTER_SKIP_TRIANGLES_SHIFT        62
-#define NODE_POINTER_SKIP_PROCEDURAL_SHIFT       63
-
-#define RAY_FLAG_VALID_MASK         0x3ffu
-#define RAY_FLAG_EXCLUDE_MASK       (RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_SKIP_CLOSEST_HIT_SHADER)
-#define RAY_FLAG_OVERRIDE_MASK      (RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_FORCE_NON_OPAQUE)   // 0x3
-#define RAY_FLAG_PRESERVE_MASK      (RAY_FLAG_VALID_MASK & (~RAY_FLAG_OVERRIDE_MASK))     // 0x3fc
-
-#define POINTER_FLAGS_HIDWORD_SHIFT (NODE_POINTER_FORCE_OPAQUE_SHIFT - 32)                  // 22
-#define POINTER_FLAGS_VALID_MASK    (RAY_FLAG_VALID_MASK << POINTER_FLAGS_HIDWORD_SHIFT)    // 0x3ff << 22
-#define POINTER_FLAGS_EXCLUDED_MASK  ~(POINTER_FLAGS_VALID_MASK)                            // 0xFFC00000
 
 //=====================================================================================================================
 struct StackPtrs
@@ -919,112 +107,6 @@ static_assert(STACK_PTRS_DST_PTR_OFFSET        == offsetof(StackPtrs, stackPtrNo
 static_assert(STACK_PTRS_NUM_LEAFS_DONE_OFFSET == offsetof(StackPtrs, numLeafsDone), "");
 #endif
 
-//=====================================================================================================================
-// Build Stage Counters (Debug only)
-// It starts with the qbvhGlobalCounters offset, i.e.,
-// qbvhGlobalStack...qbvhGlobalStackPtrs...bvhBuildDebugCounters
-
-#define COUNTER_MORTONGEN_OFFSET        0x0
-#define COUNTER_MORTON_SORT_OFFSET      0x4
-#define COUNTER_SORTLEAF_OFFSET         0x8
-#define COUNTER_BUILDPLOC_OFFSET        0xC
-#define COUNTER_BUILDLBVH_OFFSET        0x10
-#define COUNTER_REFIT_OFFSET            0x14
-#define COUNTER_INITENCODEHWBVH_OFFSET  0x18
-#define COUNTER_ENCODEHWBVH_OFFSET      0x1C
-#define COUNTER_EMPTYPRIM_OFFSET        0x20
-#define COUNTER_EMITCOMPACTSIZE_OFFSET  0x24
-#define COUNTER_BUILDFASTLBVH_OFFSET    0x28
-
-//=====================================================================================================================
-// Get leaf triangle node size in bytes
-static uint GetBvhNodeSizeTriangle()
-{
-    return TRIANGLE_NODE_SIZE;
-}
-
-//=====================================================================================================================
-// Get leaf AABB node size in bytes
-static uint GetBvhNodeSizeProcedural()
-{
-    return USER_NODE_PROCEDURAL_SIZE;
-}
-
-//=====================================================================================================================
-// Get leaf instance node size in bytes
-static uint GetBvhNodeSizeInstance(uint enableFusedInstanceNode)
-{
-    return (enableFusedInstanceNode == 0) ? INSTANCE_NODE_SIZE : FUSED_INSTANCE_NODE_SIZE;
-}
-
-//=====================================================================================================================
-// Get internal BVH node size in bytes
-static uint GetBvhNodeSizeInternal()
-{
-    return FLOAT32_BOX_NODE_SIZE;
-}
-
-//=====================================================================================================================
-// Get internal BVH node size in bytes
-static uint GetBvhNodeSizeLeaf(
-    uint primitiveType,
-    uint enableFusedInstanceNode)
-{
-    uint sizeInBytes = 0;
-    switch (primitiveType)
-    {
-    case PrimitiveType::Triangle:
-        sizeInBytes = GetBvhNodeSizeTriangle();
-        break;
-    case PrimitiveType::AABB:
-        sizeInBytes = GetBvhNodeSizeProcedural();
-        break;
-    case PrimitiveType::Instance:
-        sizeInBytes = GetBvhNodeSizeInstance(enableFusedInstanceNode);
-        break;
-    }
-
-    return sizeInBytes;
-}
-
-//=====================================================================================================================
-static uint CalcParentPtrOffset(uint nodePtr)
-{
-    // Subtract 1 from the index to account for negative offset calculations. I.e. index 0 is actually at -4 byte
-    // offset from the end of the parent pointer memory
-    const uint linkIndex = (nodePtr >> 3) - 1;
-    return linkIndex * NODE_PTR_SIZE;
-}
-
-//=====================================================================================================================
-static uint CalcBottomGeometryInfoSize(uint numGeometries)
-{
-    return numGeometries * GEOMETRY_INFO_SIZE;
-}
-
-//=====================================================================================================================
-struct DataOffsetAndSize
-{
-    uint offset;
-    uint size;
-};
-
-//=====================================================================================================================
-struct StateTaskQueueCounter
-{
-    uint            phase;
-    uint            startPhaseIndex;
-    uint            endPhaseIndex;
-    uint            taskCounter;
-    uint            numTasksDone;
-};
-
-#define STATE_TASK_QUEUE_PHASE_OFFSET               0
-#define STATE_TASK_QUEUE_START_PHASE_INDEX_OFFSET   4
-#define STATE_TASK_QUEUE_END_PHASE_INDEX_OFFSET     8
-#define STATE_TASK_QUEUE_TASK_COUNTER_OFFSET        12
-#define STATE_TASK_QUEUE_NUM_TASKS_DONE_OFFSET      16
-
 //=====================================================================================================================
 // Counters used in encode phase
 
@@ -1105,439 +187,20 @@ static_assert(TASK_LOOP_QBVH_TASKS_DONE_OFFSET           == offsetof(TaskLoopCou
 #endif
 
 //=====================================================================================================================
-#define REF_SCRATCH_SIDE_LEFT       0
-#define REF_SCRATCH_SIDE_RIGHT      1
-#define REF_SCRATCH_SIDE_LEAF       2
-
-#define USE_BLAS_PRIM_COUNT   0
 
-struct TDRefScratch
-{
-    uint        primitiveIndex;
-    uint        nodeIndex;
-    float3      center;
-    BoundingBox box;
-    uint        side;
-#if USE_BVH_REBRAID
-    uint        nodePointer; //rebraid only
-#endif
-#if USE_BLAS_PRIM_COUNT
-    uint        numPrimitives;
-#endif
-};
-
-#define TD_REF_PRIM_INDEX_OFFSET    0
-#define TD_REF_NODE_INDEX_OFFSET    4
-#define TD_REF_CENTER_OFFSET        8
-#define TD_REF_BOX_OFFSET           20
-#define TD_REF_SIDE_OFFSET          (TD_REF_BOX_OFFSET + sizeof(BoundingBox))
-#define TD_REF_NODE_POINTER_OFFSET  (TD_REF_SIDE_OFFSET + 4)
-#if USE_BLAS_PRIM_COUNT
-#define TD_REF_NUM_PRIM_OFFSET      (TD_REF_NODE_POINTER_OFFSET + sizeof(uint))
-#endif
-
-//=====================================================================================================================
-#define NUM_SPLIT_BINS        4
-
-#define TD_NODE_REBRAID_STATE_OPEN   0
-#define TD_NODE_REBRAID_STATE_CLOSED 1
-
-struct TDBins
-{
-    uint64_t        firstRefIndex;
-
-    UintBoundingBox binBoxes[3][NUM_SPLIT_BINS];
-    uint            binPrimCount[3][NUM_SPLIT_BINS];
-
-    uint            bestAxis;
-    uint            bestSplit;
-    uint            numLeft;
-    uint            numRight;
-
-#if USE_BLAS_PRIM_COUNT
-    uint            binBLASPrimCount[3][NUM_SPLIT_BINS];
-#endif
-};
-
-#define TD_BINS_FIRST_REF_INDEX_OFFSET        0
-#define TD_BINS_BIN_BOXES_OFFSET              (TD_BINS_FIRST_REF_INDEX_OFFSET + 8)
-#define TD_BINS_BIN_PRIM_COUNT_OFFSET         (TD_BINS_BIN_BOXES_OFFSET + sizeof(UintBoundingBox) * NUM_SPLIT_BINS * 3)
-#define TD_BINS_BEST_AXIS_OFFSET              (TD_BINS_BIN_PRIM_COUNT_OFFSET + sizeof(uint) * NUM_SPLIT_BINS * 3)
-#define TD_BINS_BEST_SPLIT_OFFSET             (TD_BINS_BEST_AXIS_OFFSET + 4)
-#define TD_BINS_NUM_LEFT_OFFSET               (TD_BINS_BEST_SPLIT_OFFSET + 4)
-#define TD_BINS_NUM_RIGHT_OFFSET              (TD_BINS_NUM_LEFT_OFFSET + 4)
-#if USE_BLAS_PRIM_COUNT
-#define TD_BINS_BLAS_PRIM_COUNT_OFFSET        (TD_BINS_NUM_RIGHT_OFFSET + 4)
-#endif
-
-struct TDNode
-{
-    UintBoundingBox centroidBox;
-    uint            binsIndex;
-    uint            childCount;
-
-#if USE_BVH_REBRAID
-    uint            largestAxis;    // rebraid only
-    float           largestWidth;   // rebraid only
-    uint            rebraidState;   // rebraid only
-    uint            primIndex;      // rebraid only
-#endif
-};
-
-#define TD_NODE_CENTROID_BOX_OFFSET           0
-#define TD_NODE_BINS_INDEX_OFFSET             (TD_NODE_CENTROID_BOX_OFFSET + sizeof(UintBoundingBox))
-#define TD_NODE_CHILD_COUNT_OFFSET            (TD_NODE_BINS_INDEX_OFFSET + 4)
-#define TD_NODE_LARGEST_AXIS_OFFSET           (TD_NODE_CHILD_COUNT_OFFSET + 4)
-#define TD_NODE_LARGEST_WIDTH_OFFSET          (TD_NODE_LARGEST_AXIS_OFFSET + 4)
-#define TD_NODE_REBRAID_STATE_OFFSET          (TD_NODE_LARGEST_WIDTH_OFFSET + 4)
-#define TD_NODE_PRIM_INDEX_OFFSET             (TD_NODE_REBRAID_STATE_OFFSET + 4)
-
-//=====================================================================================================================
-
-#define TD_REBRAID_STATE_NO_OPEN    0
-#define TD_REBRAID_STATE_NEED_OPEN  1
-#define TD_REBRAID_STATE_OOM        2
-
-#define TD_PHASE_INIT_STATE                 0
-#define TD_PHASE_INIT_REFS_TO_LEAVES        1
-#define TD_PHASE_CHECK_NEED_ALLOC           2
-#define TD_PHASE_ALLOC_ROOT_NODE            3
-#define TD_PHASE_REBRAID_COUNT_OPENINGS     4
-#define TD_PHASE_REBRAID_CHECK_TERMINATION  5
-#define TD_PHASE_REBRAID_OPEN               6
-#define TD_PHASE_REBRAID_UPDATE_NODES       7
-#define TD_PHASE_BIN_REFS                   8
-#define TD_PHASE_FIND_BEST_SPLIT            9
-#define TD_PHASE_SECOND_PASS                10
-#define TD_PHASE_UPDATE_NEW_NODES           11
-#define TD_PHASE_DONE                       12
-
-struct StateTDBuild
-{
-    uint            numNodes;
-    uint            numProcessedNodes;
-    uint            numNodesAllocated;
-    uint            numRefs;
-    uint            numRefsAllocated;
-    uint            numInactiveInstance;
-    UintBoundingBox rootCentroidBBox;
-    uint            numLeaves;
-    uint            binsCounter;
-
-#if USE_BVH_REBRAID
-    uint            rebraidState;
-    uint            leafAllocOffset;
-#endif
-};
-
-#define STATE_TD_NUM_NODES_OFFSET               0
-#define STATE_TD_NUM_PROCESSED_NODES_OFFSET     4
-#define STATE_TD_NUM_NODES_ALLOCATED_OFFSET     8
-#define STATE_TD_NUM_REFS_OFFSET                12
-#define STATE_TD_NUM_REFS_ALLOCATED_OFFSET      16
-#define STATE_TD_NUM_INACTIVE_INSTANCE_OFFSET   20
-#define STATE_TD_CENTROID_BBOX_OFFSET           24
-#define STATE_TD_NUM_LEAVES_OFFSET              (STATE_TD_CENTROID_BBOX_OFFSET + sizeof(UintBoundingBox))
-#define STATE_TD_BINS_COUNTER_OFFSET            (STATE_TD_NUM_LEAVES_OFFSET + 4)
-#define STATE_TD_REBRAID_STATE_OFFSET           (STATE_TD_BINS_COUNTER_OFFSET + 4)
-#define STATE_TD_LEAF_ALLOC_OFFSET_OFFSET       (STATE_TD_REBRAID_STATE_OFFSET + 4)
-
-//=====================================================================================================================
-struct Flags
-{
-    uint dataValid;
-    uint prefixSum;
-};
-
-#define FLAGS_DATA_VALID_OFFSET          0
-#define FLAGS_PREFIX_SUM_OFFSET          4
-
-#define DLB_KEYS_PER_THREAD     4
-#define DLB_KEYS_PER_GROUP      (BUILD_THREADGROUP_SIZE * DLB_KEYS_PER_THREAD)
-
-#define DLB_VALID_SUM           0
-#define DLB_VALID_PREFIX_SUM    1
-#define NUM_DLB_VALID_TYPES     2
-
-//=====================================================================================================================
-
-#define PLOC_PHASE_INIT                     0
-#define PLOC_PHASE_FIND_NEAREST_NEIGHBOUR   1
-#define PLOC_PHASE_UPDATE_CLUSTER_COUNT     2
-#define PLOC_PHASE_DONE                     3
-
-struct StatePLOC
-{
-    uint                    numClusters;
-    uint                    internalNodesIndex;
-    uint                    clusterListIndex;
-    uint                    numClustersAlloc;
-};
-
-#define STATE_PLOC_NUM_CLUSTERS_OFFSET                      0
-#define STATE_PLOC_INTERNAL_NODES_INDEX_OFFSET              4
-#define STATE_PLOC_CLUSTER_LIST_INDEX_OFFSET                8
-#define STATE_PLOC_NUM_CLUSTERS_ALLOC_OFFSET                12
-
-//=====================================================================================================================
-#define REBRAID_PHASE_CALC_SUM                 0
-#define REBRAID_PHASE_OPEN                     1
-#define REBRAID_PHASE_DONE                     2
-
-struct RebraidState
-{
-    float                   sumValue[2];
-    uint                    mutex;
-    uint                    numLeafIndices;
-    uint                    iterationCount;
-};
-
-#define STATE_REBRAID_SUM_VALUE_OFFSET          0
-#define STATE_REBRAID_MUTEX_OFFSET              (STATE_REBRAID_SUM_VALUE_OFFSET + 8)
-#define STATE_REBRAID_NUM_LEAF_INDICES_OFFSET   (STATE_REBRAID_MUTEX_OFFSET + 4)
-#define STATE_REBRAID_ITERATION_COUNT_OFFSET    (STATE_REBRAID_NUM_LEAF_INDICES_OFFSET + 4)
-
-#define REBRAID_KEYS_PER_THREAD                 4
-#define REBRAID_KEYS_PER_GROUP                  (BUILD_THREADGROUP_SIZE * REBRAID_KEYS_PER_THREAD)
-
-//=====================================================================================================================
-#define TS_PHASE_INIT                 0
-#define TS_PHASE_CALC_SUM             1
-#define TS_PHASE_ALLOC_REFS           2
-#define TS_PHASE_SPLIT                3
-#define TS_PHASE_DONE                 4
-
-struct ScratchTSRef
-{
-    uint leafIndex;
-    uint numSplits;
-
-    uint splitLeafBaseIndex;
-
-    BoundingBox bbox;
-};
-
-struct ScratchTSState
-{
-    uint                    refListIndex;
-    uint                    numRefs;
-    uint                    numRefsAlloc;
-    float                   sum;
-    uint                    mutex;
-};
-
-#define STATE_TS_REF_LIST_INDEX_OFFSET       0
-#define STATE_TS_NUM_REFS_OFFSET             STATE_TS_REF_LIST_INDEX_OFFSET + 4
-#define STATE_TS_NUM_REFS_ALLOC_OFFSET       STATE_TS_NUM_REFS_OFFSET + 4
-#define STATE_TS_SUM_OFFSET                  STATE_TS_NUM_REFS_ALLOC_OFFSET + 4
-#define STATE_TS_MUTEX_OFFSET                STATE_TS_SUM_OFFSET + 4
-
-//=====================================================================================================================
-struct IndexBufferInfo
-{
-    uint gpuVaLo;
-    uint gpuVaHi;
-    uint byteOffset;
-    uint format;
-};
-
-#define INDEX_BUFFER_INFO_GPU_VA_LO_OFFSET    0
-#define INDEX_BUFFER_INFO_GPU_VA_HI_OFFSET    4
-#define INDEX_BUFFER_INFO_BYTE_OFFSET_OFFSET  8
-#define INDEX_BUFFER_INFO_FORMAT_OFFSET      12
-
-//=====================================================================================================================
-enum RebraidType : uint
-{
-    Off = 0, // No Rebraid
-    V1  = 1, // First version of Rebraid
-    V2  = 2, // Second version of Rebraid
-};
-
-#define BUILD_MODE_LINEAR   0
-// BUILD_MODE_AC was 1, but it has been removed.
-#define BUILD_MODE_PLOC     2
-
-#define SAH_COST_TRIANGLE_INTERSECTION       1.5
-#define SAH_COST_AABBB_INTERSECTION          1
-
-#define ENCODE_FLAG_ARRAY_OF_POINTERS          0x00000001
-#define ENCODE_FLAG_UPDATE_IN_PLACE            0x00000002
-#define ENCODE_FLAG_REBRAID_ENABLED            0x00000004
-#define ENCODE_FLAG_ENABLE_FUSED_INSTANCE_NODE 0x00000008
-
-//=====================================================================================================================
-struct IntersectionResult
-{
-#if defined(__cplusplus)
-    IntersectionResult(int val)
-    {
-        memset(this, val, sizeof(IntersectionResult));
-    }
-#endif
-    float  t;                     // Relative to tMin
-    uint   nodeIndex;
-    float2 barycentrics;
-    uint   geometryIndex;
-    uint   primitiveIndex;
-    uint   instNodePtr;
-    uint   hitkind;
-    uint   instanceContribution;
-
-#if DEVELOPER
-    uint   numIterations;
-    uint   maxStackDepth;
-    uint   numRayBoxTest;
-    uint   numCandidateHits;
-    uint   numRayTriangleTest;
-    uint   numAnyHitInvocation;
-    uint   instanceIntersections;
-#endif
-};
-
-//=====================================================================================================================
-// Commit status
-typedef uint COMMITTED_STATUS;
-
-#define COMMITTED_NOTHING 0
-#define COMMITTED_TRIANGLE_HIT 1
-#define COMMITTED_PROCEDURAL_PRIMITIVE_HIT 2
-
-//=====================================================================================================================
-// Candidate type
-typedef uint CANDIDATE_STATUS;
-
-#define CANDIDATE_NON_OPAQUE_TRIANGLE 0
-#define CANDIDATE_PROCEDURAL_PRIMITIVE 1
-#define CANDIDATE_NON_OPAQUE_PROCEDURAL_PRIMITIVE 2
-#define CANDIDATE_EARLY_RAY_TERMINATE 4
-
-#define INIT_LDS_STATE 0xFFFFFFFF
-
-//=====================================================================================================================
-// Data required for system value intrinsics
-struct RaySystemData
-{
-    uint   currNodePtr;
-    float  rayTCurrent;
-    uint   instNodePtr;
-    uint   instanceContribution;
-    uint   geometryIndex;
-    uint   primitiveIndex;
-    float2 barycentrics;
-    uint   frontFace;
-    float3 origin;
-    float3 direction;
-};
+struct LutData {};
 
 //=====================================================================================================================
-#if DEFINE_RAYDESC || __cplusplus
-// Ray description matching the D3D12 HLSL header
-struct RayDesc
-{
-    float3 Origin;
-    float TMin;
-    float3 Direction;
-    float TMax;
-#if __cplusplus
-    RayDesc()
-        :
-        Origin(float3(0, 0, 0)),
-        TMin(0.f),
-        Direction(float3(0, 0, 0)),
-        TMax(0.f)
-    {}
+// different ways to encode the scene bounds used to generate morton codes
 
-    RayDesc(uint val)
-    {
-        memset(this, val, sizeof(RayDesc));
-    }
-#endif
-};
+#ifdef __cplusplus
+enum SceneBoundsCalculation : uint
+#else
+enum class SceneBoundsCalculation : uint32
 #endif
-
-//=====================================================================================================================
-// Internal RayQuery structure initialised at TraceRaysInline()
-struct RayQueryInternal
 {
-#if __cplusplus
-    RayQueryInternal(int val) {
-        memset(this, val, sizeof(RayQueryInternal));
-    }
-#endif
-
-    // Internal query data holding address of current BVH and stack information.
-    // Additional data that may be required will be stored here.
-    uint             bvhLo;
-    uint             bvhHi;
-    uint             topLevelBvhLo;
-    uint             topLevelBvhHi;
-    uint             stackPtr;
-    uint             stackPtrTop;
-    uint             stackNumEntries;
-    uint             instNodePtr;
-    uint             currNodePtr;
-    uint             instanceHitContributionAndFlags;
-    uint             prevNodePtr;
-    uint             isGoingDown;
-    uint             lastInstanceNode;
-
-    RayDesc          rayDesc;
-    float            rayTMin;
-    uint             rayFlags;
-    uint             instanceInclusionMask;
-
-    // Candidate system data
-    CANDIDATE_STATUS candidateType;
-    RaySystemData    candidate;
-
-    // Committed system data
-    COMMITTED_STATUS committedStatus;
-    RaySystemData    committed;
-
-    uint             reserved;
-
-    // Counter data
-    // @note We don't wrap these in DEVELOPER because it would result in mismatch of RayQuery struct size
-    //       on the driver side when we're not using counters.
-    uint             numRayBoxTest;
-    uint             numRayTriangleTest;
-    uint             numIterations;
-    uint             maxStackDepthAndDynamicId;
-    uint             clocks;
-    uint             numCandidateHits;
-    uint             instanceIntersections;
-    uint             rayQueryObjId;
+    BasedOnGeometry = 0x0,
+    BasedOnGeometryWithSize = 0x1
 };
 
-//=====================================================================================================================
-struct HitGroupInfo
-{
-    uint2 closestHitId;
-    uint2 anyHitId;
-    uint2 intersectionId;
-    uint  tableIndex;
-};
-
-//=====================================================================================================================
-struct TriangleData
-{
-#if __cplusplus
-    TriangleData(int val)
-    {
-        memset(this, val, sizeof(TriangleData));
-    }
-
-    TriangleData() : TriangleData(0)
-    {}
-#endif
-
-    float3 v0; ///< Vertex 0
-    float3 v1; ///< Vertex 1
-    float3 v2; ///< Vertex 2
-};
-
-//=====================================================================================================================
-struct LutData {};
-
 #endif
diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py
index 67cd973..a967fdf 100644
--- a/tools/CompileRTShaders.py
+++ b/tools/CompileRTShaders.py
@@ -39,7 +39,6 @@
 import shutil
 import glob
 import pathlib
-from typing import List
 
 DWORDS_PER_LINE = 8
 
@@ -92,15 +91,18 @@ def getName(self):
     def isBVH(self):
         return not self.isLibrary()
 
+# Explicitly pass the legacy RtIp level as a separate define so HLSL code can determine whether its GPURT_RTIP_LEVEL is the legacy one.
+commonTraceDefines = f"GPURT_RTIP_LEGACY_LEVEL={maxLegacyRtIpLevel}"
+
 traceShaderConfigs = [
-    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrarySw", defines="GPURT_RTIP_LEVEL=0"),
-    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrarySwDev", defines="DEVELOPER=1,GPURT_RTIP_LEVEL=0"),
+    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrarySw", defines=f"GPURT_RTIP_LEVEL=0,{commonTraceDefines}"),
+    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrarySwDev", defines=f"DEVELOPER=1,GPURT_RTIP_LEVEL=0,{commonTraceDefines}"),
     # Below 2 lines will be removed after GPURT_MINIMUM_INTERFACE_MAJOR_VERSION is bumped to 48
-    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrary", defines="USE_HW_INTRINSIC=1,GPURT_RTIP_LEVEL=0"),
-    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryDev", defines="USE_HW_INTRINSIC=1,DEVELOPER=1,GPURT_RTIP_LEVEL=0"),
+    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibrary", defines=f"USE_HW_INTRINSIC=1,GPURT_RTIP_LEVEL=0,{commonTraceDefines}"),
+    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryDev", defines=f"USE_HW_INTRINSIC=1,DEVELOPER=1,GPURT_RTIP_LEVEL=0,{commonTraceDefines}"),
 
-    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryLegacy", defines=f"USE_HW_INTRINSIC=1,GPURT_RTIP_LEVEL={maxLegacyRtIpLevel}"),
-    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryDevLegacy", defines=f"USE_HW_INTRINSIC=1,DEVELOPER=1,GPURT_RTIP_LEVEL={maxLegacyRtIpLevel}"),
+    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryLegacy", defines=f"USE_HW_INTRINSIC=1,GPURT_RTIP_LEVEL={maxLegacyRtIpLevel},{commonTraceDefines}"),
+    ShaderConfig(path="GpuRtLibrary.hlsl", outputName="GpuRtLibraryDevLegacy", defines=f"USE_HW_INTRINSIC=1,DEVELOPER=1,GPURT_RTIP_LEVEL={maxLegacyRtIpLevel},{commonTraceDefines}"),
 ]
 
 bvhShaderConfigs = [
@@ -117,7 +119,7 @@ def isBVH(self):
     ShaderConfig(path="GenerateMortonCodes.hlsl", entryPoint="GenerateMortonCodes"),
     ShaderConfig(path="Rebraid.hlsl", entryPoint="Rebraid"),
     ShaderConfig(path="BuildBVH.hlsl", entryPoint="BuildBVH", defines="USE_BUILD_LBVH=1"),
-    ShaderConfig(path="BuildBVHPLOC.hlsl", entryPoint="BuildBVHPLOC"),
+    ShaderConfig(path="BuildPLOC.hlsl", entryPoint="BuildPLOC"),
     ShaderConfig(path="UpdateQBVH.hlsl", entryPoint="UpdateQBVH"),
     ShaderConfig(path="RefitBounds.hlsl", entryPoint="RefitBounds"),
     ShaderConfig(path="ClearBuffer.hlsl", entryPoint="ClearBuffer"),
@@ -142,6 +144,9 @@ def isBVH(self):
     ShaderConfig(path="InitExecuteIndirect.hlsl", entryPoint="InitExecuteIndirect", outputName="InitExecuteIndirect"),
     ShaderConfig(path="PairCompression.hlsl", entryPoint="PairCompression"),
     ShaderConfig(path="MergeSort.hlsl", entryPoint="MergeSort"),
+    ShaderConfig(path="MergeSort.hlsl", entryPoint="MergeSortLocal"),
+    ShaderConfig(path="MergeSort.hlsl", entryPoint="MergeSortGlobalIteration"),
+    ShaderConfig(path="MergeSort.hlsl", entryPoint="MergeSortCopyLastLevel"),
     ShaderConfig(path="InitAccelerationStructure.hlsl", entryPoint="InitAccelerationStructure"),
     ShaderConfig(path="InitAccelerationStructure.hlsl", entryPoint="InitAccelerationStructure", defines="IS_UPDATE=1", outputName="InitUpdateAccelerationStructure"),
     ShaderConfig(path="BuildFastAgglomerativeLbvh.hlsl", entryPoint="BuildFastAgglomerativeLbvh"),
@@ -179,8 +184,6 @@ def getBaseDxcCommandArgs(isBvh:bool, isLibrary:bool, isSpirv:bool):
 
     return dxcOptions
 
-validationSpecialCaseDefines = {x.path:x.defines for x in list(filter(lambda a : a.defines is not None, bvhShaderConfigs))}
-
 """
 Combines args into an array of strings that can be used as compilation command by InvokeSubprocess.
 Output command lacks: filename, -M flag for listing includes and entrypoint-specific defines like USE_HW_INTRINSIC
@@ -192,8 +195,7 @@ def getValidationCmdArgs(args) -> [str]:
     compilerPath = FixExePath(compilerPath)
 
     validateCommand =  [compilerPath]
-
-    validateCommand += getBaseDxcCommandArgs(True, True, False)
+    validateCommand += getBaseDxcCommandArgs(True, True, args.spirv)
     validateCommand += ["-Wno-misplaced-attributes"] # -Wmisplaced-attributes is triggered by [RootSignature()]
                                                      # used by entrypoint code and compiled as library
     validateCommand += ['-Fo', 'temp.bin']
@@ -201,7 +203,7 @@ def getValidationCmdArgs(args) -> [str]:
     validateCommand += ['-DLIBRARY_COMPILATION']
 
     #use defines from cmake
-    for d in args.defines.split(';'):
+    for d in args.defines.split(' '):
         d = d.strip()
         if d != '':
             validateCommand += ['-D' + d]
@@ -212,33 +214,37 @@ def getValidationCmdArgs(args) -> [str]:
     validateCommand += ['-DUSE_HW_INTRINSIC=1']
 
     #use include pathes from cmake
-    for p in args.includePaths.split(';'):
+    for p in args.includePaths.split(' '):
         p = p.strip()
         if p != '':
             validateCommand += ['-I', p]
 
     return validateCommand
 
+def removeSuffix(path: pathlib.Path, suffix: str) -> pathlib.Path:
+    return pathlib.Path(path.as_posix()[:-len(suffix)])
+
 """
-Finds all hlsl-hlsli pairs of files under basePath (recursively).
-Outputs dict of filenames (without extension) to pair of bools meaning (has_hlsl_implementation, has_hlsli_header)
+Finds all implementation-interface pairs of files under basePath (recursively).
+Outputs dict of filenames (without extension) to pair of bools meaning (has_implementation, has_interface)
 """
-def getHlslHlsliPairs(basePath: str) -> {str: (bool, bool)}:
-    # pairs -> {hlsl_hlsli_pair_path_without_extension: (has_hlsl, has_hlsli)}
+def getImplInterfacePairs(directory: pathlib.Path, implementationSuffix: str, interfaceSuffix: str) -> {pathlib.Path, (bool, bool)}:
+    # pairs -> {pair_path_without_extension: (has_implementation, has_interface)}
     pairs = {}
-    # insert hlsl part of pairs
-    for hlslfile in glob.glob(basePath+"/**/*.hlsl", recursive=True):
-        withoutExtension = pathlib.Path(hlslfile).with_suffix("")
-        pairs[withoutExtension] = (True, False)
+    # insert implementation part of pairs
+    for implPath in directory.rglob("*" + implementationSuffix):
+        pairs[removeSuffix(implPath.resolve(), implementationSuffix)] = (True, False)
 
-    #insert hlsli part of pairs
-    for hlslifile in glob.glob(basePath+"/**/*.hlsli", recursive=True):
-        withoutExtension = pathlib.Path(hlslifile).with_suffix("")
-        hasHlslFile = pairs.get(withoutExtension, (False, False))[0]
-        pairs[withoutExtension] = (hasHlslFile, True)
+    # insert interface part of pairs
+    for interfacePath in directory.rglob("*" + interfaceSuffix):
+        withoutSuffix = removeSuffix(interfacePath.resolve(), interfaceSuffix)
+        hasImplFile = pairs.get(withoutSuffix, (False, False))[0]
+        pairs[withoutSuffix] = (hasImplFile, True)
 
     return pairs
 
+validationSpecialCaseDefines = {x.path:x.defines for x in list(filter(lambda a : a.defines is not None, bvhShaderConfigs))}
+
 """
 Some files/functions can be included conditionally behind ifdefs.
 This function combines defines, so that we can test compilation with different combinations of defines.
@@ -261,38 +267,34 @@ def getDefineCombos(path: pathlib.Path) -> [[str]]:
 
 """
 shaderClean's hlsl-hlsli pair is considered clean when:
-1. it does not include anything else than .hlsli files;
-2. it does not include anything from outside of shaderClean directory.
+1. let [(dir, suffix)] = allowedDirSuffix, it includes only -suffix files from dir/ directory, and
+2. it does not include any other files except its own .hlsl file.
 """
-def validateIncludes(cmd: List[str], path: pathlib.Path, shadersCleanStr: str) -> bool:
+def validateIncludes(cmd: [str], path: pathlib.Path, implSuffix: str, interfaceSuffix: str,
+                     allowedDirSuffix: [(pathlib.Path, str)]) -> bool:
+    allowedDirSuffix = [(dirPath.as_posix(), suffix) for (dirPath, suffix) in allowedDirSuffix]
     listIncludesCmd = cmd + ["-M"]
     threadOutput = []
     retVal = InvokeSubprocess(listIncludesCmd, None, threadOutput, linuxLibraryPath=listIncludesCmd[0], expectNoOutput=False)
-    assert retVal == 0, "Could not list includes of {0} with cmd {1} because:\n {2}".format(path, listIncludesCmd, threadOutput)
+    assert retVal == 0, "Could not list includes of {0} with cmd {1} because:\n {2}".format(path, listIncludesCmd, "\n".join(threadOutput))
 
-    includedPaths = set()
+    includedFilesStr = set()
     for line in threadOutput[0].split("\n")[1:]:
-        includedPaths |= {pathlib.Path(line.strip(" \n\r\t\\/"))}
-    includedPaths -= {path.with_suffix(".hlsl")}
-    includedPaths -= {path.with_suffix(".hlsli")}
-
-    # On windows, make sure that shadersCleanPath is also interpreted in the same way as hlsiStr via as_posix() otherwise
-    # use of a drive mapping may cause errors.
-    shadersCleanPath = pathlib.Path(shadersCleanStr)
-    shadersCleanStrPosix = str(shadersCleanPath.resolve().as_posix())
-
-    for hlsli in includedPaths:
-        hlsliStr = str(hlsli.resolve().as_posix())
-        if hlsli.suffix != ".hlsli":
-            print("GPURT clean shader validation failed:")
-            print("\tIncluding non-hlsli files is not allowed.")
-            print("\t{0} includes {1}".format(path, hlsliStr))
-            return False
-
-        if shadersCleanStrPosix not in hlsliStr:
+        # use resolve() + as_posix() to avoid path mismatches when using drive mapping
+        includedFilesStr |= {pathlib.Path(line.strip(" \n\r\t\\/")).resolve().as_posix()}
+    includedFilesStr -= {path.as_posix() + implSuffix}
+    includedFilesStr -= {path.as_posix() + interfaceSuffix}
+
+    for includedFileStr in includedFilesStr:
+        isAllowed = False
+        for (dirStr, suffix) in allowedDirSuffix:
+            if (includedFileStr.endswith(suffix)) and (dirStr in includedFileStr):
+                isAllowed = True
+                break
+
+        if not isAllowed:
             print("GPURT clean shader validation failed:")
-            print("\tIncluding non-clean files is not allowed.")
-            print("\t{0} includes {1}".format(path, hlsliStr))
+            print("\t{0} includes {1} which is not allowed.".format(path, includedFileStr))
             return False
 
     return True
@@ -301,12 +303,12 @@ def validateIncludes(cmd: List[str], path: pathlib.Path, shadersCleanStr: str) -
 hlsl-hlsli pairs must compile on its own. It tests whether pairs contain or include everything needed.
 If they do it allows including them anywhere in any order, except for some macros.
 """
-def validateCompilation(cmd: List[str], path: pathlib.Path, shadersCleanStr: str) -> bool:
+def validateCompilation(cmd: [str], path: pathlib.Path) -> bool:
     threadOutput = []
     retVal = InvokeSubprocess(cmd, None, threadOutput, linuxLibraryPath=cmd[0], expectNoOutput=False)
     if retVal != 0:
         print("GPURT clean shader validation failed:")
-        print("\tCould not compile {0} as library with cmd {1} because:\n {2}".format(path, cmd, threadOutput))
+        print("\tCould not compile {0} as library with cmd {1} because:\n {2}".format(path, cmd, threadOutput[-1]))
         return False
 
     return True
@@ -318,17 +320,20 @@ def validateCompilation(cmd: List[str], path: pathlib.Path, shadersCleanStr: str
 """
 def validateShadersClean(args) -> bool:
     cmdBase = getValidationCmdArgs(args)
-    shadersCleanPath = pathlib.Path(FixInputPath(args.basepath)).parent.as_posix() + "/shadersClean"
-    shadersCleanStr = str(shadersCleanPath)
+    # use resolve() + as_posix() to avoid path mismatches when using drive mapping
+    srcPath = pathlib.Path(FixInputPath(args.basepath)).parent.resolve()
+    shadersCleanPath = srcPath / "shadersClean"
 
-    for path, (hasImpl, hasHeader) in getHlslHlsliPairs(shadersCleanPath).items():
+    implExt = ".hlsl"
+    headerExt = ".hlsli"
+    for path, (hasImpl, hasHeader) in getImplInterfacePairs(shadersCleanPath, implExt, headerExt).items():
         assert (hasImpl or hasHeader), "There should not be files without impl nor header."
-        fullPath = path.with_suffix(".hlsl" if hasImpl else ".hlsli")
+        fullPath = path.with_suffix(path.suffix + (implExt if hasImpl else headerExt))
         for defines in getDefineCombos(fullPath):
-            compileCmd = cmdBase + defines + [str(fullPath.as_posix())]
-            if not validateIncludes(compileCmd, fullPath, shadersCleanStr):
+            compileCmd = cmdBase + defines + [fullPath.as_posix()]
+            if not validateIncludes(compileCmd, path, implExt, headerExt, [(shadersCleanPath, headerExt)]):
                 return False
-            if not validateCompilation(compileCmd, fullPath, shadersCleanStr):
+            if not validateCompilation(compileCmd, fullPath):
                 return False
 
     return True
@@ -760,12 +765,20 @@ def main() -> int:
     args = parser.parse_args()
 
     if args.validateShadersClean:
-        print("Validating shadersClean directory")
+        print("Validating shadersClean directory.")
         tBegin = time.perf_counter()
+
         validIncludes = validateShadersClean(args)
+        # For vulkan, we validate SPIR-V shaders in the same run instead of running the script again.
+        if args.vulkan and not args.spirv:
+            print("Now doing SPIR-V validation...")
+            args.spirv = True
+            validIncludes &= validateShadersClean(args)
+
         tDuration = time.perf_counter() - tBegin
         if validIncludes:
-            print("Validated shadersClean directory in ", round(tDuration, 4))
+            tDuration = round(time.perf_counter() - tBegin, 4)
+            print(f"Validated shadersClean directory in {tDuration}s.")
         else:
             print("Some files are not clean. See errors above.")
         return 0 if validIncludes else -1