diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5be2de9..ea92db7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,6 +115,10 @@ if (DEFINED GPURT_CLIENT_INTERFACE_MAJOR_VERSION)
     gpurt_add_compile_definitions(GPURT_CLIENT_INTERFACE_MAJOR_VERSION=${GPURT_CLIENT_INTERFACE_MAJOR_VERSION})
 endif()
 
+if (DEFINED PAL_CLIENT_INTERFACE_MAJOR_VERSION)
+    gpurt_add_compile_definitions(PAL_CLIENT_INTERFACE_MAJOR_VERSION=${PAL_CLIENT_INTERFACE_MAJOR_VERSION})
+endif()
+
 ### Add Source Directories
 target_include_directories(gpurt PUBLIC .)
 target_include_directories(gpurt_internal PUBLIC .)
diff --git a/cmake/GpuRtGenerateShaders.cmake b/cmake/GpuRtGenerateShaders.cmake
index faf053c..4654fa0 100644
--- a/cmake/GpuRtGenerateShaders.cmake
+++ b/cmake/GpuRtGenerateShaders.cmake
@@ -99,6 +99,8 @@ list(APPEND gpurtSharedDependencies
     ${gpurtCompileScript}
 )
 
+set(RT_SHADER_VALIDATION_COMMAND "")
+
 # Create custom command that outputs the generated BVH shaders
 # The generated shaders depend on all the above mentioned files
 if(GPURT_CLIENT_API STREQUAL "VULKAN")
@@ -130,16 +132,7 @@ if(GPURT_CLIENT_API STREQUAL "VULKAN")
             ${gpurtStripWhitelist}
             ${gpurtDxcCompiler}
             ${gpurtSpirvRemap}
-
-        COMMAND Python3::Interpreter "${gpurtCompileScript}"
-            --outputDir "${gpurtOutputDir}"
-            --validateShadersClean
-            ${COMPILER_ARGUMENT}
-            --defines "\"${gpurtDefines}\""
-            --includePaths "\"${gpurtIncludeDirectories}\""
-            "${gpurtDxilBvhShader}"
-            "${gpurtShadersSourceDir}"
-            "${gpurtSscStrict}"
+        COMMAND ${RT_SHADER_VALIDATION_COMMAND}
 
         COMMAND Python3::Interpreter "${gpurtCompileScript}"
             --vulkan
diff --git a/cmake/GpurtOptionsCodegen.cmake b/cmake/GpurtOptionsCodegen.cmake
index 1910f65..9d6d87e 100644
--- a/cmake/GpurtOptionsCodegen.cmake
+++ b/cmake/GpurtOptionsCodegen.cmake
@@ -53,4 +53,5 @@ add_custom_target(generate_gpurtOptions_h
 )
 
 target_include_directories(gpurt PUBLIC ${OUTDIR})
+target_sources(gpurt INTERFACE ${GPURTOPTIONS_OUTPUT})
 
diff --git a/gpurt/gpurtAccelStruct.h b/gpurt/gpurtAccelStruct.h
index 6468547..3b35b30 100644
--- a/gpurt/gpurtAccelStruct.h
+++ b/gpurt/gpurtAccelStruct.h
@@ -94,6 +94,8 @@ struct AccelStructMetadataHeader
                                 // numTasksDone can be reset in one 64 bit CP write.
     uint32 numTasksDone;        // Number of tasks done
     uint32 reserved0[16];       // Reserved
+    uint32 reserved1[3];        // Reserved
+    uint32 reserved2[3];        // Reserved
 };
 
 #define ACCEL_STRUCT_METADATA_VA_LO_OFFSET              0
@@ -102,7 +104,9 @@ struct AccelStructMetadataHeader
 #define ACCEL_STRUCT_METADATA_TASK_COUNTER_OFFSET       12
 #define ACCEL_STRUCT_METADATA_NUM_TASKS_DONE_OFFSET     16
 #define ACCEL_STRUCT_METADATA_RESERVED_0                20
-#define ACCEL_STRUCT_METADATA_HEADER_SIZE               84
+#define ACCEL_STRUCT_METADATA_RESERVED_1                84
+#define ACCEL_STRUCT_METADATA_RESERVED_2                96
+#define ACCEL_STRUCT_METADATA_HEADER_SIZE               108
 
 GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_HEADER_SIZE == sizeof(AccelStructMetadataHeader), "Acceleration structure header mismatch");
 #ifdef __cplusplus
@@ -110,6 +114,7 @@ GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_VA_LO_OFFSET == offsetof(AccelStructMe
 GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_VA_HI_OFFSET == offsetof(AccelStructMetadataHeader, addressHi), "");
 GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_SIZE_OFFSET == offsetof(AccelStructMetadataHeader, sizeInBytes), "");
 GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_TASK_COUNTER_OFFSET == offsetof(AccelStructMetadataHeader, taskCounter), "");
+GPURT_STATIC_ASSERT(ACCEL_STRUCT_METADATA_NUM_TASKS_DONE_OFFSET == offsetof(AccelStructMetadataHeader, numTasksDone), "");
 #endif
 
 #ifdef __cplusplus
diff --git a/gpurt/gpurtBuildSettings.h b/gpurt/gpurtBuildSettings.h
index 8482537..cce62a4 100644
--- a/gpurt/gpurtBuildSettings.h
+++ b/gpurt/gpurtBuildSettings.h
@@ -99,6 +99,7 @@ struct CompileTimeBuildSettings
     uint32 unused11;
     uint32 unused12;
     uint32 unused13;
+    uint32 rebuildAccelStruct;
 };
 
 #define BUILD_SETTINGS_DATA_TOP_LEVEL_BUILD_ID                        0
@@ -134,6 +135,7 @@ struct CompileTimeBuildSettings
 #define BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID               41
 #define BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID          42
 #define BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID              43
+#define BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID         47
 
 #ifdef __cplusplus
 } // namespace GpuRt
diff --git a/src/gpurtBvhBatcher.cpp b/src/gpurtBvhBatcher.cpp
index 674eb65..08ee242 100644
--- a/src/gpurtBvhBatcher.cpp
+++ b/src/gpurtBvhBatcher.cpp
@@ -24,12 +24,14 @@
  **********************************************************************************************************************/
 
 #include "palCmdBuffer.h"
+#include "palHashMapImpl.h"
 #include "palMetroHash.h"
 #include "palVectorImpl.h"
 
 #include "gpurt/gpurt.h"
 #include "gpurt/gpurtLib.h"
 #include "gpurt/gpurtAccelStruct.h"
+#include "gpurt/gpurtInlineFuncs.h"
 #include "gpurtInternal.h"
 #include "gpurtInternalShaderBindings.h"
 #include "gpurtBvhBatcher.h"
diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp
index 060ee74..feb9e0b 100644
--- a/src/gpurtBvhBuilder.cpp
+++ b/src/gpurtBvhBuilder.cpp
@@ -100,18 +100,9 @@ static VertexFormatInfo VertexFormatInfoTable[] =
     { Pal::ChNumFormat::X8Y8_Unorm,         Pal::ChNumFormat::X8_Unorm, 2 },
 };
 
-// =====================================================================================================================
-// Helper structure for encapsulating triangle index buffer information
-struct IndexBufferInfo
-{
-    uint32 format;
-    uint64 byteOffset;
-    uint64 gpuVa;
-};
-
 // =====================================================================================================================
 // Helper function to convert triangle geometry information into index buffer info
-IndexBufferInfo GetIndexBufferInfo(
+IndexBufferInfo BvhBuilder::GetIndexBufferInfo(
     const GeometryTriangles& geometry)
 {
     IndexBufferInfo indexBuffer = {};
@@ -149,7 +140,7 @@ static uint32 DispatchSize(
 
 // =====================================================================================================================
 // Helper function that calculates the block count for input number of triangles
-static uint32 TrianglePairBlockCount(
+uint32 BvhBuilder::TrianglePairBlockCount(
     uint32 numTriangles)
 {
     constexpr uint32 TrianglePairBlockSize = 64;
@@ -454,19 +445,27 @@ BvhBuilder::BvhBuilder(
     const AccelStructBuildInfo&  buildInfo)       // Build args
     :
     m_pDevice(pDevice),
-    m_deviceSettings(deviceSettings),
     m_clientCb(clientCb),
+    m_deviceSettings(deviceSettings),
+    m_buildConfig({}),
+    m_resultOffsets({}),
     m_buildArgs(buildInfo),
     m_deviceProps(deviceProps),
+    m_metadataSizeInBytes(0),
     m_cmdBuffer(cmdBuffer),
+    m_scratchOffsets({}),
     m_backend(backend),
     m_buildSettings({}),
+    m_shaderConstantsGpuVa(0ull),
+    m_geomConstSrdTable(0ull),
+    m_geomBufferSrdTable(0ull),
     m_radixSortConfig(GetRadixSortConfig(deviceSettings)),
     m_emitCompactDstGpuVa(0ull),
-    m_buildSettingsHash(0)
+    m_buildSettingsHash(0u),
+    m_resultBufferInfo({}),
+    m_scratchBufferInfo({}),
+    m_dumpInfo({})
 {
-    // Determine if the flags have to be overriden based on the build inputs.
-    m_buildArgs.inputs = m_pDevice->OverrideBuildInputs(m_buildArgs.inputs);
     InitializeBuildConfigs();
 
     {
@@ -491,16 +490,26 @@ BvhBuilder::BvhBuilder(
     const DeviceSettings&        deviceSettings)  // Device settings
     :
     m_pDevice(pDevice),
-    m_deviceSettings(deviceSettings),
     m_clientCb(clientCb),
+    m_deviceSettings(deviceSettings),
+    m_buildConfig({}),
+    m_resultOffsets({}),
     m_buildArgs(AccelStructBuildInfo{}),
     m_deviceProps(deviceProps),
+    m_metadataSizeInBytes(0),
     m_cmdBuffer(cmdBuffer),
+    m_scratchOffsets({}),
     m_backend(backend),
     m_buildSettings({}),
+    m_shaderConstantsGpuVa(0ull),
+    m_geomConstSrdTable(0ull),
+    m_geomBufferSrdTable(0ull),
     m_radixSortConfig(GetRadixSortConfig(deviceSettings)),
     m_emitCompactDstGpuVa(0ull),
-    m_buildSettingsHash(0)
+    m_buildSettingsHash(0u),
+    m_resultBufferInfo({}),
+    m_scratchBufferInfo({}),
+    m_dumpInfo({})
 {
     InitCopySettings();
 }
@@ -549,10 +558,16 @@ BvhBuildMode BvhBuilder::OverrideBuildMode(
 // Remapped scratch buffer base address
 bool BvhBuilder::AllowRemappingScratchBuffer() const
 {
+    bool encodeQuadPrimitives = m_buildConfig.enableEarlyPairCompression;
+
+    bool usePrimIndicesArray = false;
+
     return
         (m_deviceSettings.enableRemapScratchBuffer == true) &&
         (IsUpdate() == false) &&
-        (m_deviceSettings.enableBuildAccelStructScratchDumping == false);
+        (m_deviceSettings.enableBuildAccelStructScratchDumping == false) &&
+        (encodeQuadPrimitives == false) &&
+        (usePrimIndicesArray == false);
 }
 
 // =====================================================================================================================
@@ -587,11 +602,41 @@ uint32 BvhBuilder::CalculateScratchBufferSize(
     return size;
 }
 
+// =====================================================================================================================
+// Calculates the result buffer's metadata size
+uint32 BvhBuilder::CalculateMetadataSize(
+    const uint32  internalNodeSize,
+    const uint32  leafNodeSize,
+    uint32* const pRunningOffset)
+{
+    uint metadataSizeInBytes;
+    {
+        metadataSizeInBytes = CalcMetadataSizeInBytes(internalNodeSize, leafNodeSize);
+        // Align metadata size to cache line
+        metadataSizeInBytes = Util::Pow2Align(metadataSizeInBytes, 128);
+
+        *pRunningOffset += metadataSizeInBytes;
+    }
+
+    return metadataSizeInBytes;
+}
+
 // =====================================================================================================================
 // Calculates the result buffer offsets and returns the total result memory size
 BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfo(
     AccelStructDataOffsets* pOffsets,
-    uint32* pMetadataSizeInBytes)
+    uint32* pMetadataSizeInBytes,
+    uint32 remapScratchBufferSize)
+{
+    return CalculateResultBufferInfoDefault(pOffsets, pMetadataSizeInBytes, remapScratchBufferSize);
+}
+
+// =====================================================================================================================
+// Calculates the result buffer offsets and returns the total result memory size
+BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfoDefault(
+    AccelStructDataOffsets* pOffsets,
+    uint32* pMetadataSizeInBytes,
+    uint32 remapScratchBufferSize)
 {
     ResultBufferInfo info = {};
 
@@ -623,37 +668,40 @@ BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfo(
 
     uint32 internalNodeSize = 0;
     uint32 leafNodeSize = 0;
+    uint32 nodeSize = 0;
 
     if (m_buildConfig.maxNumPrimitives > 0)
     {
         internalNodeSize = CalculateInternalNodesSize();
         leafNodeSize = CalculateLeafNodesSize();
+        nodeSize = internalNodeSize + leafNodeSize;
 
         offsets.internalNodes = ReserveBytes(internalNodeSize, &runningOffset);
         offsets.leafNodes = ReserveBytes(leafNodeSize, &runningOffset);
 
+        if (AllowRemappingScratchBuffer() && (remapScratchBufferSize > nodeSize))
+        {
+            ReserveBytes(remapScratchBufferSize - nodeSize, &runningOffset);
+
+            nodeSize = remapScratchBufferSize;
+        }
+
         if (m_buildConfig.topLevelBuild == false)
         {
             const uint32 geometryInfoSize = CalculateGeometryInfoSize(m_buildArgs.inputs.inputElemCount);
             offsets.geometryInfo = ReserveBytes(geometryInfoSize, &runningOffset);
         }
 
-        offsets.primNodePtrs = ReserveBytes(m_buildConfig.maxNumPrimitives * sizeof(uint32), &runningOffset);
+        {
+            offsets.primNodePtrs = ReserveBytes(m_buildConfig.maxNumPrimitives * sizeof(uint32), &runningOffset);
 
+        }
     }
 
     uint32 totalSizeInBytes = runningOffset;
 
     // Metadata section is at the beginning of the acceleration structure buffer
-    uint32 metadataSizeInBytes;
-    {
-        metadataSizeInBytes = CalcMetadataSizeInBytes(internalNodeSize, leafNodeSize);
-        // Align metadata size to cache line
-        metadataSizeInBytes = Util::Pow2Align(metadataSizeInBytes, 128);
-
-        totalSizeInBytes += metadataSizeInBytes;
-    }
-
+    const uint32 metadataSizeInBytes = CalculateMetadataSize(internalNodeSize, leafNodeSize, &totalSizeInBytes);
     if (pOffsets != nullptr)
     {
         memcpy(pOffsets, &offsets, sizeof(offsets));
@@ -664,8 +712,8 @@ BvhBuilder::ResultBufferInfo BvhBuilder::CalculateResultBufferInfo(
         *pMetadataSizeInBytes = metadataSizeInBytes;
     }
 
-    info.baseOffset = metadataSizeInBytes + sizeof(AccelStructHeader);
-    info.nodeSize = internalNodeSize + leafNodeSize;
+    info.baseOffset = sizeof(AccelStructHeader);
+    info.nodeSize = nodeSize;
     info.dataSize = totalSizeInBytes;
     return info;
 }
@@ -1082,7 +1130,6 @@ BvhBuilder::ScratchBufferInfo BvhBuilder::CalculateScratchBufferInfoDefault(
             neighbourIndices = ReserveBytes(aabbCount * sizeof(uint32), &runningOffset);
             // TODO: calculate number of blocks based on KEYS_PER_THREAD
             atomicFlagsPloc = ReserveBytes(aabbCount * RayTracingPLOCFlags, &runningOffset);
-            clusterOffsets = ReserveBytes(aabbCount * sizeof(uint32), &runningOffset);
         }
     }
     bvh2PhaseMaxSize = Util::Max(bvh2PhaseMaxSize, runningOffset);
@@ -1204,6 +1251,24 @@ GeometryType BvhBuilder::GetGeometryType(
     return type;
 }
 
+// =====================================================================================================================
+bool BvhBuilder::ForceRebuild(
+    const Internal::Device*      pDevice,
+    const AccelStructBuildInputs inputs)
+{
+    const DeviceSettings settings = pDevice->Settings();
+    const bool rebuildTopLevel =
+        Util::TestAnyFlagSet(settings.forceRebuildForUpdates, ForceRebuildForUpdatesMode::TopLevel) &&
+        (inputs.type == GpuRt::AccelStructType::TopLevel);
+    const bool rebuildBottomLevel =
+        Util::TestAnyFlagSet(settings.forceRebuildForUpdates, ForceRebuildForUpdatesMode::BottomLevel) &&
+        (inputs.type == GpuRt::AccelStructType::BottomLevel);
+
+    bool rebuildAS = rebuildBottomLevel || rebuildTopLevel;
+
+    return rebuildAS;
+}
+
 // =====================================================================================================================
 // Initialize buildConfig
 void BvhBuilder::InitBuildConfig(
@@ -1211,6 +1276,13 @@ void BvhBuilder::InitBuildConfig(
 {
     m_buildConfig = {};
 
+    if (ForceRebuild(m_pDevice, m_buildArgs.inputs))
+    {
+        // Determine if the flags have to be overriden based on the build inputs.
+        m_buildArgs.inputs = m_pDevice->OverrideBuildInputs(m_buildArgs.inputs);
+        m_buildConfig.rebuildAccelStruct = true;
+    }
+
     // For top-level acceleration structure, inputElementCount represents the number of instances
     uint32 primitiveCount =
         (buildArgs.inputs.type == AccelStructType::BottomLevel) ? 0 : buildArgs.inputs.inputElemCount;
@@ -1272,8 +1344,7 @@ void BvhBuilder::InitBuildConfig(
     m_buildConfig.topDownBuild = m_buildConfig.allowTopDownBuild &&
         (buildArgs.inputs.inputElemCount <= m_deviceSettings.maxTopDownBuildInstances);
 
-    if ((Util::TestAnyFlagSet(m_buildArgs.inputs.flags, AccelStructBuildFlagAllowUpdate) == false) &&
-        m_buildConfig.topLevelBuild)
+    if ((UpdateAllowed() == false) && m_buildConfig.topLevelBuild)
     {
         if (m_buildConfig.rebraidType == RebraidType::V1)
         {
@@ -1300,8 +1371,7 @@ void BvhBuilder::InitBuildConfig(
 
     m_buildConfig.triangleSplitting = (m_deviceSettings.enableParallelBuild) &&
         m_deviceSettings.enableTriangleSplitting && (buildArgs.inputs.type == AccelStructType::BottomLevel) &&
-        (Util::TestAnyFlagSet(buildArgs.inputs.flags, AccelStructBuildFlagAllowUpdate) == false) &&
-        Util::TestAnyFlagSet(buildArgs.inputs.flags, AccelStructBuildFlagPreferFastTrace);
+        (UpdateAllowed() == false) && Util::TestAnyFlagSet(buildArgs.inputs.flags, AccelStructBuildFlagPreferFastTrace);
 
     m_buildConfig.buildMode = OverrideBuildMode(buildArgs);
 
@@ -1384,7 +1454,8 @@ void BvhBuilder::InitBuildConfig(
         (IsUpdate() &&
          (m_deviceSettings.enableMergedEncodeUpdate == 0)) ||
         ((IsUpdate() == false) && (m_deviceSettings.enableMergedEncodeBuild == 0)) ||
-        (buildArgs.inputs.type == AccelStructType::TopLevel) ||
+        ((buildArgs.inputs.type == AccelStructType::TopLevel)
+         ) ||
         ((IsUpdate() == false) && (m_buildConfig.geometryType == GeometryType::Aabbs))
 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 46
         || (buildArgs.inputs.inputElemCount > maxDescriptorTableSize)
@@ -1552,6 +1623,7 @@ AccelStructHeader BvhBuilder::InitAccelStructHeader() const
     header.accelStructVersion       = GPURT_ACCEL_STRUCT_VERSION;
     header.metadataSizeInBytes      = m_metadataSizeInBytes;
     header.sizeInBytes              = accelStructSize;
+    header.compactedSizeInBytes     = accelStructSize;
     header.numPrimitives            = m_buildConfig.maxNumPrimitives; // Is this correct?
     header.numDescs                 = m_buildArgs.inputs.inputElemCount;
     header.geometryType             = static_cast<uint32>(m_buildConfig.geometryType);
@@ -1823,9 +1895,9 @@ void BvhBuilder::InitAccelerationStructure()
         }
     }
 
-    // Merged encode/build writes the header using the shader. However, we don't launch the build shader in the case
-    // of an empty BVH.
-    if (m_buildConfig.needEncodeDispatch || (m_buildConfig.maxNumPrimitives == 0))
+    // Merged encode/build and BuildParallel write the header using the shader.
+    // However, we don't launch the build shader in the case of an empty BVH.
+    if ((m_buildConfig.maxNumPrimitives == 0) || (m_deviceSettings.enableParallelBuild == false))
     {
         WriteImmediateData(HeaderBufferBaseVa(), InitAccelStructMetadataHeader());
         WriteImmediateData(ResultBufferBaseVa(), InitAccelStructHeader());
@@ -2176,6 +2248,7 @@ void BvhBuilder::InitBuildSettings()
 
     m_buildSettings.updateFlags =
         m_buildArgs.inputs.flags & (AccelStructBuildFlagPerformUpdate | AccelStructBuildFlagAllowUpdate);
+    m_buildSettings.rebuildAccelStruct = m_buildConfig.rebuildAccelStruct;
 
     m_buildSettings.isUpdateInPlace = IsUpdateInPlace();
     m_buildSettings.encodeArrayOfPointers =
@@ -2215,19 +2288,26 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo(
 
     AccelStructPrebuildInfo prebuildInfo = {};
 
-    // Calculate the amount of space needed to store the result.
-    const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr);
-    const uint32 resultDataSize = resultBufferInfo.dataSize;
-
     // Calculate the amount of scratch space needed during the construction process.
     const ScratchBufferInfo scratchBufferInfo = CalculateScratchBufferInfo(nullptr);
+
+    // Calculate the amount of space needed to store the result.
+    const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr, scratchBufferInfo.bvh2PhaseSize);
+
     uint32 scratchDataSize = CalculateScratchBufferSize(resultBufferInfo, scratchBufferInfo);
+    const uint32 resultDataSize = resultBufferInfo.dataSize;
 
     uint32 updateDataSize = 0;
-    if (UpdateAllowed())
+    if (m_buildConfig.rebuildAccelStruct)
+    {
+        // When we force a rebuild the acceleration structure should use scratch data size for updates
+        updateDataSize = Util::Max(1u, scratchDataSize);
+    }
+    else if (UpdateAllowed())
     {
         updateDataSize = Util::Max(1u, CalculateUpdateScratchBufferInfo(nullptr));
     }
+
     // Scratch size for builds may be smaller than updates, some apps will still try to use the scratch size from
     // the build when performing the update causing page faults.
     scratchDataSize = Util::Max(scratchDataSize, updateDataSize);
@@ -2380,8 +2460,26 @@ void BvhBuilder::PreBuildDumpEvents()
 {
     PAL_ASSERT(HasBuildDumpEvents());
 
-    const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr);
-    const uint32 resultDataSize = resultBufferInfo.dataSize;
+    uint32 scratchDataSize;
+    uint32 resultDataSize;
+
+    if (IsUpdate() == false)
+    {
+        const ScratchBufferInfo scratchBufferInfo = CalculateScratchBufferInfo(nullptr);
+
+        const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr, scratchBufferInfo.bvh2PhaseSize);
+
+        scratchDataSize = CalculateScratchBufferSize(resultBufferInfo, scratchBufferInfo);
+
+        resultDataSize = resultBufferInfo.dataSize;
+    }
+    else
+    {
+        scratchDataSize = CalculateUpdateScratchBufferInfo(nullptr);
+
+        const ResultBufferInfo resultBufferInfo = CalculateResultBufferInfo(nullptr, nullptr, 0);
+        resultDataSize = resultBufferInfo.dataSize;
+    }
 
     // Initialise acceleration structure information for dump purposes
     m_dumpInfo = {};
@@ -2397,19 +2495,10 @@ void BvhBuilder::PreBuildDumpEvents()
     m_dumpInfo.gpuVa                   = HeaderBufferBaseVa();
     m_dumpInfo.sizeInBytes             = resultDataSize;
     m_dumpInfo.scratchGpuVa            = ScratchBufferBaseVa();
+    m_dumpInfo.scratchSizeInBytes      = scratchDataSize;
     m_dumpInfo.pTimeStampVidMem        = nullptr;
     m_dumpInfo.timeStampVidMemoffset   = 0;
 
-    if (IsUpdate() == false)
-    {
-        const ScratchBufferInfo scratchBufferInfo = CalculateScratchBufferInfo(nullptr);
-        m_dumpInfo.scratchSizeInBytes = CalculateScratchBufferSize(resultBufferInfo, scratchBufferInfo);
-    }
-    else
-    {
-        m_dumpInfo.scratchSizeInBytes = CalculateUpdateScratchBufferInfo(nullptr);
-    }
-
     if (m_deviceSettings.enableBuildAccelStructStats)
     {
 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 39
@@ -2509,18 +2598,20 @@ void BvhBuilder::InitializeBuildConfigs()
     InitBuildConfig(m_buildArgs);
     InitBuildSettings();
 
-    m_resultBufferInfo = CalculateResultBufferInfo(&m_resultOffsets, &m_metadataSizeInBytes);
-
-    m_scratchBufferInfo = {};
     if (IsUpdate() == false)
     {
         // Compute the offsets into the scratch buffer for all of our scratch resources.
         m_scratchBufferInfo = CalculateScratchBufferInfo(&m_scratchOffsets);
+
+        m_resultBufferInfo = CalculateResultBufferInfo(&m_resultOffsets, &m_metadataSizeInBytes, m_scratchBufferInfo.bvh2PhaseSize);
     }
     else
     {
         // Compute the offsets into the scratch buffer for all of our scratch resources.
+        m_scratchBufferInfo = {};
         CalculateUpdateScratchBufferInfo(&m_scratchOffsets);
+
+        m_resultBufferInfo = CalculateResultBufferInfo(&m_resultOffsets, &m_metadataSizeInBytes, 0);
     }
 
     // Add tlas to m_tlas
@@ -3487,46 +3578,46 @@ void BvhBuilder::UpdateParallel()
 // Perform geometry encoding and update in one dispatch
 void BvhBuilder::EncodeUpdate()
 {
-    BindPipeline((m_buildConfig.geometryType == GeometryType::Triangles) ?
-        InternalRayTracingCsType::UpdateTriangles : InternalRayTracingCsType::UpdateAabbs);
+    uint32 numThreadGroups = 0;
 
-    const uint32 threadGroupSize  = DefaultThreadGroupSize;
-    const uint32 wavesPerSimd     = 8;
-    uint32 numThreadGroups        = 0;
     {
-        const uint32 numWorkItems = Util::Max(1u, m_buildConfig.numPrimitives);
-        numThreadGroups    = GetNumPersistentThreadGroups(numWorkItems, threadGroupSize, wavesPerSimd);
-    }
-    const uint32 numThreads       = numThreadGroups * threadGroupSize;
+        BindPipeline((m_buildConfig.geometryType == GeometryType::Triangles) ?
+            InternalRayTracingCsType::UpdateTriangles : InternalRayTracingCsType::UpdateAabbs);
 
-    uint32 entryOffset = 0;
+        const uint32 threadGroupSize = DefaultThreadGroupSize;
+        const uint32 wavesPerSimd    = 8;
+        const uint32 numWorkItems    = Util::Max(1u, m_buildConfig.numPrimitives);
+        const uint32 numThreadGroups = GetNumPersistentThreadGroups(numWorkItems, threadGroupSize, wavesPerSimd);
+        const uint32 numThreads      = numThreadGroups * threadGroupSize;
 
-    const Update::Constants shaderConstants =
-    {
-        .numThreads = numThreads,
-    };
+        uint32 entryOffset = 0;
 
-    // Set shader constants
-    entryOffset = WriteUserDataEntries(&shaderConstants, Update::NumEntries, entryOffset);
+        const Update::Constants shaderConstants =
+        {
+            .numThreads = numThreads,
+        };
 
-    entryOffset = WriteBuildShaderConstantBuffer(entryOffset);
+        // Set shader constants
+        entryOffset = WriteUserDataEntries(&shaderConstants, Update::NumEntries, entryOffset);
 
-    // Set result/scratch/source buffers
-    entryOffset = WriteUpdateBuffers(entryOffset);
+        entryOffset = WriteBuildShaderConstantBuffer(entryOffset);
 
-    const uint32 cbvSrdTableGpuVaLo = Util::LowPart(m_geomConstSrdTable);
-    entryOffset = WriteUserDataEntries(&cbvSrdTableGpuVaLo, 1, entryOffset);
+        // Set result/scratch/source buffers
+        entryOffset = WriteUpdateBuffers(entryOffset);
 
-    const uint32 vbvSrdTableGpuVaLo = Util::LowPart(m_geomBufferSrdTable);
-    entryOffset = WriteUserDataEntries(&vbvSrdTableGpuVaLo, 1, entryOffset);
+        const uint32 cbvSrdTableGpuVaLo = Util::LowPart(m_geomConstSrdTable);
+        entryOffset = WriteUserDataEntries(&cbvSrdTableGpuVaLo, 1, entryOffset);
 
-    // NullBuffer binding
-    entryOffset = WriteBufferVa(0, entryOffset);
+        const uint32 vbvSrdTableGpuVaLo = Util::LowPart(m_geomBufferSrdTable);
+        entryOffset = WriteUserDataEntries(&vbvSrdTableGpuVaLo, 1, entryOffset);
 
-    RGP_PUSH_MARKER("Update (NumPrimitives=%u)", m_buildConfig.maxNumPrimitives);
-    Dispatch(numThreadGroups);
+        // NullBuffer binding
+        entryOffset = WriteBufferVa(0, entryOffset);
 
-    RGP_POP_MARKER();
+        RGP_PUSH_MARKER("Update (NumPrimitives=%u)", m_buildConfig.maxNumPrimitives);
+        Dispatch(numThreadGroups);
+        RGP_POP_MARKER();
+    }
 }
 
 // =====================================================================================================================
diff --git a/src/gpurtBvhBuilder.h b/src/gpurtBvhBuilder.h
index 876a7ec..4af762f 100644
--- a/src/gpurtBvhBuilder.h
+++ b/src/gpurtBvhBuilder.h
@@ -36,6 +36,15 @@ namespace EncodeNodes
 struct Constants;
 }
 
+// =====================================================================================================================
+// Helper structure for encapsulating triangle index buffer information
+struct IndexBufferInfo
+{
+    uint32 format;
+    uint64 byteOffset;
+    uint64 gpuVa;
+};
+
 // =====================================================================================================================
 // Helper class used by GPURT to perform various BVH operations like building, copying, etc.
 class BvhBuilder
@@ -66,6 +75,11 @@ class BvhBuilder
     static uint32 CalculateGeometryInfoSize(
         uint32 numGeometryDescs);
 
+    // Helper function for when to perform a rebuild
+    static bool ForceRebuild(
+        const Internal::Device*      pDevice,
+        const AccelStructBuildInputs inputs);
+
     // Builds or updates an acceleration structure and stores it in a result buffer
     void BuildRaytracingAccelerationStructure();
 
@@ -106,14 +120,12 @@ class BvhBuilder
 
     ResultBufferInfo CalculateResultBufferInfo(
         AccelStructDataOffsets* pOffsets,
-        uint32* pMetadataSizeInBytes);
+        uint32* pMetadataSizeInBytes,
+        uint remapScratchBufferSize);
 
     ScratchBufferInfo CalculateScratchBufferInfo(
         RayTracingScratchDataOffsets* pOffsets);
 
-    ScratchBufferInfo CalculateScratchBufferInfoDefault(
-        RayTracingScratchDataOffsets* pOffsets);
-
     uint32 CalculateUpdateScratchBufferInfo(
         RayTracingScratchDataOffsets* pOffsets);
 
@@ -169,6 +181,12 @@ class BvhBuilder
     static uint32 GetGeometryPrimCount(
         const Geometry& geometry);
 
+    static IndexBufferInfo GetIndexBufferInfo(
+        const GeometryTriangles& geometry);
+
+    static uint32 TrianglePairBlockCount(
+        uint32 numTriangles);
+
 private:
 
     // Configs that change within build calls, private to the bvh builder.
@@ -205,6 +223,7 @@ class BvhBuilder
         bool                            enableFastLBVH;
         bool                            enableMergeSort;
         bool                            enableInstanceRebraid;
+        bool                            rebuildAccelStruct;
     };
 
     BvhBuilder(
@@ -213,6 +232,19 @@ class BvhBuilder
         ClientCallbacks              clientCb,
         const DeviceSettings&        deviceSettings);
 
+    uint32 CalculateMetadataSize(
+        const uint32  internalNodeSize,
+        const uint32  leafNodeSize,
+        uint32* const pRunningOffset);
+
+    ResultBufferInfo CalculateResultBufferInfoDefault(
+        AccelStructDataOffsets* pOffsets,
+        uint32* pMetadataSizeInBytes,
+        uint remapScratchBufferSize);
+
+    ScratchBufferInfo CalculateScratchBufferInfoDefault(
+        RayTracingScratchDataOffsets* pOffsets);
+
     uint32 CalculateInternalNodesSize()const;
     uint32 CalculateLeafNodesSize() const;
     uint32 CalculateNodesSize() const;
diff --git a/src/gpurtDevice.cpp b/src/gpurtDevice.cpp
index ce6f940..b1368f0 100644
--- a/src/gpurtDevice.cpp
+++ b/src/gpurtDevice.cpp
@@ -2128,16 +2128,7 @@ const AccelStructBuildInputs Device::OverrideBuildInputs(
     ) const
 {
     AccelStructBuildInputs buildInputs = inputs;
-
-    const bool rebuildTopLevel =
-        (
-        Util::TestAnyFlagSet(Settings().forceRebuildForUpdates, ForceRebuildForUpdatesMode::TopLevel)) &&
-        (buildInputs.type == GpuRt::AccelStructType::TopLevel);
-    const bool rebuildBottomLevel =
-        Util::TestAnyFlagSet(Settings().forceRebuildForUpdates, ForceRebuildForUpdatesMode::BottomLevel) &&
-        (buildInputs.type == GpuRt::AccelStructType::BottomLevel);
-
-    bool rebuildAS = rebuildBottomLevel || rebuildTopLevel;
+    const bool rebuildAS = BvhBuilder::ForceRebuild(this, inputs);
 
     if (rebuildAS)
     {
diff --git a/src/gpurtInternalShaderBindings.h b/src/gpurtInternalShaderBindings.h
index 4b26c92..1785233 100644
--- a/src/gpurtInternalShaderBindings.h
+++ b/src/gpurtInternalShaderBindings.h
@@ -82,6 +82,16 @@ namespace CopyAS
     constexpr uint32 NumEntries = (sizeof(Constants) / sizeof(uint32));
 }
 
+namespace BuildTrivialBvh
+{
+    struct Constants
+    {
+        uint32 maxGeometryCount;
+    };
+
+    constexpr uint32 NumEntries = (sizeof(Constants) / sizeof(uint32));
+}
+
 namespace CompactAS
 {
     struct Constants
diff --git a/src/shaders/BuildBVHPLOC.hlsl b/src/shaders/BuildBVHPLOC.hlsl
index abb971e..effeb80 100644
--- a/src/shaders/BuildBVHPLOC.hlsl
+++ b/src/shaders/BuildBVHPLOC.hlsl
@@ -56,7 +56,6 @@ struct BuildPlocArgs
     uint  currentStateScratchOffset;
     uint  taskQueueCounterScratchOffset;
     uint  atomicFlagsScratchOffset;
-    uint  offsetsScratchOffset;
     uint  dynamicBlockIndexScratchOffset;
     uint  numBatchesScratchOffset;
     uint  baseBatchIndicesScratchOffset;
@@ -878,7 +877,6 @@ void BuildBVHPLOC(
     plocArgs.currentStateScratchOffset      = ShaderConstants.offsets.currentState;
     plocArgs.taskQueueCounterScratchOffset  = ShaderConstants.offsets.plocTaskQueueCounter;
     plocArgs.atomicFlagsScratchOffset       = ShaderConstants.offsets.atomicFlagsPloc;
-    plocArgs.offsetsScratchOffset           = ShaderConstants.offsets.clusterOffsets;
     plocArgs.dynamicBlockIndexScratchOffset = ShaderConstants.offsets.dynamicBlockIndex;
     plocArgs.numBatchesScratchOffset        = ShaderConstants.offsets.numBatches;
     plocArgs.baseBatchIndicesScratchOffset  = ShaderConstants.offsets.batchIndices;
diff --git a/src/shaders/BuildBVHTDTR.hlsl b/src/shaders/BuildBVHTDTR.hlsl
index e9d568d..ed21d28 100644
--- a/src/shaders/BuildBVHTDTR.hlsl
+++ b/src/shaders/BuildBVHTDTR.hlsl
@@ -984,23 +984,13 @@ void BuildBVHTDImpl(
                 uint numRefsAllocated = ScratchBuffer.Load(numRefsAllocatedOffset);
                 if (globalId == 0)
                 {
-                    UintBoundingBox sceneBounds;
-
-                    uint4 data;
-                    data    = ScratchBuffer.Load4(args.SceneBoundsOffset); // todo: recalc based on ACTIVE nodes
-                    sceneBounds.min = data.xyz;
-                    data.xy = ScratchBuffer.Load2(args.SceneBoundsOffset + 0x10);
-                    sceneBounds.max = data.wxy;
-
-                    BoundingBox bbox;
-                    bbox.min = Uint3ToFloat3(sceneBounds.min);
-                    bbox.max = Uint3ToFloat3(sceneBounds.max);
+                    BoundingBox bbox = FetchSceneBounds(args.SceneBoundsOffset); // todo: recalc based on ACTIVE nodes
 
                     BoundingBox bboxCentroid;
 
                     UintBoundingBox boxCentroidUint;
 
-                    data = ScratchBuffer.Load4(args.CurrentStateScratchOffset + STATE_TD_CENTROID_BBOX_OFFSET);
+                    uint4 data = ScratchBuffer.Load4(args.CurrentStateScratchOffset + STATE_TD_CENTROID_BBOX_OFFSET);
                     boxCentroidUint.min = data.xyz;
                     data.xy = ScratchBuffer.Load2(args.CurrentStateScratchOffset + STATE_TD_CENTROID_BBOX_OFFSET + 0x10);
                     boxCentroidUint.max = data.wxy;
diff --git a/src/shaders/BuildCommonScratch.hlsl b/src/shaders/BuildCommonScratch.hlsl
index 579920d..cd54497 100644
--- a/src/shaders/BuildCommonScratch.hlsl
+++ b/src/shaders/BuildCommonScratch.hlsl
@@ -591,6 +591,28 @@ float2 FetchSceneSize(uint sceneBoundsOffset)
     return minMax;
 }
 
+//=====================================================================================================================
+void InitSceneBounds(uint sceneBoundsOffset)
+{
+    // Initialize scene bounds
+    const uint maxVal = FloatToUint(FLT_MAX);
+    const uint minVal = FloatToUint(-FLT_MAX);
+
+    ScratchBuffer.Store3(sceneBoundsOffset, maxVal.xxx);
+    sceneBoundsOffset += sizeof(uint3);
+    ScratchBuffer.Store3(sceneBoundsOffset, minVal.xxx);
+    sceneBoundsOffset += sizeof(uint3);
+    ScratchBuffer.Store2(sceneBoundsOffset, uint2(maxVal, minVal));
+    sceneBoundsOffset += sizeof(uint2);
+
+    if (Settings.rebraidType == RebraidType::V2)
+    {
+        ScratchBuffer.Store3(sceneBoundsOffset, maxVal.xxx);
+        sceneBoundsOffset += sizeof(uint3);
+        ScratchBuffer.Store3(sceneBoundsOffset, minVal.xxx);
+    }
+}
+
 //======================================================================================================================
 uint GetBvhNodesOffset(
     uint numActivePrims,
diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl
index 73084ba..e557003 100644
--- a/src/shaders/BuildParallel.hlsl
+++ b/src/shaders/BuildParallel.hlsl
@@ -260,7 +260,6 @@ void BuildBvhPloc(
     plocArgs.currentStateScratchOffset      = ShaderConstants.offsets.currentState;
     plocArgs.taskQueueCounterScratchOffset  = ShaderConstants.offsets.plocTaskQueueCounter;
     plocArgs.atomicFlagsScratchOffset       = ShaderConstants.offsets.atomicFlagsPloc;
-    plocArgs.offsetsScratchOffset           = ShaderConstants.offsets.clusterOffsets;
     plocArgs.dynamicBlockIndexScratchOffset = ShaderConstants.offsets.dynamicBlockIndex;
     plocArgs.numBatchesScratchOffset        = ShaderConstants.offsets.numBatches;
     plocArgs.baseBatchIndicesScratchOffset  = ShaderConstants.offsets.batchIndices;
@@ -401,44 +400,31 @@ void InitAccelerationStructure()
 
     DstBuffer.Store(0, ShaderConstants.header);
 
-    // Initialise encode counters
-    WriteTaskCounterData(
-        ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_NUM_PRIMITIVES_OFFSET, 0);
-
-    // Early triangle pairing and triangle splitting dynamically increment primitive reference counter. Initialise
-    // counters to 0 when these features are enabled
-
-    const bool dynamicallyIncrementsPrimRefCount =
-        Settings.enableEarlyPairCompression || Settings.doTriangleSplitting || Settings.isIndirectBuild;
-    const uint primRefInitCount =
-        (dynamicallyIncrementsPrimRefCount) ? 0 : ShaderConstants.numPrimitives;
+    if (Settings.doEncode)
+    {
+        // Initialise encode counters
+        WriteTaskCounterData(
+            ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_NUM_PRIMITIVES_OFFSET, 0);
 
-    WriteTaskCounterData(
-        ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET, primRefInitCount);
+        // Early triangle pairing and triangle splitting dynamically increment primitive reference counter. Initialise
+        // counters to 0 when these features are enabled
 
-    // Initialize valid scratch buffer counters to 0
-    InitScratchCounter(ShaderConstants.offsets.plocTaskQueueCounter);
-    InitScratchCounter(ShaderConstants.offsets.tdTaskQueueCounter);
-    InitScratchCounter(CurrentSplitTaskQueueCounter());
-    ClearNumBatches(ShaderConstants.offsets.numBatches);
+        const bool dynamicallyIncrementsPrimRefCount =
+            Settings.enableEarlyPairCompression || Settings.doTriangleSplitting || Settings.isIndirectBuild;
+        const uint primRefInitCount =
+            (dynamicallyIncrementsPrimRefCount) ? 0 : ShaderConstants.numPrimitives;
 
-    // Initialize scene bounds
-    const uint maxVal = FloatToUint(FLT_MAX);
-    const uint minVal = FloatToUint(-FLT_MAX);
+        WriteTaskCounterData(
+            ShaderConstants.offsets.encodeTaskCounter, ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET, primRefInitCount);
 
-    uint offset = ShaderConstants.offsets.sceneBounds;
-    ScratchBuffer.Store3(offset, maxVal.xxx);
-    offset += sizeof(uint3);
-    ScratchBuffer.Store3(offset, minVal.xxx);
-    offset += sizeof(uint3);
-    ScratchBuffer.Store2(offset, uint2(maxVal, minVal));
-    offset += sizeof(uint2);
+        // Initialize valid scratch buffer counters to 0
+        InitScratchCounter(ShaderConstants.offsets.plocTaskQueueCounter);
+        InitScratchCounter(ShaderConstants.offsets.tdTaskQueueCounter);
+        InitScratchCounter(CurrentSplitTaskQueueCounter());
+        ClearNumBatches(ShaderConstants.offsets.numBatches);
 
-    if (Settings.rebraidType == RebraidType::V2)
-    {
-        ScratchBuffer.Store3(offset, maxVal.xxx);
-        offset += sizeof(uint3);
-        ScratchBuffer.Store3(offset, minVal.xxx);
+        // Initialize scene bounds
+        InitSceneBounds(ShaderConstants.offsets.sceneBounds);
     }
 }
 
@@ -503,17 +489,17 @@ void BuildBvh(
 
     INIT_TASK;
 
-    if (Settings.doEncode)
-    {
-        BEGIN_TASK(1);
+    BEGIN_TASK(1);
 
-        if (globalId == 0)
-        {
-            InitAccelerationStructure();
-        }
+    if (globalId == 0)
+    {
+        InitAccelerationStructure();
+    }
 
-        END_TASK(1);
+    END_TASK(1);
 
+    if (Settings.doEncode)
+    {
         BEGIN_TASK(ShaderRootConstants.NumThreadGroups());
 
         EncodePrimitives(globalId, localId);
diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl
index 2f03134..9dd9e30 100644
--- a/src/shaders/BuildQBVH.hlsl
+++ b/src/shaders/BuildQBVH.hlsl
@@ -604,11 +604,9 @@ static void PullUpChildren(
         }
         else
         {
-            {
-                // Note, box node flags are combined together by using an AND operation. Thus, we need to initialise
-                // invalid child flags as 0xff
-                boxNodeFlags = SetBoxNodeFlagsField(boxNodeFlags, 0xff, i);
-            }
+            // Note, box node flags are combined together by using an AND operation. Thus, we need to initialise
+            // invalid child flags as 0xff
+            boxNodeFlags = SetBoxNodeFlagsField(boxNodeFlags, 0xff, i);
         }
     }
 
diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli
index 4d72cb5..6bb5026 100644
--- a/src/shaders/BuildSettings.hlsli
+++ b/src/shaders/BuildSettings.hlsli
@@ -59,6 +59,7 @@
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENCODE_ARRAY_OF_POINTERS_ID)]]               uint encodeArrayOfPointers         = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_SCENE_BOUNDS_CALCULATION_TYPE_ID)]]          uint sceneBoundsCalculationType    = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_REBRAID_QUALITY_HEURISTIC_ID)]]              uint rebraidQualityHeuristic       = 0;
+[[vk::constant_id(BUILD_SETTINGS_DATA_REBUILD_ACCELERATION_STRUCTURE_ID)]]         uint rebuildAccelStruct            = 0;
 
 static const CompileTimeBuildSettings Settings = {
     topLevelBuild,
@@ -108,6 +109,7 @@ static const CompileTimeBuildSettings Settings = {
     0,
     0,
     0,
+    rebuildAccelStruct,
 };
 
 #endif
diff --git a/src/shaders/Common.hlsl b/src/shaders/Common.hlsl
index b8c594b..d4562fb 100644
--- a/src/shaders/Common.hlsl
+++ b/src/shaders/Common.hlsl
@@ -270,12 +270,10 @@ static uint64_t PackInstanceBasePointer(GpuVirtualAddress instanceVa, uint insta
     instanceBasePointer |= (instanceFlags & D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE)
                            ? (1ull << NODE_POINTER_FORCE_NON_OPAQUE_SHIFT) : 0;
 
-    {
-        // Set 'Skip Procedural' for triangles and 'Skip Triangles' for procedural geometry
-        instanceBasePointer |= (geometryType == GEOMETRY_TYPE_TRIANGLES)
-                            ? (1ull << NODE_POINTER_SKIP_PROCEDURAL_SHIFT)
-                            : (1ull << NODE_POINTER_SKIP_TRIANGLES_SHIFT);
-    }
+    // Set 'Skip Procedural' for triangles and 'Skip Triangles' for procedural geometry
+    instanceBasePointer |= (geometryType == GEOMETRY_TYPE_TRIANGLES)
+                        ? (1ull << NODE_POINTER_SKIP_PROCEDURAL_SHIFT)
+                        : (1ull << NODE_POINTER_SKIP_TRIANGLES_SHIFT);
 
     instanceBasePointer |= (geometryType == GEOMETRY_TYPE_AABBS)
                        ? (1ull << NODE_POINTER_DISABLE_TRIANGLE_CULL_SHIFT) : 0;
diff --git a/src/shaders/CompactCommon.hlsl b/src/shaders/CompactCommon.hlsl
index 8d41575..b504f2a 100644
--- a/src/shaders/CompactCommon.hlsl
+++ b/src/shaders/CompactCommon.hlsl
@@ -35,7 +35,7 @@ uint CalcCompactedSize(
     // Acceleration structure data starts with the header (not including the metadata)
     uint runningOffset = sizeof(AccelStructHeader);
 
-    AccelStructOffsets offsets;
+    AccelStructOffsets offsets = (AccelStructOffsets)0;
     offsets.internalNodes = runningOffset;
 
     uint internalNodeSize = 0;
@@ -63,8 +63,12 @@ uint CalcCompactedSize(
         offsets.geometryInfo = runningOffset;
         runningOffset += srcHeader.numDescs * sizeof(GeometryInfo);
 
-        offsets.primNodePtrs = runningOffset;
-        runningOffset += srcHeader.numPrimitives * sizeof(uint);
+        {
+            offsets.primNodePtrs = runningOffset;
+            runningOffset += srcHeader.numPrimitives * sizeof(uint);
+
+        }
+
     }
     else
     {
@@ -82,8 +86,11 @@ uint CalcCompactedSize(
             offsets.geometryInfo = 0;
         }
 
-        offsets.primNodePtrs = runningOffset;
-        runningOffset += srcHeader.numPrimitives * sizeof(uint);
+        {
+            offsets.primNodePtrs = runningOffset;
+            runningOffset += srcHeader.numPrimitives * sizeof(uint);
+
+        }
     }
 
     {
diff --git a/src/shaders/EncodeCommon.hlsl b/src/shaders/EncodeCommon.hlsl
index 462fc0f..3eca227 100644
--- a/src/shaders/EncodeCommon.hlsl
+++ b/src/shaders/EncodeCommon.hlsl
@@ -343,7 +343,12 @@ void EncodeTriangleNode(
 
 //=====================================================================================================================
 // Fetch API bounding box from source buffer which is a typed R32G32 buffer.
-BoundingBox FetchBoundingBoxData(RWBuffer<float3> buffer, uint index, uint offsetInElements, uint boxStrideInElements)
+template<typename Float3Buffer>
+BoundingBox FetchBoundingBoxData(
+    Float3Buffer buffer,
+    uint         index,
+    uint         offsetInElements,
+    uint         boxStrideInElements)
 {
     const uint baseElementIndex = index * boxStrideInElements + offsetInElements;
 
diff --git a/src/shaders/EncodeHwBvhCommon.hlsl b/src/shaders/EncodeHwBvhCommon.hlsl
index 3552a3c..22e6a0b 100644
--- a/src/shaders/EncodeHwBvhCommon.hlsl
+++ b/src/shaders/EncodeHwBvhCommon.hlsl
@@ -141,12 +141,18 @@ void PostHwBvhBuild(
                                                offsets,
                                                metadataSizeInBytes);
 
+        // Rebuilding an updateable acceleration structure need to use the original size and not compacted one.
+        if (Settings.rebuildAccelStruct)
+        {
+            compactedSize = ShaderConstants.header.compactedSizeInBytes;
+        }
         WriteAccelStructHeaderField(ACCEL_STRUCT_HEADER_COMPACTED_BYTE_SIZE_OFFSET, compactedSize);
 
         if (Settings.emitCompactSize != 0)
         {
             EmitBuffer.Store2(0, uint2(compactedSize, 0));
         }
+
     }
 }
 
diff --git a/src/shaders/EncodePairedTriangleImpl.hlsl b/src/shaders/EncodePairedTriangleImpl.hlsl
index 7cb366d..090b544 100644
--- a/src/shaders/EncodePairedTriangleImpl.hlsl
+++ b/src/shaders/EncodePairedTriangleImpl.hlsl
@@ -46,11 +46,10 @@ void WriteScratchTriangleNode(
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_V2_OFFSET, data);
 
     const BoundingBox box = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2);
-
-    const uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags);
-
     // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out.
     const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff;
+    const uint triangleId = WriteTriangleIdField(0, NODE_TYPE_TRIANGLE_0, 0, geometryFlags);
+
     const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId);
 
     data = uint4(0, 0, 0, packedFlags);
@@ -118,7 +117,6 @@ void WriteScratchQuadNode(
 
     // Set the instance inclusion mask to 0 for degenerate triangles so that they are culled out.
     const uint instanceMask = (box.min.x > box.max.x) ? 0 : 0xff;
-
     const uint packedFlags = PackScratchNodeFlags(instanceMask, CalcTriangleBoxNodeFlags(geometryFlags), triangleId);
     WriteScratchNodeDataAtOffset(offset, SCRATCH_NODE_FLAGS_OFFSET, packedFlags);
 }
@@ -196,11 +194,40 @@ float ComputePairAreaRatio(
     return ratio;
 }
 
+//======================================================================================================================
+float ComputeEdgeBoxSurfaceArea(
+    float3x3 vertices,
+    uint rotation)
+{
+    // triangle v1, v2, v0
+    float3 e0 = (vertices[1]);
+    float3 e1 = (vertices[0]);
+
+    if (rotation == 0)
+    {
+        // triangle v0, v1, v2
+        e0 = (vertices[0]);
+        e1 = (vertices[2]);
+    }
+    else if (rotation == 1)
+    {
+        // triangle v2, v0, v1
+        e0 = (vertices[2]);
+        e1 = (vertices[1]);
+    }
+
+    BoundingBox edgeBox = (BoundingBox)0;
+    edgeBox.min = min(e0, e1);
+    edgeBox.max = max(e0, e1);
+
+    return ComputeBoxSurfaceArea(edgeBox);
+}
+
 //======================================================================================================================
 template<typename T>
 int PairTrianglesOptimal(
     T tri,
-    BoundingBox bbox,
+    float3x3 vertices,
     bool isActive)
 {
     bool valid = isActive;
@@ -208,6 +235,8 @@ int PairTrianglesOptimal(
     // Initialise to unpaired triangle
     int pairInfo = -1;
 
+    const BoundingBox bbox = GenerateTriangleBoundingBox(vertices[0], vertices[1], vertices[2]);
+
     while (valid)
     {
         const bool isBroadcastLane = WaveIsFirstLane();
@@ -230,7 +259,13 @@ int PairTrianglesOptimal(
             WaveReadLaneFirst(bbox.max),
         };
 
-        const float ratio = (packedOffset == -1) ? FLT_MAX : ComputePairAreaRatio(broadcastTriBounds, bbox);
+        const uint tri1Rotation = (packedOffset >> 4) & 0xF;
+        const float edgeBoxSa = ComputeEdgeBoxSurfaceArea(vertices, tri1Rotation);
+
+        // Skip unpaired triangles and pairs with perpendicular shared edges (i.e. edge box area = 0)
+        const float ratio =
+            ((packedOffset == -1) || (edgeBoxSa == 0.0f)) ?
+                FLT_MAX : ComputePairAreaRatio(broadcastTriBounds, bbox);
 
         const float waveMinRatio = WaveActiveMin(ratio);
 
@@ -325,13 +360,17 @@ int PairTriangles(
 
     const bool isActiveTriangle = IsActive(tri);
 
+    float3x3 faceVertices;
+    faceVertices[0] = tri.v0;
+    faceVertices[1] = tri.v1;
+    faceVertices[2] = tri.v2;
+
     // Indexed triangles can always be paired as their connectivity cannot change on updates.
     if (isIndexed)
     {
         if (Settings.enablePairCostCheck)
         {
-            const BoundingBox bbox = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2);
-            pairInfo = PairTrianglesOptimal(faceIndices, bbox, isActiveTriangle);
+            pairInfo = PairTrianglesOptimal(faceIndices, faceVertices, isActiveTriangle);
         }
         else
         {
@@ -341,15 +380,9 @@ int PairTriangles(
     // Only pair non-indexed triangles for non-updateable as the triangle positions can change on updates
     else if (IsUpdateAllowed() == false)
     {
-        float3x3 faceVertices;
-        faceVertices[0] = tri.v0;
-        faceVertices[1] = tri.v1;
-        faceVertices[2] = tri.v2;
-
         if (Settings.enablePairCostCheck)
         {
-            const BoundingBox bbox = GenerateTriangleBoundingBox(tri.v0, tri.v1, tri.v2);
-            pairInfo = PairTrianglesOptimal(faceVertices, bbox, isActiveTriangle);
+            pairInfo = PairTrianglesOptimal(faceVertices, faceVertices, isActiveTriangle);
         }
         else
         {
diff --git a/src/shaders/EncodeTopLevel.hlsl b/src/shaders/EncodeTopLevel.hlsl
index bd03de0..689a4ff 100644
--- a/src/shaders/EncodeTopLevel.hlsl
+++ b/src/shaders/EncodeTopLevel.hlsl
@@ -136,6 +136,7 @@ void EncodeInstances(
             EncodeInstancesUpdate(index,
                                   desc,
                                   tlasMetadataSize,
+                                  offsets,
                                   primNodePointerOffset,
                                   baseAddrAccelStructHeader,
                                   numActivePrims,
diff --git a/src/shaders/EncodeTopLevelUpdate.hlsl b/src/shaders/EncodeTopLevelUpdate.hlsl
index 48277fe..7a93c31 100644
--- a/src/shaders/EncodeTopLevelUpdate.hlsl
+++ b/src/shaders/EncodeTopLevelUpdate.hlsl
@@ -28,9 +28,9 @@
 void WriteInstanceDescriptor(
     in InstanceDesc       instanceDesc,
     in uint               geometryType,
-    in uint               boxNodeFlags,
     in uint               instanceIndex,
     in uint               instNodePtr,
+    in AccelStructOffsets offsets,
     in uint               blasRootNodePointer,
     in uint               blasMetadataSize,
     in uint               tlasMetadataSize)
@@ -51,6 +51,7 @@ void EncodeInstancesUpdate(
     uint               index,
     InstanceDesc       desc,
     uint               tlasMetadataSize,
+    AccelStructOffsets offsets,
     uint               primNodePointerOffset,
     uint64_t           baseAddrAccelStructHeader,
     uint               numActivePrims,
@@ -159,9 +160,9 @@ void EncodeInstancesUpdate(
 
             WriteInstanceDescriptor(desc,
                                     geometryType,
-                                    boxNodeFlags,
                                     index,
                                     nodePointer,
+                                    offsets,
                                     CreateRootNodePointer(),
                                     blasMetadataSize,
                                     tlasMetadataSize);
diff --git a/src/shaders/Extensions.hlsl b/src/shaders/Extensions.hlsl
index baf4630..bfc0812 100644
--- a/src/shaders/Extensions.hlsl
+++ b/src/shaders/Extensions.hlsl
@@ -117,6 +117,7 @@ __decl float3 AmdExtD3DShaderIntrinsics_FloatOpWithRoundMode(
 
 //=====================================================================================================================
 // Sub-group wave reductions
+// Ref: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_instructions
 
 [[vk::ext_capability(/* GroupNonUniform */ 61)]]
 [[vk::ext_capability(/* GroupNonUniformArithmetic */ 63)]]
@@ -149,6 +150,24 @@ float AmdExtD3DShaderIntrinsics_WaveClusterMax(float x, uint dxClusterSize)
     return spirv_OpGroupNonUniformFMax_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
 }
 
+[[vk::ext_instruction(359)]]
+uint spirv_OpGroupNonUniformBitwiseAnd_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+
+uint AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(uint x, uint dxClusterSize)
+{
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    return spirv_OpGroupNonUniformBitwiseAnd_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+}
+
+[[vk::ext_instruction(360)]]
+uint spirv_OpGroupNonUniformBitwiseOr_clustered(uint scope, [[vk::ext_literal]] uint op, uint value, uint clusterSize);
+
+uint AmdExtD3DShaderIntrinsics_WaveClusterBitOr(uint x, uint dxClusterSize)
+{
+    const uint clusterSize = (1u << (dxClusterSize - 1));
+    return spirv_OpGroupNonUniformBitwiseOr_clustered(/* Subgroup */ 3, /* ClusteredReduce */ 3, x, clusterSize);
+}
+
 #endif
 
 //=====================================================================================================================
diff --git a/src/shaders/GenerateMortonCodes.hlsl b/src/shaders/GenerateMortonCodes.hlsl
index bc43899..6cd8bbd 100644
--- a/src/shaders/GenerateMortonCodes.hlsl
+++ b/src/shaders/GenerateMortonCodes.hlsl
@@ -107,7 +107,7 @@ void GenerateMortonCodesImpl(
     // Clear refit propagation flags for each leaf node in BVH2.
     const uint initValue = (Settings.enableFastLBVH ? 0xffffffffu : 0);
     const uint flagOffset = ShaderConstants.offsets.propagationFlags + (primitiveIndex * sizeof(uint));
-    ScratchGlobal.Store(flagOffset, initValue);
+    ScratchBuffer.Store(flagOffset, initValue);
 }
 
 #if NO_SHADER_ENTRYPOINT == 0
diff --git a/src/shaders/GpuRtLibrary.hlsl b/src/shaders/GpuRtLibrary.hlsl
index 3d829e4..81bf9fa 100644
--- a/src/shaders/GpuRtLibrary.hlsl
+++ b/src/shaders/GpuRtLibrary.hlsl
@@ -29,7 +29,6 @@
 // Following order matters as AccelStructTracker relies on defines from TraceRayCommon.hlsl
 #include "TraceRayCommon.hlsl"
 #include "AccelStructTracker.hlsl"
-#include "llpc/GpurtIntrinsics.h"
 
 #if GPURT_BUILD_CONTINUATION && LLPC_CLIENT_INTERFACE_MAJOR_VERSION
 // Include the continuations library
@@ -1002,6 +1001,65 @@ export uint _RayQuery_InstanceIndex(in RayQueryInternal rayQuery, bool committed
     }
 }
 
+//=====================================================================================================================
+// Fetch triangle position
+export TriangleData _RayQuery_FetchTrianglePosition(
+    inout_param(RayQueryInternal) rayQuery,  // BVH address
+    in bool                       committed) // Node pointer
+{
+    TriangleData tdata;
+    RayTracingIpLevel rtip = _AmdGetRtip();
+    switch (rtip)
+    {
+    default:
+    {
+        tdata = FetchTrianglePositionFromRayQuery(rayQuery, committed);
+        break;
+    }
+    }
+    return tdata;
+}
+
+//=====================================================================================================================
+// RayQuery::Proceed() entry point
+export bool _RayQuery_Proceed(
+    inout_param(RayQueryInternal) rayQuery,
+    in    uint                    constRayFlags,
+    in    uint3                   dispatchThreadId)
+{
+    uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip());
+    return RayQueryProceedCommon(
+        rayQuery,
+        constRayFlags,
+        dispatchThreadId,
+        rtIpLevel
+    );
+}
+
+//=====================================================================================================================
+// TraceRayInline() entry point
+export void _RayQuery_TraceRayInline(
+    inout_param(RayQueryInternal) rayQuery,
+    in    uint                    accelStructLo,
+    in    uint                    accelStructHi,
+    in    uint                    constRayFlags,
+    in    uint                    rayFlags,
+    in    uint                    instanceMask,
+    in    RayDesc                 rayDesc,
+    in    uint3                   dispatchThreadId)
+{
+    uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip());
+    TraceRayInlineCommon(rayQuery,
+                         accelStructLo,
+                         accelStructHi,
+                         constRayFlags,
+                         rayFlags,
+                         instanceMask,
+                         rayDesc,
+                         dispatchThreadId,
+                         rtIpLevel);
+}
+
 export void _RayQuery_SetObjId(in RayQueryInternal rayQuery, int objId)
 {
     rayQuery.rayQueryObjId = objId;
diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl
index 5973394..23ed420 100644
--- a/src/shaders/GpuRtLibraryCont.hlsl
+++ b/src/shaders/GpuRtLibraryCont.hlsl
@@ -131,26 +131,6 @@ static bool RtIpIsAtLeast(RayTracingIpLevel level)
     return ((uint32_t)_AmdGetRtip()) >= ((uint32_t)level);
 }
 
-//=====================================================================================================================
-static uint ConvertRtIpLevel(RayTracingIpLevel rtIpLevel)
-{
-    uint level = 0;
-
-    switch (rtIpLevel)
-    {
-    case RayTracingIpLevel::RtIp1_1:
-        level = GPURT_RTIP1_1;
-        break;
-    case RayTracingIpLevel::RtIp2_0:
-        level = GPURT_RTIP2_0;
-        break;
-    default:
-        break;
-    }
-
-    return level;
-}
-
 //=====================================================================================================================
 static uint GetPriorityForShaderType(
     DXILShaderKind shaderKind)
@@ -170,19 +150,62 @@ static uint GetPriorityForShaderType(
 // Forward declaration for _AmdDispatchSystemData.PackDispatchId() and _AmdDispatchSystemData.DispatchId()
 static uint3 GetDispatchRaysDimensions();
 
+//=====================================================================================================================
+
 static uint64_t GetVpcWithPriority(uint64_t vpc, uint priority)
 {
-    return vpc;
+    if (_AmdIsLlpc())
+    {
+        return vpc;
+    }
+
+    const uint64_t prio64 = priority;
+    const uint firstMetadataBit = 32;
+    const uint firstPriorityBitInMetadata = 16;
+    GPU_ASSERT((vpc & 0xFFFF000000000000) == 0);
+    return vpc | (prio64 << (firstMetadataBit + firstPriorityBitInMetadata));
 }
 
-static uint64_t Unpack32BitVpcTo64BitVpc(uint32_t vpc32, bool /*unpackPriority*/)
+//=====================================================================================================================
+// 32-bit function pointer packing/unpacking
+//
+static uint64_t Unpack32BitVpcTo64BitVpc(uint32_t vpc32, bool unpackPriority)
 {
-    return vpc32;
+    if (_AmdIsLlpc())
+    {
+        return vpc32;
+    }
+
+    uint64_t vpc = (vpc32 & 0xFFFFFFC0);
+
+    if (unpackPriority)
+    {
+       // The priority is stored in bits 0..2.
+       uint32_t priority = (vpc32 & 0x7);
+       vpc = GetVpcWithPriority(vpc, priority);
+    }
+
+    return vpc;
 }
 
 static uint32_t Pack64BitVpcTo32Bits(uint64_t vpc)
 {
-    return (vpc & 0xFFFFFFFF);
+    if (_AmdIsLlpc())
+    {
+        return (vpc & 0xFFFFFFFF);
+    }
+
+    // Incoming metadata is in the high dword
+    uint32_t inMetadata = (uint32_t)(vpc >> 32);
+    uint32_t prio = (inMetadata >> 16);
+    // We only have three bits for the priority:
+    GPU_ASSERT(prio <= 7);
+
+    // Outgoing metadata is in the low 6 bits
+    uint32_t outMetadata = prio;
+
+    GPU_ASSERT((vpc & 0x2F) == 0);
+    return SplitUint64(vpc).x | outMetadata;
 }
 
 //=====================================================================================================================
@@ -2030,7 +2053,6 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_
         }
         else
         {
-            // This case should only occur in sorting mode.
             GPU_ASSERT(false);
         }
     }
@@ -2038,7 +2060,6 @@ static void EnqueueNextShader(bool hasWorkToDo, uint64_t nextShaderAddr, uint64_
     const uint newState = data.traversal.committed.State();
     RayHistoryWriteEnd(data, newState);
 
-    // Finished sorting, previously dead lanes may now have CHS|MS to execute and vice-versa
     if (nextShaderAddr != returnAddr)
     {
         const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(newState) ?
@@ -2116,7 +2137,6 @@ export void _cont_Traversal(
         RayHistoryWriteAnyHitOrProceduralStatus(data);
     }
 
-    // Handle reordering of rays/threads before processing since dead lanes may become alive after sorting.
     // Execute traversal for active lanes.
     uint state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
     _AmdPrimitiveSystemState candidate  = (_AmdPrimitiveSystemState)0;
@@ -2150,14 +2170,6 @@ export void _cont_Traversal(
     _AmdTraversalResultData result = (_AmdTraversalResultData)0;
 
     bool IsChsOrMiss = data.IsChsOrMiss(state);
-    // For sorting-enabled global mem mode, we only enqueue CHS/Miss once all
-    // lanes have arrived in this state.
-    // In non-sorting mode, we immediately enqueue CHS/Miss. This is mostly
-    // to replicate the old ProcessContinuation() behavior for now.
-    // We might want to consider also waiting for all lanes here in the non-global
-    // mem mode for consistency, and potentially also to have a common place
-    // in between Traversal and CHS/Miss where extra work can be done just once
-    // for all lanes, e.g. preparing system data for CHS/Miss.
     if ((_AmdContinuationStackIsGlobal() && WaveActiveAllTrue(IsChsOrMiss)) ||
         (!_AmdContinuationStackIsGlobal() && IsChsOrMiss))
     {
@@ -2167,11 +2179,6 @@ export void _cont_Traversal(
         GetNextHitMissPc(data, state, candidate, nextShaderAddr);
 
         bool hasWorkToDo = true;
-        // Avoid sorting on return addresses to RayGen (the case nextShaderValid == false), as it may create
-        // unexpected behavior and might increase execution divergence. For example, we might have multiple resume
-        // points due to divergent control flow in the TraceRay call, but those resume points are all copies of the same
-        // code. If we sort and re-read only from one bin, we might prevent future TraceRay calls from reconverging
-        // on traversal.
         if (_AmdContinuationStackIsGlobal() && (nextShaderAddr != 0))
         {
         }
@@ -2231,9 +2238,6 @@ export void _cont_Traversal(
         }
         else
         {
-            // The last remaining case is that we need to re-enqueue Traversal, because we are waiting for
-            // other lanes to finish BVH traversal before sorting, or to resume suspended lanes that wait for
-            // other lanes to run IS/AHS in early-is-ahs mode.
             //
             // Everything else needs to go back through scheduling/traversal, regardless of state
             // Note we don't need "Wait" here because priorities run AHS and IS first
diff --git a/src/shaders/LaneGroup.hlsl b/src/shaders/LaneGroup.hlsl
index 5e69227..3274c59 100644
--- a/src/shaders/LaneGroup.hlsl
+++ b/src/shaders/LaneGroup.hlsl
@@ -124,6 +124,22 @@ struct LaneGroup
         return AmdExtD3DShaderIntrinsics_WaveClusterMin(val, clusterSize);
     }
 
+    template<typename T>
+    T BitOr(T val)
+    {
+        const uint clusterSize = log2(groupSize) + 1;
+
+        return AmdExtD3DShaderIntrinsics_WaveClusterBitOr(val, clusterSize);
+    }
+
+    template<typename T>
+    T BitAnd(T val)
+    {
+        const uint clusterSize = log2(groupSize) + 1;
+
+        return AmdExtD3DShaderIntrinsics_WaveClusterBitAnd(val, clusterSize);
+    }
+
     template<typename T>
     T Broadcast(T val, uint targetLane)
     {
diff --git a/src/shaders/TaskCounter.hlsl b/src/shaders/TaskCounter.hlsl
index 034a897..30d5531 100644
--- a/src/shaders/TaskCounter.hlsl
+++ b/src/shaders/TaskCounter.hlsl
@@ -26,16 +26,32 @@
 #include "BuildSettings.hlsli"
 #endif
 
+//======================================================================================================================
+// Set a scratch buffer counter to 0 if it has a valid offset
+void InitScratchCounter(uint offset)
+{
+    if (offset != INVALID_IDX)
+    {
+        ScratchGlobal.Store(offset, 0);
+    }
+}
+
+//======================================================================================================================
+// Increase a scratch buffer counter and return its original value
+uint IncrementScratchCounter(uint offset, uint value)
+{
+    uint originalVal = 0;
+    ScratchGlobal.InterlockedAdd(offset, value, originalVal);
+    return originalVal;
+}
+
 //=====================================================================================================================
 // Increment task counter to mark a task / primitive as done
 uint IncrementTaskCounter(uint offset, uint value)
 {
     DeviceMemoryBarrier();
 
-    uint originalVal = 0;
-    ScratchGlobal.InterlockedAdd(offset, value, originalVal);
-
-    return originalVal;
+    return IncrementScratchCounter(offset, value);
 }
 
 //=====================================================================================================================
diff --git a/src/shaders/TaskQueueCounter.hlsl b/src/shaders/TaskQueueCounter.hlsl
index fd3303d..84aa2e5 100644
--- a/src/shaders/TaskQueueCounter.hlsl
+++ b/src/shaders/TaskQueueCounter.hlsl
@@ -86,13 +86,3 @@ bool EndTask(const uint localId, uint taskQueueOffset)
 
     return returnValue;
 }
-
-//======================================================================================================================
-// Set a scratch buffer counter to 0 if it has a valid index
-void InitScratchCounter(uint offset)
-{
-    if (offset != INVALID_IDX)
-    {
-        ScratchGlobal.Store(offset, 0);
-    }
-}
diff --git a/src/shaders/TraceRayCommon.hlsl b/src/shaders/TraceRayCommon.hlsl
index 3736e40..c22f9eb 100644
--- a/src/shaders/TraceRayCommon.hlsl
+++ b/src/shaders/TraceRayCommon.hlsl
@@ -30,6 +30,8 @@
 #endif
 #include "../../gpurt/gpurtDispatch.h"
 
+#include "llpc/GpurtIntrinsics.h"
+
 // Driver reserved space ID and resource bindings
 
 #define SPACEID space93
@@ -90,6 +92,26 @@ static uint CalculateHitGroupRecordAddress(
            );
 }
 
+//=====================================================================================================================
+static uint ConvertRtIpLevel(RayTracingIpLevel rtIpLevel)
+{
+    uint level = 0;
+
+    switch (rtIpLevel)
+    {
+    case RayTracingIpLevel::RtIp1_1:
+        level = GPURT_RTIP1_1;
+        break;
+    case RayTracingIpLevel::RtIp2_0:
+        level = GPURT_RTIP2_0;
+        break;
+    default:
+        break;
+    }
+
+    return level;
+}
+
 //=====================================================================================================================
 static HitGroupInfo FetchHitGroupInfo(
     uint hitGroupRecordIndex)
diff --git a/src/shaders/TrianglePrimitive.hlsl b/src/shaders/TrianglePrimitive.hlsl
index 25e02a1..e2975dc 100644
--- a/src/shaders/TrianglePrimitive.hlsl
+++ b/src/shaders/TrianglePrimitive.hlsl
@@ -113,10 +113,11 @@ uint3 FetchFaceIndices(
 // Vertex buffers only require an address and stride alignment of the format component size not the entire element size.
 // If the input data is not naturally aligned, we cannot use a single typed fetch for the 2-3 components. In this case,
 // we need to fetch each component separately.
+template<typename Float3Buffer>
 float3 FetchVertexPerComponent(
-    RWBuffer<float3> buffer,
-    uint             firstComponentIndex,
-    uint             numComponents)
+    Float3Buffer buffer,
+    uint         firstComponentIndex,
+    uint         numComponents)
 {
     float3 vertex;
     vertex.x = buffer[firstComponentIndex+0].x;
@@ -134,8 +135,9 @@ float3 FetchVertexPerComponent(
 }
 
 //=====================================================================================================================
+template<typename Float3Buffer>
 TriangleData FetchTriangleData(
-    RWBuffer<float3> buffer,
+    Float3Buffer     buffer,
     uint             vertexOffsetInComponents,
     uint3            index,
     uint             strideInComponents,
@@ -181,8 +183,9 @@ uint CalcTriangleBoxNodeFlags(
 }
 
 //======================================================================================================================
+template<typename Float3Buffer>
 TriangleData FetchTransformedTriangleData(
-    in RWBuffer<float3>           geometryBuffer,
+    in Float3Buffer               geometryBuffer,
     in uint3                      faceIndices,
     in uint                       geometryStride,
     in uint                       vertexOffsetInComponents,
@@ -226,10 +229,11 @@ bool IsActive(TriangleData tri)
 
 //=====================================================================================================================
 // Helper function to fetch triangle data. Returns false if the vertex indices are out of bounds.
+template<typename Float3Buffer>
 bool FetchTrianglePrimitive(
     in BuildShaderGeometryConstants geomConstants,
     in NumPrimAndInputOffset        inputOffsets,
-    in RWBuffer<float3>             geometryBuffer,
+    in Float3Buffer                 geometryBuffer,
     in uint                         geomId,
     in uint                         primId,
     inout_param(TriangleData)       tri,
diff --git a/src/shaders/Update.hlsl b/src/shaders/Update.hlsl
index 0e06e40..035ad4b 100644
--- a/src/shaders/Update.hlsl
+++ b/src/shaders/Update.hlsl
@@ -133,18 +133,15 @@ void Update(
 
     const uint numGroups = ShaderRootConstants.numThreads / BUILD_THREADGROUP_SIZE;
 
-    {
-        ClearUpdateFlags(globalId);
-        BEGIN_TASK(numGroups);
-        EncodePrimitives(globalId, GEOMETRY_TYPE_TRIANGLES);
-        END_TASK(numGroups);
-
-        const uint numWorkItems = ScratchBuffer.Load(UPDATE_SCRATCH_STACK_NUM_ENTRIES_OFFSET);
-        UpdateQBVHImpl(globalId,
-                    numWorkItems,
-                    ShaderRootConstants.numThreads);
-    }
-
+    ClearUpdateFlags(globalId);
+    BEGIN_TASK(numGroups);
+    EncodePrimitives(globalId, GEOMETRY_TYPE_TRIANGLES);
+    END_TASK(numGroups);
+
+    const uint numWorkItems = ScratchBuffer.Load(UPDATE_SCRATCH_STACK_NUM_ENTRIES_OFFSET);
+    UpdateQBVHImpl(globalId,
+                   numWorkItems,
+                   ShaderRootConstants.numThreads);
 }
 
 //======================================================================================================================
diff --git a/src/shadersClean/common/Math.hlsli b/src/shadersClean/common/Math.hlsli
index 5c8356b..981b9b5 100644
--- a/src/shadersClean/common/Math.hlsli
+++ b/src/shadersClean/common/Math.hlsli
@@ -48,6 +48,13 @@ inline uint32_t bit(uint32_t index)
     return 1u << index;
 }
 
+//=====================================================================================================================
+// Helper function for producing a 16 bit mask of one bit
+inline uint16_t bit16(uint16_t index)
+{
+    return uint16_t(1u << index);
+}
+
 //=====================================================================================================================
 // Helper function for producing a 64 bit mask of one bit
 inline uint64_t bit64(uint32_t index)
diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py
index 9ce4d59..67cd973 100644
--- a/tools/CompileRTShaders.py
+++ b/tools/CompileRTShaders.py
@@ -193,7 +193,7 @@ def getValidationCmdArgs(args) -> [str]:
 
     validateCommand =  [compilerPath]
 
-    validateCommand += getBaseDxcCommandArgs(True, True, True)
+    validateCommand += getBaseDxcCommandArgs(True, True, False)
     validateCommand += ["-Wno-misplaced-attributes"] # -Wmisplaced-attributes is triggered by [RootSignature()]
                                                      # used by entrypoint code and compiled as library
     validateCommand += ['-Fo', 'temp.bin']