From aa5ba689e8c33b6d42d4261687988e6e0b13998c Mon Sep 17 00:00:00 2001
From: qiaojbao <Qiaojin.Bao@amd.com>
Date: Wed, 4 Dec 2024 14:27:34 +0800
Subject: [PATCH] Update gpurt from commit 34a59e34

[Continuations] Fix debug traversal path
Convert HwPipePoint to PipelineStageFlag
Update Trace Source interface
[Continuations] Use packed 32-bit shader IDs
[Continuations] BoxHeuristicMode changes in one _cont_TraceRay call
Consistently use EnqueueDead for dead lanes
[Continuations] Add RayHistory data for debug traversal path
[Continuations] Conditionally pass 32-bit addresses to the middle-end.
[Continuations] Fix debug traversal path compilation
Reduce GL2 flushes in AS-related barriers
Update Pal::CmdDispatch to PAL ver 909
Fix decodeAS with multiple procedural geometry descs
Fix Batch Builder when compiling to SPIRV
Use GetRayId() to handle rayquery in graphics shader
Make Intrinsics more robust
[llvmraytracing] Pass dummy return address to AwaitTraversal
[Continuations] Remove !PASS_32_BIT_CR cases
Simplify dead lanes workflow, cleanup Traversal
Fix for Hitman3 hang on Navi3
[Continuations] Do not set priority in LLPC path
Changed minimum scratch buffer size to 8 bytes
[Continuations] Simplify Vpc32::isValid()
---
 CMakeLists.txt                                |    5 +
 backends/pal/gpurtPalBackend.cpp              |   39 +-
 backends/pal/gpurtPalBackend.h                |    1 -
 gpurt/gpurt.h                                 |    2 +-
 gpurt/gpurtBackend.h                          |   13 +-
 gpurt/gpurtBuildSettings.h                    |    4 +-
 gpurt/gpurtInlineFuncs.h                      |   17 -
 src/gpurtBvhBatcher.cpp                       |   19 +-
 src/gpurtBvhBuilder.cpp                       |   97 +-
 src/gpurtBvhBuilder.h                         |    4 +-
 src/gpurtTraceSource.h                        |    8 +
 src/shaders/BuildCommonScratch.hlsl           |   37 +-
 src/shaders/BuildPLOC.hlsl                    |    2 +-
 src/shaders/BuildParallel.hlsl                |    2 +-
 src/shaders/BuildQBVH.hlsl                    |   20 +-
 src/shaders/BuildSettings.hlsli               |    4 +-
 src/shaders/CompactAS1_1.hlsl                 |   59 +-
 src/shaders/Continuations1_1.hlsl             |   10 +-
 src/shaders/Continuations2_0.hlsl             |   24 +-
 src/shaders/DecodeAS.hlsl                     |    2 +-
 src/shaders/GpuRtLibrary.hlsl                 |  147 +-
 src/shaders/GpuRtLibraryCont.hlsl             | 1391 +++++++++--------
 src/shaders/RayQuery.hlsl                     |    4 -
 src/shaders/RayQuery1_1.hlsl                  |    8 -
 src/shaders/RayQuery2_0.hlsl                  |    8 -
 src/shaders/TraceRay.hlsl                     |   34 +-
 src/shaders/TrianglePrimitive.hlsl            |    2 +-
 .../traversal/TraversalDefs.hlsli             |    9 +
 src/shared/rayTracingDefs.h                   |   12 +-
 tools/CompileRTShaders.py                     |    2 +-
 tools/DebugPreprocessShaders.py               |    2 +-
 31 files changed, 1096 insertions(+), 892 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea92db7..7332192 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,11 @@ option(GPURT_BUILD_CONTINUATION "GpuRt uses continuation traversal" ON)
 if (GPURT_BUILD_CONTINUATION)
     gpurt_add_compile_definitions(GPURT_BUILD_CONTINUATION=1)
 endif()
+
+cmake_dependent_option(GPURT_DEBUG_CONTINUATION_TRAVERSAL "Debug continuation traversal on legacy indirect path" OFF "GPURT_BUILD_CONTINUATION" OFF)
+if (GPURT_DEBUG_CONTINUATION_TRAVERSAL)
+    gpurt_add_compile_definitions(GPURT_DEBUG_CONTINUATION_TRAVERSAL=1)
+endif()
 #endif
 
 # Disable run time type information
diff --git a/backends/pal/gpurtPalBackend.cpp b/backends/pal/gpurtPalBackend.cpp
index d06394c..e4868dd 100644
--- a/backends/pal/gpurtPalBackend.cpp
+++ b/backends/pal/gpurtPalBackend.cpp
@@ -28,25 +28,6 @@
 namespace GpuRt
 {
 
-// =====================================================================================================================
-// GPURT to PAL enum conversions without undefined behavior.
-static Pal::HwPipePoint GpuRtToPalHwPipePoint(
-    HwPipePoint gpurtHwPipePoint)
-{
-#define HWPIPEPOINTCASE(x) case static_cast<uint32>(Pal::HwPipePoint::x): return Pal::HwPipePoint::x
-    switch (static_cast<uint32>(gpurtHwPipePoint))
-    {
-        HWPIPEPOINTCASE(HwPipeTop);
-        HWPIPEPOINTCASE(HwPipePreCs);
-        HWPIPEPOINTCASE(HwPipeBottom);
-        default:
-            PAL_ASSERT_ALWAYS_MSG("Unhandled HwPipePoint value in conversion: %u\n",
-                                  static_cast<uint32>(gpurtHwPipePoint));
-            return Pal::HwPipePoint::HwPipeTop;
-    }
-#undef HWPIPEPOINTCASE
-}
-
 // =====================================================================================================================
 static Pal::ImmediateDataWidth GpuRtToPalImmediateDataWidth(
     ImmediateDataWidth gpurtImmediateDataWidth)
@@ -132,7 +113,11 @@ void PalBackend::Dispatch(
     uint32                z
     ) const
 {
+#if PAL_INTERFACE_MAJOR_VERSION >= 909
+    GetCmdBuffer(cmdBuffer)->CmdDispatch({ x, y, z }, {});
+#else
     GetCmdBuffer(cmdBuffer)->CmdDispatch({ x, y, z });
+#endif
 }
 
 // =====================================================================================================================
@@ -238,6 +223,7 @@ void PalBackend::InsertBarrier(
 {
     const bool syncDispatch     = flags & BarrierFlagSyncDispatch;
     const bool syncIndirectArgs = flags & BarrierFlagSyncIndirectArg;
+    const bool syncPreCpWrite   = flags & BarrierFlagSyncPreCpWrite;
     const bool syncPostCpWrite  = flags & BarrierFlagSyncPostCpWrite;
 
     Pal::ICmdBuffer* pCmdBuffer = GetCmdBuffer(cmdBuffer);
@@ -247,8 +233,16 @@ void PalBackend::InsertBarrier(
 
     if (syncDispatch || syncIndirectArgs)
     {
-        memoryBarrier.srcStageMask  = Pal::PipelineStageCs;
-        memoryBarrier.srcAccessMask = Pal::CoherShader;
+        memoryBarrier.srcStageMask  |= Pal::PipelineStageCs;
+        memoryBarrier.srcAccessMask |= Pal::CoherShader;
+    }
+
+    if (syncPreCpWrite)
+    {
+        memoryBarrier.srcStageMask  |= Pal::PipelineStagePostPrefetch;
+        memoryBarrier.srcAccessMask |= Pal::CoherShader;
+        memoryBarrier.dstStageMask  |= Pal::PipelineStagePostPrefetch;
+        memoryBarrier.dstAccessMask |= Pal::CoherCp;
     }
 
     if (syncPostCpWrite)
@@ -359,12 +353,11 @@ void PalBackend::UpdateMemory(
 // =====================================================================================================================
 void PalBackend::WriteTimestamp(
     ClientCmdBufferHandle  cmdBuffer,
-    HwPipePoint            hwPipePoint,
     const Pal::IGpuMemory& timeStampVidMem,
     uint64                 offset
     ) const
 {
-    GetCmdBuffer(cmdBuffer)->CmdWriteTimestamp(GpuRtToPalHwPipePoint(hwPipePoint), timeStampVidMem, offset);
+    GetCmdBuffer(cmdBuffer)->CmdWriteTimestamp(Pal::PipelineStageBottomOfPipe, timeStampVidMem, offset);
 }
 
 // =====================================================================================================================
diff --git a/backends/pal/gpurtPalBackend.h b/backends/pal/gpurtPalBackend.h
index 7da603e..a5fc976 100644
--- a/backends/pal/gpurtPalBackend.h
+++ b/backends/pal/gpurtPalBackend.h
@@ -121,7 +121,6 @@ class PalBackend : public IBackend
 
     virtual void WriteTimestamp(
         ClientCmdBufferHandle  cmdBuffer,
-        HwPipePoint            hwPipePoint,
         const Pal::IGpuMemory& timeStampVidMem,
         uint64                 offset
     ) const override;
diff --git a/gpurt/gpurt.h b/gpurt/gpurt.h
index 5d9d8e2..ca70f8f 100644
--- a/gpurt/gpurt.h
+++ b/gpurt/gpurt.h
@@ -737,7 +737,7 @@ struct DeviceSettings
     uint32                      numRebraidIterations;
     uint32                      rebraidQualityHeuristic;
 
-    uint32                      plocRadius;                           // PLOC Radius
+    uint32                      plocRadius;                           // PLOC nearest neighbor search adius
     uint32                      maxTopDownBuildInstances;             // Max instances allowed for top down build
     uint32                      parallelBuildWavesPerSimd;            // Waves per SIMD to launch for parallel build
 
diff --git a/gpurt/gpurtBackend.h b/gpurt/gpurtBackend.h
index 00152dc..7463254 100644
--- a/gpurt/gpurtBackend.h
+++ b/gpurt/gpurtBackend.h
@@ -75,21 +75,13 @@ struct BufferViewInfo
     BufferViewSwizzle swizzle;
 };
 
-// =====================================================================================================================
-// Copy of Pal::HwPipePoint with values we use.
-enum class HwPipePoint : uint32
-{
-    HwPipeTop    = 0x0,
-    HwPipePreCs  = 0x1,
-    HwPipeBottom = 0x7,
-};
-
 // =====================================================================================================================
 enum BarrierFlags : uint32
 {
     BarrierFlagSyncDispatch    = 0x1, // Stall the following dispatch until all previous dispatch done
     BarrierFlagSyncIndirectArg = 0x2, // Prepare previous shader output for indirect argument use
-    BarrierFlagSyncPostCpWrite = 0x4, // Prepare data set by CP for shader use
+    BarrierFlagSyncPreCpWrite  = 0x4, // Prepare for CP write
+    BarrierFlagSyncPostCpWrite = 0x8, // Prepare data set by CP for shader use
 };
 
 // =====================================================================================================================
@@ -185,7 +177,6 @@ class IBackend
     // Will eventually replaced with a callback or other abstraction to avoid referencing video memory.
     virtual void WriteTimestamp(
         ClientCmdBufferHandle  cmdBuffer,
-        HwPipePoint            hwPipePoint,
         const Pal::IGpuMemory& timeStampVidMem,
         uint64                 offset) const = 0;
 
diff --git a/gpurt/gpurtBuildSettings.h b/gpurt/gpurtBuildSettings.h
index 5c73247..413f719 100644
--- a/gpurt/gpurtBuildSettings.h
+++ b/gpurt/gpurtBuildSettings.h
@@ -62,7 +62,7 @@ struct CompileTimeBuildSettings
     uint32 radixSortScanLevel;
     uint32 emitCompactSize;
     uint32 enableBVHBuildDebugCounters;
-    uint32 plocRadius;
+    uint32 nnSearchRadius;
     uint32 enablePairCostCheck;
     uint32 enableVariableBitsMortonCode;
     uint32 rebraidType;
@@ -112,7 +112,7 @@ struct CompileTimeBuildSettings
 #define BUILD_SETTINGS_DATA_RADIX_SORT_SCAN_LEVEL_ID                  7
 #define BUILD_SETTINGS_DATA_EMIT_COMPACT_SIZE_ID                      8
 #define BUILD_SETTINGS_DATA_ENABLE_BVH_BUILD_DEBUG_COUNTERS_ID        9
-#define BUILD_SETTINGS_DATA_PLOC_RADIUS_ID                            10
+#define BUILD_SETTINGS_DATA_NN_SEARCH_RADIUS_ID                       10
 #define BUILD_SETTINGS_DATA_ENABLE_PAIR_COST_CHECK_ID                 11
 #define BUILD_SETTINGS_DATA_ENABLE_VARIABLE_BITS_MC_ID                12
 #define BUILD_SETTINGS_DATA_REBRAID_TYPE_ID                           13
diff --git a/gpurt/gpurtInlineFuncs.h b/gpurt/gpurtInlineFuncs.h
index ff377df..72a02f7 100644
--- a/gpurt/gpurtInlineFuncs.h
+++ b/gpurt/gpurtInlineFuncs.h
@@ -156,23 +156,6 @@ inline BufferViewFormat GetSingleComponentFormatForFormat(BufferViewFormat forma
     }
 }
 
-//=====================================================================================================================
-// Converts the value of a Pal::HwPipePoint into a GpuRt::HwPipePoint without undefined behavior.
-inline HwPipePoint PalToGpuRtHwPipePoint(uint32 palHwPipePoint)
-{
-#define HWPIPEPOINTCASE(x) case static_cast<uint32>(HwPipePoint::x): return HwPipePoint::x
-    switch (palHwPipePoint)
-    {
-        HWPIPEPOINTCASE(HwPipeTop);
-        HWPIPEPOINTCASE(HwPipePreCs);
-        HWPIPEPOINTCASE(HwPipeBottom);
-        default:
-            PAL_ASSERT_ALWAYS_MSG("Unhandled HwPipePoint value in conversion: %u\n", palHwPipePoint);
-            return HwPipePoint::HwPipeTop;
-    }
-#undef HWPIPEPOINTCASE
-}
-
 //=====================================================================================================================
 // Return the number of components for a buffer view format when it's used as a vertex format.
 inline uint8 GetNumComponentsForVertexFormat(VertexFormat format)
diff --git a/src/gpurtBvhBatcher.cpp b/src/gpurtBvhBatcher.cpp
index ba70d10..ae15781 100644
--- a/src/gpurtBvhBatcher.cpp
+++ b/src/gpurtBvhBatcher.cpp
@@ -104,7 +104,7 @@ void BvhBatcher::BuildAccelerationStructureBatch(
             // but otherwise do not participate in the rest of the build.
             if (isUpdate)
             {
-                builder.EmitPostBuildInfo();
+                builder.EmitPostBuildInfoDispatch();
             }
             else
             {
@@ -146,7 +146,11 @@ void BvhBatcher::BuildAccelerationStructureBatch(
     {
         RGP_PUSH_MARKER("Process Empty BVH builds");
         DispatchInitAccelerationStructure<false>(emptyBuilders);
-        BuildPhase(emptyBuilders, &BvhBuilder::EmitPostBuildInfo);
+        if (PhaseEnabled(BuildPhaseFlags::SeparateEmitPostBuildInfoPass))
+        {
+            Barrier();
+            BuildPhase(emptyBuilders, &BvhBuilder::EmitPostBuildInfoDispatch);
+        }
         RGP_POP_MARKER();
     }
 
@@ -264,17 +268,10 @@ void BvhBatcher::BuildRaytracingAccelerationStructureBatch(
     {
         RGP_PUSH_MARKER("EmitPostBuildInfo");
         Barrier();
-        BuildPhase("Updates", updaters, &BvhBuilder::EmitPostBuildInfo);
-        BuildPhase("Builds", builders, &BvhBuilder::EmitPostBuildInfo);
-
+        BuildPhase(BuildPhaseFlags::SeparateEmitPostBuildInfoPass, updaters, &BvhBuilder::EmitPostBuildInfoDispatch);
+        BuildPhase(BuildPhaseFlags::SeparateEmitPostBuildInfoPass, builders, &BvhBuilder::EmitPostBuildInfoDispatch);
         RGP_POP_MARKER();
     }
-    else
-    {
-        // Execute EmitPostBuildInfo without any RGP markers
-        BuildPhase(updaters, &BvhBuilder::EmitPostBuildInfo);
-        BuildPhase(builders, &BvhBuilder::EmitPostBuildInfo);
-    }
 
     if (PhaseEnabled(BuildPhaseFlags::BuildDumpEvents))
     {
diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp
index 59287b6..898b172 100644
--- a/src/gpurtBvhBuilder.cpp
+++ b/src/gpurtBvhBuilder.cpp
@@ -1462,6 +1462,31 @@ void BvhBuilder::InitBuildConfig(
 #endif
         ;
 
+    // The builder supports one compacted size emit during the build itself. Additional postbuild info requires
+    // extra dispatches or CP writes.
+    uint32 emitCompactCount = 0;
+    for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; ++i)
+    {
+        AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i);
+        if (args.desc.infoType == AccelStructPostBuildInfoType::CompactedSize)
+        {
+            // Cache emit destination GPU VA for inlined emit from build shaders
+            m_emitCompactDstGpuVa = args.desc.postBuildBufferAddr.gpu;
+            emitCompactCount++;
+        }
+        else
+        {
+            m_buildConfig.nonInlinePostBuildEmits = true;
+        }
+    }
+
+    // If maxNumPrimitives == 0, we never execute a BVH build, so we always need a separate emit pass.
+    if ((emitCompactCount > 1) || (m_buildConfig.maxNumPrimitives == 0))
+    {
+        m_emitCompactDstGpuVa = 0;
+        m_buildConfig.nonInlinePostBuildEmits = true;
+        m_buildConfig.enableEmitCompactSizeDispatch = true;
+    }
 }
 
 // =====================================================================================================================
@@ -2194,7 +2219,10 @@ void BvhBuilder::InitBuildSettings()
                                                    static_cast<uint32>(m_buildConfig.fp16BoxNodesInBlasMode);
     m_buildSettings.fp16BoxModeMixedSaThreshold  = m_deviceSettings.fp16BoxModeMixedSaThresh;
     m_buildSettings.enableBVHBuildDebugCounters  = m_deviceSettings.enableBVHBuildDebugCounters;
-    m_buildSettings.plocRadius                   = m_deviceSettings.plocRadius;
+    if (buildMode == BvhBuildMode::PLOC)
+    {
+        m_buildSettings.nnSearchRadius = m_deviceSettings.plocRadius;
+    }
     m_buildSettings.enablePairCostCheck          = m_deviceSettings.enablePairCompressionCostCheck;
     m_buildSettings.enableVariableBitsMortonCode = m_deviceSettings.enableVariableBitsMortonCodes;
 
@@ -2222,24 +2250,7 @@ void BvhBuilder::InitBuildSettings()
 
     m_buildSettings.rtIpLevel = static_cast<uint32>(m_pDevice->GetRtIpLevel());
 
-    uint32 emitBufferCount = 0;
-    for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; ++i)
-    {
-        AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i);
-        if (args.desc.infoType == AccelStructPostBuildInfoType::CompactedSize)
-        {
-            // Cache emit destination GPU VA for inlined emit from build shaders
-            m_emitCompactDstGpuVa = args.desc.postBuildBufferAddr.gpu;
-            emitBufferCount++;
-        }
-    }
-
-    if (emitBufferCount == 1)
-    {
-        // We only support one compacted emit size from the build shaders. If we have more than one emit
-        // destination buffers, we use the compute shader path
-        m_buildSettings.emitCompactSize = 1;
-    }
+    m_buildSettings.emitCompactSize = (m_emitCompactDstGpuVa != 0);
 
     m_buildSettings.doEncode = (m_buildConfig.needEncodeDispatch == false);
 
@@ -2313,8 +2324,10 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo(
     // the build when performing the update causing page faults.
     scratchDataSize = Util::Max(scratchDataSize, updateDataSize);
 
-    // Some applications crash when the driver reports 0 scratch size. Use 1 DWORD instead.
-    scratchDataSize = Util::Max(static_cast<uint32>(sizeof(uint32)), scratchDataSize);
+    // Some applications crash when the driver reports 0 scratch size.
+    // Additionally, the d3d12 debug layer does not like a scratch buffer
+    // that's only 4 bytes, so we pass back 8 bytes instead.
+    scratchDataSize = Util::Max(static_cast<uint32>(sizeof(uint64)), scratchDataSize);
 
     prebuildInfo.scratchDataSizeInBytes       = scratchDataSize;
     prebuildInfo.updateScratchDataSizeInBytes = updateDataSize;
@@ -2432,7 +2445,7 @@ void BvhBuilder::BuildRaytracingAccelerationStructure()
 
     if (m_buildArgs.postBuildInfoDescCount > 0)
     {
-        if (NeedsPostBuildEmitPass())
+        if (m_buildConfig.enableEmitCompactSizeDispatch)
         {
             // Make sure build is complete before emitting
             Barrier();
@@ -2513,7 +2526,6 @@ void BvhBuilder::PreBuildDumpEvents()
         if (result == Pal::Result::Success)
         {
             m_backend.WriteTimestamp(m_cmdBuffer,
-                                     HwPipePoint::HwPipeBottom,
                                      *m_dumpInfo.pTimeStampVidMem,
                                      m_dumpInfo.timeStampVidMemoffset);
         }
@@ -2530,7 +2542,6 @@ void BvhBuilder::PostBuildDumpEvents()
         if (m_dumpInfo.pTimeStampVidMem != nullptr)
         {
             m_backend.WriteTimestamp(m_cmdBuffer,
-                                     HwPipePoint::HwPipeBottom,
                                      *m_dumpInfo.pTimeStampVidMem,
                                      m_dumpInfo.timeStampVidMemoffset + sizeof(uint64));
         }
@@ -2739,23 +2750,17 @@ void BvhBuilder::EncodePrimitives()
 // Handles writing any requested postbuild information.
 void BvhBuilder::EmitPostBuildInfo()
 {
-    if (m_buildArgs.postBuildInfoDescCount == 0)
-    {
-        return;
-    }
-
     const uint32 resultDataSize = m_resultBufferInfo.dataSize;
 
     const bool isBottomLevel = (m_buildArgs.inputs.type == AccelStructType::BottomLevel);
-    const bool useSeparateEmitPass = NeedsPostBuildEmitPass();
+
     for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; i++)
     {
         const AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i);
         switch (args.desc.infoType)
         {
         case AccelStructPostBuildInfoType::CompactedSize:
-            // If maxNumPrimitives == 0, we never execute a BVH build, so we always need a separateEmitPass
-            if (useSeparateEmitPass || (m_buildConfig.maxNumPrimitives == 0))
+            if (m_buildConfig.enableEmitCompactSizeDispatch)
             {
                 EmitAccelerationStructurePostBuildInfo(args);
             }
@@ -2808,6 +2813,22 @@ void BvhBuilder::EmitPostBuildInfo()
     }
 }
 
+// =====================================================================================================================
+// Handles writing any requested postbuild information via dispatch (not CP writes).
+void BvhBuilder::EmitPostBuildInfoDispatch()
+{
+    for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; i++)
+    {
+        const AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i);
+
+        if ((args.desc.infoType != AccelStructPostBuildInfoType::CompactedSize) ||
+            m_buildConfig.enableEmitCompactSizeDispatch)
+        {
+            EmitAccelerationStructurePostBuildInfo(args);
+        }
+    }
+}
+
 // =====================================================================================================================
 // Emits post-build properties for a set of acceleration structures.
 // This enables applications to know the output resource requirements for performing acceleration structure
@@ -3137,6 +3158,7 @@ void BvhBuilder::CopyASDeserializeMode(
     };
 
     // Reset the task counter in destination buffer.
+    Barrier(BarrierFlagSyncPreCpWrite);
     ResetTaskCounter(copyArgs.dstAccelStructAddr.gpu);
     Barrier(BarrierFlagSyncPostCpWrite);
 
@@ -3195,7 +3217,7 @@ BuildPhaseFlags BvhBuilder::EnabledPhases() const
 {
     BuildPhaseFlags flags{};
 
-    if (NeedsPostBuildEmitPass())
+    if (m_buildConfig.nonInlinePostBuildEmits)
     {
         flags |= BuildPhaseFlags::SeparateEmitPostBuildInfoPass;
     }
@@ -3451,15 +3473,6 @@ bool BvhBuilder::AllowLatePairCompression() const
     return enableLatePairCompression;
 }
 
-// =====================================================================================================================
-// Returns true when the builder will require a separate dispatch for emitting build info
-bool BvhBuilder::NeedsPostBuildEmitPass() const
-{
-    const bool usesSeparateEmitPass = (m_buildArgs.postBuildInfoDescCount == 0) &&
-                                      (m_emitCompactDstGpuVa != 0) && (m_buildSettings.emitCompactSize == 0);
-    return usesSeparateEmitPass;
-}
-
 // =====================================================================================================================
 // Returns true when the builder has dumping events
 bool BvhBuilder::HasBuildDumpEvents() const
diff --git a/src/gpurtBvhBuilder.h b/src/gpurtBvhBuilder.h
index 806c518..c025041 100644
--- a/src/gpurtBvhBuilder.h
+++ b/src/gpurtBvhBuilder.h
@@ -224,6 +224,8 @@ class BvhBuilder
         bool                            enableMergeSort;
         bool                            enableInstanceRebraid;
         bool                            rebuildAccelStruct;
+        bool                            enableEmitCompactSizeDispatch;
+        bool                            nonInlinePostBuildEmits;
     };
 
     BvhBuilder(
@@ -321,6 +323,7 @@ class BvhBuilder
     void UpdateAccelerationStructure();
 
     void EmitPostBuildInfo();
+    void EmitPostBuildInfoDispatch();
 
     void EncodeUpdate();
 
@@ -413,7 +416,6 @@ class BvhBuilder
     // Optional phase checks
     bool AllowRebraid() const;
     bool AllowLatePairCompression() const;
-    bool NeedsPostBuildEmitPass() const;
     bool HasBuildDumpEvents() const;
 
     // Helper functions
diff --git a/src/gpurtTraceSource.h b/src/gpurtTraceSource.h
index 3262892..1c609f9 100644
--- a/src/gpurtTraceSource.h
+++ b/src/gpurtTraceSource.h
@@ -71,7 +71,11 @@ class AccelStructTraceSource : public GpuUtil::ITraceSource
     }
 
     // Using this notification to do any preparation work that might be required before the trace begins.
+#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908
+    virtual void OnTraceAccepted(uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override
+#else
     virtual void OnTraceAccepted() override
+#endif
     {
     }
 
@@ -134,7 +138,11 @@ class RayHistoryTraceSource : public GpuUtil::ITraceSource
     }
 
     // Using this notification to do any preparation work that might be required before the trace begins.
+#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908
+    virtual void OnTraceAccepted(uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override
+#else
     virtual void OnTraceAccepted() override
+#endif
     {
     }
 
diff --git a/src/shaders/BuildCommonScratch.hlsl b/src/shaders/BuildCommonScratch.hlsl
index b0d3197..648a11e 100644
--- a/src/shaders/BuildCommonScratch.hlsl
+++ b/src/shaders/BuildCommonScratch.hlsl
@@ -682,6 +682,32 @@ bool IsLeafOrIsCollapsed(
     return result;
 }
 
+//=====================================================================================================================
+uint GetMinimumNumOfTriangles()
+{
+    uint minNumOfTris = 2;
+    {
+        {
+            minNumOfTris = 0;
+        }
+    }
+
+    return minNumOfTris;
+}
+
+//=====================================================================================================================
+float GetTriangleIntersectionCost(uint numTris)
+{
+    float Ct;
+    {
+        {
+            Ct = SAH_COST_TRIANGLE_INTERSECTION * numTris;
+        }
+    }
+
+    return Ct;
+}
+
 //=====================================================================================================================
 void MergeScratchNodes(
     uint        scratchNodesOffset,
@@ -724,18 +750,17 @@ void MergeScratchNodes(
         const uint numRight = FetchScratchNodeNumPrimitives(rightNode, IsLeafNode(rightNodeIndex, numActivePrims));
         const uint numTris = numLeft + numRight;
 
-        const float Ct =
-            SAH_COST_TRIANGLE_INTERSECTION;
-
         const float Ci = SAH_COST_AABBB_INTERSECTION;
 
         const float leftCost =
             IsLeafNode(leftNodeIndex, numActivePrims) ?
-                (Ct * ComputeBoxSurfaceArea(leftBounds)) : FetchScratchNodeCost(scratchNodesOffset, leftNodeIndex);
+                (GetTriangleIntersectionCost(numLeft) * ComputeBoxSurfaceArea(leftBounds)) :
+                FetchScratchNodeCost(scratchNodesOffset, leftNodeIndex);
 
         const float rightCost =
             IsLeafNode(rightNodeIndex, numActivePrims) ?
-                (Ct * ComputeBoxSurfaceArea(rightBounds)) : FetchScratchNodeCost(scratchNodesOffset, rightNodeIndex);
+                (GetTriangleIntersectionCost(numRight) * ComputeBoxSurfaceArea(rightBounds)) :
+                FetchScratchNodeCost(scratchNodesOffset, rightNodeIndex);
 
         const bool leftCollapse      = (leftNode.numPrimitivesAndDoCollapse & 0x1) ||
                                         IsLeafNode(leftNodeIndex, numActivePrims);
@@ -745,7 +770,7 @@ void MergeScratchNodes(
 
         float bestCost = leftCost + rightCost + Ci * mergedBoxSurfaceArea;
 
-        const float collapseCost = Ct * numTris;
+        const float collapseCost = GetTriangleIntersectionCost(numTris);
 
         const float splitCost    = Ci + leftCost / mergedBoxSurfaceArea + rightCost / mergedBoxSurfaceArea;
 
diff --git a/src/shaders/BuildPLOC.hlsl b/src/shaders/BuildPLOC.hlsl
index 079f1bc..4c2f7b1 100644
--- a/src/shaders/BuildPLOC.hlsl
+++ b/src/shaders/BuildPLOC.hlsl
@@ -882,7 +882,7 @@ void BuildPLOC(
     plocArgs.baseBatchIndicesScratchOffset  = ShaderConstants.offsets.batchIndices;
     plocArgs.fp16BoxNodesInBlasMode         = Settings.fp16BoxNodesMode;
     plocArgs.fp16BoxModeMixedSaThresh       = Settings.fp16BoxModeMixedSaThreshold;
-    plocArgs.plocRadius                     = Settings.plocRadius;
+    plocArgs.plocRadius                     = Settings.nnSearchRadius;
     plocArgs.splitBoxesByteOffset           = ShaderConstants.offsets.triangleSplitBoxes;
     plocArgs.primIndicesSortedScratchOffset = ShaderConstants.offsets.primIndicesSorted;
     plocArgs.unsortedBvhLeafNodesOffset     = ShaderConstants.offsets.bvhLeafNodeData;
diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl
index 8a3df86..15e197d 100644
--- a/src/shaders/BuildParallel.hlsl
+++ b/src/shaders/BuildParallel.hlsl
@@ -265,7 +265,7 @@ void BuildPloc(
     plocArgs.baseBatchIndicesScratchOffset  = ShaderConstants.offsets.batchIndices;
     plocArgs.fp16BoxNodesInBlasMode         = Settings.fp16BoxNodesMode;
     plocArgs.fp16BoxModeMixedSaThresh       = Settings.fp16BoxModeMixedSaThreshold;
-    plocArgs.plocRadius                     = Settings.plocRadius;
+    plocArgs.plocRadius                     = Settings.nnSearchRadius;
     plocArgs.splitBoxesByteOffset           = ShaderConstants.offsets.triangleSplitBoxes;
     plocArgs.primIndicesSortedScratchOffset = ShaderConstants.offsets.primIndicesSorted;
     plocArgs.unsortedBvhLeafNodesOffset     = ShaderConstants.offsets.bvhLeafNodeData;
diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl
index 60e527f..1fc9b0f 100644
--- a/src/shaders/BuildQBVH.hlsl
+++ b/src/shaders/BuildQBVH.hlsl
@@ -279,25 +279,19 @@ uint WritePrimitiveNode(
     const uint geometryIndexAndFlags = PackGeometryIndexAndFlags(geometryIndex, geometryFlags);
     const uint geometryPrimNodePtrsOffset = offsets.primNodePtrs + geometryInfo.primNodePtrsOffset;
 
-    const uint flattenedPrimIndex =
-        (geometryInfo.primNodePtrsOffset / sizeof(uint)) + scratchNode.left_or_primIndex_or_instIndex;
-
     uint numLeafsDone;
     ScratchGlobal.InterlockedAdd(ShaderConstants.offsets.qbvhGlobalStackPtrs + STACK_PTRS_NUM_LEAFS_DONE_OFFSET,
                                  1,
                                  numLeafsDone);
 
     {
-        uint destIndex;
-        if (IsTrianglePrimitiveBuild() &&
-            ((Settings.triangleCompressionMode != NO_TRIANGLE_COMPRESSION) || Settings.doTriangleSplitting))
-        {
-            destIndex = numLeafsDone;
-        }
-        else
-        {
-            destIndex = flattenedPrimIndex;
-        }
+        // Use 'numLeafsDone' as the destination index. This will pack all leaf nodes together
+        // without any holes (invalid nodes) in between.
+        // Note: Packing the triangle nodes this way causes the primNodePtrs to access the
+        // Triangle nodes in random order which results in perf drops of some Rayperf scenes
+        // when built/updated using 'asb'. Since 'asb' is a synthetic app, ignoring this perf drop
+        // for now, but need to revisit this change if any actual game/benchmark shows the perf. drop.
+        uint destIndex = numLeafsDone;
 
         const uint primitiveNodeSize = (nodeType == NODE_TYPE_USER_NODE_PROCEDURAL) ?
                                        USER_NODE_PROCEDURAL_SIZE :
diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli
index ac6e315..2e5ff10 100644
--- a/src/shaders/BuildSettings.hlsli
+++ b/src/shaders/BuildSettings.hlsli
@@ -38,7 +38,7 @@
 [[vk::constant_id(BUILD_SETTINGS_DATA_RADIX_SORT_SCAN_LEVEL_ID)]]                  uint radixSortScanLevel            = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_EMIT_COMPACT_SIZE_ID)]]                      uint emitCompactSize               = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_BVH_BUILD_DEBUG_COUNTERS_ID)]]        uint enableBVHBuildDebugCounters   = 0;
-[[vk::constant_id(BUILD_SETTINGS_DATA_PLOC_RADIUS_ID)]]                            uint plocRadius                    = 0;
+[[vk::constant_id(BUILD_SETTINGS_DATA_NN_SEARCH_RADIUS_ID)]]                       uint nnSearchRadius                = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_PAIR_COST_CHECK_ID)]]                 uint enablePairCostCheck           = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_VARIABLE_BITS_MC_ID)]]                uint enableVariableBitsMortonCode  = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_REBRAID_TYPE_ID)]]                           uint rebraidType                   = 0;
@@ -73,7 +73,7 @@ static const CompileTimeBuildSettings Settings = {
     radixSortScanLevel,
     emitCompactSize,
     enableBVHBuildDebugCounters,
-    plocRadius,
+    nnSearchRadius,
     enablePairCostCheck,
     enableVariableBitsMortonCode,
     rebraidType,
diff --git a/src/shaders/CompactAS1_1.hlsl b/src/shaders/CompactAS1_1.hlsl
index d9a165e..31af7df 100644
--- a/src/shaders/CompactAS1_1.hlsl
+++ b/src/shaders/CompactAS1_1.hlsl
@@ -391,40 +391,51 @@ void CompactASImpl1_1(
         // Copy leaf nodes
         if (type == TOP_LEVEL)
         {
-            for (uint nodeIndex = globalId; nodeIndex < srcHeader.numLeafNodes; nodeIndex += ShaderConstants.numThreads)
+            // Need to loop over all the prims, not just numLeafNodes.
+            for (uint nodeIndex = globalId; nodeIndex < srcHeader.numPrimitives; nodeIndex += ShaderConstants.numThreads)
             {
-                const uint nodeOffset
-                    = nodeIndex * GetBvhNodeSizeLeaf(PrimitiveType::Instance, Settings.enableFusedInstanceNode);
-                const uint srcNodeDataOffset  = srcOffsetDataLeafNodes + nodeOffset;
-                const uint dstNodeDataOffset  = dstOffsetDataLeafNodes + nodeOffset;
+                // Since there could be invalid instance nodes, we need to skip over them. Invalid instance nodes
+                // will have corresponding prim node pointers as -1. So check for this and skip the node if invalid.
+                // Note: We don't need to skip invalid nodes for BLASs because their leaf nodes will be packed one
+                // after another, ie: no holes -> no invalid nodes.
+                const uint primNodePtrOffset = srcOffsetDataPrimNodePtrs + (nodeIndex * NODE_PTR_SIZE);
 
-                // Copy instance node
-                // Note, fused instance nodes are twice the size of normal instance nodes. We need to copy it correspondingly.
-                if (Settings.enableFusedInstanceNode)
-                {
-                    const FusedInstanceNode node = SrcBuffer.Load<FusedInstanceNode>(srcNodeDataOffset);
-                    DstMetadata.Store<FusedInstanceNode>(dstNodeDataOffset, node);
-                }
-                else
+                if (SrcBuffer.Load(primNodePtrOffset) != INVALID_IDX)
                 {
-                    const InstanceNode node = SrcBuffer.Load<InstanceNode>(srcNodeDataOffset);
-                    DstMetadata.Store<InstanceNode>(dstNodeDataOffset, node);
-                }
+                    const uint nodeOffset
+                        = nodeIndex * GetBvhNodeSizeLeaf(PrimitiveType::Instance, Settings.enableFusedInstanceNode);
+                    const uint srcNodeDataOffset = srcOffsetDataLeafNodes + nodeOffset;
+                    const uint dstNodeDataOffset = dstOffsetDataLeafNodes + nodeOffset;
+
+                    // Copy instance node
+                    // Note, fused instance nodes are twice the size of normal instance nodes. We need to copy it correspondingly.
+                    if (Settings.enableFusedInstanceNode)
+                    {
+                        const FusedInstanceNode node = SrcBuffer.Load<FusedInstanceNode>(srcNodeDataOffset);
+                        DstMetadata.Store<FusedInstanceNode>(dstNodeDataOffset, node);
+                    }
+                    else
+                    {
+                        const InstanceNode node = SrcBuffer.Load<InstanceNode>(srcNodeDataOffset);
+                        DstMetadata.Store<InstanceNode>(dstNodeDataOffset, node);
+                    }
 
-                // Top level acceleration structures do not have geometry info.
+                    // Top level acceleration structures do not have geometry info.
 
-                const uint srcNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, srcOffsets.leafNodes + nodeOffset);
-                const uint dstNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, dstOffsets.leafNodes + nodeOffset);
+                    const uint srcNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, srcOffsets.leafNodes + nodeOffset);
+                    const uint dstNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, dstOffsets.leafNodes + nodeOffset);
 
-                // Update the parent pointer and fix up the child pointer in the parent node
-                UpdateParentPointerAndChildPointer(srcMetadataSizeInBytes,
-                                                   srcNodePointer,
-                                                   dstMetadataSizeInBytes,
-                                                   dstNodePointer);
+                    // Update the parent pointer and fix up the child pointer in the parent node
+                    UpdateParentPointerAndChildPointer(srcMetadataSizeInBytes,
+                                                       srcNodePointer,
+                                                       dstMetadataSizeInBytes,
+                                                       dstNodePointer);
+                }
             }
         }
         else if (srcHeader.geometryType == GEOMETRY_TYPE_TRIANGLES)
         {
+            // Unlike TOP_LEVEL, this assumes that all leaf nodes are packed contiguously without any holes in between.
             for (uint nodeIndex = globalId; nodeIndex < srcHeader.numLeafNodes; nodeIndex += ShaderConstants.numThreads)
             {
                 const uint nodeOffset         = (nodeIndex * sizeof(TriangleNode));
diff --git a/src/shaders/Continuations1_1.hlsl b/src/shaders/Continuations1_1.hlsl
index 1d17e9d..09fb6b5 100644
--- a/src/shaders/Continuations1_1.hlsl
+++ b/src/shaders/Continuations1_1.hlsl
@@ -158,7 +158,7 @@ static _AmdTraversalState InitTraversalState1_1(
 
     uint schedulerState = TRAVERSAL_STATE_COMMITTED_NOTHING;
     traversal.committed.PackState(schedulerState);
-    traversal.committed.currNodePtr = INVALID_NODE;
+    traversal.committed.SetCurrNodePtr(INVALID_NODE);
 
     // Start traversing from root node
     traversal.reservedNodePtr         = INVALID_NODE;
@@ -173,7 +173,7 @@ static _AmdTraversalState InitTraversalState1_1(
     traversal.stackPtr = stack.Pack();
     traversal.PackStackPtrTop(INVALID_NODE);
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
     traversal.committed.PackAnyHitCallType(0);
 #endif
 
@@ -354,7 +354,7 @@ static void TraversalInternal1_1(
                         candidate.PackInstanceContribution(instanceContributionToHitGroupIndex, hitKind);
                         candidate.PackGeometryIndex(primitiveData.geometryIndex);
                         candidate.PackIsOpaque(isOpaque);
-                        candidate.currNodePtr = nodePtr;
+                        candidate.SetCurrNodePtr(nodePtr);
 
                         bool hasAnyHit = false;
                         if ((rayForceOpaque == false) && (isOpaque == false))
@@ -416,9 +416,9 @@ static void TraversalInternal1_1(
                 candidate.PackGeometryIndex(primitiveData.geometryIndex);
                 candidate.PackIsOpaque(isOpaque);
                 candidate.PackInstanceContribution(instanceContributionToHitGroupIndex);
-                candidate.currNodePtr = nodePtr;
+                candidate.SetCurrNodePtr(nodePtr);
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
                 uint anyHitCallType = rayForceOpaque ? ANYHIT_CALLTYPE_SKIP : ANYHIT_CALLTYPE_DUPLICATE;
                 const bool noDuplicateAnyHit = (geometryFlags & D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION);
                 anyHitCallType = noDuplicateAnyHit ? ANYHIT_CALLTYPE_NO_DUPLICATE : anyHitCallType;
diff --git a/src/shaders/Continuations2_0.hlsl b/src/shaders/Continuations2_0.hlsl
index 73293dc..283fe20 100644
--- a/src/shaders/Continuations2_0.hlsl
+++ b/src/shaders/Continuations2_0.hlsl
@@ -43,7 +43,7 @@ static _AmdTraversalState InitTraversalState2_0(
 
     uint schedulerState = TRAVERSAL_STATE_COMMITTED_NOTHING;
     traversal.committed.PackState(schedulerState);
-    traversal.committed.currNodePtr = INVALID_NODE;
+    traversal.committed.SetCurrNodePtr(INVALID_NODE);
 
     // Start traversing from root node
     traversal.reservedNodePtr         = INVALID_NODE;
@@ -58,7 +58,7 @@ static _AmdTraversalState InitTraversalState2_0(
 
     traversal.PackStackPtrTop(0);
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
     traversal.committed.PackAnyHitCallType(0);
 #endif
 
@@ -72,14 +72,8 @@ static void TraversalInternal2_0(
     inout_param(_AmdPrimitiveSystemState) candidate,
     inout_param(float2) candidateBarycentrics)
 {
-    uint rayFlags = data.ray.Flags();
-
-    uint boxHeuristicMode = AmdTraceRayGetBoxSortHeuristicMode();
-    if ((boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosest) ||
-        (boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosestMidPoint))
-    {
-        boxHeuristicMode = GetBoxSortingHeuristicFromRayFlags(rayFlags, boxHeuristicMode);
-    }
+    const uint rayFlags         = data.ray.Flags();
+    const uint boxHeuristicMode = GetBoxHeuristicMode();
 
     // Root bvh address for reuse
     const GpuVirtualAddress topBvhAddress = data.ray.AccelStruct();
@@ -322,7 +316,8 @@ static void TraversalInternal2_0(
                     committed.PackInstanceContribution(instanceContributionToHitGroupIndex, hitKind);
                     committed.PackGeometryIndex(primitiveData.geometryIndex,
                         TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT, false);
-                    committed.currNodePtr    = nodePtr;
+                    committed.SetCurrNodePtr(nodePtr);
+
                     state = TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT;
 
                     // Exit traversal early if ray flags indicate end search after first hit
@@ -357,7 +352,8 @@ static void TraversalInternal2_0(
                     candidate.PackGeometryIndex(primitiveData.geometryIndex,
                     // This #ifdef is required until the legacy GPURT_RTIP_LEVEL == 0 lib has been removed:
                         TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT, isOpaque);
-                    candidate.currNodePtr    = nodePtr;
+                    candidate.SetCurrNodePtr(nodePtr);
+
                     if (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::DeferFirst)
                     {
                         haveCandidate = true;
@@ -412,10 +408,10 @@ static void TraversalInternal2_0(
                 candidate.PackGeometryIndex(primitiveData.geometryIndex);
                 candidate.PackIsOpaque(isOpaque);
                 candidate.PackInstanceContribution(instanceContributionToHitGroupIndex);
-                candidate.currNodePtr = nodePtr;
+                candidate.SetCurrNodePtr(nodePtr);
                 candidate.instNodePtr = data.traversal.instNodePtr;
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
                 // Determine anyHit shader call type
                 uint anyHitCallType = rayForceOpaque ? ANYHIT_CALLTYPE_SKIP : ANYHIT_CALLTYPE_DUPLICATE;
 
diff --git a/src/shaders/DecodeAS.hlsl b/src/shaders/DecodeAS.hlsl
index fb8ca1a..64cb0fd 100644
--- a/src/shaders/DecodeAS.hlsl
+++ b/src/shaders/DecodeAS.hlsl
@@ -170,7 +170,7 @@ void DecodeAS(in uint3 globalThreadId : SV_DispatchThreadID)
             }
             else // GEOMETRY_TYPE_AABBS
             {
-                DstBuffer.Store(dstGeometryDescOffset + GEOMETRY_DESC_AABB_COUNT_OFFSET, geometryNumPrimitives);
+                DstBuffer.Store<uint64_t>(dstGeometryDescOffset + GEOMETRY_DESC_AABB_COUNT_OFFSET, geometryNumPrimitives);
                 DstBuffer.Store4(dstGeometryDescOffset + GEOMETRY_DESC_AABBS_OFFSET,
                                  uint4(addressLo, addressHi, DECODE_PRIMITIVE_STRIDE_AABB, 0));
             }
diff --git a/src/shaders/GpuRtLibrary.hlsl b/src/shaders/GpuRtLibrary.hlsl
index 879590c..de4900c 100644
--- a/src/shaders/GpuRtLibrary.hlsl
+++ b/src/shaders/GpuRtLibrary.hlsl
@@ -33,6 +33,39 @@
 #include "TraceRayCommon.hlsl"
 #include "AccelStructTracker.hlsl"
 
+#ifdef __cplusplus
+extern uint g_rtIpLevel;          // defined in cputraversal
+void _AmdSetRtip(uint rtIpLevel); // defined in cputraversal
+#endif
+
+// Only the default path (Continuation) provides _AmdGetRtip().
+static RayTracingIpLevel GetRtIpLevel()
+{
+#ifdef __cplusplus
+    switch (g_rtIpLevel)
+    {
+    case GPURT_RTIP1_1:
+        return RayTracingIpLevel::RtIp1_1;
+    case GPURT_RTIP2_0:
+        return RayTracingIpLevel::RtIp2_0;
+    default:
+        // Should never be called
+        GPU_ASSERT(false);
+        return RayTracingIpLevel::_None;
+    }
+#else // __cplusplus
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
+    if (GPURT_RTIP_LEVEL == (uint)RayTracingIpLevel::_None)
+    {
+        return RayTracingIpLevel::_None;
+    }
+    return RayTracingIpLevel::RtIp2_0; //default to ip 2.0
+#else // GPURT_DEBUG_CONTINUATION_TRAVERSAL
+    return _AmdGetRtip(); // Continuation path
+#endif
+#endif
+}
+
 #if GPURT_BUILD_CONTINUATION && LLPC_CLIENT_INTERFACE_MAJOR_VERSION
 // Include the continuations library
 #include "GpuRtLibraryCont.hlsl"
@@ -294,8 +327,13 @@ export void TraceRayInline2_0(
 export uint GetInstanceID(
     in uint64_t instanceNodePtr) // 64-bit instance node address
 {
-    const uint instanceIdAndMask = LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_ID_AND_MASK_OFFSET);
-    return (instanceIdAndMask & 0x00ffffff);
+    uint instanceId = 0;
+    if (instanceNodePtr != 0)
+    {
+        const uint instanceIdAndMask = LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_ID_AND_MASK_OFFSET);
+        instanceId = (instanceIdAndMask & 0x00ffffff);
+    }
+    return instanceId;
 }
 
 //=====================================================================================================================
@@ -303,7 +341,13 @@ export uint GetInstanceID(
 export uint GetInstanceIndex(
     in uint64_t instanceNodePtr) // 64-bit instance node address
 {
-    return LoadDwordAtAddr(instanceNodePtr + sizeof(InstanceDesc) + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET);
+    uint instanceIndex = 0;
+    if (instanceNodePtr != 0)
+    {
+        instanceIndex = LoadDwordAtAddr(instanceNodePtr + sizeof(InstanceDesc) +
+                                        RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET);
+    }
+    return instanceIndex;
 }
 
 //=====================================================================================================================
@@ -313,11 +357,16 @@ export float GetObjectToWorldTransform(
     in uint32_t row,             // row index
     in uint32_t col)             // column index
 {
-    const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float));
-    return asfloat(LoadDwordAtAddr(instanceNodePtr +
-                                   sizeof(InstanceDesc) +
-                                   RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET +
-                                   elementOffset));
+    float transform = 0;
+    if (instanceNodePtr != 0)
+    {
+        const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float));
+        transform = asfloat(LoadDwordAtAddr(instanceNodePtr +
+                                            sizeof(InstanceDesc) +
+                                            RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET +
+                                            elementOffset));
+    }
+    return transform;
 }
 
 //=====================================================================================================================
@@ -327,8 +376,14 @@ export float GetWorldToObjectTransform(
     in uint32_t row,             // row index
     in uint32_t col)             // column index
 {
-    const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float));
-    return asfloat(LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET + elementOffset));
+    float transform = 0;
+    if (instanceNodePtr != 0)
+    {
+        const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float));
+        transform = asfloat(LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET +
+                                            elementOffset));
+    }
+    return transform;
 }
 
 //=====================================================================================================================
@@ -336,17 +391,21 @@ export float GetWorldToObjectTransform(
 static float3x4 GetObjectToWorld3x4(
     in uint64_t instanceNodePtr)
 {
-    float3x4 transform;
-    switch (_AmdGetRtip())
-    {
-    default:
+    float3x4 transform = (float3x4)0;
+
+    if (instanceNodePtr != 0)
     {
-        const uint offset = RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET;
-        transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 0));
-        transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 16));
-        transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 32));
-        break;
-    }
+        switch (GetRtIpLevel())
+        {
+        default:
+        {
+            const uint offset = RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET;
+            transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 0));
+            transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 16));
+            transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 32));
+            break;
+        }
+        }
     }
 
     return transform;
@@ -357,20 +416,23 @@ static float3x4 GetObjectToWorld3x4(
 static float3x4 GetWorldToObject3x4(
     in uint64_t instanceNodePtr)
 {
-    float3x4 transform;
+    float3x4 transform = (float3x4)0;
 
-    switch (_AmdGetRtip())
+    if (instanceNodePtr != 0)
     {
-    default:
-    {
-        const uint offset = INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET;
-
-        transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 0));
-        transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 16));
-        transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 32));
-
-        break;
-    }
+        switch (GetRtIpLevel())
+        {
+        default:
+        {
+            const uint offset = INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET;
+
+            transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 0));
+            transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 16));
+            transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 32));
+
+            break;
+        }
+        }
     }
 
     return transform;
@@ -398,7 +460,12 @@ export uint64_t GetRayQuery64BitInstanceNodePtr(
     in uint64_t tlasBaseAddr,     // 64-bit TLAS base address
     in uint32_t instanceNodePtr)  // Instance node pointer
 {
-    return CalculateNodeAddr64(tlasBaseAddr, instanceNodePtr);
+    uint64_t nodeAddr = 0;
+    if (instanceNodePtr != 0)
+    {
+        nodeAddr = CalculateNodeAddr64(tlasBaseAddr, instanceNodePtr);
+    }
+    return nodeAddr;
 }
 
 //=====================================================================================================================
@@ -429,7 +496,7 @@ static uint GetGeneralInstanceID(
     in uint64_t instNodeAddr) // 64-bit instance node address
 {
     uint id = 0;
-    switch (_AmdGetRtip())
+    switch (GetRtIpLevel())
     {
     default:
     {
@@ -447,7 +514,7 @@ static uint GetGeneralInstanceIndex(
     in uint64_t instNodeAddr) // 64-bit instance node address
 {
     uint index = 0;
-    RayTracingIpLevel rtip = _AmdGetRtip();
+    RayTracingIpLevel rtip = GetRtIpLevel();
     switch (rtip)
     {
     default:
@@ -467,7 +534,7 @@ static uint64_t GetRayQueryInstanceNodePtr(
     in uint32_t instanceNodePtr)  // Instance node pointer
 {
     uint64_t instNodePtr = 0;
-    RayTracingIpLevel rtip = _AmdGetRtip();
+    RayTracingIpLevel rtip = GetRtIpLevel();
     switch (rtip)
     {
     default:
@@ -490,7 +557,7 @@ export RayQueryInternal _RayQuery_Allocate()
 export void _RayQuery_Abort(
     inout_param(RayQueryInternal) rayQuery)
 {
-    uint rtIp = (uint)_AmdGetRtip();
+    uint rtIp = (uint)GetRtIpLevel();
     if (rtIp >= (uint)RayTracingIpLevel::RtIp2_0)
     {
         rayQuery.currNodePtr = TERMINAL_NODE;
@@ -1011,7 +1078,7 @@ export TriangleData _RayQuery_FetchTrianglePosition(
     in bool                       committed) // Node pointer
 {
     TriangleData tdata;
-    RayTracingIpLevel rtip = _AmdGetRtip();
+    RayTracingIpLevel rtip = GetRtIpLevel();
     switch (rtip)
     {
     default:
@@ -1030,7 +1097,7 @@ export bool _RayQuery_Proceed(
     in    uint                    constRayFlags,
     in    uint3                   dispatchThreadId)
 {
-    uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip());
+    uint rtIpLevel = ConvertRtIpLevel(GetRtIpLevel());
     return RayQueryProceedCommon(
         rayQuery,
         constRayFlags,
@@ -1051,7 +1118,7 @@ export void _RayQuery_TraceRayInline(
     in    RayDesc                 rayDesc,
     in    uint3                   dispatchThreadId)
 {
-    uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip());
+    uint rtIpLevel = ConvertRtIpLevel(GetRtIpLevel());
     TraceRayInlineCommon(rayQuery,
                          accelStructLo,
                          accelStructHi,
diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl
index 293fae0..f706b3d 100644
--- a/src/shaders/GpuRtLibraryCont.hlsl
+++ b/src/shaders/GpuRtLibraryCont.hlsl
@@ -25,8 +25,6 @@
 
 // Include intrinsics and defines from the compiler
 #include "llpc/GpurtIntrinsics.h"
-#ifndef __cplusplus
-#endif
 #if DEVELOPER
 #include "../../gpurt/gpurtCounter.h"
 #endif
@@ -34,20 +32,21 @@
 #include "../shadersClean/common/Math.hlsli"
 #include "../shadersClean/common/InstanceDesc.hlsli"
 
-// By default, Gpurt exports both non-continuation and continuation traversal functions. Dxcp picks one based on panel
-// setting.
-// GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP = GPURT_RTIP1_1/GPURT_RTIP2_0
-// is only used for a debug purpose.
-// It supports DxcpRt (non-continuation) to use Continuation traversal. In this config, the pure continuation model does
-// not work.
-#ifndef GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
-#define GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP 0
-#endif
+// Do not use ~0 as an invalid stack pointer, to leave it free to use as a sentinel value
+#define CPS_STACK_PTR_STACKLESS_DEAD_LANE (~uint32_t(1))
+// CPS Stack pointers are dword-aligned, so we can use up to 2 bits. Use the second bit
+// to flag a dead lane, so in particular CPS_STACK_PTR_STACKLESS_DEAD_LANE identifies a dead lane
+#define CPS_STACK_PTR_DEAD_LANE_FLAG (2)
+#define CPS_STACK_PTR_INVALID (CPS_STACK_PTR_STACKLESS_DEAD_LANE & ~CPS_STACK_PTR_DEAD_LANE_FLAG)
 
-#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0) && (!defined(__cplusplus)))
-#define CONTINUATION_ON_GPU 1
-#else
-#define CONTINUATION_ON_GPU 0
+#define DEAD_SHADER_ADDR (~uint32_t(0))
+
+static bool RtIpIsAtLeast(RayTracingIpLevel level)
+{
+    return ((uint32_t)GetRtIpLevel()) >= ((uint32_t)level);
+}
+
+#ifndef __cplusplus
 #endif
 
 #define REMAT_INSTANCE_RAY 1
@@ -96,37 +95,6 @@
 #define SCHEDULING_PRIORITY_CALLABLE  6
 // Maximum supported value (3 bits): 7
 
-#if CONTINUATION_ON_GPU == 0
-#ifdef __cplusplus
-extern uint g_rtIpLevel;          // defined in cputraversal
-void _AmdSetRtip(uint rtIpLevel); // defined in cputraversal
-#endif
-static RayTracingIpLevel _AmdGetRtip()
-{
-    RayTracingIpLevel rtIpLevel = RayTracingIpLevel::_None;
-#ifdef __cplusplus
-    switch (g_rtIpLevel)
-#else
-    switch (GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP)
-#endif
-    {
-    case GPURT_RTIP1_1:
-        rtIpLevel = RayTracingIpLevel::RtIp1_1;
-        break;
-    case GPURT_RTIP2_0:
-        rtIpLevel = RayTracingIpLevel::RtIp2_0;
-        break;
-    }
-
-    return rtIpLevel;
-}
-#endif
-
-static bool RtIpIsAtLeast(RayTracingIpLevel level)
-{
-    return ((uint32_t)_AmdGetRtip()) >= ((uint32_t)level);
-}
-
 //=====================================================================================================================
 static uint GetPriorityForShaderType(
     DXILShaderKind shaderKind)
@@ -146,6 +114,63 @@ static uint GetPriorityForShaderType(
 // Forward declaration for _AmdDispatchSystemData.PackDispatchId() and _AmdDispatchSystemData.DispatchId()
 static uint3 GetDispatchRaysDimensions();
 
+//=====================================================================================================================
+// Apply the known set/unset bits
+static uint ApplyKnownFlags(
+    uint incomingFlags)
+{
+    uint flags = incomingFlags;
+
+#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION  >= 41
+    // Apply known bits common to all TraceRay calls
+    flags = ((flags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags());
+#endif
+
+    // Apply options overrides
+    flags &= ~Options::getRayFlagsOverrideForceDisableMask();
+    flags |=  Options::getRayFlagsOverrideForceEnableMask();
+
+    return flags;
+}
+
+//=====================================================================================================================
+// Apply compile time pipeline config flags only, it does not apply known common flags from TraceRay call sites
+static uint ApplyCompileTimePipelineConfigFlags(
+    uint incomingFlags)
+{
+    uint flags = incomingFlags;
+
+    flags |= (AmdTraceRayGetStaticFlags() & (PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES | PIPELINE_FLAG_SKIP_TRIANGLES));
+#if DEVELOPER
+    flags |= DispatchRaysConstBuf.profileRayFlags;
+#endif
+
+    return flags;
+}
+
+//=====================================================================================================================
+// Apply all static known flags, include both compile time pipeline config flags and known set/unset bits
+static uint ApplyAllStaticallyKnownFlags(
+    uint incomingFlags)     // The flags from TraceRay call sites,
+                            // 0 means get Pipeline flags for all shaders in this pipeline
+{
+    return ApplyCompileTimePipelineConfigFlags(ApplyKnownFlags(incomingFlags));
+}
+
+//=====================================================================================================================
+// Get the box sort heuristic mode according to the pipeline flags
+static uint GetBoxHeuristicMode()
+{
+    uint boxHeuristicMode = AmdTraceRayGetBoxSortHeuristicMode();
+    if ((boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosest) ||
+        (boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosestMidPoint))
+    {
+        boxHeuristicMode = GetBoxSortingHeuristicFromRayFlags(ApplyAllStaticallyKnownFlags(0), boxHeuristicMode);
+    }
+
+    return boxHeuristicMode;
+}
+
 //=====================================================================================================================
 
 struct Vpc64 {
@@ -181,6 +206,7 @@ struct Vpc64 {
         const uint firstMetadataBit = 32;
         const uint firstPriorityBitInMetadata = 16;
         GPU_ASSERT((vpc & 0xFFFF000000000000) == 0);
+        vpc &= 0x0000FFFFFFFFFFFF;
         vpc |= (prio64 << (firstMetadataBit + firstPriorityBitInMetadata));
         return Vpc64(vpc);
     }
@@ -216,24 +242,37 @@ struct Vpc32 {
 
     bool IsValid()
     {
-        return GetFunctionAddr() != 0;
+        return vpc != 0;
     }
 
-    void SetPriority(uint priority)
+    Vpc32 SetPriority(uint priority)
     {
+        if (_AmdIsLlpc())
+        {
+            return Vpc32(vpc);
+        }
+
+        vpc &= ~0x7;
         vpc |= priority;
+
+        return Vpc32(vpc);
     }
 
     uint GetPriority()
     {
         return (uint)(vpc & 0x7);
     }
+
+    static Vpc32 MakeWithPriority(Vpc32 vpc32, uint priority)
+    {
+        return vpc32.SetPriority(priority);
+    }
 };
 
 //=====================================================================================================================
 // 32-bit function pointer packing/unpacking
 //
-static Vpc64 Vpc32ToVpc64(Vpc32 vpc32, bool unpackPriority)
+static Vpc64 Vpc32ToVpc64(Vpc32 vpc32)
 {
     if (_AmdIsLlpc())
     {
@@ -242,10 +281,7 @@ static Vpc64 Vpc32ToVpc64(Vpc32 vpc32, bool unpackPriority)
 
     Vpc64 vpc64 = Vpc64((uint64_t)(vpc32.GetFunctionAddr()));
 
-    if (unpackPriority)
-    {
-       vpc64.SetPriority(vpc32.GetPriority());
-    }
+    vpc64.SetPriority(vpc32.GetPriority());
 
     return vpc64;
 }
@@ -315,8 +351,10 @@ struct _AmdDispatchSystemData
         return dispatchId;
     }
 
-    static _AmdDispatchSystemData MakeDeadLaneWithStack();
-    static _AmdDispatchSystemData MakeDeadLaneWithoutStack();
+    void SetDead(bool withStack)
+    {
+        nextNodePtr = withStack ? DEAD_LANE_WITH_STACK : DEAD_LANE_WITHOUT_STACK;
+    }
 
     uint  dispatchLinearId;   // Packed dispatch linear id. Combine x/y/z into 1 DWORD.
 
@@ -358,27 +396,12 @@ struct _AmdRaySystemState
     // Incoming flags are the flags passed by TraceRay call
     uint IncomingFlags()
     {
-        uint incomingFlags = uint(bitFieldExtract64(packedAccelStruct, 48, 12));
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION  >= 41
-        // Apply known bits common to all TraceRay calls
-        incomingFlags = ((incomingFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags());
-#endif
-        // Apply options overrides
-        incomingFlags &= ~Options::getRayFlagsOverrideForceDisableMask();
-        incomingFlags |=  Options::getRayFlagsOverrideForceEnableMask();
-
-        return incomingFlags;
+        return uint(bitFieldExtract64(packedAccelStruct, 48, 12));
     }
 
     uint Flags()
     {
-        uint rayFlags = IncomingFlags();
-        // Apply compile time pipeline config flags into the ray flags
-        rayFlags |= (AmdTraceRayGetStaticFlags() & (PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES | PIPELINE_FLAG_SKIP_TRIANGLES));
-#if DEVELOPER
-        rayFlags |= DispatchRaysConstBuf.profileRayFlags;
-#endif
-        return rayFlags;
+        return ApplyAllStaticallyKnownFlags(IncomingFlags());
     }
 
     void SetAnyHitDidAccept(bool value)
@@ -421,7 +444,7 @@ struct _AmdPrimitiveSystemState
         packedGeometryIndex(0),
         packedInstanceContribution(0)
       , currNodePtr(INVALID_IDX)
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
       , packedType(0)
 #endif
     {
@@ -441,6 +464,10 @@ struct _AmdPrimitiveSystemState
                                         // hitKind              [31 : 24]
 
     uint currNodePtr;
+    void SetCurrNodePtr(uint p)
+    {
+        currNodePtr = p;
+    }
 
     uint GeometryIndex()
     {
@@ -519,7 +546,7 @@ struct _AmdPrimitiveSystemState
         packedInstanceContribution = bitFieldInsert(packedInstanceContribution, 24, 8, hitKind);
     }
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
     // The following member data are only used in DEBUG
     uint packedType;        // IsProcedural:   [31]    - 1 bit
                             // AnyhitCallType: [1 : 0] - 2 bits
@@ -598,9 +625,7 @@ struct _AmdTraversalState
                                   // field becomes re-used for something else in non-rebraid mode.
     uint reservedNodePtr;         // RTIPv2.0 (lastNodePtr)
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0
     uint32_t packedReturnAddr; // The address of the function to return to, packed into 32 bits.
-#endif
 
     uint InstanceContribution()
     {
@@ -629,16 +654,16 @@ struct _AmdTraversalState
 
     void PackStackPtrTop(uint ptr)
     {
-        GPU_ASSERT((_AmdGetRtip() == RayTracingIpLevel::RtIp1_1) ||
-                   (_AmdGetRtip() == RayTracingIpLevel::RtIp2_0));
+        GPU_ASSERT((GetRtIpLevel() == RayTracingIpLevel::RtIp1_1) ||
+                   (GetRtIpLevel() == RayTracingIpLevel::RtIp2_0));
 
         packedStackTopOrParentPointer = ptr;
     }
 
     uint StackPtrTop()
     {
-        GPU_ASSERT((_AmdGetRtip() == RayTracingIpLevel::RtIp1_1) ||
-                   (_AmdGetRtip() == RayTracingIpLevel::RtIp2_0));
+        GPU_ASSERT((GetRtIpLevel() == RayTracingIpLevel::RtIp1_1) ||
+                   (GetRtIpLevel() == RayTracingIpLevel::RtIp2_0));
         return packedStackTopOrParentPointer;
     }
 
@@ -659,14 +684,14 @@ struct _AmdTraversalState
         return committed.State();
     }
 
-    void PackReturnAddress(Vpc64 returnAddr)
+    void SetReturnAddress(Vpc32 returnAddr)
     {
-        packedReturnAddr = Vpc64ToVpc32(returnAddr).GetU32();
+        packedReturnAddr = returnAddr.GetU32();
     }
 
-    Vpc64 ReturnAddress()
+    Vpc32 ReturnAddress()
     {
-        return Vpc32ToVpc64(Vpc32(packedReturnAddr), true);
+        return Vpc32(packedReturnAddr);
     }
 };
 
@@ -712,6 +737,26 @@ struct _AmdRayHistoryCounter
 };
 #endif
 
+namespace Traits
+{
+
+static bool HasStacklessDeadLanes()
+{
+    return false;
+}
+
+static bool HasStackfulDeadLanes()
+{
+    return Options::getPersistentLaunchEnabled();
+}
+
+static bool HasDeadLanes()
+{
+    return HasStackfulDeadLanes() || HasStacklessDeadLanes();
+}
+
+} // namespace Traits
+
 //=====================================================================================================================
 struct _AmdSystemData
 {
@@ -723,52 +768,56 @@ struct _AmdSystemData
 
     bool IsDeadLaneWithoutStack()
     {
-        // This type of dead lane is only possible when the continuations stack is in global memory.
-        // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime.
-        return (dispatch.nextNodePtr == DEAD_LANE_WITHOUT_STACK) && _AmdContinuationStackIsGlobal();
+        return Traits::HasStacklessDeadLanes() && dispatch.nextNodePtr == DEAD_LANE_WITHOUT_STACK;
     }
 
     bool IsDeadLaneWithStack()
     {
-        // This type of dead lane is only possible when persistent launch is enabled.
-        // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime.
-        return (dispatch.nextNodePtr == DEAD_LANE_WITH_STACK) && Options::getPersistentLaunchEnabled();
+        return Traits::HasStackfulDeadLanes() && dispatch.nextNodePtr == DEAD_LANE_WITH_STACK;
+    }
+
+    bool IsDeadLane()
+    {
+        return IsDeadLaneWithoutStack() || IsDeadLaneWithStack();
     }
 
     bool IsTraversal()
     {
+        GPU_ASSERT(!IsDeadLane());
         return IsValidNode(dispatch.nextNodePtr);
     }
 
     bool IsChsOrMiss(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return (state >= TRAVERSAL_STATE_COMMITTED_NOTHING);
     }
 
     bool IsMiss(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return IsChsOrMiss(state) && !IsValidNode(traversal.committed.instNodePtr);
     }
 
     bool IsAhs(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return (state == TRAVERSAL_STATE_CANDIDATE_NON_OPAQUE_TRIANGLE);
     }
 
     bool IsIs(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return ((state == TRAVERSAL_STATE_CANDIDATE_PROCEDURAL_PRIMITIVE) ||
                 (state == TRAVERSAL_STATE_CANDIDATE_NON_OPAQUE_PROCEDURAL_PRIMITIVE));
     }
 
     bool IsChs(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return IsChsOrMiss(state) && IsValidNode(traversal.committed.instNodePtr);
     }
 
-    static _AmdSystemData MakeDeadLaneWithStack();
-    static _AmdSystemData MakeDeadLaneWithoutStack();
-
     // Note: _AmdDispatchSystemData must be the first member of _AmdSystemData. This allows us to save some VGPRs if
     //       we need to call a function that takes _AmdSystemData but doesn't actually need ray or traversal data.
     //       For example, the launch kernel can make a dead lane and enqueue traversal with just dispatch.nextNodePtr.
@@ -816,24 +865,30 @@ struct _AmdTraversalResultData
                   // 2) otherwise the first hitted non-opaque primitive.
 };
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0
+#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
 // Define specialized intrinsics.
 // We use macros because HLSL does not have varargs or generics.
 // The macros and intrinsics are defined by llpc.
-DECLARE_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data)
+DECLARE_ENQUEUE(, uint32_t returnAddr, _AmdSystemData data)
 
-DECLARE_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data)
-DECLARE_ENQUEUE(TraversalDead, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
-DECLARE_ENQUEUE(RayGen, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
+DECLARE_ENQUEUE(Traversal, uint32_t dummyReturnAddr, _AmdSystemData data)
+DECLARE_ENQUEUE(TraversalDead, uint32_t dummyReturnAddr, _AmdDispatchSystemData data)
+DECLARE_ENQUEUE(RayGen, uint32_t dummyReturnAddr, _AmdDispatchSystemData data)
 
-DECLARE_ENQUEUE(AnyHit, uint64_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics)
-DECLARE_ENQUEUE(Intersection, uint64_t returnAddr, _AmdAnyHitSystemData data)
+DECLARE_ENQUEUE(AnyHit, uint32_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics)
+DECLARE_ENQUEUE(Intersection, uint32_t returnAddr, _AmdAnyHitSystemData data)
 
-DECLARE_AWAIT(AnyHit, _AmdAnyHitSystemData, uint64_t returnAddr, _AmdAnyHitSystemData data)
-DECLARE_AWAIT(CallShader, _AmdDispatchSystemData, uint64_t returnAddr, _AmdDispatchSystemData data)
+DECLARE_AWAIT(AnyHit, _AmdAnyHitSystemData, uint32_t returnAddr, _AmdAnyHitSystemData data)
+DECLARE_AWAIT(CallShader, _AmdDispatchSystemData, uint32_t returnAddr, _AmdDispatchSystemData data)
 
+#ifndef PASS_DUMMY_RET_ADDR
 // No returnAddr argument. The return address is instead included in the passed system data.
 DECLARE_AWAIT(Traversal, _AmdDispatchSystemData, _AmdSystemData data)
+#else // PASS_DUMMY_RET_ADDR
+// Pass a dummy return address for consistency reasons.
+// The actual return address is included in the passed system data.
+DECLARE_AWAIT(Traversal, _AmdDispatchSystemData, VpcIntTy dummyReturnAddr, _AmdSystemData data)
+#endif
 
 DECLARE_RESTORE_SYSTEM_DATA(, _AmdDispatchSystemData data)
 DECLARE_RESTORE_SYSTEM_DATA(AnyHit, _AmdAnyHitSystemData data)
@@ -853,64 +908,37 @@ DECLARE_CONT_STACK_LOAD_LAST_USE(U32, uint32_t)
 DECLARE_CONT_STACK_STORE(U32, uint32_t value)
 DECLARE_CONT_STACK_LOAD_LAST_USE(U64, uint64_t)
 DECLARE_CONT_STACK_STORE(U64, uint64_t value)
-#endif
-
-inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithStack()
-{
-    _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData();
-    data.nextNodePtr = DEAD_LANE_WITH_STACK;
-    return data;
-}
-
-inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithoutStack()
-{
-    _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData();
-    data.nextNodePtr = DEAD_LANE_WITHOUT_STACK;
-    return data;
-}
-
-inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithStack()
-{
-    _AmdSystemData data = _AmdGetUninitializedSystemData();
-    data.dispatch.nextNodePtr = DEAD_LANE_WITH_STACK;
-    return data;
-}
-
-inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithoutStack()
+#else // ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
+//=====================================================================================================================
+inline _AmdDispatchSystemData _AmdGetUninitializedDispatchSystemData()
 {
-    _AmdSystemData data = _AmdGetUninitializedSystemData();
-    data.dispatch.nextNodePtr = DEAD_LANE_WITHOUT_STACK;
-    return data;
+    return (_AmdDispatchSystemData)0;
 }
 
 //=====================================================================================================================
-// Return the argument.
-static Vpc64 GetVpc64FromShaderId(Vpc32 shaderId, uint priority)
+inline _AmdSystemData _AmdGetUninitializedSystemData()
 {
-    Vpc64 vpc64 = Vpc32ToVpc64(shaderId, /* unpackPriority = */ false);
-    vpc64.SetPriority(priority);
-    return vpc64;
+    return (_AmdSystemData)0;
 }
+#endif
 
 //=====================================================================================================================
-static Vpc64 GetVpc64FromShaderIdAddr(GpuVirtualAddress addr, uint priority)
+static Vpc32 GetVpcFromShaderIdAddr(GpuVirtualAddress addr)
 {
 #ifdef __cplusplus
     return 1;
 #else
-    Vpc32 shaderId = Vpc32(ConstantLoadDwordAtAddr(addr));
-    return GetVpc64FromShaderId(shaderId, priority);
+    return Vpc32(ConstantLoadDwordAtAddr(addr));
 #endif
 }
 
 //=====================================================================================================================
-static Vpc64 GetVpc64FromShaderIdTable(
+static Vpc32 GetVpcFromShaderIdTable(
     GpuVirtualAddress tableAddress,
     uint index,
-    uint stride,
-    uint priority)
+    uint stride)
 {
-    return GetVpc64FromShaderIdAddr(tableAddress + stride * index, priority);
+    return GetVpcFromShaderIdAddr(tableAddress + stride * index);
 }
 
 //=====================================================================================================================
@@ -929,15 +957,6 @@ static Vpc32 GetAnyHit32BitShaderId(
     return Vpc32(ConstantLoadDwordAtAddr(tableVa + offset + 8));
 }
 
-//=====================================================================================================================
-// Returns the 64-bit VPC for the given AHS by loading its shader address, and setting the AHS priority.
-static Vpc64 GetAnyHitAddr(
-    uint hitGroupRecordIndex)
-{
-    Vpc32 shaderId = GetAnyHit32BitShaderId(hitGroupRecordIndex);
-    return GetVpc64FromShaderId(shaderId, SCHEDULING_PRIORITY_AHS);
-}
-
 //=====================================================================================================================
 // Returns whether the corresponding AHS is non-null.
 static bool AnyHitIsNonNull(
@@ -1002,13 +1021,6 @@ static float4x3 WorldToObject4x3(in uint64_t tlasBaseAddr, in uint instNodePtr)
     return transpose(WorldToObject3x4(tlasBaseAddr, instNodePtr));
 }
 
-//=====================================================================================================================
-// Implementation of DispatchRaysIndex.
-export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data)
-{
-    return data.DispatchId();
-}
-
 //=====================================================================================================================
 // Load dispatch dimensions from constant buffer.
 static uint3 GetDispatchRaysDimensions()
@@ -1035,78 +1047,6 @@ static uint GetPersistentDispatchSize()
     return min(DispatchRaysConstBuf.rayDispatchMaxGroups, groupsNeeded);
 }
 
-//=====================================================================================================================
-// Implementation of DispatchRaysDimensions().
-export uint3 _cont_DispatchRaysDimensions3(in _AmdDispatchSystemData data)
-{
-    return GetDispatchRaysDimensions();
-}
-
-#if CONTINUATION_ON_GPU
-//=====================================================================================================================
-// Return the hit state for AnyHit and Intersection
-export _AmdPrimitiveSystemState _cont_GetCandidateState(in _AmdAnyHitSystemData data)
-{
-    return data.candidate;
-}
-
-//=====================================================================================================================
-// Return the hit state for ClosestHit
-export _AmdPrimitiveSystemState _cont_GetCommittedState(in _AmdSystemData data)
-{
-    return data.traversal.committed;
-}
-
-//=====================================================================================================================
-export float3 _cont_WorldRayOrigin3(in _AmdSystemData state)
-{
-    return state.ray.origin;
-}
-
-//=====================================================================================================================
-export float3 _cont_WorldRayDirection3(in _AmdSystemData state)
-{
-    return state.ray.direction;
-}
-
-//=====================================================================================================================
-export float _cont_RayTMin(in _AmdSystemData state)
-{
-    return state.ray.tMin;
-}
-
-//=====================================================================================================================
-export uint _cont_RayFlags(in _AmdSystemData state)
-{
-    return state.ray.IncomingFlags();
-}
-
-//=====================================================================================================================
-export uint _cont_InstanceInclusionMask(in _AmdSystemData data)
-{
-    return ExtractInstanceInclusionMask(data.ray.traceParameters);
-}
-
-//=====================================================================================================================
-export float _cont_RayTCurrent(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
-{
-    if (_AmdGetShaderKind() == DXILShaderKind::Intersection)
-    {
-        // The intersection shader is an exception. While the system data is usually about the candidate hit, the
-        // current t must be from the committed hit.
-        primitive = _cont_GetCommittedState(data);
-    }
-
-    float tCurrentHw = 0.f;
-    {
-        tCurrentHw = primitive.rayTCurrent;
-    }
-
-    // AMD Gpu shifts the origin, so rayTCurrent is between 0 and (tMaxApp - tMinApp). Add tMinApp back for App's use.
-    return tCurrentHw + data.ray.tMin;
-}
-#endif
-
 //=====================================================================================================================
 // Map a thread to a ray, some threads could end up with non-existent (invalid) rays.
 // Note D3D12_DISPATCH_RAYS_DESC::(w x h x d) are organized to DispatchDims = (?, d, 1).
@@ -1190,77 +1130,156 @@ static uint3 GetDispatchId(uint width, uint height, uint dispatchId)
     return uint3(xTile * TileWidth + x, yTile * TileHeight + y, z);
 }
 
+#ifdef __cplusplus
 //=====================================================================================================================
-export uint _cont_InstanceIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Helper function for cpp only
+static float3 mul(in float3 v, in float4x3 m)
 {
-
-    return ConstantLoadDwordAtAddr(
-        GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) +
-        INSTANCE_NODE_EXTRA_OFFSET + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET);
+    float3 r;
+    r.x = dot(m[0], v);
+    r.y = dot(m[1], v);
+    r.z = dot(m[2], v);
+    return r;
 }
+#endif
 
+#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
 //=====================================================================================================================
-export uint _cont_InstanceID(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Implementation of DispatchRaysIndex.
+export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data)
 {
-
-    return ConstantLoadDwordAtAddr(
-        GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) + INSTANCE_DESC_ID_AND_MASK_OFFSET) & 0x00ffffff;
+    return data.DispatchId();
 }
 
 //=====================================================================================================================
-export uint _cont_GeometryIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Implementation of DispatchRaysDimensions().
+export uint3 _cont_DispatchRaysDimensions3(in _AmdDispatchSystemData data)
 {
-    return primitive.GeometryIndex();
+    return GetDispatchRaysDimensions();
 }
 
 //=====================================================================================================================
-export uint _cont_PrimitiveIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Return the hit state for AnyHit and Intersection
+export _AmdPrimitiveSystemState _cont_GetCandidateState(in _AmdAnyHitSystemData data)
 {
-    return primitive.primitiveIndex;
+    return data.candidate;
 }
 
 //=====================================================================================================================
-export float4x3 _cont_ObjectToWorld4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Return the hit state for ClosestHit
+export _AmdPrimitiveSystemState _cont_GetCommittedState(in _AmdSystemData data)
 {
-    return ObjectToWorld4x3(data.ray.AccelStruct(), primitive.instNodePtr);
+    return data.traversal.committed;
 }
 
 //=====================================================================================================================
-export float4x3 _cont_WorldToObject4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+export float3 _cont_WorldRayOrigin3(in _AmdSystemData state)
 {
-    return WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr);
+    return state.ray.origin;
 }
 
 //=====================================================================================================================
-export TriangleData _cont_TriangleVertexPositions(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+export float3 _cont_WorldRayDirection3(in _AmdSystemData state)
 {
-    const GpuVirtualAddress instanceAddr = GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr);
-    {
-        return FetchTriangleFromNode(GetInstanceAddr(FetchInstanceDescAddr(instanceAddr)), primitive.currNodePtr);
-    }
+    return state.ray.direction;
 }
 
-#ifdef __cplusplus
 //=====================================================================================================================
-// Helper function for cpp only
-static float3 mul(in float3 v, in float4x3 m)
+export float _cont_RayTMin(in _AmdSystemData state)
 {
-    float3 r;
-    r.x = dot(m[0], v);
-    r.y = dot(m[1], v);
-    r.z = dot(m[2], v);
-    return r;
+    return state.ray.tMin;
 }
-#endif
 
 //=====================================================================================================================
-export float3 _cont_ObjectRayOrigin3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+export uint _cont_RayFlags(in _AmdSystemData state)
 {
-    return mul(float4(data.ray.origin, 1.0), WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr));
+    // Get the flags passed by TraceRay call and apply the known set/unset bits.
+    return ApplyKnownFlags(state.ray.IncomingFlags());
 }
 
 //=====================================================================================================================
-export float3 _cont_ObjectRayDirection3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+export uint _cont_InstanceInclusionMask(in _AmdSystemData data)
+{
+    return ExtractInstanceInclusionMask(data.ray.traceParameters);
+}
+
+//=====================================================================================================================
+export float _cont_RayTCurrent(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    if (_AmdGetShaderKind() == DXILShaderKind::Intersection)
+    {
+        // The intersection shader is an exception. While the system data is usually about the candidate hit, the
+        // current t must be from the committed hit.
+        primitive = _cont_GetCommittedState(data);
+    }
+
+    float tCurrentHw = 0.f;
+    {
+        tCurrentHw = primitive.rayTCurrent;
+    }
+
+    // AMD Gpu shifts the origin, so rayTCurrent is between 0 and (tMaxApp - tMinApp). Add tMinApp back for App's use.
+    return tCurrentHw + data.ray.tMin;
+}
+
+//=====================================================================================================================
+export uint _cont_InstanceIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+
+    return ConstantLoadDwordAtAddr(
+        GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) +
+        INSTANCE_NODE_EXTRA_OFFSET + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET);
+}
+
+//=====================================================================================================================
+export uint _cont_InstanceID(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+
+    return ConstantLoadDwordAtAddr(
+        GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) + INSTANCE_DESC_ID_AND_MASK_OFFSET) & 0x00ffffff;
+}
+
+//=====================================================================================================================
+export uint _cont_GeometryIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return primitive.GeometryIndex();
+}
+
+//=====================================================================================================================
+export uint _cont_PrimitiveIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return primitive.primitiveIndex;
+}
+
+//=====================================================================================================================
+export float4x3 _cont_ObjectToWorld4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return ObjectToWorld4x3(data.ray.AccelStruct(), primitive.instNodePtr);
+}
+
+//=====================================================================================================================
+export float4x3 _cont_WorldToObject4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr);
+}
+
+//=====================================================================================================================
+export TriangleData _cont_TriangleVertexPositions(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    const GpuVirtualAddress instanceAddr = GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr);
+    {
+        return FetchTriangleFromNode(GetInstanceAddr(FetchInstanceDescAddr(instanceAddr)), primitive.currNodePtr);
+    }
+}
+
+//=====================================================================================================================
+export float3 _cont_ObjectRayOrigin3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return mul(float4(data.ray.origin, 1.0), WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr));
+}
+
+//=====================================================================================================================
+export float3 _cont_ObjectRayDirection3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
 {
     return mul(float4(data.ray.direction, 0.0), WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr));
 }
@@ -1353,7 +1372,6 @@ export uint _cont_GetContinuationStackAddr()
 {
     uint offset = 0;
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0
     if (_AmdContinuationStackIsGlobal())
     {
         const uint3 threadIdInGroup = AmdExtThreadIdInGroupCompute();
@@ -1366,7 +1384,6 @@ export uint _cont_GetContinuationStackAddr()
         offset = id * DispatchRaysConstBuf.cpsFrontendStackSize;
     }
     else
-#endif
     {
         offset =
 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 36
@@ -1387,27 +1404,26 @@ export uint64_t _cont_GetContinuationStackGlobalMemBase()
 }
 
 //=====================================================================================================================
-static Vpc64 GetTraversalVpc64()
+static Vpc32 GetTraversalVpc32()
 {
     // NOTE: DXCP uses a table for TraceRay, thus a load to traceRayGpuVa retrieves the actual traversal function
     // address. But Vulkan does not use the table so far, traceRayGpuVa is already the traversal function address.
-    return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
-                      DispatchRaysConstBuf.traceRayGpuVaHi));
+    return Vpc64ToVpc32(Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
+                      DispatchRaysConstBuf.traceRayGpuVaHi)));
 }
 
 //=====================================================================================================================
-static Vpc64 GetTraversalVpc64PwgDead()
+static Vpc32 GetTraversalVpc32PwgDead()
 {
-    return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
-                      DispatchRaysConstBuf.traceRayGpuVaHi));
+    return Vpc64ToVpc32(Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
+                                         DispatchRaysConstBuf.traceRayGpuVaHi)));
 }
 
 //=====================================================================================================================
-static Vpc64 GetRayGenVpc64()
+static Vpc32 GetRayGenVpc32()
 {
-    return GetVpc64FromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo,
-                                               DispatchRaysConstBuf.rayGenerationTableAddressHi),
-                                               SCHEDULING_PRIORITY_RGS);
+    return GetVpcFromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo,
+                                             DispatchRaysConstBuf.rayGenerationTableAddressHi));
 }
 
 //=====================================================================================================================
@@ -1460,6 +1476,162 @@ export uint _cont_GetSbtStride()
     }
 }
 
+//=====================================================================================================================
+// ReportHit implementation that is called from the intersection shader.
+// May call the AnyHit shader.
+export bool _cont_ReportHit(inout_param(_AmdAnyHitSystemData) data, float THit, uint HitKind)
+{
+    // TODO Reuse shader record index computed in Traversal
+    // TODO Check for closest hit and duplicate anyHit calling
+
+    THit -= data.base.ray.tMin;
+    float tCurrentCommitted = 0.f;
+    {
+        tCurrentCommitted = data.base.traversal.committed.rayTCurrent;
+    }
+
+    if ((THit < 0.f) || (THit > tCurrentCommitted))
+    {
+        // Discard the hit candidate and hint the compiler to not keep the
+        // values alive, which will remove redundant moves.
+        data.candidate.rayTCurrent = _AmdGetUninitializedF32();
+        // Don't discard the hit kind as it is bit packed and cannot be discarded partially.
+        return false;
+    }
+
+    data.candidate.rayTCurrent = THit;
+    data.candidate.PackHitKind(HitKind);
+
+    uint isOpaque = true;
+    {
+        PrimitiveData primitiveData;
+        InstanceDesc desc;
+
+        {
+            // Get primitive nodes to process based on candidate or committed hit
+            const uint tlasNodePtr = data.candidate.instNodePtr;
+
+            const GpuVirtualAddress tlasAddr = data.base.ray.AccelStruct() + ExtractNodePointerOffset(tlasNodePtr);
+            desc = FetchInstanceDescAddr(tlasAddr);
+            isOpaque = data.candidate.IsOpaque();
+        }
+    }
+
+    if (!isOpaque)
+    {
+        uint hitGroupRecordIndex = 0;
+        {
+            hitGroupRecordIndex = data.base.dispatch.shaderRecIdx;
+        }
+        // Compute hit group address and fetch shader identifiers
+        const Vpc32 anyHitAddr = GetAnyHit32BitShaderId(hitGroupRecordIndex);
+
+        if (anyHitAddr.IsValid())
+        {
+            // Call AnyHit
+            // Hit attributes are added as an additional argument by the compiler
+            Vpc32 resumeAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetResumePointAddr()), SCHEDULING_PRIORITY_IS);
+            data = _AmdAwaitAnyHit(anyHitAddr.GetU32(), resumeAddr.GetU32(), data);
+            _AmdRestoreSystemDataAnyHit(data);
+            return data.base.ray.AnyHitDidAccept();
+        }
+        else
+        {
+            _cont_AcceptHit(data);
+            _AmdAcceptHitAttributes(data); // changes data.base.traversal.committedBarycentrics plus up-to-6 DW data in payload
+            return true;
+        }
+    }
+    else
+    {
+        _cont_AcceptHit(data);
+        _AmdAcceptHitAttributes(data);
+        return true;
+    }
+}
+
+//=====================================================================================================================
+// CallShader implementation
+export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint index)
+{
+    const uint64_t callableTableBaseAddress =
+        PackUint64(DispatchRaysConstBuf.callableTableBaseAddressLo, DispatchRaysConstBuf.callableTableBaseAddressHi);
+
+    if (callableTableBaseAddress == 0)
+    {
+        // TODO: It might be better to AwaitSelf here, adding an artificial suspend point.
+        //       For the common case of non-null callable shaders, this would reduce
+        //       the size of compiled shaders, as the post-CallShader() part is unreachable,
+        //       also simplifying manual testing with suspend points.
+        //       For null callable shaders, it has the advantage of allowing
+        //       to reconverge on the resume function if implemented in a way that yields only
+        //       a single resume function.
+        return;
+    }
+
+    const Vpc32 addr = GetVpcFromShaderIdTable(callableTableBaseAddress,
+                                               index,
+                                               DispatchRaysConstBuf.callableTableStrideInBytes);
+
+    if (!addr.IsValid())
+    {
+        // See TODO above on how to handle this case better.
+        return;
+    }
+
+    const uint callerShaderRecIdx = data.shaderRecIdx;
+    data.shaderRecIdx = index; // the record index used by the callable shader
+
+    const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
+    const uint resumePrio = GetPriorityForShaderType(enclosingShaderType);
+    const Vpc32 resumeAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetResumePointAddr()), resumePrio);
+
+    data = _AmdAwaitCallShader(addr.GetU32(), resumeAddr.GetU32(), data);
+
+    // for the resume part.
+    data.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
+    _AmdRestoreSystemData(data); // llvm inserts amd.dx.setLocalRootIndex(data.shaderRecIdx)
+}
+
+//=====================================================================================================================
+// Returns the low part of the miss shader address and sets up the dispatch data to have the correct shader record
+// index.
+static Vpc32 SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx)
+{
+    const uint64_t missTableBaseAddress =
+        PackUint64(DispatchRaysConstBuf.missTableBaseAddressLo, DispatchRaysConstBuf.missTableBaseAddressHi);
+    if (missTableBaseAddress == 0)
+    {
+        shaderRecIdx = 0;
+        return Vpc32(0);
+    }
+
+    shaderRecIdx = ExtractMissShaderIndex(data.ray.traceParameters);
+
+    // Calculate miss shader record address
+    return GetVpcFromShaderIdTable(missTableBaseAddress,
+                                   shaderRecIdx,
+                                   DispatchRaysConstBuf.missTableStrideInBytes);
+}
+
+//=====================================================================================================================
+static HitGroupInfo GetHitGroupInfo(
+    in _AmdSystemData           data,
+    in uint                     state,
+    in _AmdPrimitiveSystemState candidate)
+{
+    uint geometryIndex = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ?
+            candidate.GeometryIndex() : data.traversal.committed.GeometryIndex();
+    uint instanceContribution = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ?
+            candidate.InstanceContribution() : data.traversal.committed.InstanceContribution();
+
+    return GetHitGroupInfo(ExtractRayContributionToHitIndex(data.ray.traceParameters),
+                           ExtractMultiplierForGeometryContributionToHitIndex(data.ray.traceParameters),
+                           geometryIndex,
+                           instanceContribution);
+}
+#endif
+
 //=====================================================================================================================
 // Ray History helper functions
 //=====================================================================================================================
@@ -1523,7 +1695,7 @@ static void RayHistoryWriteTopLevel(inout_param(_AmdSystemData) data)
 #if DEVELOPER
     if (EnableTraversalCounter() && data.counter.WriteTokenTopLevel())
     {
-        WriteRayHistoryTokenTopLevel(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), data.ray.AccelStruct());
+        WriteRayHistoryTokenTopLevel(GetRayId(data.dispatch.DispatchId()), data.ray.AccelStruct());
         data.counter.SetWriteTokenTopLevel(false);
     }
 #endif
@@ -1588,7 +1760,7 @@ static void RayHistoryWriteBegin(inout_param(_AmdSystemData) data)
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-        const uint rayId  = GetRayId(_cont_DispatchRaysIndex3(data.dispatch));
+        const uint rayId  = GetRayId(data.dispatch.DispatchId());
         RayDesc rayDesc   = (RayDesc)0;
         rayDesc.Origin    = data.ray.origin;
         rayDesc.Direction = data.ray.direction;
@@ -1600,7 +1772,7 @@ static void RayHistoryWriteBegin(inout_param(_AmdSystemData) data)
         data.counter.SetCallerShaderType(_AmdGetShaderKind());
 
         WriteRayHistoryTokenBegin(rayId,
-                                  _cont_DispatchRaysIndex3(data.dispatch),
+                                  data.dispatch.DispatchId(),
                                   data.ray.AccelStruct(),
                                   data.ray.Flags(),
                                   data.ray.traceParameters,
@@ -1619,7 +1791,7 @@ static void RayHistoryWriteEnd(inout_param(_AmdSystemData) data, uint state)
 #if DEVELOPER
     WriteDispatchCounters(data.counter.numIterations);
 
-    const uint     rayId    = GetRayId(_cont_DispatchRaysIndex3(data.dispatch));
+    const uint     rayId    = GetRayId(data.dispatch.DispatchId());
     const uint64_t timerEnd = AmdTraceRaySampleGpuTimer();
     WriteRayHistoryTokenTimeStamp(rayId, timerEnd);
 
@@ -1635,7 +1807,7 @@ static void RayHistoryWriteEnd(inout_param(_AmdSystemData) data, uint state)
     if (data.IsChs(state))
     {
         // For CHS, get candidate and barycentrics from traversal.
-        const uint instNodeIndex = FetchInstanceIdx(ConvertRtIpLevel(_AmdGetRtip()),
+        const uint instNodeIndex = FetchInstanceIdx(ConvertRtIpLevel(GetRtIpLevel()),
                                                     data.ray.AccelStruct(),
                                                     data.traversal.committed.instNodePtr);
         WriteRayHistoryTokenEnd(rayId,
@@ -1661,16 +1833,10 @@ static void RayHistoryWriteEnd(inout_param(_AmdSystemData) data, uint state)
 }
 
 //=====================================================================================================================
-static uint2 RayHistoryGetIdentifierFromVPC(uint64_t vpc)
+static uint2 RayHistoryGetIdentifierFromVPC(Vpc32 vpc)
 {
     // Zero out the metadata bits
-    return uint2(SplitUint64(vpc).x & 0xFFFFFFC0, 0);
-}
-
-//=====================================================================================================================
-static uint2 RayHistoryGetIdentifierFromShaderId(uint2 shaderId)
-{
-    return uint2(shaderId.x & 0xFFFFFFC0, 0);
+    return uint2(vpc.GetFunctionAddr(), 0);
 }
 
 //=====================================================================================================================
@@ -1679,7 +1845,7 @@ static void RayHistoryWriteTriangleHitResult(_AmdSystemData data, bool accept)
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-        WriteRayHistoryTokenTriangleHitResult(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)),
+        WriteRayHistoryTokenTriangleHitResult(GetRayId(data.dispatch.DispatchId()),
                                               uint(accept),
                                               data.counter.candidateTCurrent);
     }
@@ -1695,7 +1861,7 @@ static void RayHistoryWriteFunctionCall(inout_param(_AmdSystemData) data,
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-        const uint rayId = GetRayId(_cont_DispatchRaysIndex3(data.dispatch));
+        const uint rayId = GetRayId(data.dispatch.DispatchId());
 
         switch(shaderKind)
         {
@@ -1749,7 +1915,7 @@ static void RayHistoryWriteAnyHitOrProceduralStatus(inout_param(_AmdSystemData)
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-        const uint rayId  = GetRayId(_cont_DispatchRaysIndex3(data.dispatch));
+        const uint rayId  = GetRayId(data.dispatch.DispatchId());
         const uint status = (data.dispatch.nextNodePtr == END_SEARCH)
                             ? HIT_STATUS_ACCEPT_AND_END_SEARCH
                             : (data.ray.AnyHitDidAccept() ? HIT_STATUS_ACCEPT : HIT_STATUS_IGNORE);
@@ -1779,225 +1945,64 @@ static void RayHistoryWriteAnyHitOrProceduralStatus(inout_param(_AmdSystemData)
             {
                 data.counter.numCandidateHits++;
             }
-            break;
-
-        default:
-            break;
-        }
-        data.counter.SetCallerShaderType(DXILShaderKind::Invalid);
-    }
-#endif
-}
-
-//=====================================================================================================================
-static void RayHistoryHandleIteration(inout_param(_AmdSystemData) data, uint nextNodePtr)
-{
-#if DEVELOPER
-    if (EnableTraversalCounter())
-    {
-        WriteRayHistoryTokenNodePtr(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), nextNodePtr);
-        UpdateWaveTraversalStatistics(ConvertRtIpLevel(_AmdGetRtip()), nextNodePtr);
-
-        data.counter.numIterations++;
-    }
-#endif
-}
-
-//=====================================================================================================================
-static void RayHistoryWriteBottomLevel(_AmdSystemData data, GpuVirtualAddress bvhAddress)
-{
-#if DEVELOPER
-    if (EnableTraversalCounter())
-    {
-        WriteRayHistoryTokenBottomLevel(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), bvhAddress);
-    }
-#endif
-}
-
-//=====================================================================================================================
-static void TraversalCounterWriteCounter(_AmdSystemData data)
-{
-#if DEVELOPER
-    if (EnableTraversalCounter())
-    {
-        TraversalCounter counter = (TraversalCounter)0;
-        counter.data[TCID_NUM_RAY_BOX_TEST]       = data.counter.numRayBoxTest;
-        counter.data[TCID_NUM_RAY_TRIANGLE_TEST]  = data.counter.numRayTriangleTest;
-        counter.data[TCID_NUM_ITERATION]          = data.counter.numIterations;
-        counter.data[TCID_MAX_TRAVERSAL_DEPTH]    = data.counter.maxStackDepth;
-        counter.data[TCID_NUM_ANYHIT_INVOCATION]  = data.counter.numAnyHitInvocation;
-        counter.data[TCID_SHADER_ID]              = data.counter.shaderIdLow;
-        counter.data[TCID_SHADER_RECORD_INDEX]    = data.counter.shaderRecIdx;
-        counter.data[TCID_TIMING_DATA]            = data.counter.timer;
-        counter.data[TCID_WAVE_ID]                = AmdTraceRayGetHwWaveId();
-        counter.data[TCID_NUM_CANDIDATE_HITS]     = data.counter.numCandidateHits;
-        counter.data[TCID_INSTANCE_INTERSECTIONS] = data.counter.instanceIntersections;
-
-        WriteTraversalCounter(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), counter);
-    }
-#endif
-}
-
-#if CONTINUATION_ON_GPU
-//=====================================================================================================================
-// ReportHit implementation that is called from the intersection shader.
-// May call the AnyHit shader.
-export bool _cont_ReportHit(inout_param(_AmdAnyHitSystemData) data, float THit, uint HitKind)
-{
-    // TODO Reuse shader record index computed in Traversal
-    // TODO Check for closest hit and duplicate anyHit calling
-
-    THit -= data.base.ray.tMin;
-    float tCurrentCommitted = 0.f;
-    {
-        tCurrentCommitted = data.base.traversal.committed.rayTCurrent;
-    }
-
-    if ((THit < 0.f) || (THit > tCurrentCommitted))
-    {
-        // Discard the hit candidate and hint the compiler to not keep the
-        // values alive, which will remove redundant moves.
-        data.candidate.rayTCurrent = _AmdGetUninitializedF32();
-        // Don't discard the hit kind as it is bit packed and cannot be discarded partially.
-        return false;
-    }
-
-    data.candidate.rayTCurrent = THit;
-    data.candidate.PackHitKind(HitKind);
-
-    uint isOpaque = true;
-    {
-        PrimitiveData primitiveData;
-        InstanceDesc desc;
-
-        {
-            // Get primitive nodes to process based on candidate or committed hit
-            const uint tlasNodePtr = data.candidate.instNodePtr;
-
-            const GpuVirtualAddress tlasAddr = data.base.ray.AccelStruct() + ExtractNodePointerOffset(tlasNodePtr);
-            desc = FetchInstanceDescAddr(tlasAddr);
-            isOpaque = data.candidate.IsOpaque();
-        }
-    }
-
-    if (!isOpaque)
-    {
-        uint hitGroupRecordIndex = 0;
-        {
-            hitGroupRecordIndex = data.base.dispatch.shaderRecIdx;
-        }
-        // Compute hit group address and fetch shader identifiers
-        const Vpc64 anyHitAddr = GetAnyHitAddr(hitGroupRecordIndex);
-
-        if (anyHitAddr.IsValid())
-        {
-            // Call AnyHit
-            // Hit attributes are added as an additional argument by the compiler
-            Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), SCHEDULING_PRIORITY_IS);
-            data = _AmdAwaitAnyHit(anyHitAddr.GetU64(), resumeAddr.GetU64(), data);
-            _AmdRestoreSystemDataAnyHit(data);
-            return data.base.ray.AnyHitDidAccept();
-        }
-        else
-        {
-            _cont_AcceptHit(data);
-            _AmdAcceptHitAttributes(data); // changes data.base.traversal.committedBarycentrics plus up-to-6 DW data in payload
-            return true;
-        }
-    }
-    else
-    {
-        _cont_AcceptHit(data);
-        _AmdAcceptHitAttributes(data);
-        return true;
+            break;
+
+        default:
+            break;
+        }
+        data.counter.SetCallerShaderType(DXILShaderKind::Invalid);
     }
+#endif
 }
 
 //=====================================================================================================================
-// CallShader implementation
-export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint index)
+static void RayHistoryHandleIteration(inout_param(_AmdSystemData) data, uint nextNodePtr)
 {
-    const uint64_t callableTableBaseAddress =
-        PackUint64(DispatchRaysConstBuf.callableTableBaseAddressLo, DispatchRaysConstBuf.callableTableBaseAddressHi);
-
-    if (callableTableBaseAddress == 0)
+#if DEVELOPER
+    if (EnableTraversalCounter())
     {
-        // TODO: It might be better to AwaitSelf here, adding an artificial suspend point.
-        //       For the common case of non-null callable shaders, this would reduce
-        //       the size of compiled shaders, as the post-CallShader() part is unreachable,
-        //       also simplifying manual testing with suspend points.
-        //       For null callable shaders, it has the advantage of allowing
-        //       to reconverge on the resume function if implemented in a way that yields only
-        //       a single resume function.
-        return;
-    }
-
-    const Vpc64 addr = GetVpc64FromShaderIdTable(callableTableBaseAddress,
-                                                 index,
-                                                 DispatchRaysConstBuf.callableTableStrideInBytes,
-                                                 SCHEDULING_PRIORITY_CALLABLE);
+        WriteRayHistoryTokenNodePtr(GetRayId(data.dispatch.DispatchId()), nextNodePtr);
+        UpdateWaveTraversalStatistics(ConvertRtIpLevel(GetRtIpLevel()), nextNodePtr);
 
-    if (!addr.IsValid())
-    {
-        // See TODO above on how to handle this case better.
-        return;
+        data.counter.numIterations++;
     }
-
-    const uint callerShaderRecIdx = data.shaderRecIdx;
-    data.shaderRecIdx = index; // the record index used by the callable shader
-
-    const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
-    const uint resumePrio = GetPriorityForShaderType(enclosingShaderType);
-    const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio);
-
-    data = _AmdAwaitCallShader(addr.GetU64(), resumeAddr.GetU64(), data);
-
-    // for the resume part.
-    data.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
-    _AmdRestoreSystemData(data); // llvm inserts amd.dx.setLocalRootIndex(data.shaderRecIdx)
+#endif
 }
 
 //=====================================================================================================================
-// Returns the low part of the miss shader address and sets up the dispatch data to have the correct shader record
-// index.
-static Vpc64 SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx)
+static void RayHistoryWriteBottomLevel(_AmdSystemData data, GpuVirtualAddress bvhAddress)
 {
-    const uint64_t missTableBaseAddress =
-        PackUint64(DispatchRaysConstBuf.missTableBaseAddressLo, DispatchRaysConstBuf.missTableBaseAddressHi);
-    if (missTableBaseAddress == 0)
+#if DEVELOPER
+    if (EnableTraversalCounter())
     {
-        shaderRecIdx = 0;
-        return Vpc64(0);
+        WriteRayHistoryTokenBottomLevel(GetRayId(data.dispatch.DispatchId()), bvhAddress);
     }
-
-    shaderRecIdx = ExtractMissShaderIndex(data.ray.traceParameters);
-
-    // Calculate miss shader record address
-    const Vpc64 shaderAddr = GetVpc64FromShaderIdTable(missTableBaseAddress,
-                                                       shaderRecIdx,
-                                                       DispatchRaysConstBuf.missTableStrideInBytes,
-                                                       SCHEDULING_PRIORITY_MISS);
-
-    return shaderAddr;
+#endif
 }
 
 //=====================================================================================================================
-static HitGroupInfo GetHitGroupInfo(
-    in _AmdSystemData           data,
-    in uint                     state,
-    in _AmdPrimitiveSystemState candidate)
+static void TraversalCounterWriteCounter(_AmdSystemData data)
 {
-    uint geometryIndex = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ?
-            candidate.GeometryIndex() : data.traversal.committed.GeometryIndex();
-    uint instanceContribution = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ?
-            candidate.InstanceContribution() : data.traversal.committed.InstanceContribution();
+#if DEVELOPER
+    if (EnableTraversalCounter())
+    {
+        TraversalCounter counter = (TraversalCounter)0;
+        counter.data[TCID_NUM_RAY_BOX_TEST]       = data.counter.numRayBoxTest;
+        counter.data[TCID_NUM_RAY_TRIANGLE_TEST]  = data.counter.numRayTriangleTest;
+        counter.data[TCID_NUM_ITERATION]          = data.counter.numIterations;
+        counter.data[TCID_MAX_TRAVERSAL_DEPTH]    = data.counter.maxStackDepth;
+        counter.data[TCID_NUM_ANYHIT_INVOCATION]  = data.counter.numAnyHitInvocation;
+        counter.data[TCID_SHADER_ID]              = data.counter.shaderIdLow;
+        counter.data[TCID_SHADER_RECORD_INDEX]    = data.counter.shaderRecIdx;
+        counter.data[TCID_TIMING_DATA]            = data.counter.timer;
+        counter.data[TCID_WAVE_ID]                = AmdTraceRayGetHwWaveId();
+        counter.data[TCID_NUM_CANDIDATE_HITS]     = data.counter.numCandidateHits;
+        counter.data[TCID_INSTANCE_INTERSECTIONS] = data.counter.instanceIntersections;
 
-    return GetHitGroupInfo(ExtractRayContributionToHitIndex(data.ray.traceParameters),
-                           ExtractMultiplierForGeometryContributionToHitIndex(data.ray.traceParameters),
-                           geometryIndex,
-                           instanceContribution);
-}
+        WriteTraversalCounter(GetRayId(data.dispatch.DispatchId()), counter);
+    }
 #endif
+}
 
 //=====================================================================================================================
 // Order matters, the following HLSL reference the functions and structs defined above. TODO: refactor these into a
@@ -2005,7 +2010,30 @@ static HitGroupInfo GetHitGroupInfo(
 #include "Continuations1_1.hlsl"
 #include "Continuations2_0.hlsl"
 
-#if CONTINUATION_ON_GPU
+//=====================================================================================================================
+// Calls traversal for the current rtip.
+static void TraversalInternal(
+    inout_param(_AmdSystemData) data,
+    inout_param(uint) state,
+    inout_param(_AmdPrimitiveSystemState) candidate,
+    inout_param(float2) candidateBarycentrics)
+{
+    switch (GetRtIpLevel())
+    {
+#if (GPURT_RTIP_LEVEL == GPURT_RTIP_LEGACY_LEVEL) || (GPURT_RTIP_LEVEL == 0)
+    case RayTracingIpLevel::RtIp1_1:
+        TraversalInternal1_1(data, state, candidate, candidateBarycentrics);
+        break;
+    case RayTracingIpLevel::RtIp2_0:
+        TraversalInternal2_0(data, state, candidate, candidateBarycentrics);
+        break;
+#endif
+    default:
+        break;
+    }
+}
+
+#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
 static uint64_t GetDispatchIdAddr()
 {
     return PackUint64(DispatchRaysConstBuf.cpsDispatchIdAddressLo, DispatchRaysConstBuf.cpsDispatchIdAddressHi);
@@ -2066,12 +2094,13 @@ static void LaunchRayGen(bool setupStack)
 #if DEVELOPER
         systemData.parentId = -1;
 #endif
-        _AmdEnqueueRayGen(GetRayGenVpc64().GetU64(), _AmdGetUninitializedI64(), systemData);
+        _AmdEnqueueRayGen(GetRayGenVpc32().GetU32(), _AmdGetUninitializedI32(), systemData);
     }
     else if (Options::getPersistentLaunchEnabled())
     {
-        _AmdDispatchSystemData systemData = _AmdDispatchSystemData::MakeDeadLaneWithStack();
-        _AmdEnqueueTraversalDead(GetTraversalVpc64PwgDead().GetU64(), _AmdGetUninitializedI64(), systemData);
+        _AmdDispatchSystemData systemData = _AmdGetUninitializedDispatchSystemData();
+        systemData.SetDead(true);
+        _AmdEnqueueTraversalDead(GetTraversalVpc32PwgDead().GetU32(), _AmdGetUninitializedI32(), systemData);
     }
 }
 
@@ -2152,7 +2181,7 @@ export void _cont_TraceRay(
     }
     // Initialise traversal system state
     _AmdTraversalState traversal = (_AmdTraversalState)0;
-    switch (_AmdGetRtip())
+    switch (GetRtIpLevel())
     {
     case RayTracingIpLevel::RtIp1_1:
         traversal = InitTraversalState1_1(instanceInclusionMask, rayDesc, isValid);
@@ -2176,16 +2205,19 @@ export void _cont_TraceRay(
 
     const uint     callerShaderRecIdx    = dispatch.shaderRecIdx; // 0 if from RayGen.
     const uint     parentId              = RayHistoryGetParentId(dispatch);
-    const Vpc64    traversalAddr         = GetTraversalVpc64();
 
     // The type of the shader containing this TraceRay call, i.e. the shader we are inlined into.
     const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
     const uint           resumePrio          = GetPriorityForShaderType(enclosingShaderType);
 
     // NO control flow is allowed between _AmdGetResumePointAddr() and _AmdAwaitTraversal().
-    const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio);
-    data.traversal.PackReturnAddress(resumeAddr);
-    dispatch = _AmdAwaitTraversal(traversalAddr.GetU64(), data);
+    const Vpc32 resumeAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetResumePointAddr()), resumePrio);
+    data.traversal.SetReturnAddress(resumeAddr);
+#ifndef PASS_DUMMY_RET_ADDR
+    dispatch = _AmdAwaitTraversal(GetTraversalVpc32().GetU32(), data);
+#else // PASS_DUMMY_RET_ADDR
+    dispatch = _AmdAwaitTraversal(GetTraversalVpc32().GetU32(), _AmdGetUninitializedI32(), data);
+#endif
 
     // for the resume part.
     dispatch.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
@@ -2196,26 +2228,23 @@ export void _cont_TraceRay(
 }
 
 //=====================================================================================================================
-// Get the address of the function that should be called next, either a closest hit or a miss shader. If no hit or miss
-// shader should be called, this method returns false (and in that case it should return to
-// data.traversal.ReturnAddress()), otherwise it returns true.
-static bool GetNextHitMissPc(
+// Get the address of the function that should be called next, either a closest hit or a miss shader.
+// If no hit or miss shader should be called, this method returns DEAD_SHADER_ADDR.
+static Vpc32 GetNextHitMissPc(
     inout_param(_AmdSystemData) data,
     uint state,
-    _AmdPrimitiveSystemState candidate,
-    out_param(Vpc64) nextShaderAddr)
+    _AmdPrimitiveSystemState candidate)
 {
     // MS
     if (data.IsMiss(state))
     {
         uint shaderRecIdx;
-        const Vpc64 missShaderAddr = SetupMissShader(data, shaderRecIdx);
+        const Vpc32 missShaderAddr = SetupMissShader(data, shaderRecIdx);
         if (missShaderAddr.IsValid())
         {
             // Valid MS
             data.dispatch.shaderRecIdx = shaderRecIdx;
-            nextShaderAddr = missShaderAddr;
-            return true;
+            return missShaderAddr;
         }
     }
 
@@ -2230,98 +2259,79 @@ static bool GetNextHitMissPc(
 
         if ((data.ray.Flags() & RAY_FLAG_SKIP_CLOSEST_HIT_SHADER) == 0)
         {
-            if (hitInfo.closestHitId.x != 0)
+            Vpc32 closestHitId = Vpc32(hitInfo.closestHitId.x);
+            if (closestHitId.IsValid())
             {
-                // Valid CHS
-                nextShaderAddr = GetVpc64FromShaderId(Vpc32(hitInfo.closestHitId.x), SCHEDULING_PRIORITY_CHS);
-                return true;
+                return closestHitId;
             }
         }
     }
-    return false;
+    return Vpc32(DEAD_SHADER_ADDR);
 }
 
 //=====================================================================================================================
-// Calls traversal for the current rtip.
-static void TraversalInternal(
-    inout_param(_AmdSystemData) data,
-    inout_param(uint) state,
-    inout_param(_AmdPrimitiveSystemState) candidate,
-    inout_param(float2) candidateBarycentrics)
+// Helper to handle enqueueing CHS, MS.
+static void EnqueueHitMiss(_AmdSystemData data, Vpc32 nextShaderAddr)
 {
-    switch (_AmdGetRtip())
-    {
-#if (GPURT_RTIP_LEVEL == GPURT_RTIP_LEGACY_LEVEL) || (GPURT_RTIP_LEVEL == 0)
-    case RayTracingIpLevel::RtIp1_1:
-        TraversalInternal1_1(data, state, candidate, candidateBarycentrics);
-        break;
-    case RayTracingIpLevel::RtIp2_0:
-        TraversalInternal2_0(data, state, candidate, candidateBarycentrics);
-        break;
-#endif
-    default:
-        break;
-    }
-}
+    GPU_ASSERT(nextShaderAddr.GetU32() != DEAD_SHADER_ADDR && !data.IsDeadLane());
+    const uint state = data.traversal.committed.State();
+    RayHistoryWriteEnd(data, state);
 
-static void EnqueueNextShader(bool hasWorkToDo, Vpc64 nextShaderAddr, Vpc64 returnAddr, _AmdSystemData data)
-{
-    if (!hasWorkToDo)
+    const Vpc32 returnAddr = data.traversal.ReturnAddress();
+
+    if (nextShaderAddr.GetU32() == DEAD_SHADER_ADDR)
     {
-        if (_AmdContinuationStackIsGlobal())
-        {
-            // No work to do = dead lane, jump to traversal as a synchronization point with an empty system data
-            _AmdSystemData sysData = _AmdSystemData::MakeDeadLaneWithoutStack();
-            _AmdEnqueueTraversal(GetTraversalVpc64().GetU64(), _AmdGetUninitializedI64(), sysData);
-        }
-        else
-        {
-            GPU_ASSERT(false);
-        }
+        // We do not have an address to jump to, retrieve the return address and return to RGS
+        _AmdEnqueueRayGen(returnAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch);
     }
 
-    const uint newState = data.traversal.committed.State();
-    RayHistoryWriteEnd(data, newState);
+    // Enqueue the selected shader
+    const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(state)
+        ? (int)DXILShaderKind::Miss // convert to int to fix linux build error
+        : (int)DXILShaderKind::ClosestHit
+    );
 
-    if (nextShaderAddr.GetU64() != returnAddr.GetU64())
-    {
-        const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(newState) ?
-                                          (int)DXILShaderKind::Miss : // convert to int to fix linux build error
-                                          (int)DXILShaderKind::ClosestHit);
-        RayHistoryWriteFunctionCall(data,
-                                    RayHistoryGetIdentifierFromVPC(nextShaderAddr.GetU64()),
-                                    data.dispatch.shaderRecIdx,
-                                    shaderKind);
-
-        _AmdEnqueue(nextShaderAddr.GetU64(), returnAddr.GetU64(), data);
-    }
+    RayHistoryWriteFunctionCall(data,
+                                RayHistoryGetIdentifierFromVPC(nextShaderAddr),
+                                data.dispatch.shaderRecIdx,
+                                shaderKind);
 
-    // Return to RayGen. No need to set a priority, as it is already set in the stored return address.
-    _AmdEnqueueRayGen(returnAddr.GetU64(), _AmdGetUninitializedI64(), data.dispatch);
+    _AmdEnqueue(nextShaderAddr.GetU32(), returnAddr.GetU32(), data);
 }
 
 //=====================================================================================================================
-// Convenience helper calling Traversal on the debug/emulation path that returns _AmdTraversalResultData.
-static _AmdTraversalResultData TraversalInternalDebugWrapper(
-    inout_param(_AmdSystemData) data)
+
+export void _cont_ExitRayGen(in _AmdDispatchSystemData data)
 {
-    uint state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
-    _AmdPrimitiveSystemState candidate  = (_AmdPrimitiveSystemState)0;
-    float2 candidateBarycentrics        = float2(0.0f, 0.0f);
+    if (Options::getPersistentLaunchEnabled()
+    ) {
+        // Lanes that exit raygen own a stack. Return them to traversal for scheduling
+        _AmdDispatchSystemData sysData = _AmdGetUninitializedDispatchSystemData();
+        sysData.SetDead(true);
+        _AmdEnqueueTraversalDead(GetTraversalVpc32PwgDead().GetU32(), _AmdGetUninitializedI32(), sysData);
+    }
+    // In all other cases, exit the wave
+    _AmdComplete();
+}
 
-    TraversalInternal(data, state, candidate, candidateBarycentrics);
+//=====================================================================================================================
 
-    _AmdTraversalResultData result = (_AmdTraversalResultData)0;
-    result.state = state;
-    result.candidate = candidate;
-    result.candidateBarycentrics = candidateBarycentrics;
+//=====================================================================================================================
 
-    return result;
-}
+namespace ThreadTrace
+{
 
-//=====================================================================================================================
-// Wrapper to ensure the following shader section is marked as "Scheduler" in TTV (if thread traces are enabled).
-static void EnterSchedulerSection()
+enum struct Section
+{
+    Scheduler = 8,
+    Traversal = 6
+};
+
+//=================================================================================================================
+// Wrapper to ensure the subsequent shader section is correctly identified in TTV.
+// If thread traces are disabled, this does nothing. Otherwise, it issues a return token and a new shader data token
+// of the type specified by `section`.
+static void EnterSection(uint section)
 {
     if (Options::getThreadTraceEnabled())
     {
@@ -2331,8 +2341,44 @@ static void EnterSchedulerSection()
 
         // Emit a function call token to start the scheduler function.
         AmdExtD3DShaderIntrinsics_ShaderMarker(0x11 |
-            (/* scheduler */ 8 << 8) |
-            (/* exec      */ WaveActiveCountBits(true) << 13));
+            (/* section */ section << 8) |
+            (/* exec    */ WaveActiveCountBits(true) << 13));
+    }
+}
+
+} // namespace ThreadTrace
+
+//=====================================================================================================================
+// Scheduler for dead lanes.
+// Some lanes may return from this function. All lanes that return are guaranteed to be dead and are supposed to enqueue
+// traversal for subsequent processing. If the full wave is dead and persistent launch is on, new work will be started.
+// If persistent work is off, and all lanes are dead (potentially less than a full wave), and no work could be obtained,
+// then the lanes are terminated.
+static void ScheduleDeadWave(_AmdSystemData data, Vpc32 traversalAddr)
+{
+    GPU_ASSERT(WaveActiveAllTrue(data.IsDeadLane()));
+
+    if (Options::getPersistentLaunchEnabled())
+    {
+        if (data.IsDeadLaneWithStack())
+        {
+            if (WaveActiveCountBits(true) == AmdExtLaneCount())
+            {
+                // If the whole wave is dead, get ready to start a new dispatch
+                LaunchRayGen(false);
+            }
+            // Passthrough these stackful dead lanes
+            _AmdEnqueueTraversalDead(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch);
+        }
+    }
+
+    if (Options::getPersistentLaunchEnabled())
+    {
+        _AmdEnqueueTraversalDead(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch);
+    }
+    else
+    {
+        _AmdComplete();
     }
 }
 
@@ -2341,6 +2387,13 @@ static void EnterSchedulerSection()
 export void _cont_Traversal(
     inout_param(_AmdSystemData) data)
 {
+    bool IsDead = data.IsDeadLane();
+    const bool IsTraversal = !IsDead && data.IsTraversal();
+
+    // TRAVERSAL: BVH -------------------------------------------------------------------------------------------------
+    uint state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
+    _AmdPrimitiveSystemState candidate  = (_AmdPrimitiveSystemState)0;
+    float2 candidateBarycentrics        = float2(0.0f, 0.0f);
     // Discard data that doesn't need to be kept alive during Traversal
     data.dispatch.shaderRecIdx = _AmdGetUninitializedI32();
     if (!IsBvhRebraid())
@@ -2349,23 +2402,18 @@ export void _cont_Traversal(
         data.traversal.lastInstanceRootNodePtr = _AmdGetUninitializedI32();
     }
 
-    // Write AHS/IS returned status
-    bool IsDeadLane = (data.IsDeadLaneWithoutStack() || data.IsDeadLaneWithStack());
-    if (!IsDeadLane)
+    if (!IsDead)
     {
+        // Write AHS/IS returned status
         RayHistoryWriteAnyHitOrProceduralStatus(data);
     }
 
     // Execute traversal for active lanes.
-    uint state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
-    _AmdPrimitiveSystemState candidate  = (_AmdPrimitiveSystemState)0;
-    float2 candidateBarycentrics        = float2(0.0f, 0.0f);
-
-    if (data.IsTraversal())
+    if (IsTraversal)
     {
         TraversalInternal(data, state, candidate, candidateBarycentrics);
     }
-    else
+    else if (!IsDead)
     {
         // This branch is hit when the traversal for a lane is done:
         // a) AHS/IS enqueued _cont_Traversal(), for the very last time.
@@ -2380,41 +2428,32 @@ export void _cont_Traversal(
         // For CHS, get candidate and barycentrics from traversal.
         if (data.IsChs(state))
         {
-            candidate                   = data.traversal.committed;
-            candidateBarycentrics       = data.traversal.committedBarycentrics;
+            candidate             = data.traversal.committed;
+            candidateBarycentrics = data.traversal.committedBarycentrics;
         }
     }
 
-    // Result used on the CPU path. This is an unused dummy return value on the GPU path.
-    _AmdTraversalResultData result = (_AmdTraversalResultData)0;
+    // ALIASES AND CACHED VARIABLES -----------------------------------------------------------------------------------
 
-    bool IsChsOrMiss = data.IsChsOrMiss(state);
-    // Re-enqueue Traversal until all lanes are done with BVH Traversal.
-    // Only then enqueue CHS/Miss to ensure other lanes that are not yet done with Traversal
-    // converge on these CHS/Miss invocations.
-    // This is necessary because Traversal has lower scheduling priority.
-    if (WaveActiveAllTrue(IsChsOrMiss))
-    {
-        EnterSchedulerSection();
+    // Cache Traversal's own address
+    const Vpc32 traversalAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
 
-        Vpc64 nextShaderAddr = Vpc64(0);
-        GetNextHitMissPc(data, state, candidate, nextShaderAddr);
+    // Some aliases for variable state. Help the compiler figure out these are mutually exclusive in all modes.
+    bool IsChsOrMiss = false;
+    bool IsAhsOrIs = false;
+    if (!IsDead)
+    {
+        IsChsOrMiss = data.IsChsOrMiss(state);
+        IsAhsOrIs = (data.IsAhs(state) || data.IsIs(state));
+    }
+    bool AllDead = Traits::HasDeadLanes() && WaveActiveAllTrue(IsDead);
+    bool AnyIsAhsOrIs = WaveActiveAnyTrue(IsAhsOrIs);
 
-        bool hasWorkToDo = true;
-        if (_AmdContinuationStackIsGlobal() && nextShaderAddr.IsValid())
-        {
-        }
+    // TRAVERSAL: AHS AND IS ------------------------------------------------------------------------------------------
 
-        const Vpc64 returnAddr = data.traversal.ReturnAddress();
-        if (!nextShaderAddr.IsValid())
-        {
-            nextShaderAddr = returnAddr;
-        }
-        EnqueueNextShader(hasWorkToDo, nextShaderAddr, returnAddr, data);
-    }
-    else
+    if (AnyIsAhsOrIs)
     {
-        if (data.IsAhs(state) || data.IsIs(state))
+        if (IsAhsOrIs)
         {
             HitGroupInfo hitInfo = (HitGroupInfo)0;
             {
@@ -2429,45 +2468,78 @@ export void _cont_Traversal(
             // AHS and IS re-enqueue SchedulerInternal when finished.
             if (data.IsAhs(state))
             {
+                const Vpc32 anyHitAddr = Vpc32(hitInfo.anyHitId.x);
                 RayHistoryWriteFunctionCall(anyHitData.base,
-                                            RayHistoryGetIdentifierFromShaderId(hitInfo.anyHitId),
+                                            RayHistoryGetIdentifierFromVPC(anyHitAddr),
                                             hitInfo.tableIndex,
                                             DXILShaderKind::AnyHit);
 
-                const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.anyHitId.x), SCHEDULING_PRIORITY_AHS);
-                const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
-                _AmdEnqueueAnyHit(addr.GetU64(), returnAddr.GetU64(), anyHitData, candidateBarycentrics);
+                const Vpc32 returnAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+                _AmdEnqueueAnyHit(anyHitAddr.GetU32(), returnAddr.GetU32(), anyHitData, candidateBarycentrics);
             }
             else
             {
                 // Intersection shader
                 GPU_ASSERT(data.IsIs(state));
 
+                const Vpc32 isAddr = Vpc32(hitInfo.intersectionId.x);
                 RayHistoryWriteFunctionCall(anyHitData.base,
-                                            RayHistoryGetIdentifierFromShaderId(hitInfo.intersectionId),
+                                            RayHistoryGetIdentifierFromVPC(isAddr),
                                             hitInfo.tableIndex,
                                             DXILShaderKind::Intersection);
 
-                const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.intersectionId.x), SCHEDULING_PRIORITY_IS);
-                const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
-                _AmdEnqueueIntersection(addr.GetU64(), returnAddr.GetU64(), anyHitData);
+                const Vpc32 returnAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+                _AmdEnqueueIntersection(isAddr.GetU32(), returnAddr.GetU32(), anyHitData);
             }
         }
-        else
+        _AmdEnqueueTraversal(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data);
+    }
+
+    // FULL WAVE OF DEAD LANES ----------------------------------------------------------------------------------------
+    else if (AllDead)
+    {
+        ScheduleDeadWave(data, traversalAddr);
+        // this is unreachable, ScheduleDeadWave guarantees to end with an enqueue
+    }
+
+    // CHS, MISS AND POSSIBLY DEAD LANES ------------------------------------------------------------------------------
+    else
+    {
+        GPU_ASSERT(IsChsOrMiss || IsDead);
+        ThreadTrace::EnterSection(ThreadTrace::Section::Scheduler);
+
+        Vpc32 nextShaderAddr = Vpc32(IsDead ? DEAD_SHADER_ADDR : GetNextHitMissPc(data, state, candidate).GetU32());
+
+        if (!IsDead)
         {
-            //
-            // Everything else needs to go back through scheduling/traversal, regardless of state
-            // Note we don't need "Wait" here because priorities run AHS and IS first
-            const Vpc64 traversalAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
-            _AmdEnqueueTraversal(traversalAddr.GetU64(), _AmdGetUninitializedI64(), data);
+            EnqueueHitMiss(data, nextShaderAddr);
         }
+        _AmdEnqueueTraversalDead(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch);
     }
-    // This is unreachable
 }
-#endif
+
+#elif GPURT_DEBUG_CONTINUATION_TRAVERSAL // ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
+
+//=====================================================================================================================
+// For debug. Convenience helper calling Traversal on the debug/emulation path that returns _AmdTraversalResultData.
+static _AmdTraversalResultData TraversalInternalDebugWrapper(
+    inout_param(_AmdSystemData) data)
+{
+    uint state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
+    _AmdPrimitiveSystemState candidate  = (_AmdPrimitiveSystemState)0;
+    float2 candidateBarycentrics        = float2(0.0f, 0.0f);
+
+    TraversalInternal(data, state, candidate, candidateBarycentrics);
+
+    _AmdTraversalResultData result = (_AmdTraversalResultData)0;
+    result.state = state;
+    result.candidate = candidate;
+    result.candidateBarycentrics = candidateBarycentrics;
+
+    return result;
+}
 
 //=====================================================================================================================
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
 // For debug. Support DxcpRt (non-continuation) to use Continuation traversal.
 static IntersectionResult TraceRayInternalCPSDebug(
     in GpuVirtualAddress topLevelBvh,             // Top-level acceleration structure to use
@@ -2476,26 +2548,24 @@ static IntersectionResult TraceRayInternalCPSDebug(
     in RayDesc           rayDesc,                 // Ray to be traced
     in uint              rayId,                   // Ray ID for profiling
     in uint              rtIpLevel                // HW version to determine TraceRay implementation
+#if DEVELOPER
+    , in uint            dynamicId                // dynamic ID
+#endif
 )
 {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION  >= 41
-    rayFlags = (rayFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags();
-#endif
-
     // Initialise ray system state from TraceRay parameters
     _AmdRaySystemState ray = (_AmdRaySystemState)0;
-    ray.accelStruct        = topLevelBvh;
+    ray.PackAccelStructAndRayflags(topLevelBvh, rayFlags);
     ray.direction          = rayDesc.Direction;
     ray.origin             = rayDesc.Origin;
     ray.tMin               = rayDesc.TMin;
     ray.tMax               = rayDesc.TMax;
-    ray.flags              = rayFlags;
     ray.traceParameters    = traceRayParameters;
 
     const bool isValid = true; // already verified in the caller
 
     _AmdDispatchSystemData dispatch = (_AmdDispatchSystemData)0;
-    dispatch.PackDispatchId(GetDispatchId());
+    dispatch.PackDispatchId(AmdTraceRayDispatchRaysIndex());
 #if DEVELOPER
     dispatch.parentId = -1;
 #endif
@@ -2525,6 +2595,10 @@ static IntersectionResult TraceRayInternalCPSDebug(
     sysData.ray       = ray;
     sysData.traversal = traversal;
 
+#if DEVELOPER
+    sysData.counter.dynamicId = dynamicId;
+#endif
+
     // Begin outer while loop
     while (sysData.dispatch.nextNodePtr < TERMINAL_NODE)
     {
@@ -2564,10 +2638,20 @@ static IntersectionResult TraceRayInternalCPSDebug(
             const uint64_t instNodePtr64 = CalculateInstanceNodePtr64(rtIpLevel, topLevelBvh, tlasNodePtr);
             if (state == TRAVERSAL_STATE_CANDIDATE_NON_OPAQUE_TRIANGLE)
             {
+                uint status = HIT_STATUS_ACCEPT;
+
                 // This test reduces sp3 instructions, when rayFlags is a const containing RAY_FLAG_FORCE_OPAQUE. Note
                 // in this case, this branch is not executed w/wo this test, but simpler sp3 boosts performance.
-                if ((rayFlags & RAY_FLAG_FORCE_OPAQUE) == 0)
+                if ((ray.Flags() & RAY_FLAG_FORCE_OPAQUE) == 0)
                 {
+                    if (PackUint64(hitInfo.anyHitId) != 0)
+                    {
+                        RayHistoryWriteFunctionCall(sysData,
+                                                    hitInfo.anyHitId,
+                                                    hitInfo.tableIndex,
+                                                    DXILShaderKind::AnyHit);
+                    }
+
                     uint hitKind = ret.candidate.HitKind();
                     // Set intersection attributes
                     AmdTraceRaySetHitAttributes(ret.candidate.rayTCurrent,
@@ -2582,9 +2666,13 @@ static IntersectionResult TraceRayInternalCPSDebug(
                     BuiltInTriangleIntersectionAttributes attr = { ret.candidateBarycentrics };
                     AmdTraceRayCallTriangleAnyHitShader(hitInfo.anyHitId, hitInfo.tableIndex, attr);
 
-                    uint status = HIT_STATUS_ACCEPT;
                     AmdTraceRayGetHitAttributes(ret.candidate.rayTCurrent, hitKind, status);
 
+                    if (PackUint64(hitInfo.anyHitId) != 0)
+                    {
+                        RayHistoryWriteAnyHitOrProceduralStatus(sysData);
+                    }
+
                     if (status != HIT_STATUS_IGNORE)
                     {
                         sysData.traversal.committed = ret.candidate;
@@ -2596,12 +2684,19 @@ static IntersectionResult TraceRayInternalCPSDebug(
                         }
                     }
                 }
+
+                RayHistoryWriteTriangleHitResult(sysData, status > HIT_STATUS_IGNORE);
             }
             else
             {
                 // Intersection requires the currently committed hit as RayTCurrent()
                 ret.candidate.rayTCurrent = sysData.traversal.committed.rayTCurrent;
 
+                RayHistoryWriteFunctionCall(sysData,
+                                            hitInfo.intersectionId,
+                                            hitInfo.tableIndex,
+                                            DXILShaderKind::Intersection);
+
                 // Set intersection attributes
                 AmdTraceRaySetHitAttributes(sysData.traversal.committed.rayTCurrent,
                                             0,
@@ -2621,6 +2716,7 @@ static IntersectionResult TraceRayInternalCPSDebug(
                 AmdTraceRayGetHitAttributes(ret.candidate.rayTCurrent,
                                             hitKind,
                                             status);
+                RayHistoryWriteAnyHitOrProceduralStatus(sysData);
 
                 if (status != HIT_STATUS_IGNORE)
                 {
@@ -2675,6 +2771,7 @@ static IntersectionResult TraceRayInternalCPSDebug(
         {
             handleTriangleNode = CheckHandleTriangleNode(sysData.traversal.committed.currNodePtr);
         }
+
         if (handleTriangleNode)
         {
             AmdTraceRaySetTriangleIntersectionAttributes(result.barycentrics);
@@ -2686,6 +2783,16 @@ static IntersectionResult TraceRayInternalCPSDebug(
         AmdTraceRaySetHitTokenData(INVALID_NODE, INVALID_NODE);
     }
 
+#if DEVELOPER
+    result.numRayBoxTest         = sysData.counter.numRayBoxTest;
+    result.numRayTriangleTest    = sysData.counter.numRayTriangleTest;
+    result.numIterations         = sysData.counter.numIterations;
+    result.maxStackDepth         = sysData.counter.maxStackDepth;
+    result.numAnyHitInvocation   = sysData.counter.numAnyHitInvocation;
+    result.numCandidateHits      = sysData.counter.numCandidateHits;
+    result.instanceIntersections = sysData.counter.instanceIntersections;
+#endif
+
     return result;
 }
 #endif
diff --git a/src/shaders/RayQuery.hlsl b/src/shaders/RayQuery.hlsl
index ea80508..7107295 100644
--- a/src/shaders/RayQuery.hlsl
+++ b/src/shaders/RayQuery.hlsl
@@ -98,11 +98,7 @@ static bool RayQueryProceedCommon(
     {
         if (continueTraversal == false)
         {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-            const uint rayId = AmdExtDispatchThreadIdFlat();
-#else
             const uint rayId = GetRayId(dispatchThreadId);
-#endif
             WriteDispatchCounters(rayQuery.numIterations);
             WriteTraversalCounter(rayQuery, rayId);
 
diff --git a/src/shaders/RayQuery1_1.hlsl b/src/shaders/RayQuery1_1.hlsl
index c82f9c2..d372f85 100644
--- a/src/shaders/RayQuery1_1.hlsl
+++ b/src/shaders/RayQuery1_1.hlsl
@@ -204,11 +204,7 @@ static void TraceRayInlineImpl1_1(
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-        const uint rayId = AmdExtDispatchThreadIdFlat();
-#else
         const uint rayId = GetRayId(dispatchThreadId);
-#endif
         SetRayQueryDynamicId(rayQuery, AllocateRayHistoryDynamicId());
         WriteRayHistoryTokenBegin(rayId,
                                   dispatchThreadId,
@@ -235,11 +231,7 @@ static bool RayQueryProceedImpl1_1(
     uint rayId = 0;
     if (EnableTraversalCounter())
     {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-        rayId = AmdExtDispatchThreadIdFlat();
-#else
         rayId = GetRayId(dispatchThreadId);
-#endif
     }
 #endif
 
diff --git a/src/shaders/RayQuery2_0.hlsl b/src/shaders/RayQuery2_0.hlsl
index 0a9bed9..9e163d3 100644
--- a/src/shaders/RayQuery2_0.hlsl
+++ b/src/shaders/RayQuery2_0.hlsl
@@ -118,11 +118,7 @@ static void TraceRayInlineImpl2_0(
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-        const uint rayId = AmdExtDispatchThreadIdFlat();
-#else
         const uint rayId = GetRayId(dispatchThreadId);
-#endif
         SetRayQueryDynamicId(rayQuery, AllocateRayHistoryDynamicId());
         WriteRayHistoryTokenBegin(rayId,
                                   dispatchThreadId,
@@ -149,11 +145,7 @@ static bool RayQueryProceedImpl2_0(
     uint rayId = 0;
     if (EnableTraversalCounter())
     {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-        rayId = AmdExtDispatchThreadIdFlat();
-#else
         rayId = GetRayId(dispatchThreadId);
-#endif
     }
 #endif
 
diff --git a/src/shaders/TraceRay.hlsl b/src/shaders/TraceRay.hlsl
index 5b9f06c..b4d012b 100644
--- a/src/shaders/TraceRay.hlsl
+++ b/src/shaders/TraceRay.hlsl
@@ -34,8 +34,11 @@ static IntersectionResult TraceRayInternal(
     in RayDesc           rayDesc,                 // Ray to be traced
     in uint              rayId,                   // Ray ID for profiling
     in uint              rtIpLevel                // HW version to determine TraceRay implementation
+#if DEVELOPER
+    , in uint            dynamicId                // dynamic ID
+#endif
 )
-#if GPURT_BUILD_CONTINUATION && GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
 {
     return TraceRayInternalCPSDebug(topLevelBvh,
                                     rayFlags,
@@ -44,9 +47,12 @@ static IntersectionResult TraceRayInternal(
                                     rayId,
                                     rtIpLevel
 
+#if DEVELOPER
+                                    , dynamicId
+#endif
     );
 }
-#else // GPURT_BUILD_CONTINUATION && GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#else // GPURT_DEBUG_CONTINUATION_TRAVERSAL
 // Default path
 {
 #ifdef __cplusplus
@@ -116,6 +122,9 @@ static bool TraceRayCommon(
     uint  rtIpLevel
 )
 {
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
+    uint oriRayFlags = rayFlags;
+#endif
 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION  >= 41
     rayFlags = (rayFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags();
 #endif
@@ -188,11 +197,18 @@ static bool TraceRayCommon(
         {
             result = TraceRayInternal(
                 accelStruct,
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
+                oriRayFlags,
+#else
                 rayFlags,
+#endif
                 packedTraceParams,
                 ray,
                 rayId,
                 rtIpLevel
+#if DEVELOPER
+              , dynamicId
+#endif
                 );
         }
         else
@@ -250,13 +266,13 @@ static bool TraceRayCommon(
         }
         AmdTraceRaySetParentId(dynamicId);
 
-        counter.data[TCID_NUM_RAY_BOX_TEST] = result.numRayBoxTest;
-        counter.data[TCID_NUM_RAY_TRIANGLE_TEST] = result.numRayTriangleTest;
-        counter.data[TCID_NUM_ITERATION] = result.numIterations;
-        counter.data[TCID_MAX_TRAVERSAL_DEPTH] = result.maxStackDepth;
-        counter.data[TCID_NUM_ANYHIT_INVOCATION] = result.numAnyHitInvocation;
-        counter.data[TCID_WAVE_ID] = AmdTraceRayGetHwWaveId();
-        counter.data[TCID_NUM_CANDIDATE_HITS] = result.numCandidateHits;
+        counter.data[TCID_NUM_RAY_BOX_TEST]       = result.numRayBoxTest;
+        counter.data[TCID_NUM_RAY_TRIANGLE_TEST]  = result.numRayTriangleTest;
+        counter.data[TCID_NUM_ITERATION]          = result.numIterations;
+        counter.data[TCID_MAX_TRAVERSAL_DEPTH]    = result.maxStackDepth;
+        counter.data[TCID_NUM_ANYHIT_INVOCATION]  = result.numAnyHitInvocation;
+        counter.data[TCID_WAVE_ID]                = AmdTraceRayGetHwWaveId();
+        counter.data[TCID_NUM_CANDIDATE_HITS]     = result.numCandidateHits;
         counter.data[TCID_INSTANCE_INTERSECTIONS] = result.instanceIntersections;
     }
 #endif
diff --git a/src/shaders/TrianglePrimitive.hlsl b/src/shaders/TrianglePrimitive.hlsl
index e2975dc..b32357e 100644
--- a/src/shaders/TrianglePrimitive.hlsl
+++ b/src/shaders/TrianglePrimitive.hlsl
@@ -224,7 +224,7 @@ TriangleData FetchTransformedTriangleData(
 //======================================================================================================================
 bool IsActive(TriangleData tri)
 {
-    return ((isnan(tri.v0.x) == false) && (isnan(tri.v1.x) == false) && (isnan(tri.v2.x) == false));
+    return (any(isnan(tri.v0)) == false) && (any(isnan(tri.v1)) == false) && (any(isnan(tri.v2)) == false);
 }
 
 //=====================================================================================================================
diff --git a/src/shadersClean/traversal/TraversalDefs.hlsli b/src/shadersClean/traversal/TraversalDefs.hlsli
index 28f9999..ea8ea10 100644
--- a/src/shadersClean/traversal/TraversalDefs.hlsli
+++ b/src/shadersClean/traversal/TraversalDefs.hlsli
@@ -160,6 +160,15 @@ struct RayQueryInternal
 //=====================================================================================================================
 struct HitGroupInfo
 {
+#ifdef __cplusplus
+    HitGroupInfo(uint val)
+    {
+        memset(this, val, sizeof(HitGroupInfo));
+    }
+
+    HitGroupInfo() : HitGroupInfo(0)
+    {}
+#endif
     uint2 closestHitId;
     uint2 anyHitId;
     uint2 intersectionId;
diff --git a/src/shared/rayTracingDefs.h b/src/shared/rayTracingDefs.h
index 5778dfd..636df92 100644
--- a/src/shared/rayTracingDefs.h
+++ b/src/shared/rayTracingDefs.h
@@ -115,8 +115,14 @@ struct EncodeTaskCountersCommon
 };
 
 //=====================================================================================================================
-struct EncodeTaskCountersBuild : EncodeTaskCountersCommon
+// There is DXC bug that doesn't properly compile HLSL->SPRIV using structure inheritance.
+// Once it is fixed, EncodeTaskCountersBuild, EncodeTaskCountersUpdate can inherit from EncodeTaskCountersCommon
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6986
+struct EncodeTaskCountersBuild
 {
+    uint numPrimitives;
+    uint primRefs;
+
     // The following indirect arguments are only used in mult-dispatch path. Note, currently only HPLOC dispatch uses
     // these, but it will be extended to other passes when early pair compression is enabled.
     uint groupCountX;
@@ -135,8 +141,10 @@ static_assert(ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET == offsetof(EncodeTaskCounter
 
 //=====================================================================================================================
 // Update scratch memory fields
-struct EncodeTaskCountersUpdate : EncodeTaskCountersCommon
+struct EncodeTaskCountersUpdate
 {
+    uint numPrimitives;
+    uint primRefs;
     uint refitTaskCounter;
     uint taskCount;
     uint tasksDone;
diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py
index 95a90d3..5c691cd 100644
--- a/tools/CompileRTShaders.py
+++ b/tools/CompileRTShaders.py
@@ -43,7 +43,7 @@
 DWORDS_PER_LINE = 8
 
 FILE_STANDARD_HEADER = """
-/* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. */
+/* Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. */
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //
diff --git a/tools/DebugPreprocessShaders.py b/tools/DebugPreprocessShaders.py
index 4793b96..cc4f16a 100644
--- a/tools/DebugPreprocessShaders.py
+++ b/tools/DebugPreprocessShaders.py
@@ -29,7 +29,7 @@
 import argparse
 
 cpp_file_header = """
-/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
+/* Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. */
 
 namespace GpuRt
 {