diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea92db7..7332192 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,11 @@ option(GPURT_BUILD_CONTINUATION "GpuRt uses continuation traversal" ON)
 if (GPURT_BUILD_CONTINUATION)
     gpurt_add_compile_definitions(GPURT_BUILD_CONTINUATION=1)
 endif()
+
+cmake_dependent_option(GPURT_DEBUG_CONTINUATION_TRAVERSAL "Debug continuation traversal on legacy indirect path" OFF "GPURT_BUILD_CONTINUATION" OFF)
+if (GPURT_DEBUG_CONTINUATION_TRAVERSAL)
+    gpurt_add_compile_definitions(GPURT_DEBUG_CONTINUATION_TRAVERSAL=1)
+endif()
 #endif
 
 # Disable run time type information
diff --git a/backends/pal/gpurtPalBackend.cpp b/backends/pal/gpurtPalBackend.cpp
index d06394c..e4868dd 100644
--- a/backends/pal/gpurtPalBackend.cpp
+++ b/backends/pal/gpurtPalBackend.cpp
@@ -28,25 +28,6 @@
 namespace GpuRt
 {
 
-// =====================================================================================================================
-// GPURT to PAL enum conversions without undefined behavior.
-static Pal::HwPipePoint GpuRtToPalHwPipePoint(
-    HwPipePoint gpurtHwPipePoint)
-{
-#define HWPIPEPOINTCASE(x) case static_cast<uint32>(Pal::HwPipePoint::x): return Pal::HwPipePoint::x
-    switch (static_cast<uint32>(gpurtHwPipePoint))
-    {
-        HWPIPEPOINTCASE(HwPipeTop);
-        HWPIPEPOINTCASE(HwPipePreCs);
-        HWPIPEPOINTCASE(HwPipeBottom);
-        default:
-            PAL_ASSERT_ALWAYS_MSG("Unhandled HwPipePoint value in conversion: %u\n",
-                                  static_cast<uint32>(gpurtHwPipePoint));
-            return Pal::HwPipePoint::HwPipeTop;
-    }
-#undef HWPIPEPOINTCASE
-}
-
 // =====================================================================================================================
 static Pal::ImmediateDataWidth GpuRtToPalImmediateDataWidth(
     ImmediateDataWidth gpurtImmediateDataWidth)
@@ -132,7 +113,11 @@ void PalBackend::Dispatch(
     uint32                z
     ) const
 {
+#if PAL_INTERFACE_MAJOR_VERSION >= 909
+    GetCmdBuffer(cmdBuffer)->CmdDispatch({ x, y, z }, {});
+#else
     GetCmdBuffer(cmdBuffer)->CmdDispatch({ x, y, z });
+#endif
 }
 
 // =====================================================================================================================
@@ -238,6 +223,7 @@ void PalBackend::InsertBarrier(
 {
     const bool syncDispatch     = flags & BarrierFlagSyncDispatch;
     const bool syncIndirectArgs = flags & BarrierFlagSyncIndirectArg;
+    const bool syncPreCpWrite   = flags & BarrierFlagSyncPreCpWrite;
     const bool syncPostCpWrite  = flags & BarrierFlagSyncPostCpWrite;
 
     Pal::ICmdBuffer* pCmdBuffer = GetCmdBuffer(cmdBuffer);
@@ -247,8 +233,16 @@ void PalBackend::InsertBarrier(
 
     if (syncDispatch || syncIndirectArgs)
     {
-        memoryBarrier.srcStageMask  = Pal::PipelineStageCs;
-        memoryBarrier.srcAccessMask = Pal::CoherShader;
+        memoryBarrier.srcStageMask  |= Pal::PipelineStageCs;
+        memoryBarrier.srcAccessMask |= Pal::CoherShader;
+    }
+
+    if (syncPreCpWrite)
+    {
+        memoryBarrier.srcStageMask  |= Pal::PipelineStagePostPrefetch;
+        memoryBarrier.srcAccessMask |= Pal::CoherShader;
+        memoryBarrier.dstStageMask  |= Pal::PipelineStagePostPrefetch;
+        memoryBarrier.dstAccessMask |= Pal::CoherCp;
     }
 
     if (syncPostCpWrite)
@@ -359,12 +353,11 @@ void PalBackend::UpdateMemory(
 // =====================================================================================================================
 void PalBackend::WriteTimestamp(
     ClientCmdBufferHandle  cmdBuffer,
-    HwPipePoint            hwPipePoint,
     const Pal::IGpuMemory& timeStampVidMem,
     uint64                 offset
     ) const
 {
-    GetCmdBuffer(cmdBuffer)->CmdWriteTimestamp(GpuRtToPalHwPipePoint(hwPipePoint), timeStampVidMem, offset);
+    GetCmdBuffer(cmdBuffer)->CmdWriteTimestamp(Pal::PipelineStageBottomOfPipe, timeStampVidMem, offset);
 }
 
 // =====================================================================================================================
diff --git a/backends/pal/gpurtPalBackend.h b/backends/pal/gpurtPalBackend.h
index 7da603e..a5fc976 100644
--- a/backends/pal/gpurtPalBackend.h
+++ b/backends/pal/gpurtPalBackend.h
@@ -121,7 +121,6 @@ class PalBackend : public IBackend
 
     virtual void WriteTimestamp(
         ClientCmdBufferHandle  cmdBuffer,
-        HwPipePoint            hwPipePoint,
         const Pal::IGpuMemory& timeStampVidMem,
         uint64                 offset
     ) const override;
diff --git a/gpurt/gpurt.h b/gpurt/gpurt.h
index 5d9d8e2..ca70f8f 100644
--- a/gpurt/gpurt.h
+++ b/gpurt/gpurt.h
@@ -737,7 +737,7 @@ struct DeviceSettings
     uint32                      numRebraidIterations;
     uint32                      rebraidQualityHeuristic;
 
-    uint32                      plocRadius;                           // PLOC Radius
+    uint32                      plocRadius;                           // PLOC nearest neighbor search adius
     uint32                      maxTopDownBuildInstances;             // Max instances allowed for top down build
     uint32                      parallelBuildWavesPerSimd;            // Waves per SIMD to launch for parallel build
 
diff --git a/gpurt/gpurtBackend.h b/gpurt/gpurtBackend.h
index 00152dc..7463254 100644
--- a/gpurt/gpurtBackend.h
+++ b/gpurt/gpurtBackend.h
@@ -75,21 +75,13 @@ struct BufferViewInfo
     BufferViewSwizzle swizzle;
 };
 
-// =====================================================================================================================
-// Copy of Pal::HwPipePoint with values we use.
-enum class HwPipePoint : uint32
-{
-    HwPipeTop    = 0x0,
-    HwPipePreCs  = 0x1,
-    HwPipeBottom = 0x7,
-};
-
 // =====================================================================================================================
 enum BarrierFlags : uint32
 {
     BarrierFlagSyncDispatch    = 0x1, // Stall the following dispatch until all previous dispatch done
     BarrierFlagSyncIndirectArg = 0x2, // Prepare previous shader output for indirect argument use
-    BarrierFlagSyncPostCpWrite = 0x4, // Prepare data set by CP for shader use
+    BarrierFlagSyncPreCpWrite  = 0x4, // Prepare for CP write
+    BarrierFlagSyncPostCpWrite = 0x8, // Prepare data set by CP for shader use
 };
 
 // =====================================================================================================================
@@ -185,7 +177,6 @@ class IBackend
     // Will eventually replaced with a callback or other abstraction to avoid referencing video memory.
     virtual void WriteTimestamp(
         ClientCmdBufferHandle  cmdBuffer,
-        HwPipePoint            hwPipePoint,
         const Pal::IGpuMemory& timeStampVidMem,
         uint64                 offset) const = 0;
 
diff --git a/gpurt/gpurtBuildSettings.h b/gpurt/gpurtBuildSettings.h
index 5c73247..413f719 100644
--- a/gpurt/gpurtBuildSettings.h
+++ b/gpurt/gpurtBuildSettings.h
@@ -62,7 +62,7 @@ struct CompileTimeBuildSettings
     uint32 radixSortScanLevel;
     uint32 emitCompactSize;
     uint32 enableBVHBuildDebugCounters;
-    uint32 plocRadius;
+    uint32 nnSearchRadius;
     uint32 enablePairCostCheck;
     uint32 enableVariableBitsMortonCode;
     uint32 rebraidType;
@@ -112,7 +112,7 @@ struct CompileTimeBuildSettings
 #define BUILD_SETTINGS_DATA_RADIX_SORT_SCAN_LEVEL_ID                  7
 #define BUILD_SETTINGS_DATA_EMIT_COMPACT_SIZE_ID                      8
 #define BUILD_SETTINGS_DATA_ENABLE_BVH_BUILD_DEBUG_COUNTERS_ID        9
-#define BUILD_SETTINGS_DATA_PLOC_RADIUS_ID                            10
+#define BUILD_SETTINGS_DATA_NN_SEARCH_RADIUS_ID                       10
 #define BUILD_SETTINGS_DATA_ENABLE_PAIR_COST_CHECK_ID                 11
 #define BUILD_SETTINGS_DATA_ENABLE_VARIABLE_BITS_MC_ID                12
 #define BUILD_SETTINGS_DATA_REBRAID_TYPE_ID                           13
diff --git a/gpurt/gpurtInlineFuncs.h b/gpurt/gpurtInlineFuncs.h
index ff377df..72a02f7 100644
--- a/gpurt/gpurtInlineFuncs.h
+++ b/gpurt/gpurtInlineFuncs.h
@@ -156,23 +156,6 @@ inline BufferViewFormat GetSingleComponentFormatForFormat(BufferViewFormat forma
     }
 }
 
-//=====================================================================================================================
-// Converts the value of a Pal::HwPipePoint into a GpuRt::HwPipePoint without undefined behavior.
-inline HwPipePoint PalToGpuRtHwPipePoint(uint32 palHwPipePoint)
-{
-#define HWPIPEPOINTCASE(x) case static_cast<uint32>(HwPipePoint::x): return HwPipePoint::x
-    switch (palHwPipePoint)
-    {
-        HWPIPEPOINTCASE(HwPipeTop);
-        HWPIPEPOINTCASE(HwPipePreCs);
-        HWPIPEPOINTCASE(HwPipeBottom);
-        default:
-            PAL_ASSERT_ALWAYS_MSG("Unhandled HwPipePoint value in conversion: %u\n", palHwPipePoint);
-            return HwPipePoint::HwPipeTop;
-    }
-#undef HWPIPEPOINTCASE
-}
-
 //=====================================================================================================================
 // Return the number of components for a buffer view format when it's used as a vertex format.
 inline uint8 GetNumComponentsForVertexFormat(VertexFormat format)
diff --git a/src/gpurtBvhBatcher.cpp b/src/gpurtBvhBatcher.cpp
index ba70d10..ae15781 100644
--- a/src/gpurtBvhBatcher.cpp
+++ b/src/gpurtBvhBatcher.cpp
@@ -104,7 +104,7 @@ void BvhBatcher::BuildAccelerationStructureBatch(
             // but otherwise do not participate in the rest of the build.
             if (isUpdate)
             {
-                builder.EmitPostBuildInfo();
+                builder.EmitPostBuildInfoDispatch();
             }
             else
             {
@@ -146,7 +146,11 @@ void BvhBatcher::BuildAccelerationStructureBatch(
     {
         RGP_PUSH_MARKER("Process Empty BVH builds");
         DispatchInitAccelerationStructure<false>(emptyBuilders);
-        BuildPhase(emptyBuilders, &BvhBuilder::EmitPostBuildInfo);
+        if (PhaseEnabled(BuildPhaseFlags::SeparateEmitPostBuildInfoPass))
+        {
+            Barrier();
+            BuildPhase(emptyBuilders, &BvhBuilder::EmitPostBuildInfoDispatch);
+        }
         RGP_POP_MARKER();
     }
 
@@ -264,17 +268,10 @@ void BvhBatcher::BuildRaytracingAccelerationStructureBatch(
     {
         RGP_PUSH_MARKER("EmitPostBuildInfo");
         Barrier();
-        BuildPhase("Updates", updaters, &BvhBuilder::EmitPostBuildInfo);
-        BuildPhase("Builds", builders, &BvhBuilder::EmitPostBuildInfo);
-
+        BuildPhase(BuildPhaseFlags::SeparateEmitPostBuildInfoPass, updaters, &BvhBuilder::EmitPostBuildInfoDispatch);
+        BuildPhase(BuildPhaseFlags::SeparateEmitPostBuildInfoPass, builders, &BvhBuilder::EmitPostBuildInfoDispatch);
         RGP_POP_MARKER();
     }
-    else
-    {
-        // Execute EmitPostBuildInfo without any RGP markers
-        BuildPhase(updaters, &BvhBuilder::EmitPostBuildInfo);
-        BuildPhase(builders, &BvhBuilder::EmitPostBuildInfo);
-    }
 
     if (PhaseEnabled(BuildPhaseFlags::BuildDumpEvents))
     {
diff --git a/src/gpurtBvhBuilder.cpp b/src/gpurtBvhBuilder.cpp
index 59287b6..898b172 100644
--- a/src/gpurtBvhBuilder.cpp
+++ b/src/gpurtBvhBuilder.cpp
@@ -1462,6 +1462,31 @@ void BvhBuilder::InitBuildConfig(
 #endif
         ;
 
+    // The builder supports one compacted size emit during the build itself. Additional postbuild info requires
+    // extra dispatches or CP writes.
+    uint32 emitCompactCount = 0;
+    for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; ++i)
+    {
+        AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i);
+        if (args.desc.infoType == AccelStructPostBuildInfoType::CompactedSize)
+        {
+            // Cache emit destination GPU VA for inlined emit from build shaders
+            m_emitCompactDstGpuVa = args.desc.postBuildBufferAddr.gpu;
+            emitCompactCount++;
+        }
+        else
+        {
+            m_buildConfig.nonInlinePostBuildEmits = true;
+        }
+    }
+
+    // If maxNumPrimitives == 0, we never execute a BVH build, so we always need a separate emit pass.
+    if ((emitCompactCount > 1) || (m_buildConfig.maxNumPrimitives == 0))
+    {
+        m_emitCompactDstGpuVa = 0;
+        m_buildConfig.nonInlinePostBuildEmits = true;
+        m_buildConfig.enableEmitCompactSizeDispatch = true;
+    }
 }
 
 // =====================================================================================================================
@@ -2194,7 +2219,10 @@ void BvhBuilder::InitBuildSettings()
                                                    static_cast<uint32>(m_buildConfig.fp16BoxNodesInBlasMode);
     m_buildSettings.fp16BoxModeMixedSaThreshold  = m_deviceSettings.fp16BoxModeMixedSaThresh;
     m_buildSettings.enableBVHBuildDebugCounters  = m_deviceSettings.enableBVHBuildDebugCounters;
-    m_buildSettings.plocRadius                   = m_deviceSettings.plocRadius;
+    if (buildMode == BvhBuildMode::PLOC)
+    {
+        m_buildSettings.nnSearchRadius = m_deviceSettings.plocRadius;
+    }
     m_buildSettings.enablePairCostCheck          = m_deviceSettings.enablePairCompressionCostCheck;
     m_buildSettings.enableVariableBitsMortonCode = m_deviceSettings.enableVariableBitsMortonCodes;
 
@@ -2222,24 +2250,7 @@ void BvhBuilder::InitBuildSettings()
 
     m_buildSettings.rtIpLevel = static_cast<uint32>(m_pDevice->GetRtIpLevel());
 
-    uint32 emitBufferCount = 0;
-    for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; ++i)
-    {
-        AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i);
-        if (args.desc.infoType == AccelStructPostBuildInfoType::CompactedSize)
-        {
-            // Cache emit destination GPU VA for inlined emit from build shaders
-            m_emitCompactDstGpuVa = args.desc.postBuildBufferAddr.gpu;
-            emitBufferCount++;
-        }
-    }
-
-    if (emitBufferCount == 1)
-    {
-        // We only support one compacted emit size from the build shaders. If we have more than one emit
-        // destination buffers, we use the compute shader path
-        m_buildSettings.emitCompactSize = 1;
-    }
+    m_buildSettings.emitCompactSize = (m_emitCompactDstGpuVa != 0);
 
     m_buildSettings.doEncode = (m_buildConfig.needEncodeDispatch == false);
 
@@ -2313,8 +2324,10 @@ void BvhBuilder::GetAccelerationStructurePrebuildInfo(
     // the build when performing the update causing page faults.
     scratchDataSize = Util::Max(scratchDataSize, updateDataSize);
 
-    // Some applications crash when the driver reports 0 scratch size. Use 1 DWORD instead.
-    scratchDataSize = Util::Max(static_cast<uint32>(sizeof(uint32)), scratchDataSize);
+    // Some applications crash when the driver reports 0 scratch size.
+    // Additionally, the d3d12 debug layer does not like a scratch buffer
+    // that's only 4 bytes, so we pass back 8 bytes instead.
+    scratchDataSize = Util::Max(static_cast<uint32>(sizeof(uint64)), scratchDataSize);
 
     prebuildInfo.scratchDataSizeInBytes       = scratchDataSize;
     prebuildInfo.updateScratchDataSizeInBytes = updateDataSize;
@@ -2432,7 +2445,7 @@ void BvhBuilder::BuildRaytracingAccelerationStructure()
 
     if (m_buildArgs.postBuildInfoDescCount > 0)
     {
-        if (NeedsPostBuildEmitPass())
+        if (m_buildConfig.enableEmitCompactSizeDispatch)
         {
             // Make sure build is complete before emitting
             Barrier();
@@ -2513,7 +2526,6 @@ void BvhBuilder::PreBuildDumpEvents()
         if (result == Pal::Result::Success)
         {
             m_backend.WriteTimestamp(m_cmdBuffer,
-                                     HwPipePoint::HwPipeBottom,
                                      *m_dumpInfo.pTimeStampVidMem,
                                      m_dumpInfo.timeStampVidMemoffset);
         }
@@ -2530,7 +2542,6 @@ void BvhBuilder::PostBuildDumpEvents()
         if (m_dumpInfo.pTimeStampVidMem != nullptr)
         {
             m_backend.WriteTimestamp(m_cmdBuffer,
-                                     HwPipePoint::HwPipeBottom,
                                      *m_dumpInfo.pTimeStampVidMem,
                                      m_dumpInfo.timeStampVidMemoffset + sizeof(uint64));
         }
@@ -2739,23 +2750,17 @@ void BvhBuilder::EncodePrimitives()
 // Handles writing any requested postbuild information.
 void BvhBuilder::EmitPostBuildInfo()
 {
-    if (m_buildArgs.postBuildInfoDescCount == 0)
-    {
-        return;
-    }
-
     const uint32 resultDataSize = m_resultBufferInfo.dataSize;
 
     const bool isBottomLevel = (m_buildArgs.inputs.type == AccelStructType::BottomLevel);
-    const bool useSeparateEmitPass = NeedsPostBuildEmitPass();
+
     for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; i++)
     {
         const AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i);
         switch (args.desc.infoType)
         {
         case AccelStructPostBuildInfoType::CompactedSize:
-            // If maxNumPrimitives == 0, we never execute a BVH build, so we always need a separateEmitPass
-            if (useSeparateEmitPass || (m_buildConfig.maxNumPrimitives == 0))
+            if (m_buildConfig.enableEmitCompactSizeDispatch)
             {
                 EmitAccelerationStructurePostBuildInfo(args);
             }
@@ -2808,6 +2813,22 @@ void BvhBuilder::EmitPostBuildInfo()
     }
 }
 
+// =====================================================================================================================
+// Handles writing any requested postbuild information via dispatch (not CP writes).
+void BvhBuilder::EmitPostBuildInfoDispatch()
+{
+    for (uint32 i = 0; i < m_buildArgs.postBuildInfoDescCount; i++)
+    {
+        const AccelStructPostBuildInfo args = m_clientCb.pfnConvertAccelStructPostBuildInfo(m_buildArgs, i);
+
+        if ((args.desc.infoType != AccelStructPostBuildInfoType::CompactedSize) ||
+            m_buildConfig.enableEmitCompactSizeDispatch)
+        {
+            EmitAccelerationStructurePostBuildInfo(args);
+        }
+    }
+}
+
 // =====================================================================================================================
 // Emits post-build properties for a set of acceleration structures.
 // This enables applications to know the output resource requirements for performing acceleration structure
@@ -3137,6 +3158,7 @@ void BvhBuilder::CopyASDeserializeMode(
     };
 
     // Reset the task counter in destination buffer.
+    Barrier(BarrierFlagSyncPreCpWrite);
     ResetTaskCounter(copyArgs.dstAccelStructAddr.gpu);
     Barrier(BarrierFlagSyncPostCpWrite);
 
@@ -3195,7 +3217,7 @@ BuildPhaseFlags BvhBuilder::EnabledPhases() const
 {
     BuildPhaseFlags flags{};
 
-    if (NeedsPostBuildEmitPass())
+    if (m_buildConfig.nonInlinePostBuildEmits)
     {
         flags |= BuildPhaseFlags::SeparateEmitPostBuildInfoPass;
     }
@@ -3451,15 +3473,6 @@ bool BvhBuilder::AllowLatePairCompression() const
     return enableLatePairCompression;
 }
 
-// =====================================================================================================================
-// Returns true when the builder will require a separate dispatch for emitting build info
-bool BvhBuilder::NeedsPostBuildEmitPass() const
-{
-    const bool usesSeparateEmitPass = (m_buildArgs.postBuildInfoDescCount == 0) &&
-                                      (m_emitCompactDstGpuVa != 0) && (m_buildSettings.emitCompactSize == 0);
-    return usesSeparateEmitPass;
-}
-
 // =====================================================================================================================
 // Returns true when the builder has dumping events
 bool BvhBuilder::HasBuildDumpEvents() const
diff --git a/src/gpurtBvhBuilder.h b/src/gpurtBvhBuilder.h
index 806c518..c025041 100644
--- a/src/gpurtBvhBuilder.h
+++ b/src/gpurtBvhBuilder.h
@@ -224,6 +224,8 @@ class BvhBuilder
         bool                            enableMergeSort;
         bool                            enableInstanceRebraid;
         bool                            rebuildAccelStruct;
+        bool                            enableEmitCompactSizeDispatch;
+        bool                            nonInlinePostBuildEmits;
     };
 
     BvhBuilder(
@@ -321,6 +323,7 @@ class BvhBuilder
     void UpdateAccelerationStructure();
 
     void EmitPostBuildInfo();
+    void EmitPostBuildInfoDispatch();
 
     void EncodeUpdate();
 
@@ -413,7 +416,6 @@ class BvhBuilder
     // Optional phase checks
     bool AllowRebraid() const;
     bool AllowLatePairCompression() const;
-    bool NeedsPostBuildEmitPass() const;
     bool HasBuildDumpEvents() const;
 
     // Helper functions
diff --git a/src/gpurtTraceSource.h b/src/gpurtTraceSource.h
index 3262892..1c609f9 100644
--- a/src/gpurtTraceSource.h
+++ b/src/gpurtTraceSource.h
@@ -71,7 +71,11 @@ class AccelStructTraceSource : public GpuUtil::ITraceSource
     }
 
     // Using this notification to do any preparation work that might be required before the trace begins.
+#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908
+    virtual void OnTraceAccepted(uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override
+#else
     virtual void OnTraceAccepted() override
+#endif
     {
     }
 
@@ -134,7 +138,11 @@ class RayHistoryTraceSource : public GpuUtil::ITraceSource
     }
 
     // Using this notification to do any preparation work that might be required before the trace begins.
+#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908
+    virtual void OnTraceAccepted(uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override
+#else
     virtual void OnTraceAccepted() override
+#endif
     {
     }
 
diff --git a/src/shaders/BuildCommonScratch.hlsl b/src/shaders/BuildCommonScratch.hlsl
index b0d3197..648a11e 100644
--- a/src/shaders/BuildCommonScratch.hlsl
+++ b/src/shaders/BuildCommonScratch.hlsl
@@ -682,6 +682,32 @@ bool IsLeafOrIsCollapsed(
     return result;
 }
 
+//=====================================================================================================================
+uint GetMinimumNumOfTriangles()
+{
+    uint minNumOfTris = 2;
+    {
+        {
+            minNumOfTris = 0;
+        }
+    }
+
+    return minNumOfTris;
+}
+
+//=====================================================================================================================
+float GetTriangleIntersectionCost(uint numTris)
+{
+    float Ct;
+    {
+        {
+            Ct = SAH_COST_TRIANGLE_INTERSECTION * numTris;
+        }
+    }
+
+    return Ct;
+}
+
 //=====================================================================================================================
 void MergeScratchNodes(
     uint        scratchNodesOffset,
@@ -724,18 +750,17 @@ void MergeScratchNodes(
         const uint numRight = FetchScratchNodeNumPrimitives(rightNode, IsLeafNode(rightNodeIndex, numActivePrims));
         const uint numTris = numLeft + numRight;
 
-        const float Ct =
-            SAH_COST_TRIANGLE_INTERSECTION;
-
         const float Ci = SAH_COST_AABBB_INTERSECTION;
 
         const float leftCost =
             IsLeafNode(leftNodeIndex, numActivePrims) ?
-                (Ct * ComputeBoxSurfaceArea(leftBounds)) : FetchScratchNodeCost(scratchNodesOffset, leftNodeIndex);
+                (GetTriangleIntersectionCost(numLeft) * ComputeBoxSurfaceArea(leftBounds)) :
+                FetchScratchNodeCost(scratchNodesOffset, leftNodeIndex);
 
         const float rightCost =
             IsLeafNode(rightNodeIndex, numActivePrims) ?
-                (Ct * ComputeBoxSurfaceArea(rightBounds)) : FetchScratchNodeCost(scratchNodesOffset, rightNodeIndex);
+                (GetTriangleIntersectionCost(numRight) * ComputeBoxSurfaceArea(rightBounds)) :
+                FetchScratchNodeCost(scratchNodesOffset, rightNodeIndex);
 
         const bool leftCollapse      = (leftNode.numPrimitivesAndDoCollapse & 0x1) ||
                                         IsLeafNode(leftNodeIndex, numActivePrims);
@@ -745,7 +770,7 @@ void MergeScratchNodes(
 
         float bestCost = leftCost + rightCost + Ci * mergedBoxSurfaceArea;
 
-        const float collapseCost = Ct * numTris;
+        const float collapseCost = GetTriangleIntersectionCost(numTris);
 
         const float splitCost    = Ci + leftCost / mergedBoxSurfaceArea + rightCost / mergedBoxSurfaceArea;
 
diff --git a/src/shaders/BuildPLOC.hlsl b/src/shaders/BuildPLOC.hlsl
index 079f1bc..4c2f7b1 100644
--- a/src/shaders/BuildPLOC.hlsl
+++ b/src/shaders/BuildPLOC.hlsl
@@ -882,7 +882,7 @@ void BuildPLOC(
     plocArgs.baseBatchIndicesScratchOffset  = ShaderConstants.offsets.batchIndices;
     plocArgs.fp16BoxNodesInBlasMode         = Settings.fp16BoxNodesMode;
     plocArgs.fp16BoxModeMixedSaThresh       = Settings.fp16BoxModeMixedSaThreshold;
-    plocArgs.plocRadius                     = Settings.plocRadius;
+    plocArgs.plocRadius                     = Settings.nnSearchRadius;
     plocArgs.splitBoxesByteOffset           = ShaderConstants.offsets.triangleSplitBoxes;
     plocArgs.primIndicesSortedScratchOffset = ShaderConstants.offsets.primIndicesSorted;
     plocArgs.unsortedBvhLeafNodesOffset     = ShaderConstants.offsets.bvhLeafNodeData;
diff --git a/src/shaders/BuildParallel.hlsl b/src/shaders/BuildParallel.hlsl
index 8a3df86..15e197d 100644
--- a/src/shaders/BuildParallel.hlsl
+++ b/src/shaders/BuildParallel.hlsl
@@ -265,7 +265,7 @@ void BuildPloc(
     plocArgs.baseBatchIndicesScratchOffset  = ShaderConstants.offsets.batchIndices;
     plocArgs.fp16BoxNodesInBlasMode         = Settings.fp16BoxNodesMode;
     plocArgs.fp16BoxModeMixedSaThresh       = Settings.fp16BoxModeMixedSaThreshold;
-    plocArgs.plocRadius                     = Settings.plocRadius;
+    plocArgs.plocRadius                     = Settings.nnSearchRadius;
     plocArgs.splitBoxesByteOffset           = ShaderConstants.offsets.triangleSplitBoxes;
     plocArgs.primIndicesSortedScratchOffset = ShaderConstants.offsets.primIndicesSorted;
     plocArgs.unsortedBvhLeafNodesOffset     = ShaderConstants.offsets.bvhLeafNodeData;
diff --git a/src/shaders/BuildQBVH.hlsl b/src/shaders/BuildQBVH.hlsl
index 60e527f..1fc9b0f 100644
--- a/src/shaders/BuildQBVH.hlsl
+++ b/src/shaders/BuildQBVH.hlsl
@@ -279,25 +279,19 @@ uint WritePrimitiveNode(
     const uint geometryIndexAndFlags = PackGeometryIndexAndFlags(geometryIndex, geometryFlags);
     const uint geometryPrimNodePtrsOffset = offsets.primNodePtrs + geometryInfo.primNodePtrsOffset;
 
-    const uint flattenedPrimIndex =
-        (geometryInfo.primNodePtrsOffset / sizeof(uint)) + scratchNode.left_or_primIndex_or_instIndex;
-
     uint numLeafsDone;
     ScratchGlobal.InterlockedAdd(ShaderConstants.offsets.qbvhGlobalStackPtrs + STACK_PTRS_NUM_LEAFS_DONE_OFFSET,
                                  1,
                                  numLeafsDone);
 
     {
-        uint destIndex;
-        if (IsTrianglePrimitiveBuild() &&
-            ((Settings.triangleCompressionMode != NO_TRIANGLE_COMPRESSION) || Settings.doTriangleSplitting))
-        {
-            destIndex = numLeafsDone;
-        }
-        else
-        {
-            destIndex = flattenedPrimIndex;
-        }
+        // Use 'numLeafsDone' as the destination index. This will pack all leaf nodes together
+        // without any holes (invalid nodes) in between.
+        // Note: Packing the triangle nodes this way causes the primNodePtrs to access the
+        // Triangle nodes in random order which results in perf drops of some Rayperf scenes
+        // when built/updated using 'asb'. Since 'asb' is a synthetic app, ignoring this perf drop
+        // for now, but need to revisit this change if any actual game/benchmark shows the perf. drop.
+        uint destIndex = numLeafsDone;
 
         const uint primitiveNodeSize = (nodeType == NODE_TYPE_USER_NODE_PROCEDURAL) ?
                                        USER_NODE_PROCEDURAL_SIZE :
diff --git a/src/shaders/BuildSettings.hlsli b/src/shaders/BuildSettings.hlsli
index ac6e315..2e5ff10 100644
--- a/src/shaders/BuildSettings.hlsli
+++ b/src/shaders/BuildSettings.hlsli
@@ -38,7 +38,7 @@
 [[vk::constant_id(BUILD_SETTINGS_DATA_RADIX_SORT_SCAN_LEVEL_ID)]]                  uint radixSortScanLevel            = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_EMIT_COMPACT_SIZE_ID)]]                      uint emitCompactSize               = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_BVH_BUILD_DEBUG_COUNTERS_ID)]]        uint enableBVHBuildDebugCounters   = 0;
-[[vk::constant_id(BUILD_SETTINGS_DATA_PLOC_RADIUS_ID)]]                            uint plocRadius                    = 0;
+[[vk::constant_id(BUILD_SETTINGS_DATA_NN_SEARCH_RADIUS_ID)]]                       uint nnSearchRadius                = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_PAIR_COST_CHECK_ID)]]                 uint enablePairCostCheck           = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_ENABLE_VARIABLE_BITS_MC_ID)]]                uint enableVariableBitsMortonCode  = 0;
 [[vk::constant_id(BUILD_SETTINGS_DATA_REBRAID_TYPE_ID)]]                           uint rebraidType                   = 0;
@@ -73,7 +73,7 @@ static const CompileTimeBuildSettings Settings = {
     radixSortScanLevel,
     emitCompactSize,
     enableBVHBuildDebugCounters,
-    plocRadius,
+    nnSearchRadius,
     enablePairCostCheck,
     enableVariableBitsMortonCode,
     rebraidType,
diff --git a/src/shaders/CompactAS1_1.hlsl b/src/shaders/CompactAS1_1.hlsl
index d9a165e..31af7df 100644
--- a/src/shaders/CompactAS1_1.hlsl
+++ b/src/shaders/CompactAS1_1.hlsl
@@ -391,40 +391,51 @@ void CompactASImpl1_1(
         // Copy leaf nodes
         if (type == TOP_LEVEL)
         {
-            for (uint nodeIndex = globalId; nodeIndex < srcHeader.numLeafNodes; nodeIndex += ShaderConstants.numThreads)
+            // Need to loop over all the prims, not just numLeafNodes.
+            for (uint nodeIndex = globalId; nodeIndex < srcHeader.numPrimitives; nodeIndex += ShaderConstants.numThreads)
             {
-                const uint nodeOffset
-                    = nodeIndex * GetBvhNodeSizeLeaf(PrimitiveType::Instance, Settings.enableFusedInstanceNode);
-                const uint srcNodeDataOffset  = srcOffsetDataLeafNodes + nodeOffset;
-                const uint dstNodeDataOffset  = dstOffsetDataLeafNodes + nodeOffset;
+                // Since there could be invalid instance nodes, we need to skip over them. Invalid instance nodes
+                // will have corresponding prim node pointers as -1. So check for this and skip the node if invalid.
+                // Note: We don't need to skip invalid nodes for BLASs because their leaf nodes will be packed one
+                // after another, ie: no holes -> no invalid nodes.
+                const uint primNodePtrOffset = srcOffsetDataPrimNodePtrs + (nodeIndex * NODE_PTR_SIZE);
 
-                // Copy instance node
-                // Note, fused instance nodes are twice the size of normal instance nodes. We need to copy it correspondingly.
-                if (Settings.enableFusedInstanceNode)
-                {
-                    const FusedInstanceNode node = SrcBuffer.Load<FusedInstanceNode>(srcNodeDataOffset);
-                    DstMetadata.Store<FusedInstanceNode>(dstNodeDataOffset, node);
-                }
-                else
+                if (SrcBuffer.Load(primNodePtrOffset) != INVALID_IDX)
                 {
-                    const InstanceNode node = SrcBuffer.Load<InstanceNode>(srcNodeDataOffset);
-                    DstMetadata.Store<InstanceNode>(dstNodeDataOffset, node);
-                }
+                    const uint nodeOffset
+                        = nodeIndex * GetBvhNodeSizeLeaf(PrimitiveType::Instance, Settings.enableFusedInstanceNode);
+                    const uint srcNodeDataOffset = srcOffsetDataLeafNodes + nodeOffset;
+                    const uint dstNodeDataOffset = dstOffsetDataLeafNodes + nodeOffset;
+
+                    // Copy instance node
+                    // Note, fused instance nodes are twice the size of normal instance nodes. We need to copy it correspondingly.
+                    if (Settings.enableFusedInstanceNode)
+                    {
+                        const FusedInstanceNode node = SrcBuffer.Load<FusedInstanceNode>(srcNodeDataOffset);
+                        DstMetadata.Store<FusedInstanceNode>(dstNodeDataOffset, node);
+                    }
+                    else
+                    {
+                        const InstanceNode node = SrcBuffer.Load<InstanceNode>(srcNodeDataOffset);
+                        DstMetadata.Store<InstanceNode>(dstNodeDataOffset, node);
+                    }
 
-                // Top level acceleration structures do not have geometry info.
+                    // Top level acceleration structures do not have geometry info.
 
-                const uint srcNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, srcOffsets.leafNodes + nodeOffset);
-                const uint dstNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, dstOffsets.leafNodes + nodeOffset);
+                    const uint srcNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, srcOffsets.leafNodes + nodeOffset);
+                    const uint dstNodePointer = PackNodePointer(NODE_TYPE_USER_NODE_INSTANCE, dstOffsets.leafNodes + nodeOffset);
 
-                // Update the parent pointer and fix up the child pointer in the parent node
-                UpdateParentPointerAndChildPointer(srcMetadataSizeInBytes,
-                                                   srcNodePointer,
-                                                   dstMetadataSizeInBytes,
-                                                   dstNodePointer);
+                    // Update the parent pointer and fix up the child pointer in the parent node
+                    UpdateParentPointerAndChildPointer(srcMetadataSizeInBytes,
+                                                       srcNodePointer,
+                                                       dstMetadataSizeInBytes,
+                                                       dstNodePointer);
+                }
             }
         }
         else if (srcHeader.geometryType == GEOMETRY_TYPE_TRIANGLES)
         {
+            // Unlike TOP_LEVEL, this assumes that all leaf nodes are packed contiguously without any holes in between.
             for (uint nodeIndex = globalId; nodeIndex < srcHeader.numLeafNodes; nodeIndex += ShaderConstants.numThreads)
             {
                 const uint nodeOffset         = (nodeIndex * sizeof(TriangleNode));
diff --git a/src/shaders/Continuations1_1.hlsl b/src/shaders/Continuations1_1.hlsl
index 1d17e9d..09fb6b5 100644
--- a/src/shaders/Continuations1_1.hlsl
+++ b/src/shaders/Continuations1_1.hlsl
@@ -158,7 +158,7 @@ static _AmdTraversalState InitTraversalState1_1(
 
     uint schedulerState = TRAVERSAL_STATE_COMMITTED_NOTHING;
     traversal.committed.PackState(schedulerState);
-    traversal.committed.currNodePtr = INVALID_NODE;
+    traversal.committed.SetCurrNodePtr(INVALID_NODE);
 
     // Start traversing from root node
     traversal.reservedNodePtr         = INVALID_NODE;
@@ -173,7 +173,7 @@ static _AmdTraversalState InitTraversalState1_1(
     traversal.stackPtr = stack.Pack();
     traversal.PackStackPtrTop(INVALID_NODE);
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
     traversal.committed.PackAnyHitCallType(0);
 #endif
 
@@ -354,7 +354,7 @@ static void TraversalInternal1_1(
                         candidate.PackInstanceContribution(instanceContributionToHitGroupIndex, hitKind);
                         candidate.PackGeometryIndex(primitiveData.geometryIndex);
                         candidate.PackIsOpaque(isOpaque);
-                        candidate.currNodePtr = nodePtr;
+                        candidate.SetCurrNodePtr(nodePtr);
 
                         bool hasAnyHit = false;
                         if ((rayForceOpaque == false) && (isOpaque == false))
@@ -416,9 +416,9 @@ static void TraversalInternal1_1(
                 candidate.PackGeometryIndex(primitiveData.geometryIndex);
                 candidate.PackIsOpaque(isOpaque);
                 candidate.PackInstanceContribution(instanceContributionToHitGroupIndex);
-                candidate.currNodePtr = nodePtr;
+                candidate.SetCurrNodePtr(nodePtr);
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
                 uint anyHitCallType = rayForceOpaque ? ANYHIT_CALLTYPE_SKIP : ANYHIT_CALLTYPE_DUPLICATE;
                 const bool noDuplicateAnyHit = (geometryFlags & D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION);
                 anyHitCallType = noDuplicateAnyHit ? ANYHIT_CALLTYPE_NO_DUPLICATE : anyHitCallType;
diff --git a/src/shaders/Continuations2_0.hlsl b/src/shaders/Continuations2_0.hlsl
index 73293dc..283fe20 100644
--- a/src/shaders/Continuations2_0.hlsl
+++ b/src/shaders/Continuations2_0.hlsl
@@ -43,7 +43,7 @@ static _AmdTraversalState InitTraversalState2_0(
 
     uint schedulerState = TRAVERSAL_STATE_COMMITTED_NOTHING;
     traversal.committed.PackState(schedulerState);
-    traversal.committed.currNodePtr = INVALID_NODE;
+    traversal.committed.SetCurrNodePtr(INVALID_NODE);
 
     // Start traversing from root node
     traversal.reservedNodePtr         = INVALID_NODE;
@@ -58,7 +58,7 @@ static _AmdTraversalState InitTraversalState2_0(
 
     traversal.PackStackPtrTop(0);
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
     traversal.committed.PackAnyHitCallType(0);
 #endif
 
@@ -72,14 +72,8 @@ static void TraversalInternal2_0(
     inout_param(_AmdPrimitiveSystemState) candidate,
     inout_param(float2) candidateBarycentrics)
 {
-    uint rayFlags = data.ray.Flags();
-
-    uint boxHeuristicMode = AmdTraceRayGetBoxSortHeuristicMode();
-    if ((boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosest) ||
-        (boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosestMidPoint))
-    {
-        boxHeuristicMode = GetBoxSortingHeuristicFromRayFlags(rayFlags, boxHeuristicMode);
-    }
+    const uint rayFlags         = data.ray.Flags();
+    const uint boxHeuristicMode = GetBoxHeuristicMode();
 
     // Root bvh address for reuse
     const GpuVirtualAddress topBvhAddress = data.ray.AccelStruct();
@@ -322,7 +316,8 @@ static void TraversalInternal2_0(
                     committed.PackInstanceContribution(instanceContributionToHitGroupIndex, hitKind);
                     committed.PackGeometryIndex(primitiveData.geometryIndex,
                         TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT, false);
-                    committed.currNodePtr    = nodePtr;
+                    committed.SetCurrNodePtr(nodePtr);
+
                     state = TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT;
 
                     // Exit traversal early if ray flags indicate end search after first hit
@@ -357,7 +352,8 @@ static void TraversalInternal2_0(
                     candidate.PackGeometryIndex(primitiveData.geometryIndex,
                     // This #ifdef is required until the legacy GPURT_RTIP_LEVEL == 0 lib has been removed:
                         TRAVERSAL_STATE_COMMITTED_TRIANGLE_HIT, isOpaque);
-                    candidate.currNodePtr    = nodePtr;
+                    candidate.SetCurrNodePtr(nodePtr);
+
                     if (Options::getCpsCandidatePrimitiveMode() == CpsCandidatePrimitiveMode::DeferFirst)
                     {
                         haveCandidate = true;
@@ -412,10 +408,10 @@ static void TraversalInternal2_0(
                 candidate.PackGeometryIndex(primitiveData.geometryIndex);
                 candidate.PackIsOpaque(isOpaque);
                 candidate.PackInstanceContribution(instanceContributionToHitGroupIndex);
-                candidate.currNodePtr = nodePtr;
+                candidate.SetCurrNodePtr(nodePtr);
                 candidate.instNodePtr = data.traversal.instNodePtr;
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
                 // Determine anyHit shader call type
                 uint anyHitCallType = rayForceOpaque ? ANYHIT_CALLTYPE_SKIP : ANYHIT_CALLTYPE_DUPLICATE;
 
diff --git a/src/shaders/DecodeAS.hlsl b/src/shaders/DecodeAS.hlsl
index fb8ca1a..64cb0fd 100644
--- a/src/shaders/DecodeAS.hlsl
+++ b/src/shaders/DecodeAS.hlsl
@@ -170,7 +170,7 @@ void DecodeAS(in uint3 globalThreadId : SV_DispatchThreadID)
             }
             else // GEOMETRY_TYPE_AABBS
             {
-                DstBuffer.Store(dstGeometryDescOffset + GEOMETRY_DESC_AABB_COUNT_OFFSET, geometryNumPrimitives);
+                DstBuffer.Store<uint64_t>(dstGeometryDescOffset + GEOMETRY_DESC_AABB_COUNT_OFFSET, geometryNumPrimitives);
                 DstBuffer.Store4(dstGeometryDescOffset + GEOMETRY_DESC_AABBS_OFFSET,
                                  uint4(addressLo, addressHi, DECODE_PRIMITIVE_STRIDE_AABB, 0));
             }
diff --git a/src/shaders/GpuRtLibrary.hlsl b/src/shaders/GpuRtLibrary.hlsl
index 879590c..de4900c 100644
--- a/src/shaders/GpuRtLibrary.hlsl
+++ b/src/shaders/GpuRtLibrary.hlsl
@@ -33,6 +33,39 @@
 #include "TraceRayCommon.hlsl"
 #include "AccelStructTracker.hlsl"
 
+#ifdef __cplusplus
+extern uint g_rtIpLevel;          // defined in cputraversal
+void _AmdSetRtip(uint rtIpLevel); // defined in cputraversal
+#endif
+
+// Only the default path (Continuation) provides _AmdGetRtip().
+static RayTracingIpLevel GetRtIpLevel()
+{
+#ifdef __cplusplus
+    switch (g_rtIpLevel)
+    {
+    case GPURT_RTIP1_1:
+        return RayTracingIpLevel::RtIp1_1;
+    case GPURT_RTIP2_0:
+        return RayTracingIpLevel::RtIp2_0;
+    default:
+        // Should never be called
+        GPU_ASSERT(false);
+        return RayTracingIpLevel::_None;
+    }
+#else // __cplusplus
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
+    if (GPURT_RTIP_LEVEL == (uint)RayTracingIpLevel::_None)
+    {
+        return RayTracingIpLevel::_None;
+    }
+    return RayTracingIpLevel::RtIp2_0; //default to ip 2.0
+#else // GPURT_DEBUG_CONTINUATION_TRAVERSAL
+    return _AmdGetRtip(); // Continuation path
+#endif
+#endif
+}
+
 #if GPURT_BUILD_CONTINUATION && LLPC_CLIENT_INTERFACE_MAJOR_VERSION
 // Include the continuations library
 #include "GpuRtLibraryCont.hlsl"
@@ -294,8 +327,13 @@ export void TraceRayInline2_0(
 export uint GetInstanceID(
     in uint64_t instanceNodePtr) // 64-bit instance node address
 {
-    const uint instanceIdAndMask = LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_ID_AND_MASK_OFFSET);
-    return (instanceIdAndMask & 0x00ffffff);
+    uint instanceId = 0;
+    if (instanceNodePtr != 0)
+    {
+        const uint instanceIdAndMask = LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_ID_AND_MASK_OFFSET);
+        instanceId = (instanceIdAndMask & 0x00ffffff);
+    }
+    return instanceId;
 }
 
 //=====================================================================================================================
@@ -303,7 +341,13 @@ export uint GetInstanceID(
 export uint GetInstanceIndex(
     in uint64_t instanceNodePtr) // 64-bit instance node address
 {
-    return LoadDwordAtAddr(instanceNodePtr + sizeof(InstanceDesc) + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET);
+    uint instanceIndex = 0;
+    if (instanceNodePtr != 0)
+    {
+        instanceIndex = LoadDwordAtAddr(instanceNodePtr + sizeof(InstanceDesc) +
+                                        RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET);
+    }
+    return instanceIndex;
 }
 
 //=====================================================================================================================
@@ -313,11 +357,16 @@ export float GetObjectToWorldTransform(
     in uint32_t row,             // row index
     in uint32_t col)             // column index
 {
-    const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float));
-    return asfloat(LoadDwordAtAddr(instanceNodePtr +
-                                   sizeof(InstanceDesc) +
-                                   RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET +
-                                   elementOffset));
+    float transform = 0;
+    if (instanceNodePtr != 0)
+    {
+        const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float));
+        transform = asfloat(LoadDwordAtAddr(instanceNodePtr +
+                                            sizeof(InstanceDesc) +
+                                            RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET +
+                                            elementOffset));
+    }
+    return transform;
 }
 
 //=====================================================================================================================
@@ -327,8 +376,14 @@ export float GetWorldToObjectTransform(
     in uint32_t row,             // row index
     in uint32_t col)             // column index
 {
-    const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float));
-    return asfloat(LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET + elementOffset));
+    float transform = 0;
+    if (instanceNodePtr != 0)
+    {
+        const uint32_t elementOffset = (row * sizeof(float4)) + (col * sizeof(float));
+        transform = asfloat(LoadDwordAtAddr(instanceNodePtr + INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET +
+                                            elementOffset));
+    }
+    return transform;
 }
 
 //=====================================================================================================================
@@ -336,17 +391,21 @@ export float GetWorldToObjectTransform(
 static float3x4 GetObjectToWorld3x4(
     in uint64_t instanceNodePtr)
 {
-    float3x4 transform;
-    switch (_AmdGetRtip())
-    {
-    default:
+    float3x4 transform = (float3x4)0;
+
+    if (instanceNodePtr != 0)
     {
-        const uint offset = RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET;
-        transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 0));
-        transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 16));
-        transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 32));
-        break;
-    }
+        switch (GetRtIpLevel())
+        {
+        default:
+        {
+            const uint offset = RTIP1_1_INSTANCE_SIDEBAND_OBJECT2WORLD_OFFSET;
+            transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 0));
+            transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 16));
+            transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + sizeof(InstanceDesc) + offset + 32));
+            break;
+        }
+        }
     }
 
     return transform;
@@ -357,20 +416,23 @@ static float3x4 GetObjectToWorld3x4(
 static float3x4 GetWorldToObject3x4(
     in uint64_t instanceNodePtr)
 {
-    float3x4 transform;
+    float3x4 transform = (float3x4)0;
 
-    switch (_AmdGetRtip())
+    if (instanceNodePtr != 0)
     {
-    default:
-    {
-        const uint offset = INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET;
-
-        transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 0));
-        transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 16));
-        transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 32));
-
-        break;
-    }
+        switch (GetRtIpLevel())
+        {
+        default:
+        {
+            const uint offset = INSTANCE_DESC_WORLD_TO_OBJECT_XFORM_OFFSET;
+
+            transform[0] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 0));
+            transform[1] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 16));
+            transform[2] = asfloat(ConstantLoadDwordAtAddrx4(instanceNodePtr + offset + 32));
+
+            break;
+        }
+        }
     }
 
     return transform;
@@ -398,7 +460,12 @@ export uint64_t GetRayQuery64BitInstanceNodePtr(
     in uint64_t tlasBaseAddr,     // 64-bit TLAS base address
     in uint32_t instanceNodePtr)  // Instance node pointer
 {
-    return CalculateNodeAddr64(tlasBaseAddr, instanceNodePtr);
+    uint64_t nodeAddr = 0;
+    if (instanceNodePtr != 0)
+    {
+        nodeAddr = CalculateNodeAddr64(tlasBaseAddr, instanceNodePtr);
+    }
+    return nodeAddr;
 }
 
 //=====================================================================================================================
@@ -429,7 +496,7 @@ static uint GetGeneralInstanceID(
     in uint64_t instNodeAddr) // 64-bit instance node address
 {
     uint id = 0;
-    switch (_AmdGetRtip())
+    switch (GetRtIpLevel())
     {
     default:
     {
@@ -447,7 +514,7 @@ static uint GetGeneralInstanceIndex(
     in uint64_t instNodeAddr) // 64-bit instance node address
 {
     uint index = 0;
-    RayTracingIpLevel rtip = _AmdGetRtip();
+    RayTracingIpLevel rtip = GetRtIpLevel();
     switch (rtip)
     {
     default:
@@ -467,7 +534,7 @@ static uint64_t GetRayQueryInstanceNodePtr(
     in uint32_t instanceNodePtr)  // Instance node pointer
 {
     uint64_t instNodePtr = 0;
-    RayTracingIpLevel rtip = _AmdGetRtip();
+    RayTracingIpLevel rtip = GetRtIpLevel();
     switch (rtip)
     {
     default:
@@ -490,7 +557,7 @@ export RayQueryInternal _RayQuery_Allocate()
 export void _RayQuery_Abort(
     inout_param(RayQueryInternal) rayQuery)
 {
-    uint rtIp = (uint)_AmdGetRtip();
+    uint rtIp = (uint)GetRtIpLevel();
     if (rtIp >= (uint)RayTracingIpLevel::RtIp2_0)
     {
         rayQuery.currNodePtr = TERMINAL_NODE;
@@ -1011,7 +1078,7 @@ export TriangleData _RayQuery_FetchTrianglePosition(
     in bool                       committed) // Node pointer
 {
     TriangleData tdata;
-    RayTracingIpLevel rtip = _AmdGetRtip();
+    RayTracingIpLevel rtip = GetRtIpLevel();
     switch (rtip)
     {
     default:
@@ -1030,7 +1097,7 @@ export bool _RayQuery_Proceed(
     in    uint                    constRayFlags,
     in    uint3                   dispatchThreadId)
 {
-    uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip());
+    uint rtIpLevel = ConvertRtIpLevel(GetRtIpLevel());
     return RayQueryProceedCommon(
         rayQuery,
         constRayFlags,
@@ -1051,7 +1118,7 @@ export void _RayQuery_TraceRayInline(
     in    RayDesc                 rayDesc,
     in    uint3                   dispatchThreadId)
 {
-    uint rtIpLevel = ConvertRtIpLevel(_AmdGetRtip());
+    uint rtIpLevel = ConvertRtIpLevel(GetRtIpLevel());
     TraceRayInlineCommon(rayQuery,
                          accelStructLo,
                          accelStructHi,
diff --git a/src/shaders/GpuRtLibraryCont.hlsl b/src/shaders/GpuRtLibraryCont.hlsl
index 293fae0..f706b3d 100644
--- a/src/shaders/GpuRtLibraryCont.hlsl
+++ b/src/shaders/GpuRtLibraryCont.hlsl
@@ -25,8 +25,6 @@
 
 // Include intrinsics and defines from the compiler
 #include "llpc/GpurtIntrinsics.h"
-#ifndef __cplusplus
-#endif
 #if DEVELOPER
 #include "../../gpurt/gpurtCounter.h"
 #endif
@@ -34,20 +32,21 @@
 #include "../shadersClean/common/Math.hlsli"
 #include "../shadersClean/common/InstanceDesc.hlsli"
 
-// By default, Gpurt exports both non-continuation and continuation traversal functions. Dxcp picks one based on panel
-// setting.
-// GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP = GPURT_RTIP1_1/GPURT_RTIP2_0
-// is only used for a debug purpose.
-// It supports DxcpRt (non-continuation) to use Continuation traversal. In this config, the pure continuation model does
-// not work.
-#ifndef GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
-#define GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP 0
-#endif
+// Do not use ~0 as an invalid stack pointer, to leave it free to use as a sentinel value
+#define CPS_STACK_PTR_STACKLESS_DEAD_LANE (~uint32_t(1))
+// CPS Stack pointers are dword-aligned, so we can use up to 2 bits. Use the second bit
+// to flag a dead lane, so in particular CPS_STACK_PTR_STACKLESS_DEAD_LANE identifies a dead lane
+#define CPS_STACK_PTR_DEAD_LANE_FLAG (2)
+#define CPS_STACK_PTR_INVALID (CPS_STACK_PTR_STACKLESS_DEAD_LANE & ~CPS_STACK_PTR_DEAD_LANE_FLAG)
 
-#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0) && (!defined(__cplusplus)))
-#define CONTINUATION_ON_GPU 1
-#else
-#define CONTINUATION_ON_GPU 0
+#define DEAD_SHADER_ADDR (~uint32_t(0))
+
+static bool RtIpIsAtLeast(RayTracingIpLevel level)
+{
+    return ((uint32_t)GetRtIpLevel()) >= ((uint32_t)level);
+}
+
+#ifndef __cplusplus
 #endif
 
 #define REMAT_INSTANCE_RAY 1
@@ -96,37 +95,6 @@
 #define SCHEDULING_PRIORITY_CALLABLE  6
 // Maximum supported value (3 bits): 7
 
-#if CONTINUATION_ON_GPU == 0
-#ifdef __cplusplus
-extern uint g_rtIpLevel;          // defined in cputraversal
-void _AmdSetRtip(uint rtIpLevel); // defined in cputraversal
-#endif
-static RayTracingIpLevel _AmdGetRtip()
-{
-    RayTracingIpLevel rtIpLevel = RayTracingIpLevel::_None;
-#ifdef __cplusplus
-    switch (g_rtIpLevel)
-#else
-    switch (GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP)
-#endif
-    {
-    case GPURT_RTIP1_1:
-        rtIpLevel = RayTracingIpLevel::RtIp1_1;
-        break;
-    case GPURT_RTIP2_0:
-        rtIpLevel = RayTracingIpLevel::RtIp2_0;
-        break;
-    }
-
-    return rtIpLevel;
-}
-#endif
-
-static bool RtIpIsAtLeast(RayTracingIpLevel level)
-{
-    return ((uint32_t)_AmdGetRtip()) >= ((uint32_t)level);
-}
-
 //=====================================================================================================================
 static uint GetPriorityForShaderType(
     DXILShaderKind shaderKind)
@@ -146,6 +114,63 @@ static uint GetPriorityForShaderType(
 // Forward declaration for _AmdDispatchSystemData.PackDispatchId() and _AmdDispatchSystemData.DispatchId()
 static uint3 GetDispatchRaysDimensions();
 
+//=====================================================================================================================
+// Apply the known set/unset bits
+static uint ApplyKnownFlags(
+    uint incomingFlags)
+{
+    uint flags = incomingFlags;
+
+#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION  >= 41
+    // Apply known bits common to all TraceRay calls
+    flags = ((flags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags());
+#endif
+
+    // Apply options overrides
+    flags &= ~Options::getRayFlagsOverrideForceDisableMask();
+    flags |=  Options::getRayFlagsOverrideForceEnableMask();
+
+    return flags;
+}
+
+//=====================================================================================================================
+// Apply compile time pipeline config flags only, it does not apply known common flags from TraceRay call sites
+static uint ApplyCompileTimePipelineConfigFlags(
+    uint incomingFlags)
+{
+    uint flags = incomingFlags;
+
+    flags |= (AmdTraceRayGetStaticFlags() & (PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES | PIPELINE_FLAG_SKIP_TRIANGLES));
+#if DEVELOPER
+    flags |= DispatchRaysConstBuf.profileRayFlags;
+#endif
+
+    return flags;
+}
+
+//=====================================================================================================================
+// Apply all static known flags, include both compile time pipeline config flags and known set/unset bits
+static uint ApplyAllStaticallyKnownFlags(
+    uint incomingFlags)     // The flags from TraceRay call sites,
+                            // 0 means get Pipeline flags for all shaders in this pipeline
+{
+    return ApplyCompileTimePipelineConfigFlags(ApplyKnownFlags(incomingFlags));
+}
+
+//=====================================================================================================================
+// Get the box sort heuristic mode according to the pipeline flags
+static uint GetBoxHeuristicMode()
+{
+    uint boxHeuristicMode = AmdTraceRayGetBoxSortHeuristicMode();
+    if ((boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosest) ||
+        (boxHeuristicMode == BoxSortHeuristic::LargestFirstOrClosestMidPoint))
+    {
+        boxHeuristicMode = GetBoxSortingHeuristicFromRayFlags(ApplyAllStaticallyKnownFlags(0), boxHeuristicMode);
+    }
+
+    return boxHeuristicMode;
+}
+
 //=====================================================================================================================
 
 struct Vpc64 {
@@ -181,6 +206,7 @@ struct Vpc64 {
         const uint firstMetadataBit = 32;
         const uint firstPriorityBitInMetadata = 16;
         GPU_ASSERT((vpc & 0xFFFF000000000000) == 0);
+        vpc &= 0x0000FFFFFFFFFFFF;
         vpc |= (prio64 << (firstMetadataBit + firstPriorityBitInMetadata));
         return Vpc64(vpc);
     }
@@ -216,24 +242,37 @@ struct Vpc32 {
 
     bool IsValid()
     {
-        return GetFunctionAddr() != 0;
+        return vpc != 0;
     }
 
-    void SetPriority(uint priority)
+    Vpc32 SetPriority(uint priority)
     {
+        if (_AmdIsLlpc())
+        {
+            return Vpc32(vpc);
+        }
+
+        vpc &= ~0x7;
         vpc |= priority;
+
+        return Vpc32(vpc);
     }
 
     uint GetPriority()
     {
         return (uint)(vpc & 0x7);
     }
+
+    static Vpc32 MakeWithPriority(Vpc32 vpc32, uint priority)
+    {
+        return vpc32.SetPriority(priority);
+    }
 };
 
 //=====================================================================================================================
 // 32-bit function pointer packing/unpacking
 //
-static Vpc64 Vpc32ToVpc64(Vpc32 vpc32, bool unpackPriority)
+static Vpc64 Vpc32ToVpc64(Vpc32 vpc32)
 {
     if (_AmdIsLlpc())
     {
@@ -242,10 +281,7 @@ static Vpc64 Vpc32ToVpc64(Vpc32 vpc32, bool unpackPriority)
 
     Vpc64 vpc64 = Vpc64((uint64_t)(vpc32.GetFunctionAddr()));
 
-    if (unpackPriority)
-    {
-       vpc64.SetPriority(vpc32.GetPriority());
-    }
+    vpc64.SetPriority(vpc32.GetPriority());
 
     return vpc64;
 }
@@ -315,8 +351,10 @@ struct _AmdDispatchSystemData
         return dispatchId;
     }
 
-    static _AmdDispatchSystemData MakeDeadLaneWithStack();
-    static _AmdDispatchSystemData MakeDeadLaneWithoutStack();
+    void SetDead(bool withStack)
+    {
+        nextNodePtr = withStack ? DEAD_LANE_WITH_STACK : DEAD_LANE_WITHOUT_STACK;
+    }
 
     uint  dispatchLinearId;   // Packed dispatch linear id. Combine x/y/z into 1 DWORD.
 
@@ -358,27 +396,12 @@ struct _AmdRaySystemState
     // Incoming flags are the flags passed by TraceRay call
     uint IncomingFlags()
     {
-        uint incomingFlags = uint(bitFieldExtract64(packedAccelStruct, 48, 12));
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION  >= 41
-        // Apply known bits common to all TraceRay calls
-        incomingFlags = ((incomingFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags());
-#endif
-        // Apply options overrides
-        incomingFlags &= ~Options::getRayFlagsOverrideForceDisableMask();
-        incomingFlags |=  Options::getRayFlagsOverrideForceEnableMask();
-
-        return incomingFlags;
+        return uint(bitFieldExtract64(packedAccelStruct, 48, 12));
     }
 
     uint Flags()
     {
-        uint rayFlags = IncomingFlags();
-        // Apply compile time pipeline config flags into the ray flags
-        rayFlags |= (AmdTraceRayGetStaticFlags() & (PIPELINE_FLAG_SKIP_PROCEDURAL_PRIMITIVES | PIPELINE_FLAG_SKIP_TRIANGLES));
-#if DEVELOPER
-        rayFlags |= DispatchRaysConstBuf.profileRayFlags;
-#endif
-        return rayFlags;
+        return ApplyAllStaticallyKnownFlags(IncomingFlags());
     }
 
     void SetAnyHitDidAccept(bool value)
@@ -421,7 +444,7 @@ struct _AmdPrimitiveSystemState
         packedGeometryIndex(0),
         packedInstanceContribution(0)
       , currNodePtr(INVALID_IDX)
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
       , packedType(0)
 #endif
     {
@@ -441,6 +464,10 @@ struct _AmdPrimitiveSystemState
                                         // hitKind              [31 : 24]
 
     uint currNodePtr;
+    void SetCurrNodePtr(uint p)
+    {
+        currNodePtr = p;
+    }
 
     uint GeometryIndex()
     {
@@ -519,7 +546,7 @@ struct _AmdPrimitiveSystemState
         packedInstanceContribution = bitFieldInsert(packedInstanceContribution, 24, 8, hitKind);
     }
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
     // The following member data are only used in DEBUG
     uint packedType;        // IsProcedural:   [31]    - 1 bit
                             // AnyhitCallType: [1 : 0] - 2 bits
@@ -598,9 +625,7 @@ struct _AmdTraversalState
                                   // field becomes re-used for something else in non-rebraid mode.
     uint reservedNodePtr;         // RTIPv2.0 (lastNodePtr)
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0
     uint32_t packedReturnAddr; // The address of the function to return to, packed into 32 bits.
-#endif
 
     uint InstanceContribution()
     {
@@ -629,16 +654,16 @@ struct _AmdTraversalState
 
     void PackStackPtrTop(uint ptr)
     {
-        GPU_ASSERT((_AmdGetRtip() == RayTracingIpLevel::RtIp1_1) ||
-                   (_AmdGetRtip() == RayTracingIpLevel::RtIp2_0));
+        GPU_ASSERT((GetRtIpLevel() == RayTracingIpLevel::RtIp1_1) ||
+                   (GetRtIpLevel() == RayTracingIpLevel::RtIp2_0));
 
         packedStackTopOrParentPointer = ptr;
     }
 
     uint StackPtrTop()
     {
-        GPU_ASSERT((_AmdGetRtip() == RayTracingIpLevel::RtIp1_1) ||
-                   (_AmdGetRtip() == RayTracingIpLevel::RtIp2_0));
+        GPU_ASSERT((GetRtIpLevel() == RayTracingIpLevel::RtIp1_1) ||
+                   (GetRtIpLevel() == RayTracingIpLevel::RtIp2_0));
         return packedStackTopOrParentPointer;
     }
 
@@ -659,14 +684,14 @@ struct _AmdTraversalState
         return committed.State();
     }
 
-    void PackReturnAddress(Vpc64 returnAddr)
+    void SetReturnAddress(Vpc32 returnAddr)
     {
-        packedReturnAddr = Vpc64ToVpc32(returnAddr).GetU32();
+        packedReturnAddr = returnAddr.GetU32();
     }
 
-    Vpc64 ReturnAddress()
+    Vpc32 ReturnAddress()
     {
-        return Vpc32ToVpc64(Vpc32(packedReturnAddr), true);
+        return Vpc32(packedReturnAddr);
     }
 };
 
@@ -712,6 +737,26 @@ struct _AmdRayHistoryCounter
 };
 #endif
 
+namespace Traits
+{
+
+static bool HasStacklessDeadLanes()
+{
+    return false;
+}
+
+static bool HasStackfulDeadLanes()
+{
+    return Options::getPersistentLaunchEnabled();
+}
+
+static bool HasDeadLanes()
+{
+    return HasStackfulDeadLanes() || HasStacklessDeadLanes();
+}
+
+} // namespace Traits
+
 //=====================================================================================================================
 struct _AmdSystemData
 {
@@ -723,52 +768,56 @@ struct _AmdSystemData
 
     bool IsDeadLaneWithoutStack()
     {
-        // This type of dead lane is only possible when the continuations stack is in global memory.
-        // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime.
-        return (dispatch.nextNodePtr == DEAD_LANE_WITHOUT_STACK) && _AmdContinuationStackIsGlobal();
+        return Traits::HasStacklessDeadLanes() && dispatch.nextNodePtr == DEAD_LANE_WITHOUT_STACK;
     }
 
     bool IsDeadLaneWithStack()
     {
-        // This type of dead lane is only possible when persistent launch is enabled.
-        // Explicitly check the compile time setting to help the compiler eliminte unnecessary code at runtime.
-        return (dispatch.nextNodePtr == DEAD_LANE_WITH_STACK) && Options::getPersistentLaunchEnabled();
+        return Traits::HasStackfulDeadLanes() && dispatch.nextNodePtr == DEAD_LANE_WITH_STACK;
+    }
+
+    bool IsDeadLane()
+    {
+        return IsDeadLaneWithoutStack() || IsDeadLaneWithStack();
     }
 
     bool IsTraversal()
     {
+        GPU_ASSERT(!IsDeadLane());
         return IsValidNode(dispatch.nextNodePtr);
     }
 
     bool IsChsOrMiss(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return (state >= TRAVERSAL_STATE_COMMITTED_NOTHING);
     }
 
     bool IsMiss(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return IsChsOrMiss(state) && !IsValidNode(traversal.committed.instNodePtr);
     }
 
     bool IsAhs(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return (state == TRAVERSAL_STATE_CANDIDATE_NON_OPAQUE_TRIANGLE);
     }
 
     bool IsIs(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return ((state == TRAVERSAL_STATE_CANDIDATE_PROCEDURAL_PRIMITIVE) ||
                 (state == TRAVERSAL_STATE_CANDIDATE_NON_OPAQUE_PROCEDURAL_PRIMITIVE));
     }
 
     bool IsChs(in uint state)
     {
+        GPU_ASSERT(!IsDeadLane());
         return IsChsOrMiss(state) && IsValidNode(traversal.committed.instNodePtr);
     }
 
-    static _AmdSystemData MakeDeadLaneWithStack();
-    static _AmdSystemData MakeDeadLaneWithoutStack();
-
     // Note: _AmdDispatchSystemData must be the first member of _AmdSystemData. This allows us to save some VGPRs if
     //       we need to call a function that takes _AmdSystemData but doesn't actually need ray or traversal data.
     //       For example, the launch kernel can make a dead lane and enqueue traversal with just dispatch.nextNodePtr.
@@ -816,24 +865,30 @@ struct _AmdTraversalResultData
                   // 2) otherwise the first hitted non-opaque primitive.
 };
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0
+#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
 // Define specialized intrinsics.
 // We use macros because HLSL does not have varargs or generics.
 // The macros and intrinsics are defined by llpc.
-DECLARE_ENQUEUE(, uint64_t returnAddr, _AmdSystemData data)
+DECLARE_ENQUEUE(, uint32_t returnAddr, _AmdSystemData data)
 
-DECLARE_ENQUEUE(Traversal, uint64_t dummyReturnAddr, _AmdSystemData data)
-DECLARE_ENQUEUE(TraversalDead, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
-DECLARE_ENQUEUE(RayGen, uint64_t dummyReturnAddr, _AmdDispatchSystemData data)
+DECLARE_ENQUEUE(Traversal, uint32_t dummyReturnAddr, _AmdSystemData data)
+DECLARE_ENQUEUE(TraversalDead, uint32_t dummyReturnAddr, _AmdDispatchSystemData data)
+DECLARE_ENQUEUE(RayGen, uint32_t dummyReturnAddr, _AmdDispatchSystemData data)
 
-DECLARE_ENQUEUE(AnyHit, uint64_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics)
-DECLARE_ENQUEUE(Intersection, uint64_t returnAddr, _AmdAnyHitSystemData data)
+DECLARE_ENQUEUE(AnyHit, uint32_t returnAddr, _AmdAnyHitSystemData data, float2 candidateBarycentrics)
+DECLARE_ENQUEUE(Intersection, uint32_t returnAddr, _AmdAnyHitSystemData data)
 
-DECLARE_AWAIT(AnyHit, _AmdAnyHitSystemData, uint64_t returnAddr, _AmdAnyHitSystemData data)
-DECLARE_AWAIT(CallShader, _AmdDispatchSystemData, uint64_t returnAddr, _AmdDispatchSystemData data)
+DECLARE_AWAIT(AnyHit, _AmdAnyHitSystemData, uint32_t returnAddr, _AmdAnyHitSystemData data)
+DECLARE_AWAIT(CallShader, _AmdDispatchSystemData, uint32_t returnAddr, _AmdDispatchSystemData data)
 
+#ifndef PASS_DUMMY_RET_ADDR
 // No returnAddr argument. The return address is instead included in the passed system data.
 DECLARE_AWAIT(Traversal, _AmdDispatchSystemData, _AmdSystemData data)
+#else // PASS_DUMMY_RET_ADDR
+// Pass a dummy return address for consistency reasons.
+// The actual return address is included in the passed system data.
+DECLARE_AWAIT(Traversal, _AmdDispatchSystemData, VpcIntTy dummyReturnAddr, _AmdSystemData data)
+#endif
 
 DECLARE_RESTORE_SYSTEM_DATA(, _AmdDispatchSystemData data)
 DECLARE_RESTORE_SYSTEM_DATA(AnyHit, _AmdAnyHitSystemData data)
@@ -853,64 +908,37 @@ DECLARE_CONT_STACK_LOAD_LAST_USE(U32, uint32_t)
 DECLARE_CONT_STACK_STORE(U32, uint32_t value)
 DECLARE_CONT_STACK_LOAD_LAST_USE(U64, uint64_t)
 DECLARE_CONT_STACK_STORE(U64, uint64_t value)
-#endif
-
-inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithStack()
-{
-    _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData();
-    data.nextNodePtr = DEAD_LANE_WITH_STACK;
-    return data;
-}
-
-inline _AmdDispatchSystemData _AmdDispatchSystemData::MakeDeadLaneWithoutStack()
-{
-    _AmdDispatchSystemData data = _AmdGetUninitializedDispatchSystemData();
-    data.nextNodePtr = DEAD_LANE_WITHOUT_STACK;
-    return data;
-}
-
-inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithStack()
-{
-    _AmdSystemData data = _AmdGetUninitializedSystemData();
-    data.dispatch.nextNodePtr = DEAD_LANE_WITH_STACK;
-    return data;
-}
-
-inline _AmdSystemData _AmdSystemData::MakeDeadLaneWithoutStack()
+#else // ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
+//=====================================================================================================================
+inline _AmdDispatchSystemData _AmdGetUninitializedDispatchSystemData()
 {
-    _AmdSystemData data = _AmdGetUninitializedSystemData();
-    data.dispatch.nextNodePtr = DEAD_LANE_WITHOUT_STACK;
-    return data;
+    return (_AmdDispatchSystemData)0;
 }
 
 //=====================================================================================================================
-// Return the argument.
-static Vpc64 GetVpc64FromShaderId(Vpc32 shaderId, uint priority)
+inline _AmdSystemData _AmdGetUninitializedSystemData()
 {
-    Vpc64 vpc64 = Vpc32ToVpc64(shaderId, /* unpackPriority = */ false);
-    vpc64.SetPriority(priority);
-    return vpc64;
+    return (_AmdSystemData)0;
 }
+#endif
 
 //=====================================================================================================================
-static Vpc64 GetVpc64FromShaderIdAddr(GpuVirtualAddress addr, uint priority)
+static Vpc32 GetVpcFromShaderIdAddr(GpuVirtualAddress addr)
 {
 #ifdef __cplusplus
     return 1;
 #else
-    Vpc32 shaderId = Vpc32(ConstantLoadDwordAtAddr(addr));
-    return GetVpc64FromShaderId(shaderId, priority);
+    return Vpc32(ConstantLoadDwordAtAddr(addr));
 #endif
 }
 
 //=====================================================================================================================
-static Vpc64 GetVpc64FromShaderIdTable(
+static Vpc32 GetVpcFromShaderIdTable(
     GpuVirtualAddress tableAddress,
     uint index,
-    uint stride,
-    uint priority)
+    uint stride)
 {
-    return GetVpc64FromShaderIdAddr(tableAddress + stride * index, priority);
+    return GetVpcFromShaderIdAddr(tableAddress + stride * index);
 }
 
 //=====================================================================================================================
@@ -929,15 +957,6 @@ static Vpc32 GetAnyHit32BitShaderId(
     return Vpc32(ConstantLoadDwordAtAddr(tableVa + offset + 8));
 }
 
-//=====================================================================================================================
-// Returns the 64-bit VPC for the given AHS by loading its shader address, and setting the AHS priority.
-static Vpc64 GetAnyHitAddr(
-    uint hitGroupRecordIndex)
-{
-    Vpc32 shaderId = GetAnyHit32BitShaderId(hitGroupRecordIndex);
-    return GetVpc64FromShaderId(shaderId, SCHEDULING_PRIORITY_AHS);
-}
-
 //=====================================================================================================================
 // Returns whether the corresponding AHS is non-null.
 static bool AnyHitIsNonNull(
@@ -1002,13 +1021,6 @@ static float4x3 WorldToObject4x3(in uint64_t tlasBaseAddr, in uint instNodePtr)
     return transpose(WorldToObject3x4(tlasBaseAddr, instNodePtr));
 }
 
-//=====================================================================================================================
-// Implementation of DispatchRaysIndex.
-export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data)
-{
-    return data.DispatchId();
-}
-
 //=====================================================================================================================
 // Load dispatch dimensions from constant buffer.
 static uint3 GetDispatchRaysDimensions()
@@ -1035,78 +1047,6 @@ static uint GetPersistentDispatchSize()
     return min(DispatchRaysConstBuf.rayDispatchMaxGroups, groupsNeeded);
 }
 
-//=====================================================================================================================
-// Implementation of DispatchRaysDimensions().
-export uint3 _cont_DispatchRaysDimensions3(in _AmdDispatchSystemData data)
-{
-    return GetDispatchRaysDimensions();
-}
-
-#if CONTINUATION_ON_GPU
-//=====================================================================================================================
-// Return the hit state for AnyHit and Intersection
-export _AmdPrimitiveSystemState _cont_GetCandidateState(in _AmdAnyHitSystemData data)
-{
-    return data.candidate;
-}
-
-//=====================================================================================================================
-// Return the hit state for ClosestHit
-export _AmdPrimitiveSystemState _cont_GetCommittedState(in _AmdSystemData data)
-{
-    return data.traversal.committed;
-}
-
-//=====================================================================================================================
-export float3 _cont_WorldRayOrigin3(in _AmdSystemData state)
-{
-    return state.ray.origin;
-}
-
-//=====================================================================================================================
-export float3 _cont_WorldRayDirection3(in _AmdSystemData state)
-{
-    return state.ray.direction;
-}
-
-//=====================================================================================================================
-export float _cont_RayTMin(in _AmdSystemData state)
-{
-    return state.ray.tMin;
-}
-
-//=====================================================================================================================
-export uint _cont_RayFlags(in _AmdSystemData state)
-{
-    return state.ray.IncomingFlags();
-}
-
-//=====================================================================================================================
-export uint _cont_InstanceInclusionMask(in _AmdSystemData data)
-{
-    return ExtractInstanceInclusionMask(data.ray.traceParameters);
-}
-
-//=====================================================================================================================
-export float _cont_RayTCurrent(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
-{
-    if (_AmdGetShaderKind() == DXILShaderKind::Intersection)
-    {
-        // The intersection shader is an exception. While the system data is usually about the candidate hit, the
-        // current t must be from the committed hit.
-        primitive = _cont_GetCommittedState(data);
-    }
-
-    float tCurrentHw = 0.f;
-    {
-        tCurrentHw = primitive.rayTCurrent;
-    }
-
-    // AMD Gpu shifts the origin, so rayTCurrent is between 0 and (tMaxApp - tMinApp). Add tMinApp back for App's use.
-    return tCurrentHw + data.ray.tMin;
-}
-#endif
-
 //=====================================================================================================================
 // Map a thread to a ray, some threads could end up with non-existent (invalid) rays.
 // Note D3D12_DISPATCH_RAYS_DESC::(w x h x d) are organized to DispatchDims = (?, d, 1).
@@ -1190,77 +1130,156 @@ static uint3 GetDispatchId(uint width, uint height, uint dispatchId)
     return uint3(xTile * TileWidth + x, yTile * TileHeight + y, z);
 }
 
+#ifdef __cplusplus
 //=====================================================================================================================
-export uint _cont_InstanceIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Helper function for cpp only
+static float3 mul(in float3 v, in float4x3 m)
 {
-
-    return ConstantLoadDwordAtAddr(
-        GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) +
-        INSTANCE_NODE_EXTRA_OFFSET + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET);
+    float3 r;
+    r.x = dot(m[0], v);
+    r.y = dot(m[1], v);
+    r.z = dot(m[2], v);
+    return r;
 }
+#endif
 
+#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
 //=====================================================================================================================
-export uint _cont_InstanceID(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Implementation of DispatchRaysIndex.
+export uint3 _cont_DispatchRaysIndex3(in _AmdDispatchSystemData data)
 {
-
-    return ConstantLoadDwordAtAddr(
-        GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) + INSTANCE_DESC_ID_AND_MASK_OFFSET) & 0x00ffffff;
+    return data.DispatchId();
 }
 
 //=====================================================================================================================
-export uint _cont_GeometryIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Implementation of DispatchRaysDimensions().
+export uint3 _cont_DispatchRaysDimensions3(in _AmdDispatchSystemData data)
 {
-    return primitive.GeometryIndex();
+    return GetDispatchRaysDimensions();
 }
 
 //=====================================================================================================================
-export uint _cont_PrimitiveIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Return the hit state for AnyHit and Intersection
+export _AmdPrimitiveSystemState _cont_GetCandidateState(in _AmdAnyHitSystemData data)
 {
-    return primitive.primitiveIndex;
+    return data.candidate;
 }
 
 //=====================================================================================================================
-export float4x3 _cont_ObjectToWorld4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+// Return the hit state for ClosestHit
+export _AmdPrimitiveSystemState _cont_GetCommittedState(in _AmdSystemData data)
 {
-    return ObjectToWorld4x3(data.ray.AccelStruct(), primitive.instNodePtr);
+    return data.traversal.committed;
 }
 
 //=====================================================================================================================
-export float4x3 _cont_WorldToObject4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+export float3 _cont_WorldRayOrigin3(in _AmdSystemData state)
 {
-    return WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr);
+    return state.ray.origin;
 }
 
 //=====================================================================================================================
-export TriangleData _cont_TriangleVertexPositions(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+export float3 _cont_WorldRayDirection3(in _AmdSystemData state)
 {
-    const GpuVirtualAddress instanceAddr = GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr);
-    {
-        return FetchTriangleFromNode(GetInstanceAddr(FetchInstanceDescAddr(instanceAddr)), primitive.currNodePtr);
-    }
+    return state.ray.direction;
 }
 
-#ifdef __cplusplus
 //=====================================================================================================================
-// Helper function for cpp only
-static float3 mul(in float3 v, in float4x3 m)
+export float _cont_RayTMin(in _AmdSystemData state)
 {
-    float3 r;
-    r.x = dot(m[0], v);
-    r.y = dot(m[1], v);
-    r.z = dot(m[2], v);
-    return r;
+    return state.ray.tMin;
 }
-#endif
 
 //=====================================================================================================================
-export float3 _cont_ObjectRayOrigin3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+export uint _cont_RayFlags(in _AmdSystemData state)
 {
-    return mul(float4(data.ray.origin, 1.0), WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr));
+    // Get the flags passed by TraceRay call and apply the known set/unset bits.
+    return ApplyKnownFlags(state.ray.IncomingFlags());
 }
 
 //=====================================================================================================================
-export float3 _cont_ObjectRayDirection3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+export uint _cont_InstanceInclusionMask(in _AmdSystemData data)
+{
+    return ExtractInstanceInclusionMask(data.ray.traceParameters);
+}
+
+//=====================================================================================================================
+export float _cont_RayTCurrent(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    if (_AmdGetShaderKind() == DXILShaderKind::Intersection)
+    {
+        // The intersection shader is an exception. While the system data is usually about the candidate hit, the
+        // current t must be from the committed hit.
+        primitive = _cont_GetCommittedState(data);
+    }
+
+    float tCurrentHw = 0.f;
+    {
+        tCurrentHw = primitive.rayTCurrent;
+    }
+
+    // AMD Gpu shifts the origin, so rayTCurrent is between 0 and (tMaxApp - tMinApp). Add tMinApp back for App's use.
+    return tCurrentHw + data.ray.tMin;
+}
+
+//=====================================================================================================================
+export uint _cont_InstanceIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+
+    return ConstantLoadDwordAtAddr(
+        GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) +
+        INSTANCE_NODE_EXTRA_OFFSET + RTIP1_1_INSTANCE_SIDEBAND_INSTANCE_INDEX_OFFSET);
+}
+
+//=====================================================================================================================
+export uint _cont_InstanceID(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+
+    return ConstantLoadDwordAtAddr(
+        GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr) + INSTANCE_DESC_ID_AND_MASK_OFFSET) & 0x00ffffff;
+}
+
+//=====================================================================================================================
+export uint _cont_GeometryIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return primitive.GeometryIndex();
+}
+
+//=====================================================================================================================
+export uint _cont_PrimitiveIndex(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return primitive.primitiveIndex;
+}
+
+//=====================================================================================================================
+export float4x3 _cont_ObjectToWorld4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return ObjectToWorld4x3(data.ray.AccelStruct(), primitive.instNodePtr);
+}
+
+//=====================================================================================================================
+export float4x3 _cont_WorldToObject4x3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr);
+}
+
+//=====================================================================================================================
+export TriangleData _cont_TriangleVertexPositions(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    const GpuVirtualAddress instanceAddr = GetInstanceNodeAddr(data.ray.AccelStruct(), primitive.instNodePtr);
+    {
+        return FetchTriangleFromNode(GetInstanceAddr(FetchInstanceDescAddr(instanceAddr)), primitive.currNodePtr);
+    }
+}
+
+//=====================================================================================================================
+export float3 _cont_ObjectRayOrigin3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
+{
+    return mul(float4(data.ray.origin, 1.0), WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr));
+}
+
+//=====================================================================================================================
+export float3 _cont_ObjectRayDirection3(in _AmdSystemData data, in _AmdPrimitiveSystemState primitive)
 {
     return mul(float4(data.ray.direction, 0.0), WorldToObject4x3(data.ray.AccelStruct(), primitive.instNodePtr));
 }
@@ -1353,7 +1372,6 @@ export uint _cont_GetContinuationStackAddr()
 {
     uint offset = 0;
 
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP == 0
     if (_AmdContinuationStackIsGlobal())
     {
         const uint3 threadIdInGroup = AmdExtThreadIdInGroupCompute();
@@ -1366,7 +1384,6 @@ export uint _cont_GetContinuationStackAddr()
         offset = id * DispatchRaysConstBuf.cpsFrontendStackSize;
     }
     else
-#endif
     {
         offset =
 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION < 36
@@ -1387,27 +1404,26 @@ export uint64_t _cont_GetContinuationStackGlobalMemBase()
 }
 
 //=====================================================================================================================
-static Vpc64 GetTraversalVpc64()
+static Vpc32 GetTraversalVpc32()
 {
     // NOTE: DXCP uses a table for TraceRay, thus a load to traceRayGpuVa retrieves the actual traversal function
     // address. But Vulkan does not use the table so far, traceRayGpuVa is already the traversal function address.
-    return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
-                      DispatchRaysConstBuf.traceRayGpuVaHi));
+    return Vpc64ToVpc32(Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
+                      DispatchRaysConstBuf.traceRayGpuVaHi)));
 }
 
 //=====================================================================================================================
-static Vpc64 GetTraversalVpc64PwgDead()
+static Vpc32 GetTraversalVpc32PwgDead()
 {
-    return Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
-                      DispatchRaysConstBuf.traceRayGpuVaHi));
+    return Vpc64ToVpc32(Vpc64(PackUint64(DispatchRaysConstBuf.traceRayGpuVaLo,
+                                         DispatchRaysConstBuf.traceRayGpuVaHi)));
 }
 
 //=====================================================================================================================
-static Vpc64 GetRayGenVpc64()
+static Vpc32 GetRayGenVpc32()
 {
-    return GetVpc64FromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo,
-                                               DispatchRaysConstBuf.rayGenerationTableAddressHi),
-                                               SCHEDULING_PRIORITY_RGS);
+    return GetVpcFromShaderIdAddr(PackUint64(DispatchRaysConstBuf.rayGenerationTableAddressLo,
+                                             DispatchRaysConstBuf.rayGenerationTableAddressHi));
 }
 
 //=====================================================================================================================
@@ -1460,6 +1476,162 @@ export uint _cont_GetSbtStride()
     }
 }
 
+//=====================================================================================================================
+// ReportHit implementation that is called from the intersection shader.
+// May call the AnyHit shader.
+export bool _cont_ReportHit(inout_param(_AmdAnyHitSystemData) data, float THit, uint HitKind)
+{
+    // TODO Reuse shader record index computed in Traversal
+    // TODO Check for closest hit and duplicate anyHit calling
+
+    THit -= data.base.ray.tMin;
+    float tCurrentCommitted = 0.f;
+    {
+        tCurrentCommitted = data.base.traversal.committed.rayTCurrent;
+    }
+
+    if ((THit < 0.f) || (THit > tCurrentCommitted))
+    {
+        // Discard the hit candidate and hint the compiler to not keep the
+        // values alive, which will remove redundant moves.
+        data.candidate.rayTCurrent = _AmdGetUninitializedF32();
+        // Don't discard the hit kind as it is bit packed and cannot be discarded partially.
+        return false;
+    }
+
+    data.candidate.rayTCurrent = THit;
+    data.candidate.PackHitKind(HitKind);
+
+    uint isOpaque = true;
+    {
+        PrimitiveData primitiveData;
+        InstanceDesc desc;
+
+        {
+            // Get primitive nodes to process based on candidate or committed hit
+            const uint tlasNodePtr = data.candidate.instNodePtr;
+
+            const GpuVirtualAddress tlasAddr = data.base.ray.AccelStruct() + ExtractNodePointerOffset(tlasNodePtr);
+            desc = FetchInstanceDescAddr(tlasAddr);
+            isOpaque = data.candidate.IsOpaque();
+        }
+    }
+
+    if (!isOpaque)
+    {
+        uint hitGroupRecordIndex = 0;
+        {
+            hitGroupRecordIndex = data.base.dispatch.shaderRecIdx;
+        }
+        // Compute hit group address and fetch shader identifiers
+        const Vpc32 anyHitAddr = GetAnyHit32BitShaderId(hitGroupRecordIndex);
+
+        if (anyHitAddr.IsValid())
+        {
+            // Call AnyHit
+            // Hit attributes are added as an additional argument by the compiler
+            Vpc32 resumeAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetResumePointAddr()), SCHEDULING_PRIORITY_IS);
+            data = _AmdAwaitAnyHit(anyHitAddr.GetU32(), resumeAddr.GetU32(), data);
+            _AmdRestoreSystemDataAnyHit(data);
+            return data.base.ray.AnyHitDidAccept();
+        }
+        else
+        {
+            _cont_AcceptHit(data);
+            _AmdAcceptHitAttributes(data); // changes data.base.traversal.committedBarycentrics plus up-to-6 DW data in payload
+            return true;
+        }
+    }
+    else
+    {
+        _cont_AcceptHit(data);
+        _AmdAcceptHitAttributes(data);
+        return true;
+    }
+}
+
+//=====================================================================================================================
+// CallShader implementation
+export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint index)
+{
+    const uint64_t callableTableBaseAddress =
+        PackUint64(DispatchRaysConstBuf.callableTableBaseAddressLo, DispatchRaysConstBuf.callableTableBaseAddressHi);
+
+    if (callableTableBaseAddress == 0)
+    {
+        // TODO: It might be better to AwaitSelf here, adding an artificial suspend point.
+        //       For the common case of non-null callable shaders, this would reduce
+        //       the size of compiled shaders, as the post-CallShader() part is unreachable,
+        //       also simplifying manual testing with suspend points.
+        //       For null callable shaders, it has the advantage of allowing
+        //       to reconverge on the resume function if implemented in a way that yields only
+        //       a single resume function.
+        return;
+    }
+
+    const Vpc32 addr = GetVpcFromShaderIdTable(callableTableBaseAddress,
+                                               index,
+                                               DispatchRaysConstBuf.callableTableStrideInBytes);
+
+    if (!addr.IsValid())
+    {
+        // See TODO above on how to handle this case better.
+        return;
+    }
+
+    const uint callerShaderRecIdx = data.shaderRecIdx;
+    data.shaderRecIdx = index; // the record index used by the callable shader
+
+    const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
+    const uint resumePrio = GetPriorityForShaderType(enclosingShaderType);
+    const Vpc32 resumeAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetResumePointAddr()), resumePrio);
+
+    data = _AmdAwaitCallShader(addr.GetU32(), resumeAddr.GetU32(), data);
+
+    // for the resume part.
+    data.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
+    _AmdRestoreSystemData(data); // llvm inserts amd.dx.setLocalRootIndex(data.shaderRecIdx)
+}
+
+//=====================================================================================================================
+// Returns the low part of the miss shader address and sets up the dispatch data to have the correct shader record
+// index.
+static Vpc32 SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx)
+{
+    const uint64_t missTableBaseAddress =
+        PackUint64(DispatchRaysConstBuf.missTableBaseAddressLo, DispatchRaysConstBuf.missTableBaseAddressHi);
+    if (missTableBaseAddress == 0)
+    {
+        shaderRecIdx = 0;
+        return Vpc32(0);
+    }
+
+    shaderRecIdx = ExtractMissShaderIndex(data.ray.traceParameters);
+
+    // Calculate miss shader record address
+    return GetVpcFromShaderIdTable(missTableBaseAddress,
+                                   shaderRecIdx,
+                                   DispatchRaysConstBuf.missTableStrideInBytes);
+}
+
+//=====================================================================================================================
+static HitGroupInfo GetHitGroupInfo(
+    in _AmdSystemData           data,
+    in uint                     state,
+    in _AmdPrimitiveSystemState candidate)
+{
+    uint geometryIndex = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ?
+            candidate.GeometryIndex() : data.traversal.committed.GeometryIndex();
+    uint instanceContribution = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ?
+            candidate.InstanceContribution() : data.traversal.committed.InstanceContribution();
+
+    return GetHitGroupInfo(ExtractRayContributionToHitIndex(data.ray.traceParameters),
+                           ExtractMultiplierForGeometryContributionToHitIndex(data.ray.traceParameters),
+                           geometryIndex,
+                           instanceContribution);
+}
+#endif
+
 //=====================================================================================================================
 // Ray History helper functions
 //=====================================================================================================================
@@ -1523,7 +1695,7 @@ static void RayHistoryWriteTopLevel(inout_param(_AmdSystemData) data)
 #if DEVELOPER
     if (EnableTraversalCounter() && data.counter.WriteTokenTopLevel())
     {
-        WriteRayHistoryTokenTopLevel(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), data.ray.AccelStruct());
+        WriteRayHistoryTokenTopLevel(GetRayId(data.dispatch.DispatchId()), data.ray.AccelStruct());
         data.counter.SetWriteTokenTopLevel(false);
     }
 #endif
@@ -1588,7 +1760,7 @@ static void RayHistoryWriteBegin(inout_param(_AmdSystemData) data)
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-        const uint rayId  = GetRayId(_cont_DispatchRaysIndex3(data.dispatch));
+        const uint rayId  = GetRayId(data.dispatch.DispatchId());
         RayDesc rayDesc   = (RayDesc)0;
         rayDesc.Origin    = data.ray.origin;
         rayDesc.Direction = data.ray.direction;
@@ -1600,7 +1772,7 @@ static void RayHistoryWriteBegin(inout_param(_AmdSystemData) data)
         data.counter.SetCallerShaderType(_AmdGetShaderKind());
 
         WriteRayHistoryTokenBegin(rayId,
-                                  _cont_DispatchRaysIndex3(data.dispatch),
+                                  data.dispatch.DispatchId(),
                                   data.ray.AccelStruct(),
                                   data.ray.Flags(),
                                   data.ray.traceParameters,
@@ -1619,7 +1791,7 @@ static void RayHistoryWriteEnd(inout_param(_AmdSystemData) data, uint state)
 #if DEVELOPER
     WriteDispatchCounters(data.counter.numIterations);
 
-    const uint     rayId    = GetRayId(_cont_DispatchRaysIndex3(data.dispatch));
+    const uint     rayId    = GetRayId(data.dispatch.DispatchId());
     const uint64_t timerEnd = AmdTraceRaySampleGpuTimer();
     WriteRayHistoryTokenTimeStamp(rayId, timerEnd);
 
@@ -1635,7 +1807,7 @@ static void RayHistoryWriteEnd(inout_param(_AmdSystemData) data, uint state)
     if (data.IsChs(state))
     {
         // For CHS, get candidate and barycentrics from traversal.
-        const uint instNodeIndex = FetchInstanceIdx(ConvertRtIpLevel(_AmdGetRtip()),
+        const uint instNodeIndex = FetchInstanceIdx(ConvertRtIpLevel(GetRtIpLevel()),
                                                     data.ray.AccelStruct(),
                                                     data.traversal.committed.instNodePtr);
         WriteRayHistoryTokenEnd(rayId,
@@ -1661,16 +1833,10 @@ static void RayHistoryWriteEnd(inout_param(_AmdSystemData) data, uint state)
 }
 
 //=====================================================================================================================
-static uint2 RayHistoryGetIdentifierFromVPC(uint64_t vpc)
+static uint2 RayHistoryGetIdentifierFromVPC(Vpc32 vpc)
 {
     // Zero out the metadata bits
-    return uint2(SplitUint64(vpc).x & 0xFFFFFFC0, 0);
-}
-
-//=====================================================================================================================
-static uint2 RayHistoryGetIdentifierFromShaderId(uint2 shaderId)
-{
-    return uint2(shaderId.x & 0xFFFFFFC0, 0);
+    return uint2(vpc.GetFunctionAddr(), 0);
 }
 
 //=====================================================================================================================
@@ -1679,7 +1845,7 @@ static void RayHistoryWriteTriangleHitResult(_AmdSystemData data, bool accept)
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-        WriteRayHistoryTokenTriangleHitResult(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)),
+        WriteRayHistoryTokenTriangleHitResult(GetRayId(data.dispatch.DispatchId()),
                                               uint(accept),
                                               data.counter.candidateTCurrent);
     }
@@ -1695,7 +1861,7 @@ static void RayHistoryWriteFunctionCall(inout_param(_AmdSystemData) data,
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-        const uint rayId = GetRayId(_cont_DispatchRaysIndex3(data.dispatch));
+        const uint rayId = GetRayId(data.dispatch.DispatchId());
 
         switch(shaderKind)
         {
@@ -1749,7 +1915,7 @@ static void RayHistoryWriteAnyHitOrProceduralStatus(inout_param(_AmdSystemData)
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-        const uint rayId  = GetRayId(_cont_DispatchRaysIndex3(data.dispatch));
+        const uint rayId  = GetRayId(data.dispatch.DispatchId());
         const uint status = (data.dispatch.nextNodePtr == END_SEARCH)
                             ? HIT_STATUS_ACCEPT_AND_END_SEARCH
                             : (data.ray.AnyHitDidAccept() ? HIT_STATUS_ACCEPT : HIT_STATUS_IGNORE);
@@ -1779,225 +1945,64 @@ static void RayHistoryWriteAnyHitOrProceduralStatus(inout_param(_AmdSystemData)
             {
                 data.counter.numCandidateHits++;
             }
-            break;
-
-        default:
-            break;
-        }
-        data.counter.SetCallerShaderType(DXILShaderKind::Invalid);
-    }
-#endif
-}
-
-//=====================================================================================================================
-static void RayHistoryHandleIteration(inout_param(_AmdSystemData) data, uint nextNodePtr)
-{
-#if DEVELOPER
-    if (EnableTraversalCounter())
-    {
-        WriteRayHistoryTokenNodePtr(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), nextNodePtr);
-        UpdateWaveTraversalStatistics(ConvertRtIpLevel(_AmdGetRtip()), nextNodePtr);
-
-        data.counter.numIterations++;
-    }
-#endif
-}
-
-//=====================================================================================================================
-static void RayHistoryWriteBottomLevel(_AmdSystemData data, GpuVirtualAddress bvhAddress)
-{
-#if DEVELOPER
-    if (EnableTraversalCounter())
-    {
-        WriteRayHistoryTokenBottomLevel(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), bvhAddress);
-    }
-#endif
-}
-
-//=====================================================================================================================
-static void TraversalCounterWriteCounter(_AmdSystemData data)
-{
-#if DEVELOPER
-    if (EnableTraversalCounter())
-    {
-        TraversalCounter counter = (TraversalCounter)0;
-        counter.data[TCID_NUM_RAY_BOX_TEST]       = data.counter.numRayBoxTest;
-        counter.data[TCID_NUM_RAY_TRIANGLE_TEST]  = data.counter.numRayTriangleTest;
-        counter.data[TCID_NUM_ITERATION]          = data.counter.numIterations;
-        counter.data[TCID_MAX_TRAVERSAL_DEPTH]    = data.counter.maxStackDepth;
-        counter.data[TCID_NUM_ANYHIT_INVOCATION]  = data.counter.numAnyHitInvocation;
-        counter.data[TCID_SHADER_ID]              = data.counter.shaderIdLow;
-        counter.data[TCID_SHADER_RECORD_INDEX]    = data.counter.shaderRecIdx;
-        counter.data[TCID_TIMING_DATA]            = data.counter.timer;
-        counter.data[TCID_WAVE_ID]                = AmdTraceRayGetHwWaveId();
-        counter.data[TCID_NUM_CANDIDATE_HITS]     = data.counter.numCandidateHits;
-        counter.data[TCID_INSTANCE_INTERSECTIONS] = data.counter.instanceIntersections;
-
-        WriteTraversalCounter(GetRayId(_cont_DispatchRaysIndex3(data.dispatch)), counter);
-    }
-#endif
-}
-
-#if CONTINUATION_ON_GPU
-//=====================================================================================================================
-// ReportHit implementation that is called from the intersection shader.
-// May call the AnyHit shader.
-export bool _cont_ReportHit(inout_param(_AmdAnyHitSystemData) data, float THit, uint HitKind)
-{
-    // TODO Reuse shader record index computed in Traversal
-    // TODO Check for closest hit and duplicate anyHit calling
-
-    THit -= data.base.ray.tMin;
-    float tCurrentCommitted = 0.f;
-    {
-        tCurrentCommitted = data.base.traversal.committed.rayTCurrent;
-    }
-
-    if ((THit < 0.f) || (THit > tCurrentCommitted))
-    {
-        // Discard the hit candidate and hint the compiler to not keep the
-        // values alive, which will remove redundant moves.
-        data.candidate.rayTCurrent = _AmdGetUninitializedF32();
-        // Don't discard the hit kind as it is bit packed and cannot be discarded partially.
-        return false;
-    }
-
-    data.candidate.rayTCurrent = THit;
-    data.candidate.PackHitKind(HitKind);
-
-    uint isOpaque = true;
-    {
-        PrimitiveData primitiveData;
-        InstanceDesc desc;
-
-        {
-            // Get primitive nodes to process based on candidate or committed hit
-            const uint tlasNodePtr = data.candidate.instNodePtr;
-
-            const GpuVirtualAddress tlasAddr = data.base.ray.AccelStruct() + ExtractNodePointerOffset(tlasNodePtr);
-            desc = FetchInstanceDescAddr(tlasAddr);
-            isOpaque = data.candidate.IsOpaque();
-        }
-    }
-
-    if (!isOpaque)
-    {
-        uint hitGroupRecordIndex = 0;
-        {
-            hitGroupRecordIndex = data.base.dispatch.shaderRecIdx;
-        }
-        // Compute hit group address and fetch shader identifiers
-        const Vpc64 anyHitAddr = GetAnyHitAddr(hitGroupRecordIndex);
-
-        if (anyHitAddr.IsValid())
-        {
-            // Call AnyHit
-            // Hit attributes are added as an additional argument by the compiler
-            Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), SCHEDULING_PRIORITY_IS);
-            data = _AmdAwaitAnyHit(anyHitAddr.GetU64(), resumeAddr.GetU64(), data);
-            _AmdRestoreSystemDataAnyHit(data);
-            return data.base.ray.AnyHitDidAccept();
-        }
-        else
-        {
-            _cont_AcceptHit(data);
-            _AmdAcceptHitAttributes(data); // changes data.base.traversal.committedBarycentrics plus up-to-6 DW data in payload
-            return true;
-        }
-    }
-    else
-    {
-        _cont_AcceptHit(data);
-        _AmdAcceptHitAttributes(data);
-        return true;
+            break;
+
+        default:
+            break;
+        }
+        data.counter.SetCallerShaderType(DXILShaderKind::Invalid);
     }
+#endif
 }
 
 //=====================================================================================================================
-// CallShader implementation
-export void _cont_CallShader(inout_param(_AmdDispatchSystemData) data, uint index)
+static void RayHistoryHandleIteration(inout_param(_AmdSystemData) data, uint nextNodePtr)
 {
-    const uint64_t callableTableBaseAddress =
-        PackUint64(DispatchRaysConstBuf.callableTableBaseAddressLo, DispatchRaysConstBuf.callableTableBaseAddressHi);
-
-    if (callableTableBaseAddress == 0)
+#if DEVELOPER
+    if (EnableTraversalCounter())
     {
-        // TODO: It might be better to AwaitSelf here, adding an artificial suspend point.
-        //       For the common case of non-null callable shaders, this would reduce
-        //       the size of compiled shaders, as the post-CallShader() part is unreachable,
-        //       also simplifying manual testing with suspend points.
-        //       For null callable shaders, it has the advantage of allowing
-        //       to reconverge on the resume function if implemented in a way that yields only
-        //       a single resume function.
-        return;
-    }
-
-    const Vpc64 addr = GetVpc64FromShaderIdTable(callableTableBaseAddress,
-                                                 index,
-                                                 DispatchRaysConstBuf.callableTableStrideInBytes,
-                                                 SCHEDULING_PRIORITY_CALLABLE);
+        WriteRayHistoryTokenNodePtr(GetRayId(data.dispatch.DispatchId()), nextNodePtr);
+        UpdateWaveTraversalStatistics(ConvertRtIpLevel(GetRtIpLevel()), nextNodePtr);
 
-    if (!addr.IsValid())
-    {
-        // See TODO above on how to handle this case better.
-        return;
+        data.counter.numIterations++;
     }
-
-    const uint callerShaderRecIdx = data.shaderRecIdx;
-    data.shaderRecIdx = index; // the record index used by the callable shader
-
-    const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
-    const uint resumePrio = GetPriorityForShaderType(enclosingShaderType);
-    const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio);
-
-    data = _AmdAwaitCallShader(addr.GetU64(), resumeAddr.GetU64(), data);
-
-    // for the resume part.
-    data.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
-    _AmdRestoreSystemData(data); // llvm inserts amd.dx.setLocalRootIndex(data.shaderRecIdx)
+#endif
 }
 
 //=====================================================================================================================
-// Returns the low part of the miss shader address and sets up the dispatch data to have the correct shader record
-// index.
-static Vpc64 SetupMissShader(inout_param(_AmdSystemData) data, out_param(uint) shaderRecIdx)
+static void RayHistoryWriteBottomLevel(_AmdSystemData data, GpuVirtualAddress bvhAddress)
 {
-    const uint64_t missTableBaseAddress =
-        PackUint64(DispatchRaysConstBuf.missTableBaseAddressLo, DispatchRaysConstBuf.missTableBaseAddressHi);
-    if (missTableBaseAddress == 0)
+#if DEVELOPER
+    if (EnableTraversalCounter())
     {
-        shaderRecIdx = 0;
-        return Vpc64(0);
+        WriteRayHistoryTokenBottomLevel(GetRayId(data.dispatch.DispatchId()), bvhAddress);
     }
-
-    shaderRecIdx = ExtractMissShaderIndex(data.ray.traceParameters);
-
-    // Calculate miss shader record address
-    const Vpc64 shaderAddr = GetVpc64FromShaderIdTable(missTableBaseAddress,
-                                                       shaderRecIdx,
-                                                       DispatchRaysConstBuf.missTableStrideInBytes,
-                                                       SCHEDULING_PRIORITY_MISS);
-
-    return shaderAddr;
+#endif
 }
 
 //=====================================================================================================================
-static HitGroupInfo GetHitGroupInfo(
-    in _AmdSystemData           data,
-    in uint                     state,
-    in _AmdPrimitiveSystemState candidate)
+static void TraversalCounterWriteCounter(_AmdSystemData data)
 {
-    uint geometryIndex = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ?
-            candidate.GeometryIndex() : data.traversal.committed.GeometryIndex();
-    uint instanceContribution = (state < TRAVERSAL_STATE_COMMITTED_NOTHING) ?
-            candidate.InstanceContribution() : data.traversal.committed.InstanceContribution();
+#if DEVELOPER
+    if (EnableTraversalCounter())
+    {
+        TraversalCounter counter = (TraversalCounter)0;
+        counter.data[TCID_NUM_RAY_BOX_TEST]       = data.counter.numRayBoxTest;
+        counter.data[TCID_NUM_RAY_TRIANGLE_TEST]  = data.counter.numRayTriangleTest;
+        counter.data[TCID_NUM_ITERATION]          = data.counter.numIterations;
+        counter.data[TCID_MAX_TRAVERSAL_DEPTH]    = data.counter.maxStackDepth;
+        counter.data[TCID_NUM_ANYHIT_INVOCATION]  = data.counter.numAnyHitInvocation;
+        counter.data[TCID_SHADER_ID]              = data.counter.shaderIdLow;
+        counter.data[TCID_SHADER_RECORD_INDEX]    = data.counter.shaderRecIdx;
+        counter.data[TCID_TIMING_DATA]            = data.counter.timer;
+        counter.data[TCID_WAVE_ID]                = AmdTraceRayGetHwWaveId();
+        counter.data[TCID_NUM_CANDIDATE_HITS]     = data.counter.numCandidateHits;
+        counter.data[TCID_INSTANCE_INTERSECTIONS] = data.counter.instanceIntersections;
 
-    return GetHitGroupInfo(ExtractRayContributionToHitIndex(data.ray.traceParameters),
-                           ExtractMultiplierForGeometryContributionToHitIndex(data.ray.traceParameters),
-                           geometryIndex,
-                           instanceContribution);
-}
+        WriteTraversalCounter(GetRayId(data.dispatch.DispatchId()), counter);
+    }
 #endif
+}
 
 //=====================================================================================================================
 // Order matters, the following HLSL reference the functions and structs defined above. TODO: refactor these into a
@@ -2005,7 +2010,30 @@ static HitGroupInfo GetHitGroupInfo(
 #include "Continuations1_1.hlsl"
 #include "Continuations2_0.hlsl"
 
-#if CONTINUATION_ON_GPU
+//=====================================================================================================================
+// Calls traversal for the current rtip.
+static void TraversalInternal(
+    inout_param(_AmdSystemData) data,
+    inout_param(uint) state,
+    inout_param(_AmdPrimitiveSystemState) candidate,
+    inout_param(float2) candidateBarycentrics)
+{
+    switch (GetRtIpLevel())
+    {
+#if (GPURT_RTIP_LEVEL == GPURT_RTIP_LEGACY_LEVEL) || (GPURT_RTIP_LEVEL == 0)
+    case RayTracingIpLevel::RtIp1_1:
+        TraversalInternal1_1(data, state, candidate, candidateBarycentrics);
+        break;
+    case RayTracingIpLevel::RtIp2_0:
+        TraversalInternal2_0(data, state, candidate, candidateBarycentrics);
+        break;
+#endif
+    default:
+        break;
+    }
+}
+
+#if ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
 static uint64_t GetDispatchIdAddr()
 {
     return PackUint64(DispatchRaysConstBuf.cpsDispatchIdAddressLo, DispatchRaysConstBuf.cpsDispatchIdAddressHi);
@@ -2066,12 +2094,13 @@ static void LaunchRayGen(bool setupStack)
 #if DEVELOPER
         systemData.parentId = -1;
 #endif
-        _AmdEnqueueRayGen(GetRayGenVpc64().GetU64(), _AmdGetUninitializedI64(), systemData);
+        _AmdEnqueueRayGen(GetRayGenVpc32().GetU32(), _AmdGetUninitializedI32(), systemData);
     }
     else if (Options::getPersistentLaunchEnabled())
     {
-        _AmdDispatchSystemData systemData = _AmdDispatchSystemData::MakeDeadLaneWithStack();
-        _AmdEnqueueTraversalDead(GetTraversalVpc64PwgDead().GetU64(), _AmdGetUninitializedI64(), systemData);
+        _AmdDispatchSystemData systemData = _AmdGetUninitializedDispatchSystemData();
+        systemData.SetDead(true);
+        _AmdEnqueueTraversalDead(GetTraversalVpc32PwgDead().GetU32(), _AmdGetUninitializedI32(), systemData);
     }
 }
 
@@ -2152,7 +2181,7 @@ export void _cont_TraceRay(
     }
     // Initialise traversal system state
     _AmdTraversalState traversal = (_AmdTraversalState)0;
-    switch (_AmdGetRtip())
+    switch (GetRtIpLevel())
     {
     case RayTracingIpLevel::RtIp1_1:
         traversal = InitTraversalState1_1(instanceInclusionMask, rayDesc, isValid);
@@ -2176,16 +2205,19 @@ export void _cont_TraceRay(
 
     const uint     callerShaderRecIdx    = dispatch.shaderRecIdx; // 0 if from RayGen.
     const uint     parentId              = RayHistoryGetParentId(dispatch);
-    const Vpc64    traversalAddr         = GetTraversalVpc64();
 
     // The type of the shader containing this TraceRay call, i.e. the shader we are inlined into.
     const DXILShaderKind enclosingShaderType = _AmdGetShaderKind();
     const uint           resumePrio          = GetPriorityForShaderType(enclosingShaderType);
 
     // NO control flow is allowed between _AmdGetResumePointAddr() and _AmdAwaitTraversal().
-    const Vpc64 resumeAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetResumePointAddr()), resumePrio);
-    data.traversal.PackReturnAddress(resumeAddr);
-    dispatch = _AmdAwaitTraversal(traversalAddr.GetU64(), data);
+    const Vpc32 resumeAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetResumePointAddr()), resumePrio);
+    data.traversal.SetReturnAddress(resumeAddr);
+#ifndef PASS_DUMMY_RET_ADDR
+    dispatch = _AmdAwaitTraversal(GetTraversalVpc32().GetU32(), data);
+#else // PASS_DUMMY_RET_ADDR
+    dispatch = _AmdAwaitTraversal(GetTraversalVpc32().GetU32(), _AmdGetUninitializedI32(), data);
+#endif
 
     // for the resume part.
     dispatch.shaderRecIdx = callerShaderRecIdx; // restores callerShaderRecIdx
@@ -2196,26 +2228,23 @@ export void _cont_TraceRay(
 }
 
 //=====================================================================================================================
-// Get the address of the function that should be called next, either a closest hit or a miss shader. If no hit or miss
-// shader should be called, this method returns false (and in that case it should return to
-// data.traversal.ReturnAddress()), otherwise it returns true.
-static bool GetNextHitMissPc(
+// Get the address of the function that should be called next, either a closest hit or a miss shader.
+// If no hit or miss shader should be called, this method returns DEAD_SHADER_ADDR.
+static Vpc32 GetNextHitMissPc(
     inout_param(_AmdSystemData) data,
     uint state,
-    _AmdPrimitiveSystemState candidate,
-    out_param(Vpc64) nextShaderAddr)
+    _AmdPrimitiveSystemState candidate)
 {
     // MS
     if (data.IsMiss(state))
     {
         uint shaderRecIdx;
-        const Vpc64 missShaderAddr = SetupMissShader(data, shaderRecIdx);
+        const Vpc32 missShaderAddr = SetupMissShader(data, shaderRecIdx);
         if (missShaderAddr.IsValid())
         {
             // Valid MS
             data.dispatch.shaderRecIdx = shaderRecIdx;
-            nextShaderAddr = missShaderAddr;
-            return true;
+            return missShaderAddr;
         }
     }
 
@@ -2230,98 +2259,79 @@ static bool GetNextHitMissPc(
 
         if ((data.ray.Flags() & RAY_FLAG_SKIP_CLOSEST_HIT_SHADER) == 0)
         {
-            if (hitInfo.closestHitId.x != 0)
+            Vpc32 closestHitId = Vpc32(hitInfo.closestHitId.x);
+            if (closestHitId.IsValid())
             {
-                // Valid CHS
-                nextShaderAddr = GetVpc64FromShaderId(Vpc32(hitInfo.closestHitId.x), SCHEDULING_PRIORITY_CHS);
-                return true;
+                return closestHitId;
             }
         }
     }
-    return false;
+    return Vpc32(DEAD_SHADER_ADDR);
 }
 
 //=====================================================================================================================
-// Calls traversal for the current rtip.
-static void TraversalInternal(
-    inout_param(_AmdSystemData) data,
-    inout_param(uint) state,
-    inout_param(_AmdPrimitiveSystemState) candidate,
-    inout_param(float2) candidateBarycentrics)
+// Helper to handle enqueueing CHS, MS.
+static void EnqueueHitMiss(_AmdSystemData data, Vpc32 nextShaderAddr)
 {
-    switch (_AmdGetRtip())
-    {
-#if (GPURT_RTIP_LEVEL == GPURT_RTIP_LEGACY_LEVEL) || (GPURT_RTIP_LEVEL == 0)
-    case RayTracingIpLevel::RtIp1_1:
-        TraversalInternal1_1(data, state, candidate, candidateBarycentrics);
-        break;
-    case RayTracingIpLevel::RtIp2_0:
-        TraversalInternal2_0(data, state, candidate, candidateBarycentrics);
-        break;
-#endif
-    default:
-        break;
-    }
-}
+    GPU_ASSERT(nextShaderAddr.GetU32() != DEAD_SHADER_ADDR && !data.IsDeadLane());
+    const uint state = data.traversal.committed.State();
+    RayHistoryWriteEnd(data, state);
 
-static void EnqueueNextShader(bool hasWorkToDo, Vpc64 nextShaderAddr, Vpc64 returnAddr, _AmdSystemData data)
-{
-    if (!hasWorkToDo)
+    const Vpc32 returnAddr = data.traversal.ReturnAddress();
+
+    if (nextShaderAddr.GetU32() == DEAD_SHADER_ADDR)
     {
-        if (_AmdContinuationStackIsGlobal())
-        {
-            // No work to do = dead lane, jump to traversal as a synchronization point with an empty system data
-            _AmdSystemData sysData = _AmdSystemData::MakeDeadLaneWithoutStack();
-            _AmdEnqueueTraversal(GetTraversalVpc64().GetU64(), _AmdGetUninitializedI64(), sysData);
-        }
-        else
-        {
-            GPU_ASSERT(false);
-        }
+        // We do not have an address to jump to, retrieve the return address and return to RGS
+        _AmdEnqueueRayGen(returnAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch);
     }
 
-    const uint newState = data.traversal.committed.State();
-    RayHistoryWriteEnd(data, newState);
+    // Enqueue the selected shader
+    const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(state)
+        ? (int)DXILShaderKind::Miss // convert to int to fix linux build error
+        : (int)DXILShaderKind::ClosestHit
+    );
 
-    if (nextShaderAddr.GetU64() != returnAddr.GetU64())
-    {
-        const DXILShaderKind shaderKind = (DXILShaderKind)(data.IsMiss(newState) ?
-                                          (int)DXILShaderKind::Miss : // convert to int to fix linux build error
-                                          (int)DXILShaderKind::ClosestHit);
-        RayHistoryWriteFunctionCall(data,
-                                    RayHistoryGetIdentifierFromVPC(nextShaderAddr.GetU64()),
-                                    data.dispatch.shaderRecIdx,
-                                    shaderKind);
-
-        _AmdEnqueue(nextShaderAddr.GetU64(), returnAddr.GetU64(), data);
-    }
+    RayHistoryWriteFunctionCall(data,
+                                RayHistoryGetIdentifierFromVPC(nextShaderAddr),
+                                data.dispatch.shaderRecIdx,
+                                shaderKind);
 
-    // Return to RayGen. No need to set a priority, as it is already set in the stored return address.
-    _AmdEnqueueRayGen(returnAddr.GetU64(), _AmdGetUninitializedI64(), data.dispatch);
+    _AmdEnqueue(nextShaderAddr.GetU32(), returnAddr.GetU32(), data);
 }
 
 //=====================================================================================================================
-// Convenience helper calling Traversal on the debug/emulation path that returns _AmdTraversalResultData.
-static _AmdTraversalResultData TraversalInternalDebugWrapper(
-    inout_param(_AmdSystemData) data)
+
+export void _cont_ExitRayGen(in _AmdDispatchSystemData data)
 {
-    uint state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
-    _AmdPrimitiveSystemState candidate  = (_AmdPrimitiveSystemState)0;
-    float2 candidateBarycentrics        = float2(0.0f, 0.0f);
+    if (Options::getPersistentLaunchEnabled()
+    ) {
+        // Lanes that exit raygen own a stack. Return them to traversal for scheduling
+        _AmdDispatchSystemData sysData = _AmdGetUninitializedDispatchSystemData();
+        sysData.SetDead(true);
+        _AmdEnqueueTraversalDead(GetTraversalVpc32PwgDead().GetU32(), _AmdGetUninitializedI32(), sysData);
+    }
+    // In all other cases, exit the wave
+    _AmdComplete();
+}
 
-    TraversalInternal(data, state, candidate, candidateBarycentrics);
+//=====================================================================================================================
 
-    _AmdTraversalResultData result = (_AmdTraversalResultData)0;
-    result.state = state;
-    result.candidate = candidate;
-    result.candidateBarycentrics = candidateBarycentrics;
+//=====================================================================================================================
 
-    return result;
-}
+namespace ThreadTrace
+{
 
-//=====================================================================================================================
-// Wrapper to ensure the following shader section is marked as "Scheduler" in TTV (if thread traces are enabled).
-static void EnterSchedulerSection()
+enum struct Section
+{
+    Scheduler = 8,
+    Traversal = 6
+};
+
+//=================================================================================================================
+// Wrapper to ensure the subsequent shader section is correctly identified in TTV.
+// If thread traces are disabled, this does nothing. Otherwise, it issues a return token and a new shader data token
+// of the type specified by `section`.
+static void EnterSection(uint section)
 {
     if (Options::getThreadTraceEnabled())
     {
@@ -2331,8 +2341,44 @@ static void EnterSchedulerSection()
 
         // Emit a function call token to start the scheduler function.
         AmdExtD3DShaderIntrinsics_ShaderMarker(0x11 |
-            (/* scheduler */ 8 << 8) |
-            (/* exec      */ WaveActiveCountBits(true) << 13));
+            (/* section */ section << 8) |
+            (/* exec    */ WaveActiveCountBits(true) << 13));
+    }
+}
+
+} // namespace ThreadTrace
+
+//=====================================================================================================================
+// Scheduler for dead lanes.
+// Some lanes may return from this function. All lanes that return are guaranteed to be dead and are supposed to enqueue
+// traversal for subsequent processing. If the full wave is dead and persistent launch is on, new work will be started.
+// If persistent work is off, and all lanes are dead (potentially less than a full wave), and no work could be obtained,
+// then the lanes are terminated.
+static void ScheduleDeadWave(_AmdSystemData data, Vpc32 traversalAddr)
+{
+    GPU_ASSERT(WaveActiveAllTrue(data.IsDeadLane()));
+
+    if (Options::getPersistentLaunchEnabled())
+    {
+        if (data.IsDeadLaneWithStack())
+        {
+            if (WaveActiveCountBits(true) == AmdExtLaneCount())
+            {
+                // If the whole wave is dead, get ready to start a new dispatch
+                LaunchRayGen(false);
+            }
+            // Passthrough these stackful dead lanes
+            _AmdEnqueueTraversalDead(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch);
+        }
+    }
+
+    if (Options::getPersistentLaunchEnabled())
+    {
+        _AmdEnqueueTraversalDead(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch);
+    }
+    else
+    {
+        _AmdComplete();
     }
 }
 
@@ -2341,6 +2387,13 @@ static void EnterSchedulerSection()
 export void _cont_Traversal(
     inout_param(_AmdSystemData) data)
 {
+    bool IsDead = data.IsDeadLane();
+    const bool IsTraversal = !IsDead && data.IsTraversal();
+
+    // TRAVERSAL: BVH -------------------------------------------------------------------------------------------------
+    uint state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
+    _AmdPrimitiveSystemState candidate  = (_AmdPrimitiveSystemState)0;
+    float2 candidateBarycentrics        = float2(0.0f, 0.0f);
     // Discard data that doesn't need to be kept alive during Traversal
     data.dispatch.shaderRecIdx = _AmdGetUninitializedI32();
     if (!IsBvhRebraid())
@@ -2349,23 +2402,18 @@ export void _cont_Traversal(
         data.traversal.lastInstanceRootNodePtr = _AmdGetUninitializedI32();
     }
 
-    // Write AHS/IS returned status
-    bool IsDeadLane = (data.IsDeadLaneWithoutStack() || data.IsDeadLaneWithStack());
-    if (!IsDeadLane)
+    if (!IsDead)
     {
+        // Write AHS/IS returned status
         RayHistoryWriteAnyHitOrProceduralStatus(data);
     }
 
     // Execute traversal for active lanes.
-    uint state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
-    _AmdPrimitiveSystemState candidate  = (_AmdPrimitiveSystemState)0;
-    float2 candidateBarycentrics        = float2(0.0f, 0.0f);
-
-    if (data.IsTraversal())
+    if (IsTraversal)
     {
         TraversalInternal(data, state, candidate, candidateBarycentrics);
     }
-    else
+    else if (!IsDead)
     {
         // This branch is hit when the traversal for a lane is done:
         // a) AHS/IS enqueued _cont_Traversal(), for the very last time.
@@ -2380,41 +2428,32 @@ export void _cont_Traversal(
         // For CHS, get candidate and barycentrics from traversal.
         if (data.IsChs(state))
         {
-            candidate                   = data.traversal.committed;
-            candidateBarycentrics       = data.traversal.committedBarycentrics;
+            candidate             = data.traversal.committed;
+            candidateBarycentrics = data.traversal.committedBarycentrics;
         }
     }
 
-    // Result used on the CPU path. This is an unused dummy return value on the GPU path.
-    _AmdTraversalResultData result = (_AmdTraversalResultData)0;
+    // ALIASES AND CACHED VARIABLES -----------------------------------------------------------------------------------
 
-    bool IsChsOrMiss = data.IsChsOrMiss(state);
-    // Re-enqueue Traversal until all lanes are done with BVH Traversal.
-    // Only then enqueue CHS/Miss to ensure other lanes that are not yet done with Traversal
-    // converge on these CHS/Miss invocations.
-    // This is necessary because Traversal has lower scheduling priority.
-    if (WaveActiveAllTrue(IsChsOrMiss))
-    {
-        EnterSchedulerSection();
+    // Cache Traversal's own address
+    const Vpc32 traversalAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
 
-        Vpc64 nextShaderAddr = Vpc64(0);
-        GetNextHitMissPc(data, state, candidate, nextShaderAddr);
+    // Some aliases for variable state. Help the compiler figure out these are mutually exclusive in all modes.
+    bool IsChsOrMiss = false;
+    bool IsAhsOrIs = false;
+    if (!IsDead)
+    {
+        IsChsOrMiss = data.IsChsOrMiss(state);
+        IsAhsOrIs = (data.IsAhs(state) || data.IsIs(state));
+    }
+    bool AllDead = Traits::HasDeadLanes() && WaveActiveAllTrue(IsDead);
+    bool AnyIsAhsOrIs = WaveActiveAnyTrue(IsAhsOrIs);
 
-        bool hasWorkToDo = true;
-        if (_AmdContinuationStackIsGlobal() && nextShaderAddr.IsValid())
-        {
-        }
+    // TRAVERSAL: AHS AND IS ------------------------------------------------------------------------------------------
 
-        const Vpc64 returnAddr = data.traversal.ReturnAddress();
-        if (!nextShaderAddr.IsValid())
-        {
-            nextShaderAddr = returnAddr;
-        }
-        EnqueueNextShader(hasWorkToDo, nextShaderAddr, returnAddr, data);
-    }
-    else
+    if (AnyIsAhsOrIs)
     {
-        if (data.IsAhs(state) || data.IsIs(state))
+        if (IsAhsOrIs)
         {
             HitGroupInfo hitInfo = (HitGroupInfo)0;
             {
@@ -2429,45 +2468,78 @@ export void _cont_Traversal(
             // AHS and IS re-enqueue SchedulerInternal when finished.
             if (data.IsAhs(state))
             {
+                const Vpc32 anyHitAddr = Vpc32(hitInfo.anyHitId.x);
                 RayHistoryWriteFunctionCall(anyHitData.base,
-                                            RayHistoryGetIdentifierFromShaderId(hitInfo.anyHitId),
+                                            RayHistoryGetIdentifierFromVPC(anyHitAddr),
                                             hitInfo.tableIndex,
                                             DXILShaderKind::AnyHit);
 
-                const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.anyHitId.x), SCHEDULING_PRIORITY_AHS);
-                const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
-                _AmdEnqueueAnyHit(addr.GetU64(), returnAddr.GetU64(), anyHitData, candidateBarycentrics);
+                const Vpc32 returnAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+                _AmdEnqueueAnyHit(anyHitAddr.GetU32(), returnAddr.GetU32(), anyHitData, candidateBarycentrics);
             }
             else
             {
                 // Intersection shader
                 GPU_ASSERT(data.IsIs(state));
 
+                const Vpc32 isAddr = Vpc32(hitInfo.intersectionId.x);
                 RayHistoryWriteFunctionCall(anyHitData.base,
-                                            RayHistoryGetIdentifierFromShaderId(hitInfo.intersectionId),
+                                            RayHistoryGetIdentifierFromVPC(isAddr),
                                             hitInfo.tableIndex,
                                             DXILShaderKind::Intersection);
 
-                const Vpc64 addr = GetVpc64FromShaderId(Vpc32(hitInfo.intersectionId.x), SCHEDULING_PRIORITY_IS);
-                const Vpc64 returnAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
-                _AmdEnqueueIntersection(addr.GetU64(), returnAddr.GetU64(), anyHitData);
+                const Vpc32 returnAddr = Vpc32::MakeWithPriority(Vpc32(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
+                _AmdEnqueueIntersection(isAddr.GetU32(), returnAddr.GetU32(), anyHitData);
             }
         }
-        else
+        _AmdEnqueueTraversal(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data);
+    }
+
+    // FULL WAVE OF DEAD LANES ----------------------------------------------------------------------------------------
+    else if (AllDead)
+    {
+        ScheduleDeadWave(data, traversalAddr);
+        // this is unreachable, ScheduleDeadWave guarantees to end with an enqueue
+    }
+
+    // CHS, MISS AND POSSIBLY DEAD LANES ------------------------------------------------------------------------------
+    else
+    {
+        GPU_ASSERT(IsChsOrMiss || IsDead);
+        ThreadTrace::EnterSection(ThreadTrace::Section::Scheduler);
+
+        Vpc32 nextShaderAddr = Vpc32(IsDead ? DEAD_SHADER_ADDR : GetNextHitMissPc(data, state, candidate).GetU32());
+
+        if (!IsDead)
         {
-            //
-            // Everything else needs to go back through scheduling/traversal, regardless of state
-            // Note we don't need "Wait" here because priorities run AHS and IS first
-            const Vpc64 traversalAddr = Vpc64::MakeWithPriority(Vpc64(_AmdGetCurrentFuncAddr()), SCHEDULING_PRIORITY_TRAVERSAL);
-            _AmdEnqueueTraversal(traversalAddr.GetU64(), _AmdGetUninitializedI64(), data);
+            EnqueueHitMiss(data, nextShaderAddr);
         }
+        _AmdEnqueueTraversalDead(traversalAddr.GetU32(), _AmdGetUninitializedI32(), data.dispatch);
     }
-    // This is unreachable
 }
-#endif
+
+#elif GPURT_DEBUG_CONTINUATION_TRAVERSAL // ((GPURT_DEBUG_CONTINUATION_TRAVERSAL == 0) && (!defined(__cplusplus)))
+
+//=====================================================================================================================
+// For debug. Convenience helper calling Traversal on the debug/emulation path that returns _AmdTraversalResultData.
+static _AmdTraversalResultData TraversalInternalDebugWrapper(
+    inout_param(_AmdSystemData) data)
+{
+    uint state                          = TRAVERSAL_STATE_COMMITTED_NOTHING;
+    _AmdPrimitiveSystemState candidate  = (_AmdPrimitiveSystemState)0;
+    float2 candidateBarycentrics        = float2(0.0f, 0.0f);
+
+    TraversalInternal(data, state, candidate, candidateBarycentrics);
+
+    _AmdTraversalResultData result = (_AmdTraversalResultData)0;
+    result.state = state;
+    result.candidate = candidate;
+    result.candidateBarycentrics = candidateBarycentrics;
+
+    return result;
+}
 
 //=====================================================================================================================
-#if GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
 // For debug. Support DxcpRt (non-continuation) to use Continuation traversal.
 static IntersectionResult TraceRayInternalCPSDebug(
     in GpuVirtualAddress topLevelBvh,             // Top-level acceleration structure to use
@@ -2476,26 +2548,24 @@ static IntersectionResult TraceRayInternalCPSDebug(
     in RayDesc           rayDesc,                 // Ray to be traced
     in uint              rayId,                   // Ray ID for profiling
     in uint              rtIpLevel                // HW version to determine TraceRay implementation
+#if DEVELOPER
+    , in uint            dynamicId                // dynamic ID
+#endif
 )
 {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION  >= 41
-    rayFlags = (rayFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags();
-#endif
-
     // Initialise ray system state from TraceRay parameters
     _AmdRaySystemState ray = (_AmdRaySystemState)0;
-    ray.accelStruct        = topLevelBvh;
+    ray.PackAccelStructAndRayflags(topLevelBvh, rayFlags);
     ray.direction          = rayDesc.Direction;
     ray.origin             = rayDesc.Origin;
     ray.tMin               = rayDesc.TMin;
     ray.tMax               = rayDesc.TMax;
-    ray.flags              = rayFlags;
     ray.traceParameters    = traceRayParameters;
 
     const bool isValid = true; // already verified in the caller
 
     _AmdDispatchSystemData dispatch = (_AmdDispatchSystemData)0;
-    dispatch.PackDispatchId(GetDispatchId());
+    dispatch.PackDispatchId(AmdTraceRayDispatchRaysIndex());
 #if DEVELOPER
     dispatch.parentId = -1;
 #endif
@@ -2525,6 +2595,10 @@ static IntersectionResult TraceRayInternalCPSDebug(
     sysData.ray       = ray;
     sysData.traversal = traversal;
 
+#if DEVELOPER
+    sysData.counter.dynamicId = dynamicId;
+#endif
+
     // Begin outer while loop
     while (sysData.dispatch.nextNodePtr < TERMINAL_NODE)
     {
@@ -2564,10 +2638,20 @@ static IntersectionResult TraceRayInternalCPSDebug(
             const uint64_t instNodePtr64 = CalculateInstanceNodePtr64(rtIpLevel, topLevelBvh, tlasNodePtr);
             if (state == TRAVERSAL_STATE_CANDIDATE_NON_OPAQUE_TRIANGLE)
             {
+                uint status = HIT_STATUS_ACCEPT;
+
                 // This test reduces sp3 instructions, when rayFlags is a const containing RAY_FLAG_FORCE_OPAQUE. Note
                 // in this case, this branch is not executed w/wo this test, but simpler sp3 boosts performance.
-                if ((rayFlags & RAY_FLAG_FORCE_OPAQUE) == 0)
+                if ((ray.Flags() & RAY_FLAG_FORCE_OPAQUE) == 0)
                 {
+                    if (PackUint64(hitInfo.anyHitId) != 0)
+                    {
+                        RayHistoryWriteFunctionCall(sysData,
+                                                    hitInfo.anyHitId,
+                                                    hitInfo.tableIndex,
+                                                    DXILShaderKind::AnyHit);
+                    }
+
                     uint hitKind = ret.candidate.HitKind();
                     // Set intersection attributes
                     AmdTraceRaySetHitAttributes(ret.candidate.rayTCurrent,
@@ -2582,9 +2666,13 @@ static IntersectionResult TraceRayInternalCPSDebug(
                     BuiltInTriangleIntersectionAttributes attr = { ret.candidateBarycentrics };
                     AmdTraceRayCallTriangleAnyHitShader(hitInfo.anyHitId, hitInfo.tableIndex, attr);
 
-                    uint status = HIT_STATUS_ACCEPT;
                     AmdTraceRayGetHitAttributes(ret.candidate.rayTCurrent, hitKind, status);
 
+                    if (PackUint64(hitInfo.anyHitId) != 0)
+                    {
+                        RayHistoryWriteAnyHitOrProceduralStatus(sysData);
+                    }
+
                     if (status != HIT_STATUS_IGNORE)
                     {
                         sysData.traversal.committed = ret.candidate;
@@ -2596,12 +2684,19 @@ static IntersectionResult TraceRayInternalCPSDebug(
                         }
                     }
                 }
+
+                RayHistoryWriteTriangleHitResult(sysData, status > HIT_STATUS_IGNORE);
             }
             else
             {
                 // Intersection requires the currently committed hit as RayTCurrent()
                 ret.candidate.rayTCurrent = sysData.traversal.committed.rayTCurrent;
 
+                RayHistoryWriteFunctionCall(sysData,
+                                            hitInfo.intersectionId,
+                                            hitInfo.tableIndex,
+                                            DXILShaderKind::Intersection);
+
                 // Set intersection attributes
                 AmdTraceRaySetHitAttributes(sysData.traversal.committed.rayTCurrent,
                                             0,
@@ -2621,6 +2716,7 @@ static IntersectionResult TraceRayInternalCPSDebug(
                 AmdTraceRayGetHitAttributes(ret.candidate.rayTCurrent,
                                             hitKind,
                                             status);
+                RayHistoryWriteAnyHitOrProceduralStatus(sysData);
 
                 if (status != HIT_STATUS_IGNORE)
                 {
@@ -2675,6 +2771,7 @@ static IntersectionResult TraceRayInternalCPSDebug(
         {
             handleTriangleNode = CheckHandleTriangleNode(sysData.traversal.committed.currNodePtr);
         }
+
         if (handleTriangleNode)
         {
             AmdTraceRaySetTriangleIntersectionAttributes(result.barycentrics);
@@ -2686,6 +2783,16 @@ static IntersectionResult TraceRayInternalCPSDebug(
         AmdTraceRaySetHitTokenData(INVALID_NODE, INVALID_NODE);
     }
 
+#if DEVELOPER
+    result.numRayBoxTest         = sysData.counter.numRayBoxTest;
+    result.numRayTriangleTest    = sysData.counter.numRayTriangleTest;
+    result.numIterations         = sysData.counter.numIterations;
+    result.maxStackDepth         = sysData.counter.maxStackDepth;
+    result.numAnyHitInvocation   = sysData.counter.numAnyHitInvocation;
+    result.numCandidateHits      = sysData.counter.numCandidateHits;
+    result.instanceIntersections = sysData.counter.instanceIntersections;
+#endif
+
     return result;
 }
 #endif
diff --git a/src/shaders/RayQuery.hlsl b/src/shaders/RayQuery.hlsl
index ea80508..7107295 100644
--- a/src/shaders/RayQuery.hlsl
+++ b/src/shaders/RayQuery.hlsl
@@ -98,11 +98,7 @@ static bool RayQueryProceedCommon(
     {
         if (continueTraversal == false)
         {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-            const uint rayId = AmdExtDispatchThreadIdFlat();
-#else
             const uint rayId = GetRayId(dispatchThreadId);
-#endif
             WriteDispatchCounters(rayQuery.numIterations);
             WriteTraversalCounter(rayQuery, rayId);
 
diff --git a/src/shaders/RayQuery1_1.hlsl b/src/shaders/RayQuery1_1.hlsl
index c82f9c2..d372f85 100644
--- a/src/shaders/RayQuery1_1.hlsl
+++ b/src/shaders/RayQuery1_1.hlsl
@@ -204,11 +204,7 @@ static void TraceRayInlineImpl1_1(
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-        const uint rayId = AmdExtDispatchThreadIdFlat();
-#else
         const uint rayId = GetRayId(dispatchThreadId);
-#endif
         SetRayQueryDynamicId(rayQuery, AllocateRayHistoryDynamicId());
         WriteRayHistoryTokenBegin(rayId,
                                   dispatchThreadId,
@@ -235,11 +231,7 @@ static bool RayQueryProceedImpl1_1(
     uint rayId = 0;
     if (EnableTraversalCounter())
     {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-        rayId = AmdExtDispatchThreadIdFlat();
-#else
         rayId = GetRayId(dispatchThreadId);
-#endif
     }
 #endif
 
diff --git a/src/shaders/RayQuery2_0.hlsl b/src/shaders/RayQuery2_0.hlsl
index 0a9bed9..9e163d3 100644
--- a/src/shaders/RayQuery2_0.hlsl
+++ b/src/shaders/RayQuery2_0.hlsl
@@ -118,11 +118,7 @@ static void TraceRayInlineImpl2_0(
 #if DEVELOPER
     if (EnableTraversalCounter())
     {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-        const uint rayId = AmdExtDispatchThreadIdFlat();
-#else
         const uint rayId = GetRayId(dispatchThreadId);
-#endif
         SetRayQueryDynamicId(rayQuery, AllocateRayHistoryDynamicId());
         WriteRayHistoryTokenBegin(rayId,
                                   dispatchThreadId,
@@ -149,11 +145,7 @@ static bool RayQueryProceedImpl2_0(
     uint rayId = 0;
     if (EnableTraversalCounter())
     {
-#if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 42
-        rayId = AmdExtDispatchThreadIdFlat();
-#else
         rayId = GetRayId(dispatchThreadId);
-#endif
     }
 #endif
 
diff --git a/src/shaders/TraceRay.hlsl b/src/shaders/TraceRay.hlsl
index 5b9f06c..b4d012b 100644
--- a/src/shaders/TraceRay.hlsl
+++ b/src/shaders/TraceRay.hlsl
@@ -34,8 +34,11 @@ static IntersectionResult TraceRayInternal(
     in RayDesc           rayDesc,                 // Ray to be traced
     in uint              rayId,                   // Ray ID for profiling
     in uint              rtIpLevel                // HW version to determine TraceRay implementation
+#if DEVELOPER
+    , in uint            dynamicId                // dynamic ID
+#endif
 )
-#if GPURT_BUILD_CONTINUATION && GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
 {
     return TraceRayInternalCPSDebug(topLevelBvh,
                                     rayFlags,
@@ -44,9 +47,12 @@ static IntersectionResult TraceRayInternal(
                                     rayId,
                                     rtIpLevel
 
+#if DEVELOPER
+                                    , dynamicId
+#endif
     );
 }
-#else // GPURT_BUILD_CONTINUATION && GPURT_DEBUG_CONTINUATION_TRAVERSAL_RTIP
+#else // GPURT_DEBUG_CONTINUATION_TRAVERSAL
 // Default path
 {
 #ifdef __cplusplus
@@ -116,6 +122,9 @@ static bool TraceRayCommon(
     uint  rtIpLevel
 )
 {
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
+    uint oriRayFlags = rayFlags;
+#endif
 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION  >= 41
     rayFlags = (rayFlags & ~AmdTraceRayGetKnownUnsetRayFlags()) | AmdTraceRayGetKnownSetRayFlags();
 #endif
@@ -188,11 +197,18 @@ static bool TraceRayCommon(
         {
             result = TraceRayInternal(
                 accelStruct,
+#if GPURT_DEBUG_CONTINUATION_TRAVERSAL
+                oriRayFlags,
+#else
                 rayFlags,
+#endif
                 packedTraceParams,
                 ray,
                 rayId,
                 rtIpLevel
+#if DEVELOPER
+              , dynamicId
+#endif
                 );
         }
         else
@@ -250,13 +266,13 @@ static bool TraceRayCommon(
         }
         AmdTraceRaySetParentId(dynamicId);
 
-        counter.data[TCID_NUM_RAY_BOX_TEST] = result.numRayBoxTest;
-        counter.data[TCID_NUM_RAY_TRIANGLE_TEST] = result.numRayTriangleTest;
-        counter.data[TCID_NUM_ITERATION] = result.numIterations;
-        counter.data[TCID_MAX_TRAVERSAL_DEPTH] = result.maxStackDepth;
-        counter.data[TCID_NUM_ANYHIT_INVOCATION] = result.numAnyHitInvocation;
-        counter.data[TCID_WAVE_ID] = AmdTraceRayGetHwWaveId();
-        counter.data[TCID_NUM_CANDIDATE_HITS] = result.numCandidateHits;
+        counter.data[TCID_NUM_RAY_BOX_TEST]       = result.numRayBoxTest;
+        counter.data[TCID_NUM_RAY_TRIANGLE_TEST]  = result.numRayTriangleTest;
+        counter.data[TCID_NUM_ITERATION]          = result.numIterations;
+        counter.data[TCID_MAX_TRAVERSAL_DEPTH]    = result.maxStackDepth;
+        counter.data[TCID_NUM_ANYHIT_INVOCATION]  = result.numAnyHitInvocation;
+        counter.data[TCID_WAVE_ID]                = AmdTraceRayGetHwWaveId();
+        counter.data[TCID_NUM_CANDIDATE_HITS]     = result.numCandidateHits;
         counter.data[TCID_INSTANCE_INTERSECTIONS] = result.instanceIntersections;
     }
 #endif
diff --git a/src/shaders/TrianglePrimitive.hlsl b/src/shaders/TrianglePrimitive.hlsl
index e2975dc..b32357e 100644
--- a/src/shaders/TrianglePrimitive.hlsl
+++ b/src/shaders/TrianglePrimitive.hlsl
@@ -224,7 +224,7 @@ TriangleData FetchTransformedTriangleData(
 //======================================================================================================================
 bool IsActive(TriangleData tri)
 {
-    return ((isnan(tri.v0.x) == false) && (isnan(tri.v1.x) == false) && (isnan(tri.v2.x) == false));
+    return (any(isnan(tri.v0)) == false) && (any(isnan(tri.v1)) == false) && (any(isnan(tri.v2)) == false);
 }
 
 //=====================================================================================================================
diff --git a/src/shadersClean/traversal/TraversalDefs.hlsli b/src/shadersClean/traversal/TraversalDefs.hlsli
index 28f9999..ea8ea10 100644
--- a/src/shadersClean/traversal/TraversalDefs.hlsli
+++ b/src/shadersClean/traversal/TraversalDefs.hlsli
@@ -160,6 +160,15 @@ struct RayQueryInternal
 //=====================================================================================================================
 struct HitGroupInfo
 {
+#ifdef __cplusplus
+    HitGroupInfo(uint val)
+    {
+        memset(this, val, sizeof(HitGroupInfo));
+    }
+
+    HitGroupInfo() : HitGroupInfo(0)
+    {}
+#endif
     uint2 closestHitId;
     uint2 anyHitId;
     uint2 intersectionId;
diff --git a/src/shared/rayTracingDefs.h b/src/shared/rayTracingDefs.h
index 5778dfd..636df92 100644
--- a/src/shared/rayTracingDefs.h
+++ b/src/shared/rayTracingDefs.h
@@ -115,8 +115,14 @@ struct EncodeTaskCountersCommon
 };
 
 //=====================================================================================================================
-struct EncodeTaskCountersBuild : EncodeTaskCountersCommon
+// There is DXC bug that doesn't properly compile HLSL->SPRIV using structure inheritance.
+// Once it is fixed, EncodeTaskCountersBuild, EncodeTaskCountersUpdate can inherit from EncodeTaskCountersCommon
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6986
+struct EncodeTaskCountersBuild
 {
+    uint numPrimitives;
+    uint primRefs;
+
     // The following indirect arguments are only used in mult-dispatch path. Note, currently only HPLOC dispatch uses
     // these, but it will be extended to other passes when early pair compression is enabled.
     uint groupCountX;
@@ -135,8 +141,10 @@ static_assert(ENCODE_TASK_COUNTER_PRIM_REFS_OFFSET == offsetof(EncodeTaskCounter
 
 //=====================================================================================================================
 // Update scratch memory fields
-struct EncodeTaskCountersUpdate : EncodeTaskCountersCommon
+struct EncodeTaskCountersUpdate
 {
+    uint numPrimitives;
+    uint primRefs;
     uint refitTaskCounter;
     uint taskCount;
     uint tasksDone;
diff --git a/tools/CompileRTShaders.py b/tools/CompileRTShaders.py
index 95a90d3..5c691cd 100644
--- a/tools/CompileRTShaders.py
+++ b/tools/CompileRTShaders.py
@@ -43,7 +43,7 @@
 DWORDS_PER_LINE = 8
 
 FILE_STANDARD_HEADER = """
-/* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. */
+/* Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. */
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //
diff --git a/tools/DebugPreprocessShaders.py b/tools/DebugPreprocessShaders.py
index 4793b96..cc4f16a 100644
--- a/tools/DebugPreprocessShaders.py
+++ b/tools/DebugPreprocessShaders.py
@@ -29,7 +29,7 @@
 import argparse
 
 cpp_file_header = """
-/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */
+/* Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. */
 
 namespace GpuRt
 {