From 814f15d9b20574b76d46f0f76adfc3cda1a32c26 Mon Sep 17 00:00:00 2001
From: wuyakuma <wuyakuma@outlook.com>
Date: Thu, 20 Feb 2025 14:55:52 +0800
Subject: [PATCH] Fix BVH4_CPU traversal, somehow the old code get don't get
 the right lane data...

---
 tiny_bvh.h | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 152 insertions(+), 1 deletion(-)

diff --git a/tiny_bvh.h b/tiny_bvh.h
index ce2f923..9f0e6d4 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -6130,7 +6130,8 @@ int32_t BVH4_CPU::Intersect( Ray& ray ) const
 		{
 			// blend in lane indices
 			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
-
+            
+#if false
 			// sort
 			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
 			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
@@ -6155,11 +6156,49 @@ int32_t BVH4_CPU::Intersect( Ray& ray ) const
 				for (uint32_t j = 0; j < count; j++, cost += C_INT) // TODO: aim for 4 prims per leaf
 					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
 			}
+#else
+            ALIGNED( 64 ) float d[4];
+            vst1q_f32(d, tm);
+            // sort
+            float tmp;
+            if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
+            if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
+            if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
+            if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
+            if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
+            
+            const uint32_t lanes[4] =
+            {
+                (uint32_t)-1,
+                *(uint32_t*)&d[1] & 3,
+                *(uint32_t*)&d[2] & 3,
+                *(uint32_t*)&d[3] & 3,
+            };
+            
+            nodeIdx = 0;
+            for (int32_t i = 1; i < 4; i++)
+            {
+                uint32_t lane = lanes[i];
+                if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
+                if (node.triCount[lane] == 0)
+                {
+                    const uint32_t childIdx = node.childFirst[lane];
+                    if (nodeIdx) stack[stackPtr++] = nodeIdx;
+                    nodeIdx = childIdx;
+                    continue;
+                }
+                const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+                for (uint32_t j = 0; j < count; j++, cost += C_INT) // TODO: aim for 4 prims per leaf
+                    IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
+            }
+
+#endif
 		}
 		else /* hits == 4, 2%: rare */
 		{
 			// blend in lane indices
 			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
+#if false
 			// sort
 			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
 			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
@@ -6185,6 +6224,43 @@ int32_t BVH4_CPU::Intersect( Ray& ray ) const
 				for (uint32_t j = 0; j < count; j++, cost += C_INT) // TODO: aim for 4 prims per leaf
 					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
 			}
+#else
+            ALIGNED( 64 ) float d[4];
+            vst1q_f32(d, tm);
+            // sort
+            float tmp;
+            if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
+            if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
+            if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
+            if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
+            if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
+            
+            const uint32_t lanes[4] =
+            {
+                *(uint32_t*)&d[0] & 3,
+                *(uint32_t*)&d[1] & 3,
+                *(uint32_t*)&d[2] & 3,
+                *(uint32_t*)&d[3] & 3,
+            };
+            
+            nodeIdx = 0;
+            for (int32_t i = 0; i < 4; i++)
+            {
+                uint32_t lane = lanes[i];
+                
+                if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
+                if (node.triCount[lane] == 0)
+                {
+                    const uint32_t childIdx = node.childFirst[lane];
+                    if (nodeIdx) stack[stackPtr++] = nodeIdx;
+                    nodeIdx = childIdx;
+                    continue;
+                }
+                const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+                for (uint32_t j = 0; j < count; j++, cost += C_INT) // TODO: aim for 4 prims per leaf
+                    IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
+            }
+#endif
 		}
 		// get next task
 		if (nodeIdx) continue;
@@ -6291,6 +6367,7 @@ bool BVH4_CPU::IsOccluded( const Ray& ray ) const
 		{
 			// blend in lane indices
 			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
+#if false
 			// sort
 			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
 			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
@@ -6315,11 +6392,48 @@ bool BVH4_CPU::IsOccluded( const Ray& ray ) const
 				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
 			}
+#else
+            ALIGNED( 64 ) float d[4];
+            vst1q_f32(d, tm);
+            // sort
+            float tmp;
+            if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
+            if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
+            if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
+            if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
+            if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
+            
+            const uint32_t lanes[4] =
+            {
+                (uint32_t)-1,
+                *(uint32_t*)&d[1] & 3,
+                *(uint32_t*)&d[2] & 3,
+                *(uint32_t*)&d[3] & 3,
+            };
+            
+            nodeIdx = 0;
+            for (int32_t i = 1; i < 4; i++)
+            {
+                uint32_t lane = lanes[i];
+                if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
+                if (node.triCount[lane] == 0)
+                {
+                    const uint32_t childIdx = node.childFirst[lane];
+                    if (nodeIdx) stack[stackPtr++] = nodeIdx;
+                    nodeIdx = childIdx;
+                    continue;
+                }
+                const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+                for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
+                    if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
+            }
+#endif
 		}
 		else /* hits == 4, 2%: rare */
 		{
 			// blend in lane indices
 			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
+#if false
 			// sort
 			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
 			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
@@ -6345,6 +6459,43 @@ bool BVH4_CPU::IsOccluded( const Ray& ray ) const
 				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
 			}
+#else
+            ALIGNED( 64 ) float d[4];
+            vst1q_f32(d, tm);
+            // sort
+            float tmp;
+            if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
+            if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
+            if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
+            if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
+            if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
+            
+            const uint32_t lanes[4] =
+            {
+                *(uint32_t*)&d[0] & 3,
+                *(uint32_t*)&d[1] & 3,
+                *(uint32_t*)&d[2] & 3,
+                *(uint32_t*)&d[3] & 3,
+            };
+            
+            nodeIdx = 0;
+            for (int32_t i = 0; i < 4; i++)
+            {
+                uint32_t lane = lanes[i];
+                
+                if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
+                if (node.triCount[lane] == 0)
+                {
+                    const uint32_t childIdx = node.childFirst[lane];
+                    if (nodeIdx) stack[stackPtr++] = nodeIdx;
+                    nodeIdx = childIdx;
+                    continue;
+                }
+                const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
+                for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
+                    if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
+            }
+#endif
 		}
 		// get next task
 		if (nodeIdx) continue;