diff --git a/tiny_bvh.h b/tiny_bvh.h
index 9f0e6d4..119ef78 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -631,7 +631,7 @@ class BVHBase
 		LAYOUT_BVH4_GPU,
 		LAYOUT_MBVH8,
 		LAYOUT_CWBVH,
-		LAYOUT_BVH4_AVX2
+		LAYOUT_BVH8_AVX2
 	};
 	struct ALIGNED( 32 ) Fragment
 	{
@@ -960,7 +960,7 @@ template <int M> class MBVH : public BVHBase
 public:
 	struct MBVHNode
 	{
-		// 4-wide (aka 'shallow') BVH layout.
+		// M-wide (aka 'shallow') BVH layout.
 		bvhvec3 aabbMin; uint32_t firstTri;
 		bvhvec3 aabbMax; uint32_t triCount;
 		uint32_t child[M];
@@ -986,7 +986,7 @@ template <int M> class MBVH : public BVHBase
 	void SplitBVHLeaf( const uint32_t nodeIdx, const uint32_t maxPrims );
 	// BVH data
 	MBVHNode* mbvhNode = 0;			// BVH node for M-wide BVH.
-	BVH bvh;						// BVH4 is created from BVH and uses its data.
+	BVH bvh;						// MBVH<M> is created from BVH and uses its data.
 	bool ownBVH = true;				// False when ConvertFrom receives an external bvh.
 };
 
@@ -1122,6 +1122,50 @@ class ALIGNED( 64 ) BLASInstance
 	void InvertTransform();
 };
 
+class BVH8_WiVe : public BVHBase
+{
+public:
+	struct BVHNode
+	{
+		// 4-way BVH node, optimized for CPU rendering.
+		// Based on: "Accelerated Single Ray Tracing for Wide Vector Units", Fuetterling1 et al., 2017,
+		// and the implementation by Mathijs Molenaar, https://github.com/mathijs727/pandora
+		SIMDVEC8 xmin8, xmax8;
+		SIMDVEC8 ymin8, ymax8;
+		SIMDVEC8 zmin8, zmax8;
+		SIMDIVEC8 child8; // bits: 31..29 = flags, 28..0: node index.
+		SIMDIVEC8 permOffs8;
+		// flag bits: 000 is an empty node, 010 is an interior node. 1xx is leaf; xx = tricount.
+	};
+	struct RayWiVe
+	{
+		__m256 ox8, oy8, oz8, rdx8, rdy8, rdz8, t8;
+		__m256i signShift8;
+	};
+	BVH8_WiVe( BVHContext ctx = {} ) { layout = LAYOUT_BVH8_AVX2; context = ctx; }
+	~BVH8_WiVe();
+	void Build( const bvhvec4* vertices, const uint32_t primCount );
+	void Build( const bvhvec4slice& vertices );
+	void Build( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims );
+	void Build( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims );
+	void BuildHQ( const bvhvec4* vertices, const uint32_t primCount );
+	void BuildHQ( const bvhvec4slice& vertices );
+	void BuildHQ( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims );
+	void BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims );
+	void Optimize( const uint32_t iterations, bool extreme );
+	float SAHCost( const uint32_t nodeIdx ) const;
+	void ConvertFrom( const MBVH<8>& original, bool compact = true );
+	SIMDIVEC8 CalculatePermOffsets( const uint32_t nodeIdx ) const;
+	int32_t Intersect( Ray& ray ) const;
+	bool IsOccluded( const Ray& ray ) const;
+	// BVH8 data
+	bvhvec4slice verts = {};		// pointer to input primitive array: 3x16 bytes per tri.
+	uint32_t* primIdx = 0;			// primitive index array - pointer copied from original.
+	BVHNode* bvh8Node = 0;			// 256-byte 8-wide BVH node for efficient CPU rendering.
+	MBVH<8> bvh8;					// BVH8_WiVe is created from BVH8 and uses its data.
+	bool ownBVH8 = true;			// false when ConvertFrom receives an external bvh8.
+};
+
 #ifdef DOUBLE_PRECISION_SUPPORT
 
 // BLASInstanceEx: Double-precision version of BLASInstance.
@@ -1142,36 +1186,6 @@ class BLASInstanceEx
 
 #endif
 
-// Experimental & 'under construction' structs
-
-class BVH4_WiVe : public BVHBase
-{
-public:
-	struct BVHNode
-	{
-		// 4-way BVH node, optimized for CPU rendering.
-		// Based on: "Accelerated Single Ray Tracing for Wide Vector Units",
-		// Fuetterling1 et al., 2017.
-		union { SIMDVEC8 xmin8, xmax8; };
-		union { SIMDVEC8 ymin8, ymax8; };
-		union { SIMDVEC8 zmin8, zmax8; };
-		union { SIMDVEC8 sn8; float sn[8]; }; // n0..3, s0..3; total size =  128 bytes.
-		// n_i: inner node flag + child node cluster offset
-	};
-	BVH4_WiVe( BVHContext ctx = {} ) { layout = LAYOUT_BVH4_AVX2; context = ctx; }
-	~BVH4_WiVe();
-	void Build( const bvhvec4* vertices, const uint32_t primCount );
-	void Build( const bvhvec4slice& vertices );
-	void ConvertFrom( MBVH<4>& original, bool compact = true );
-	int32_t Intersect( Ray& ray ) const;
-	uint32_t IntersectInnerNode( const BVHNode& n, const Ray& ray, SIMDIVEC8& outChildren, SIMDVEC8& outDistances ) const;
-	bool IsOccluded( const Ray& ray ) const;
-	// BVH4 data
-	bvhvec4slice verts = {};		// pointer to input primitive array: 3x16 bytes per tri.
-	uint32_t* primIdx = 0;			// primitive index array - pointer copied from original.
-	BVHNode* bvh4Node = 0;			// 128-byte 4-wide BVH node for efficient CPU rendering.
-};
-
 } // namespace tinybvh
 
 #endif // TINY_BVH_H_
@@ -4544,7 +4558,7 @@ void BVH::BuildAVX()
 				binbox[i0] = r0, i0 = ILANE( bc4, 0 );
 				binbox[AVXBINS + i1] = r1, i1 = ILANE( bc4, 1 );
 				binbox[2 * AVXBINS + i2] = r2, i2 = ILANE( bc4, 2 );
-		}
+			}
 			// final business for final fragment
 			const __m256 b0 = binbox[i0], b1 = binbox[AVXBINS + i1], b2 = binbox[2 * AVXBINS + i2];
 			count[0][i0]++, count[1][i1]++, count[2][i2]++;
@@ -4597,10 +4611,10 @@ void BVH::BuildAVX()
 			*(__m256*)& bvhNode[n] = _mm256_xor_ps( bestRBox, signFlip8 );
 			bvhNode[n].leftFirst = j, bvhNode[n].triCount = rightCount;
 			task[taskCount++] = n, nodeIdx = n - 1;
-	}
+		}
 		// fetch subdivision task from stack
 		if (taskCount == 0) break; else nodeIdx = task[--taskCount];
-}
+	}
 	// all done.
 	aabbMin = bvhNode[0].aabbMin, aabbMax = bvhNode[0].aabbMax;
 	refittable = true; // not using spatial splits: can refit this BVH
@@ -5100,7 +5114,7 @@ int32_t BVH8_CWBVH::Intersect( Ray& ray ) const
 				const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7];
 				const bool hit = u >= 0 && v >= 0 && u + v < 1;
 				if (hit) triangleuv = bvhvec2( u, v ), tmax = ta, hitAddr = *(uint32_t*)&T[15];
-	}
+			}
 		#else
 			int32_t triAddr = tgroup.x + triangleIndex * 3;
 			const bvhvec3 edge2 = bvhvec3( blasTris[triAddr + 0] );
@@ -5130,7 +5144,7 @@ int32_t BVH8_CWBVH::Intersect( Ray& ray ) const
 			}
 		#endif
 			tgroup.y -= 1 << triangleIndex;
-}
+		}
 		if (ngroup.y <= 0x00FFFFFF)
 		{
 			if (stackPtr > 0) { STACK_POP( /* nodeGroup */ ); }
@@ -5478,11 +5492,194 @@ bool BVH4_CPU::IsOccluded( const Ray& ray ) const
 #pragma GCC pop_options
 #endif
 
-#if 0 // under construction
+BVH8_WiVe::~BVH8_WiVe()
+{
+	if (!ownBVH8) bvh8 = MBVH<8>(); // clear out pointers we don't own.
+	AlignedFree( bvh8Node );
+}
+
+void BVH8_WiVe::Build( const bvhvec4* vertices, const uint32_t primCount )
+{
+	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) );
+}
+
+void BVH8_WiVe::Build( const bvhvec4slice& vertices )
+{
+	bvh8.context = context; // properly propagate context to fix issue #66.
+	bvh8.Build( vertices );
+	ConvertFrom( bvh8, true );
+}
+
+void BVH8_WiVe::Build( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims )
+{
+	// build the BVH with a continuous array of bvhvec4 vertices, indexed by 'indices'.
+	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+}
+
+void BVH8_WiVe::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
+{
+	// build the BVH from vertices stored in a slice, indexed by 'indices'.
+	bvh8.context = context;
+	bvh8.Build( vertices, indices, prims );
+	ConvertFrom( bvh8, true );
+}
+
+void BVH8_WiVe::BuildHQ( const bvhvec4* vertices, const uint32_t primCount )
+{
+	BuildHQ( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) );
+}
+
+void BVH8_WiVe::BuildHQ( const bvhvec4slice& vertices )
+{
+	bvh8.context = context;
+	bvh8.BuildHQ( vertices );
+	ConvertFrom( bvh8, true );
+}
+
+void BVH8_WiVe::BuildHQ( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims )
+{
+	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+}
+
+void BVH8_WiVe::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
+{
+	bvh8.context = context;
+	bvh8.BuildHQ( vertices, indices, prims );
+	ConvertFrom( bvh8, true );
+}
+
+void BVH8_WiVe::Optimize( const uint32_t iterations, bool extreme )
+{
+	bvh8.Optimize( iterations, extreme );
+	ConvertFrom( bvh8, true );
+}
+
+float BVH8_WiVe::SAHCost( const uint32_t nodeIdx ) const
+{
+	return bvh8.SAHCost( nodeIdx );
+}
+
+#define SORT(a,b) { if (dist[a] > dist[b]) { float h = dist[a]; dist[a] = dist[b], dist[b] = h; } }
+SIMDIVEC8 BVH8_WiVe::CalculatePermOffsets( const uint32_t nodeIdx ) const
+{
+	const MBVH<8>::MBVHNode& n = bvh8.mbvhNode[nodeIdx];
+	union { uint32_t permOffs[8]; __m256i permOffs8; };
+	permOffs8 = _mm256_set1_epi32( 0 );
+	static const bvhvec3 D[8] = {
+		bvhvec3( -1, -1, -1 ), bvhvec3( 1, -1, -1 ), bvhvec3( -1,  1, -1 ), bvhvec3( 1,  1, -1 ),
+		bvhvec3( -1, -1, 1 ), bvhvec3( 1, -1, 1 ), bvhvec3( -1,  1, 1 ), bvhvec3( 1,  1, 1 )
+	};
+	for (uint32_t q = 0; q < 8; q++)
+	{
+		union { float dist[8]; uint32_t idist[8]; };
+		for (int i = 0; i < 8; i++) if (n.child[i] == 0) dist[i] = 1e30f; else
+		{
+			const MBVH<8>::MBVHNode& c = bvh8.mbvhNode[n.child[i]];
+			const bvhvec3 p( q & 1 ? c.aabbMin.x : c.aabbMax.x, q & 2 ? c.aabbMin.y : c.aabbMax.y, q & 4 ? c.aabbMin.z : c.aabbMax.z );
+			dist[i] = tinybvh_dot( D[q], p ), idist[i] = (idist[i] & 0xfffffff8) + i;
+		}
+		// apply sorting network - https://bertdobbelaere.github.io/sorting_networks.html#N8L19D6
+		SORT( 0, 2 ); SORT( 1, 3 ); SORT( 4, 6 ); SORT( 5, 7 ); SORT( 0, 4 );
+		SORT( 1, 5 ); SORT( 2, 6 ); SORT( 3, 7 ); SORT( 0, 1 ); SORT( 2, 3 );
+		SORT( 4, 5 ); SORT( 6, 7 ); SORT( 2, 4 ); SORT( 3, 5 ); SORT( 1, 4 );
+		SORT( 3, 6 ); SORT( 1, 2 ); SORT( 3, 4 ); SORT( 5, 6 );
+		for (int i = 0; i < 8; i++) permOffs[i] += (idist[i] & 7) << (q * 3);
+	}
+	return permOffs8;
+}
 
-uint32_t signShiftAmount( const bool positiveX, bool positiveY, bool positiveZ )
+void BVH8_WiVe::ConvertFrom( const MBVH<8>& original, bool compact )
 {
-	return ((positiveX ? 0b001 : 0u) | (positiveY ? 0b010 : 0u) | (positiveZ ? 0b100 : 0u)) * 3;
+	// get a copy of the original bvh4
+	if (&original != &bvh8) ownBVH8 = false; // bvh isn't ours; don't delete in destructor.
+	bvh8 = original;
+	uint32_t spaceNeeded = compact ? bvh8.usedNodes : bvh8.allocatedNodes;
+	if (allocatedNodes < spaceNeeded)
+	{
+		AlignedFree( bvh8Node );
+		bvh8Node = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) );
+		allocatedNodes = spaceNeeded;
+	}
+	memset( bvh8Node, 0, spaceNeeded * sizeof( BVHNode ) );
+	CopyBasePropertiesFrom( bvh8 );
+	// start conversion
+	uint32_t newAlt4Ptr = 0, nodeIdx = 0, stack[128], stackPtr = 0;
+	while (1)
+	{
+		const MBVH<8>::MBVHNode& orig = bvh8.mbvhNode[nodeIdx];
+		BVHNode& newNode = bvh8Node[newAlt4Ptr++];
+		int32_t cidx = 0;
+		for (int32_t i = 0; i < 8; i++) if (orig.child[i])
+		{
+			const MBVH<8>::MBVHNode& child = bvh8.mbvhNode[orig.child[i]];
+			((float*)&newNode.xmin8)[cidx] = child.aabbMin.x;
+			((float*)&newNode.ymin8)[cidx] = child.aabbMin.y;
+			((float*)&newNode.zmin8)[cidx] = child.aabbMin.z;
+			((float*)&newNode.xmax8)[cidx] = child.aabbMax.x;
+			((float*)&newNode.ymax8)[cidx] = child.aabbMax.y;
+			((float*)&newNode.zmax8)[cidx] = child.aabbMax.z;
+			newNode.permOffs8 = CalculatePermOffsets( nodeIdx );
+			if (child.isLeaf())
+			{
+				((uint32_t*)&newNode.child8)[cidx] = child.firstTri + (child.triCount << 29) + (1 << 31);
+			}
+			else
+			{
+				uint32_t* slot = (uint32_t*)&newNode.child8 + cidx;
+				stack[stackPtr++] = (uint32_t)(slot - (uint32_t*)bvh8Node);
+				stack[stackPtr++] = orig.child[i];
+			}
+			cidx++;
+		}
+		for (; cidx < 8; cidx++)
+		{
+			((float*)&newNode.xmin8)[cidx] = 1e30f, ((float*)&newNode.xmax8)[cidx] = 1.00001e30f; // why?
+			((float*)&newNode.ymin8)[cidx] = 1e30f, ((float*)&newNode.ymax8)[cidx] = 1.00001e30f;
+			((float*)&newNode.zmin8)[cidx] = 1e30f, ((float*)&newNode.zmax8)[cidx] = 1.00001e30f;
+		}
+		// pop next task
+		if (!stackPtr) break;
+		nodeIdx = stack[--stackPtr];
+		uint32_t offset = stack[--stackPtr];
+		((uint32_t*)bvh8Node)[offset] = newAlt4Ptr;
+	}
+#if 0
+	// Convert index list: store primitives 'by value'.
+	// This also allows us to compact and reorder them for best performance.
+	stackPtr = 0, nodeIdx = 0;
+	uint32_t triPtr = 0;
+	while (1)
+	{
+		BVHNode& node = bvh4Node[nodeIdx];
+		for (int32_t i = 0; i < 4; i++) if (node.triCount[i] + node.childFirst[i] > 0)
+		{
+			if (!node.triCount[i]) stack[stackPtr++] = node.childFirst[i]; else
+			{
+				uint32_t first = node.childFirst[i];
+				uint32_t count = node.triCount[i];
+				node.childFirst[i] = triPtr;
+				// assign vertex data
+				for (uint32_t j = 0; j < count; j++)
+				{
+					const uint32_t fi = bvh4.bvh.primIdx[first + j];
+					uint32_t ti0, ti1, ti2;
+					if (bvh4.bvh.vertIdx)
+						ti0 = bvh4.bvh.vertIdx[fi * 3],
+						ti1 = bvh4.bvh.vertIdx[fi * 3 + 1],
+						ti2 = bvh4.bvh.vertIdx[fi * 3 + 2];
+					else
+						ti0 = fi * 3, ti1 = fi * 3 + 1, ti2 = fi * 3 + 2;
+					PrecomputeTriangle( bvh4.bvh.verts, ti0, ti1, ti2, (float*)&bvh4Tris[triPtr] );
+					bvh4Tris[triPtr + 3] = bvhvec4( 0, 0, 0, *(float*)&fi );
+					triPtr += 4;
+				}
+			}
+		}
+		if (!stackPtr) break;
+		nodeIdx = stack[--stackPtr];
+	}
+#endif
+	usedNodes = newAlt4Ptr;
 }
 
 constexpr uint64_t idxLUT[256] = {
@@ -5509,78 +5706,73 @@ constexpr uint64_t idxLUT[256] = {
 	7722435347201,1976943448883456,7722435347202,1976943448883712,1976943448883713,506097522914230528
 };
 
-uint32_t BVH4_WiVe::IntersectInnerNode( const BVHNode& n, const Ray& ray, __m256i& outChildren, __m256& outDistances ) const
-{
-	const __m256 tx1 = _mm256_mul_ps( _mm256_sub_ps( n.xmin8, _mm256_set1_ps( ray.O.x ) ), _mm256_set1_ps( ray.rD.x ) );
-	const __m256 tx2 = _mm256_mul_ps( _mm256_sub_ps( n.xmax8, _mm256_set1_ps( ray.O.x ) ), _mm256_set1_ps( ray.rD.x ) );
-	const __m256 ty1 = _mm256_mul_ps( _mm256_sub_ps( n.ymin8, _mm256_set1_ps( ray.O.y ) ), _mm256_set1_ps( ray.rD.y ) );
-	const __m256 ty2 = _mm256_mul_ps( _mm256_sub_ps( n.ymax8, _mm256_set1_ps( ray.O.y ) ), _mm256_set1_ps( ray.rD.y ) );
-	const __m256 tz1 = _mm256_mul_ps( _mm256_sub_ps( n.zmin8, _mm256_set1_ps( ray.O.z ) ), _mm256_set1_ps( ray.rD.z ) );
-	const __m256 tz2 = _mm256_mul_ps( _mm256_sub_ps( n.zmax8, _mm256_set1_ps( ray.O.z ) ), _mm256_set1_ps( ray.rD.z ) );
-	const __m256 txMin = _mm256_min_ps( tx1, tx2 ), tyMin = _mm256_min_ps( ty1, ty2 ), tzMin = _mm256_min_ps( tz1, tz2 );
-	const __m256 txMax = _mm256_max_ps( tx1, tx2 ), tyMax = _mm256_max_ps( ty1, ty2 ), tzMax = _mm256_max_ps( tz1, tz2 );
-	__m256 tmin = _mm256_max_ps( _mm256_setzero_ps(), _mm256_max_ps( txMin, _mm256_max_ps( tyMin, tzMin ) ) );
-	__m256 tmax = _mm256_min_ps( _mm256_set1_ps( ray.hit.t ), _mm256_min_ps( txMax, _mm256_min_ps( tyMax, tzMax ) ) );
-	const __m256i indexMask = _mm256_set1_epi32( 0b111 );
-	const __m256i index = _mm256_and_si256( _mm256_srlv_epi32( n.permutationOffsets, ray.raySignShiftAmount ), indexMask );
-	tmin = _mm256_permutevar8x32_ps( tmin, index );
-	tmax = _mm256_permutevar8x32_ps( tmax, index );
-	uint32_t mask = _mm256_movemask_ps( _mm256_cmp_ps( tmin, tmax, _CMP_LE_OQ ) );
-	__m256i cpi = _mm256_cvtepu8_epi32( _mm_cvtsi64_si128( idxLUT[mask] ) );
-	outChildren = _mm256_permutevar8x32_ps( _mm256_permutevar8x32_ps( n.children, index ), cpi );
-	outDistances = _mm256_permutevar8x32_ps( tmin, cpi );
-	return __popc( mask );
-}
-
-int32_t BVH4_WiVe::Intersect( Ray& ray ) const
-{
-	__m256 ox8 = _mm256_set1_ps( ray.O.x ), rdx8 = _mm256_set1_ps( ray.rD.x );
-	__m256 oy8 = _mm256_set1_ps( ray.O.y ), rdy8 = _mm256_set1_ps( ray.rD.y );
-	__m256 oz8 = _mm256_set1_ps( ray.O.z ), rdz8 = _mm256_set1_ps( ray.rD.z );
-	__m256 t8 = _mm256_set1_ps( ray.hit.t );
-	__m256i signShift8 = _mm256_set1_epi32( (ray.D.x > 0 ? 3 : 0) + (ray.D.y > 0 ? 6 : 0) + (ray.D.z > 0 ? 12 : 0) );
-	struct StackEntry { uint32_t node; float dist; };
-	ALIGNED( 64 ) StackEntry stack[64];
-	// std::fill( std::begin( stackDistances ), std::end( stackDistances ), std::numeric_limits<float>::max() );
+int32_t BVH8_WiVe::Intersect( Ray& ray ) const
+{
+	RayWiVe rw;
+	rw.ox8 = _mm256_set1_ps( ray.O.x ), rw.rdx8 = _mm256_set1_ps( ray.rD.x );
+	rw.oy8 = _mm256_set1_ps( ray.O.y ), rw.rdy8 = _mm256_set1_ps( ray.rD.y );
+	rw.oz8 = _mm256_set1_ps( ray.O.z ), rw.rdz8 = _mm256_set1_ps( ray.rD.z );
+	rw.t8 = _mm256_set1_ps( ray.hit.t );
+	rw.signShift8 = _mm256_set1_epi32( (ray.D.x > 0 ? 3 : 0) + (ray.D.y > 0 ? 6 : 0) + (ray.D.z > 0 ? 12 : 0) );
+	ALIGNED( 64 ) uint32_t nodeStack[64];
+	ALIGNED( 64 ) float distStack[64];
+	for (int i = 0; i < 64; i++) distStack[i] = 1e30f; // TODO: is this needed?
+	nodeStack[0] = 0, distStack[0] = 0;
 	uint32_t stackPtr = 1;
-	stack[0].node = 0, stack[0].dist = 0;
 	while (stackPtr > 0)
 	{
 		stackPtr--;
-		uint32_t nodeIdx = stack[--stackPtr].node; // mask with ((1u << 29) - 1) ?
-		const BVHNode& node = bvh4Node[nodeIdx];
-		if (!node.isLeaf())
+		uint32_t nodeIdx = nodeStack[--stackPtr];
+		const BVHNode& n = bvh8Node[nodeIdx & 0x1fffffff /* bits 0..28 */];
+		if (!(nodeIdx >> 31)) // top bit: leaf flag
+		{
+			const __m256 tx1 = _mm256_mul_ps( _mm256_sub_ps( n.xmin8, rw.ox8 ), rw.rdx8 );
+			const __m256 tx2 = _mm256_mul_ps( _mm256_sub_ps( n.xmax8, rw.ox8 ), rw.rdx8 );
+			const __m256 ty1 = _mm256_mul_ps( _mm256_sub_ps( n.ymin8, rw.oy8 ), rw.rdy8 );
+			const __m256 ty2 = _mm256_mul_ps( _mm256_sub_ps( n.ymax8, rw.oy8 ), rw.rdy8 );
+			const __m256 tz1 = _mm256_mul_ps( _mm256_sub_ps( n.zmin8, rw.oz8 ), rw.rdz8 );
+			const __m256 tz2 = _mm256_mul_ps( _mm256_sub_ps( n.zmax8, rw.oz8 ), rw.rdz8 );
+			const __m256 txMin = _mm256_min_ps( tx1, tx2 ), tyMin = _mm256_min_ps( ty1, ty2 ), tzMin = _mm256_min_ps( tz1, tz2 );
+			const __m256 txMax = _mm256_max_ps( tx1, tx2 ), tyMax = _mm256_max_ps( ty1, ty2 ), tzMax = _mm256_max_ps( tz1, tz2 );
+			__m256 tmin = _mm256_max_ps( _mm256_setzero_ps(), _mm256_max_ps( txMin, _mm256_max_ps( tyMin, tzMin ) ) );
+			__m256 tmax = _mm256_min_ps( rw.t8, _mm256_min_ps( txMax, _mm256_min_ps( tyMax, tzMax ) ) );
+			const __m256i indexMask = _mm256_set1_epi32( 0b111 );
+			const __m256i index = _mm256_and_si256( _mm256_srlv_epi32( n.permOffs8, rw.signShift8 ), indexMask );
+			tmin = _mm256_permutevar8x32_ps( tmin, index ), tmax = _mm256_permutevar8x32_ps( tmax, index );
+			const uint32_t mask = _mm256_movemask_ps( _mm256_cmp_ps( tmin, tmax, _CMP_LE_OQ ) );
+			const __m256i cpi = _mm256_cvtepu8_epi32( _mm_cvtsi64_si128( idxLUT[mask] ) );
+			const __m256i child8 = _mm256_permutevar8x32_epi32( _mm256_permutevar8x32_epi32( n.child8, index ), cpi );
+			const __m256 dist8 = _mm256_permutevar8x32_ps( tmin, cpi );
+			const uint32_t childCount = __popc( mask );
+			if (childCount == 0) continue;
+			_mm256_storeu_si256( (__m256i*)(nodeStack + stackPtr), child8 );
+			_mm256_storeu_ps( (float*)(distStack + stackPtr), dist8 );
+			stackPtr += childCount;
+		}
+		else
 		{
-			__m256i child8;
-			__m256 dist8;
-			uint32_t numChildren = IntersectInnerNode( node, simdRay, childrenSIMD, distancesSIMD );
-			if (numChildren > 0)
+			uint32_t first = nodeIdx & 0x1fffffff;
+			uint32_t count = ((nodeIdx >> 29) & 3) + 1;
+			bool hitSomething = false;
+			for (uint32_t i = 0; i < count; i++)
 			{
-				_mm256_storeu_si256( child8, .. ); // childrenSIMD.store( std::span( stackCompressedNodeHandles.data() + stackPtr, 8 ) );
-				_mm256_storeu_ps( dist8, .. ); // distancesSIMD.store( std::span( stackDistances.data() + stackPtr, 8 ) );
-				stackPtr += numChildren;
+				// for now, let's intersect
 			}
-		}
-		else // leaf
-		{
-			if (intersectLeaf( &m_leafIndexAllocator.get( handle ), leafNodePrimitiveCount( compressedNodeHandle ), ray, si ))
+			if (hitSomething)
 			{
-				t8 = _mm256_set1_ps( ray.hit.t );
 				// compress stack
 				uint32_t outStackPtr = 0;
-				for (size_t i = 0; i < stackPtr; i += 8)
+				for (uint32_t i = 0; i < stackPtr; i += 8)
 				{
-					__m256i node8 = _mm256_load_si256( .. ); // node8.loadAligned( std::span( stackCompressedNodeHandles.data() + i, 8 ) );
-					__m256 dist8 = _mm256_load_ps( .. ); // dist8.loadAligned( std::span( stackDistances.data() + i, 8 ) );
-					__m256 mask8 = _mm256_cmp_ps( _CMP_LT_OQ, dist8, t8 );
-					__m256i cpi = mask8.computeCompressPermutation(); // ?
-					dist8 = dist8.permute( cpi );
-					node8 = node8.permute( cpi );
-					_mm256_storeu_ps( dist8, .. ); // dist8.store( std::span( stackDistances.data() + outStackPtr, 8 ) );
-					_mm256_storeu_si256( node8, .. ); // node8.store( std::span( stackCompressedNodeHandles.data() + outStackPtr, 8 ) );
-					uint32_t numItems = tinybvh_min( 8, stackPtr - i );
-					uint32_t validMask = (1 << numItems) - 1;
-					outStackPtr += mask8.count( validMask ); // ?
+					__m256i node8 = _mm256_load_si256( (__m256i*)(nodeStack + i) );
+					__m256 dist8 = _mm256_load_ps( (float*)(distStack + i) );
+					const uint32_t mask = _mm256_movemask_ps( _mm256_cmp_ps( dist8, rw.t8, _CMP_LT_OQ ) );
+					const __m256i cpi = _mm256_cvtepu8_epi32( _mm_cvtsi64_si128( idxLUT[mask] ) );
+					dist8 = _mm256_permutevar8x32_ps( dist8, cpi ), node8 = _mm256_permutevar8x32_epi32( node8, cpi );
+					_mm256_storeu_ps( (float*)(distStack + outStackPtr), dist8 );
+					_mm256_storeu_si256( (__m256i*)(nodeStack + outStackPtr), node8 );
+					const uint32_t numItems = tinybvh_min( 8u, stackPtr - i ), validMask = (1 << numItems) - 1;
+					outStackPtr += __popc( mask & validMask );
 				}
 				stackPtr = outStackPtr;
 			}
@@ -5588,8 +5780,6 @@ int32_t BVH4_WiVe::Intersect( Ray& ray ) const
 	}
 }
 
-#endif
-
 #endif // BVH_USEAVX
 
 // ============================================================================
@@ -6130,21 +6320,21 @@ int32_t BVH4_CPU::Intersect( Ray& ray ) const
 		{
 			// blend in lane indices
 			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
-            
-#if false
+			ALIGNED( 64 ) float d[4];
+			vst1q_f32( d, tm );
 			// sort
-			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
-			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
-			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
-			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
-			if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp;
-			if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp;
-			// process hits
-			float d[4] = { d0, d1, d2, d3 };
+			float tmp;
+			if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
+			if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
+			if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
+			if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
+			if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
+			const uint32_t lanes[4] = { (uint32_t)-1, *(uint32_t*)&d[1] & 3, *(uint32_t*)&d[2] & 3, *(uint32_t*)&d[3] & 3 };
 			nodeIdx = 0;
 			for (int32_t i = 1; i < 4; i++)
 			{
-				uint32_t lane = *(uint32_t*)&d[i] & 3;
+				uint32_t lane = lanes[i];
+				if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
 				if (node.triCount[lane] == 0)
 				{
 					const uint32_t childIdx = node.childFirst[lane];
@@ -6156,62 +6346,25 @@ int32_t BVH4_CPU::Intersect( Ray& ray ) const
 				for (uint32_t j = 0; j < count; j++, cost += C_INT) // TODO: aim for 4 prims per leaf
 					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
 			}
-#else
-            ALIGNED( 64 ) float d[4];
-            vst1q_f32(d, tm);
-            // sort
-            float tmp;
-            if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
-            if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
-            if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
-            if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
-            if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
-            
-            const uint32_t lanes[4] =
-            {
-                (uint32_t)-1,
-                *(uint32_t*)&d[1] & 3,
-                *(uint32_t*)&d[2] & 3,
-                *(uint32_t*)&d[3] & 3,
-            };
-            
-            nodeIdx = 0;
-            for (int32_t i = 1; i < 4; i++)
-            {
-                uint32_t lane = lanes[i];
-                if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
-                if (node.triCount[lane] == 0)
-                {
-                    const uint32_t childIdx = node.childFirst[lane];
-                    if (nodeIdx) stack[stackPtr++] = nodeIdx;
-                    nodeIdx = childIdx;
-                    continue;
-                }
-                const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
-                for (uint32_t j = 0; j < count; j++, cost += C_INT) // TODO: aim for 4 prims per leaf
-                    IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
-            }
-
-#endif
 		}
 		else /* hits == 4, 2%: rare */
 		{
 			// blend in lane indices
 			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
-#if false
+			ALIGNED( 64 ) float d[4];
+			vst1q_f32( d, tm );
 			// sort
-			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
-			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
-			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
-			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
-			if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp;
-			if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp;
-			// process hits
-			float d[4] = { d0, d1, d2, d3 };
+			float tmp;
+			if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
+			if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
+			if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
+			if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
+			if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
+			const uint32_t lanes[4] = { *(uint32_t*)&d[0] & 3, *(uint32_t*)&d[1] & 3, *(uint32_t*)&d[2] & 3, *(uint32_t*)&d[3] & 3 };
 			nodeIdx = 0;
 			for (int32_t i = 0; i < 4; i++)
 			{
-				uint32_t lane = *(uint32_t*)&d[i] & 3;
+				uint32_t lane = lanes[i];
 				if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
 				if (node.triCount[lane] == 0)
 				{
@@ -6224,43 +6377,6 @@ int32_t BVH4_CPU::Intersect( Ray& ray ) const
 				for (uint32_t j = 0; j < count; j++, cost += C_INT) // TODO: aim for 4 prims per leaf
 					IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
 			}
-#else
-            ALIGNED( 64 ) float d[4];
-            vst1q_f32(d, tm);
-            // sort
-            float tmp;
-            if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
-            if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
-            if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
-            if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
-            if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
-            
-            const uint32_t lanes[4] =
-            {
-                *(uint32_t*)&d[0] & 3,
-                *(uint32_t*)&d[1] & 3,
-                *(uint32_t*)&d[2] & 3,
-                *(uint32_t*)&d[3] & 3,
-            };
-            
-            nodeIdx = 0;
-            for (int32_t i = 0; i < 4; i++)
-            {
-                uint32_t lane = lanes[i];
-                
-                if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
-                if (node.triCount[lane] == 0)
-                {
-                    const uint32_t childIdx = node.childFirst[lane];
-                    if (nodeIdx) stack[stackPtr++] = nodeIdx;
-                    nodeIdx = childIdx;
-                    continue;
-                }
-                const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
-                for (uint32_t j = 0; j < count; j++, cost += C_INT) // TODO: aim for 4 prims per leaf
-                    IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) );
-            }
-#endif
 		}
 		// get next task
 		if (nodeIdx) continue;
@@ -6367,20 +6483,21 @@ bool BVH4_CPU::IsOccluded( const Ray& ray ) const
 		{
 			// blend in lane indices
 			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
-#if false
+			ALIGNED( 64 ) float d[4];
+			vst1q_f32( d, tm );
 			// sort
-			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
-			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
-			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
-			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
-			if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp;
-			if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp;
-			// process hits
-			float d[4] = { d0, d1, d2, d3 };
+			float tmp;
+			if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
+			if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
+			if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
+			if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
+			if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
+			const uint32_t lanes[4] = { (uint32_t)-1, *(uint32_t*)&d[1] & 3, *(uint32_t*)&d[2] & 3, *(uint32_t*)&d[3] & 3 };
 			nodeIdx = 0;
 			for (int32_t i = 1; i < 4; i++)
 			{
-				uint32_t lane = *(uint32_t*)&d[i] & 3;
+				uint32_t lane = lanes[i];
+				if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
 				if (node.triCount[lane] == 0)
 				{
 					const uint32_t childIdx = node.childFirst[lane];
@@ -6392,61 +6509,26 @@ bool BVH4_CPU::IsOccluded( const Ray& ray ) const
 				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
 			}
-#else
-            ALIGNED( 64 ) float d[4];
-            vst1q_f32(d, tm);
-            // sort
-            float tmp;
-            if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
-            if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
-            if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
-            if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
-            if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
-            
-            const uint32_t lanes[4] =
-            {
-                (uint32_t)-1,
-                *(uint32_t*)&d[1] & 3,
-                *(uint32_t*)&d[2] & 3,
-                *(uint32_t*)&d[3] & 3,
-            };
-            
-            nodeIdx = 0;
-            for (int32_t i = 1; i < 4; i++)
-            {
-                uint32_t lane = lanes[i];
-                if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
-                if (node.triCount[lane] == 0)
-                {
-                    const uint32_t childIdx = node.childFirst[lane];
-                    if (nodeIdx) stack[stackPtr++] = nodeIdx;
-                    nodeIdx = childIdx;
-                    continue;
-                }
-                const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
-                for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
-                    if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
-            }
-#endif
 		}
 		else /* hits == 4, 2%: rare */
 		{
 			// blend in lane indices
 			float32x4_t tm = vreinterpretq_f32_u32( vorrq_u32( vandq_u32( vreinterpretq_u32_f32( vbslq_f32( hit, tmin, inf4 ) ), idxMask ), idx4 ) );
-#if false
+			ALIGNED( 64 ) float d[4];
+			vst1q_f32( d, tm );
 			// sort
-			float tmp, d0 = tm[0], d1 = tm[1], d2 = tm[2], d3 = tm[3];
-			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
-			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
-			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
-			if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp;
-			if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp;
-			// process hits
-			float d[4] = { d0, d1, d2, d3 };
+			float tmp;
+			if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
+			if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
+			if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
+			if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
+			if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
+			const uint32_t lanes[4] = { *(uint32_t*)&d[0] & 3, *(uint32_t*)&d[1] & 3, *(uint32_t*)&d[2] & 3, *(uint32_t*)&d[3] & 3 };
 			nodeIdx = 0;
 			for (int32_t i = 0; i < 4; i++)
 			{
-				uint32_t lane = *(uint32_t*)&d[i] & 3;
+				uint32_t lane = lanes[i];
+
 				if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
 				if (node.triCount[lane] == 0)
 				{
@@ -6459,43 +6541,6 @@ bool BVH4_CPU::IsOccluded( const Ray& ray ) const
 				for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
 					if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
 			}
-#else
-            ALIGNED( 64 ) float d[4];
-            vst1q_f32(d, tm);
-            // sort
-            float tmp;
-            if (d[0] < d[2]) tmp = d[0], d[0] = d[2], d[2] = tmp;
-            if (d[1] < d[3]) tmp = d[1], d[1] = d[3], d[3] = tmp;
-            if (d[0] < d[1]) tmp = d[0], d[0] = d[1], d[1] = tmp;
-            if (d[2] < d[3]) tmp = d[2], d[2] = d[3], d[3] = tmp;
-            if (d[1] < d[2]) tmp = d[1], d[1] = d[2], d[2] = tmp;
-            
-            const uint32_t lanes[4] =
-            {
-                *(uint32_t*)&d[0] & 3,
-                *(uint32_t*)&d[1] & 3,
-                *(uint32_t*)&d[2] & 3,
-                *(uint32_t*)&d[3] & 3,
-            };
-            
-            nodeIdx = 0;
-            for (int32_t i = 0; i < 4; i++)
-            {
-                uint32_t lane = lanes[i];
-                
-                if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
-                if (node.triCount[lane] == 0)
-                {
-                    const uint32_t childIdx = node.childFirst[lane];
-                    if (nodeIdx) stack[stackPtr++] = nodeIdx;
-                    nodeIdx = childIdx;
-                    continue;
-                }
-                const uint32_t first = node.childFirst[lane], count = node.triCount[lane];
-                for (uint32_t j = 0; j < count; j++) // TODO: aim for 4 prims per leaf
-                    if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true;
-            }
-#endif
 		}
 		// get next task
 		if (nodeIdx) continue;