jbikker · jbikker · Nov 14, 2024 · Nov 14, 2024
diff --git a/tiny_bvh.h b/tiny_bvh.h
@@ -75,6 +75,9 @@ THE SOFTWARE.
 // include fast AVX BVH builder
 #if defined(__x86_64__) || defined(_M_X64) || defined(__wasm_simd128__) || defined(__wasm_relaxed_simd__)
 #define BVH_USEAVX
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define BVH_USENEON
+#include "arm_neon.h"
 #endif
 
 // optimizer setting
@@ -281,6 +284,18 @@ static bvhvec3 normalize( const bvhvec3& a )
 typedef __m128 SIMDVEC4;
 #define SIMD_SETVEC(a,b,c,d) _mm_set_ps( a, b, c, d )
 #define SIMD_SETRVEC(a,b,c,d) _mm_set_ps( d, c, b, a )
+#elif defined(BVH_USENEON)
+typedef float32x4_t SIMDVEC4;
+inline float32x4_t SIMD_SETVEC(float w, float z, float y, float x)
+{
+    ALIGNED(64) float data[4] = {x, y, z, w};
+    return vld1q_f32(data);
+}
+inline float32x4_t SIMD_SETRVEC(float x, float y, float z, float w)
+{
+    ALIGNED(64) float data[4] = {x, y, z, w};
+    return vld1q_f32(data);
+}
 #else
 typedef bvhvec4 SIMDVEC4;
 #define SIMD_SETVEC(a,b,c,d) bvhvec4( d, c, b, a )
@@ -464,7 +479,12 @@ class BVH
 	void Compact( const BVHLayout layout /* must be WALD_32BYTE or VERBOSE */ );
 	void Build( const bvhvec4* vertices, const unsigned primCount );
 	void BuildHQ( const bvhvec4* vertices, const unsigned primCount );
+#ifdef BVH_USEAVX
 	void BuildAVX( const bvhvec4* vertices, const unsigned primCount );
+#endif
+#ifdef BVH_USENEON
+    void BuildNEON( const bvhvec4* vertices, const unsigned primCount );
+#endif
 	void Convert( BVHLayout from, BVHLayout to, const bool deleteOriginal = false );
 	void SplitLeafs(); // operates on VERBOSE layout
 	void MergeLeafs(); // operates on VERBOSE layout
@@ -480,7 +500,7 @@ class BVH
 	int Intersect_BasicBVH8( Ray& ray ) const; // only for testing, not efficient.
 	int Intersect_Alt4BVH( Ray& ray ) const; // only for testing, not efficient.
 	int Intersect_CWBVH( Ray& ray ) const; // only for testing, not efficient.
-	int Intersect_AltSoA( Ray& ray ) const; // requires BVH_USEAVX
+	int Intersect_AltSoA( Ray& ray ) const; // requires BVH_USEAVX or BVH_USENEON
 	void IntersectTri( Ray& ray, const unsigned triIdx ) const;
 	static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax );
 	static float SA( const bvhvec3& aabbMin, const bvhvec3& aabbMax )
@@ -1036,7 +1056,7 @@ void BVH::Convert( BVHLayout from, BVHLayout to, const bool deleteOriginal )
 			{
 				const BVHNode& left = bvhNode[node.leftFirst];
 				const BVHNode& right = bvhNode[node.leftFirst + 1];
-				// This BVH layout requires BVH_USEAVX for traversal, but at least we
+				// This BVH layout requires BVH_USEAVX/BVH_USENEON for traversal, but at least we
 				// can convert to it without SSE/AVX/NEON support.
 				alt2Node[idx].xxxx = SIMD_SETRVEC( left.aabbMin.x, left.aabbMax.x, right.aabbMin.x, right.aabbMax.x );
 				alt2Node[idx].yyyy = SIMD_SETRVEC( left.aabbMin.y, left.aabbMax.y, right.aabbMin.y, right.aabbMax.y );
@@ -1886,6 +1906,11 @@ int BVH::Intersect( Ray& ray, BVHLayout layout ) const
 		return Intersect_CWBVH( ray );
 		break;
 	#endif
+    #ifdef BVH_USENEON
+    case ALT_SOA:
+        return Intersect_AltSoA( ray );
+        break;
+    #endif
 	default:
 		assert( false );
 	};
@@ -3012,6 +3037,285 @@ int BVH::Intersect_CWBVH( Ray& ray ) const
 
 #endif // BVH_USEAVX
 
+#ifdef BVH_USENEON
+
+#define ILANE(a,b) vgetq_lane_s32(a, b)
+
+inline float32x4x2_t vmaxq_f32x2(float32x4x2_t a, float32x4x2_t b)
+{
+    float32x4x2_t ret;
+    ret.val[0] = vmaxq_f32(a.val[0], b.val[0]);
+    ret.val[1] = vmaxq_f32(a.val[1], b.val[1]);
+    return ret;
+}
+inline float halfArea( const float32x4_t a /* a contains extent of aabb */ )
+{
+    ALIGNED(64) float v[4];
+    vst1q_f32(v, a);
+    return v[0] * v[1] + v[1] * v[2] + v[2] * v[3];
+}
+inline float halfArea( const float32x4x2_t& a /* a contains aabb itself, with min.xyz negated */ )
+{
+    ALIGNED(64) float c[8];
+    vst1q_f32(c, a.val[0]);
+    vst1q_f32(c + 4, a.val[1]);
+
+    float ex = c[4] + c[0], ey = c[5] + c[1], ez = c[6] + c[2];
+    return ex * ey + ey * ez + ez * ex;
+}
+#define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \
+    ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; if (ANLR < splitCost) \
+    splitCost = ANLR, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; }
+
+void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
+{
+    int test = BVHBINS;
+    if (test != 8) assert( false ); // AVX builders require BVHBINS == 8.
+    // aligned data
+    ALIGNED( 64 ) float32x4x2_t binbox[3 * BVHBINS];                // 768 bytes
+    ALIGNED( 64 ) float32x4x2_t binboxOrig[3 * BVHBINS];            // 768 bytes
+    ALIGNED( 64 ) unsigned count[3][BVHBINS]{};            // 96 bytes
+    ALIGNED( 64 ) float32x4x2_t bestLBox, bestRBox;                // 64 bytes
+    // some constants
+    static const float32x4_t max4 = vdupq_n_f32( -1e30f ), half4 = vdupq_n_f32( 0.5f );
+    static const float32x4_t two4 = vdupq_n_f32( 2.0f ), min1 = vdupq_n_f32( -1 );
+    static const float32x4x2_t max8 = {max4, max4};
+    static const float32x4_t signFlip4 = SIMD_SETRVEC( -0.0f, -0.0f, -0.0f, 0.0f );
+    static const float32x4x2_t signFlip8 = {signFlip4, vdupq_n_f32(0)}; // TODO: Check me
+    static const float32x4_t mask3 = vceqq_f32( SIMD_SETRVEC( 0, 0, 0, 1 ), vdupq_n_f32(0) );
+    static const float32x4_t binmul3 = vdupq_n_f32( BVHBINS * 0.49999f );
+    for (unsigned i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template
+    // reset node pool
+    const unsigned spaceNeeded = primCount * 2;
+    if (allocatedBVHNodes < spaceNeeded)
+    {
+        ALIGNED_FREE( bvhNode );
+        ALIGNED_FREE( triIdx );
+        ALIGNED_FREE( fragment );
+        verts = (bvhvec4*)vertices;
+        triIdx = (unsigned*)ALIGNED_MALLOC( primCount * sizeof( unsigned ) );
+        bvhNode = (BVHNode*)ALIGNED_MALLOC( spaceNeeded * sizeof( BVHNode ) );
+        allocatedBVHNodes = spaceNeeded;
+        memset( &bvhNode[1], 0, 32 ); // avoid crash in refit.
+        fragment = (Fragment*)ALIGNED_MALLOC( primCount * sizeof( Fragment ) );
+    }
+    else assert( rebuildable == true );
+    triCount = idxCount = primCount;
+    unsigned newNodePtr = 2;
+    struct FragSSE { float32x4_t bmin4, bmax4; };
+    FragSSE* frag4 = (FragSSE*)fragment;
+    float32x4x2_t* frag8 = (float32x4x2_t*)fragment;
+    const float32x4_t* verts4 = (float32x4_t*)verts;
+    // assign all triangles to the root node
+    BVHNode& root = bvhNode[0];
+    root.leftFirst = 0, root.triCount = triCount;
+    // initialize fragments and update root bounds
+    float32x4_t rootMin = max4, rootMax = max4;
+    for (unsigned i = 0; i < triCount; i++)
+    {
+        const float32x4_t v1 = veorq_s32( signFlip4, vminq_f32( vminq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] ) );
+        const float32x4_t v2 = vmaxq_f32( vmaxq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] );
+        frag4[i].bmin4 = v1, frag4[i].bmax4 = v2, rootMin = vmaxq_f32( rootMin, v1 ), rootMax = vmaxq_f32( rootMax, v2 ), triIdx[i] = i;
+    }
+    rootMin = veorq_s32( rootMin, signFlip4 );
+    root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax;
+    // subdivide recursively
+    ALIGNED( 64 ) unsigned task[128], taskCount = 0, nodeIdx = 0;
+    const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-7f;
+    while (1)
+    {
+        while (1)
+        {
+            BVHNode& node = bvhNode[nodeIdx];
+            float32x4_t* node4 = (float32x4_t*) & bvhNode[nodeIdx];
+            // find optimal object split
+            const float32x4_t d4 = vbslq_f32(vshrq_n_s32(mask3, 31), vsubq_f32( node4[1], node4[0] ), min1 );
+            const float32x4_t nmin4 = vmulq_f32( vandq_s32( node4[0], mask3 ), two4 );
+            const float32x4_t rpd4 = vandq_s32( vdivq_f32( binmul3, d4 ), vmvnq_u32(vceqq_f32( d4, vdupq_n_f32(0) ) ) );
+            // implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies":
+            // main loop operates on two fragments to minimize dependencies and maximize ILP.
+            unsigned fi = triIdx[node.leftFirst];
+            memset( count, 0, sizeof( count ) );
+            float32x4x2_t r0, r1, r2, f = frag8[fi];
+            int32x4_t bi4 = vcvtq_s32_f32(vrnd32xq_f32( vsubq_f32( vmulq_f32( vsubq_f32( vsubq_f32( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) ) );
+            memcpy( binbox, binboxOrig, sizeof( binbox ) );
+            unsigned i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1;
+            for (unsigned i = 0; i < node.triCount - 1; i++)
+            {
+                unsigned fid = *ti++;
+                const float32x4x2_t b0 = binbox[i0];
+                const float32x4x2_t b1 = binbox[BVHBINS + i1];
+                const float32x4x2_t b2 = binbox[2 * BVHBINS + i2];
+                const float32x4_t fmin = frag4[fid].bmin4, fmax = frag4[fid].bmax4;
+                r0 =  vmaxq_f32x2(b0, f);
+                r1 =  vmaxq_f32x2(b1, f);
+                r2 =  vmaxq_f32x2(b2, f);
+                const int32x4_t b4 = vcvtq_s32_f32( vrnd32xq_f32( vsubq_f32( vmulq_f32( vsubq_f32( vsubq_f32( fmax, fmin ), nmin4 ), rpd4 ), half4 ) ) );
+
+                f = frag8[fid], count[0][i0]++, count[1][i1]++, count[2][i2]++;
+                binbox[i0] = r0, i0 = ILANE( b4, 0 );
+                binbox[BVHBINS + i1] = r1, i1 = ILANE( b4, 1 );
+                binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4, 2 );
+            }
+            // final business for final fragment
+            const float32x4x2_t b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2];
+            count[0][i0]++, count[1][i1]++, count[2][i2]++;
+            r0 =  vmaxq_f32x2(b0, f);
+            r1 =  vmaxq_f32x2(b1, f);
+            r2 =  vmaxq_f32x2(b2, f);
+            binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
+            // calculate per-split totals
+            float splitCost = 1e30f;
+            unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
+            const float32x4x2_t* bb = binbox;
+            for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
+            {
+                // hardcoded bin processing for BVHBINS == 8
+                assert( BVHBINS == 8 );
+                const unsigned lN0 = count[a][0], rN0 = count[a][7];
+                const float32x4x2_t lb0 = bb[0], rb0 = bb[7];
+                const unsigned lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2];
+                const unsigned rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4];
+                const float32x4x2_t lb1 = vmaxq_f32x2( lb0, bb[1] ), rb1 = vmaxq_f32x2( rb0, bb[6] );
+                const float32x4x2_t lb2 = vmaxq_f32x2( lb1, bb[2] ), rb2 = vmaxq_f32x2( rb1, bb[5] );
+                const float32x4x2_t lb3 = vmaxq_f32x2( lb2, bb[3] ), rb3 = vmaxq_f32x2( rb2, bb[4] );
+                const unsigned lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5];
+                const unsigned rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1];
+                const float32x4x2_t lb4 = vmaxq_f32x2( lb3, bb[4] ), rb4 = vmaxq_f32x2( rb3, bb[3] );
+                const float32x4x2_t lb5 = vmaxq_f32x2( lb4, bb[5] ), rb5 = vmaxq_f32x2( rb4, bb[2] );
+                const float32x4x2_t lb6 = vmaxq_f32x2( lb5, bb[6] ), rb6 = vmaxq_f32x2( rb5, bb[1] );
+                float ANLR3 = 1e30f; PROCESS_PLANE( a, 3, ANLR3, lN3, rN3, lb3, rb3 ); // most likely split
+                float ANLR2 = 1e30f; PROCESS_PLANE( a, 2, ANLR2, lN2, rN4, lb2, rb4 );
+                float ANLR4 = 1e30f; PROCESS_PLANE( a, 4, ANLR4, lN4, rN2, lb4, rb2 );
+                float ANLR5 = 1e30f; PROCESS_PLANE( a, 5, ANLR5, lN5, rN1, lb5, rb1 );
+                float ANLR1 = 1e30f; PROCESS_PLANE( a, 1, ANLR1, lN1, rN5, lb1, rb5 );
+                float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 );
+                float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split
+            }
+            if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
+            // in-place partition
+            const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
+            unsigned t, fr = triIdx[src];
+            for (unsigned i = 0; i < node.triCount; i++)
+            {
+                const unsigned bi = (unsigned)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd);
+                if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t;
+            }
+            // create child nodes and recurse
+            const unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
+            if (leftCount == 0 || rightCount == 0) break; // should not happen.
+            (*(float32x4x2_t*)& bvhNode[n]).val[0] = veorq_s32( bestLBox.val[0], signFlip8.val[0] );
+            (*(float32x4x2_t*)& bvhNode[n]).val[1] = veorq_s32( bestLBox.val[1], signFlip8.val[1] );
+            bvhNode[n].leftFirst = node.leftFirst, bvhNode[n].triCount = leftCount;
+            node.leftFirst = n++, node.triCount = 0, newNodePtr += 2;
+            (*(float32x4x2_t*)& bvhNode[n]).val[0] = veorq_s32( bestRBox.val[0], signFlip8.val[0] );
+            (*(float32x4x2_t*)& bvhNode[n]).val[1] = veorq_s32( bestRBox.val[1], signFlip8.val[1] );
+            bvhNode[n].leftFirst = j, bvhNode[n].triCount = rightCount;
+            task[taskCount++] = n, nodeIdx = n - 1;
+        }
+        // fetch subdivision task from stack
+        if (taskCount == 0) break; else nodeIdx = task[--taskCount];
+    }
+    // all done.
+    refittable = true; // not using spatial splits: can refit this BVH
+    frag_min_flipped = true; // NEON was used for binning; fragment.min flipped
+    may_have_holes = false; // the NEON builder produces a continuous list of nodes
+    usedBVHNodes = newNodePtr;
+}
+
+// Traverse the second alternative BVH layout (ALT_SOA).
+int BVH::Intersect_AltSoA( Ray& ray ) const
+{
+    assert( alt2Node != 0 );
+    BVHNodeAlt2* node = &alt2Node[0], * stack[64];
+    unsigned stackPtr = 0, steps = 0;
+    const float32x4_t Ox4 = vdupq_n_f32( ray.O.x ), rDx4 = vdupq_n_f32( ray.rD.x );
+    const float32x4_t Oy4 = vdupq_n_f32( ray.O.y ), rDy4 = vdupq_n_f32( ray.rD.y );
+    const float32x4_t Oz4 = vdupq_n_f32( ray.O.z ), rDz4 = vdupq_n_f32( ray.rD.z );
+    // const float32x4_t inf4 = vdupq_n_f32( 1e30f );
+    while (1)
+    {
+        steps++;
+        if (node->isLeaf())
+        {
+            for (unsigned i = 0; i < node->triCount; i++)
+            {
+                const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
+                const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
+                const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
+                const bvhvec3 h = cross( ray.D, edge2 );
+                const float a = dot( edge1, h );
+                if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle
+                const float f = 1 / a;
+                const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] );
+                const float u = f * dot( s, h );
+                if (u < 0 || u > 1) continue;
+                const bvhvec3 q = cross( s, edge1 );
+                const float v = f * dot( ray.D, q );
+                if (v < 0 || u + v > 1) continue;
+                const float t = f * dot( edge2, q );
+                if (t < 0 || t > ray.hit.t) continue;
+                ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = tidx;
+            }
+            if (stackPtr == 0) break; else node = stack[--stackPtr];
+            continue;
+        }
+        float32x4_t x4 = vmulq_f32( vsubq_f32( node->xxxx, Ox4 ), rDx4 );
+        float32x4_t y4 = vmulq_f32( vsubq_f32( node->yyyy, Oy4 ), rDy4 );
+        float32x4_t z4 = vmulq_f32( vsubq_f32( node->zzzz, Oz4 ), rDz4 );
+        // transpose
+        float32x4_t t0 = vzip1q_f32( x4, y4 ), t2 = vzip1q_f32( z4, z4 );
+        float32x4_t t1 = vzip2q_f32( x4, y4 ), t3 = vzip2q_f32( z4, z4 );
+        float32x4_t xyzw1a = vcombine_f32( vget_low_f32(t0), vget_low_f32(t2) );
+        float32x4_t xyzw2a = vcombine_f32( vget_high_f32(t0), vget_high_f32(t2) );
+        float32x4_t xyzw1b = vcombine_f32( vget_low_f32(t1), vget_low_f32(t3) );
+        float32x4_t xyzw2b = vcombine_f32( vget_high_f32(t1), vget_high_f32(t3) );
+        // process
+        float32x4_t tmina4 = vminq_f32( xyzw1a, xyzw2a ), tmaxa4 = vmaxq_f32( xyzw1a, xyzw2a );
+        float32x4_t tminb4 = vminq_f32( xyzw1b, xyzw2b ), tmaxb4 = vmaxq_f32( xyzw1b, xyzw2b );
+        // transpose back
+        t0 = vzip1q_f32( tmina4, tmaxa4 ), t2 = vzip1q_f32( tminb4, tmaxb4 );
+        t1 = vzip2q_f32( tmina4, tmaxa4 ), t3 = vzip2q_f32( tminb4, tmaxb4 );
+        x4 = vcombine_f32( vget_low_f32(t0), vget_low_f32(t2) );
+        y4 = vcombine_f32( vget_high_f32(t0), vget_high_f32(t2) );
+        z4 = vcombine_f32( vget_low_f32(t1), vget_low_f32(t3) );
+        unsigned lidx = node->left, ridx = node->right;
+        const float32x4_t min4 = vmaxq_f32( vmaxq_f32( vmaxq_f32( x4, y4 ), z4 ), vdupq_n_f32(0) );
+        const float32x4_t max4 = vminq_f32( vminq_f32( vminq_f32( x4, y4 ), z4 ), vdupq_n_f32( ray.hit.t ) );
+    #if 0
+        // TODO: why is this slower on gen14?
+        const float tmina_0 = vgetq_lane_f32( min4, 0 ), tmaxa_1 = vgetq_lane_f32( max4, 1 );
+        const float tminb_2 = vgetq_lane_f32( min4, 2 ), tmaxb_3 = vgetq_lane_f32( max4, 3 );
+        t0 = __builtin_shufflevector( max4, max4, 3, 1, 3, 1);
+        t1 = __builtin_shufflevector( min4, min4, 2, 0, 2, 0 );
+        t0 = vbslq_f32( vcgeq_f32( t0, t1 ), t1, inf4 );
+        float dist1 = vgetq_lane_f32( t0, 1 ), dist2 = vgetq_lane_f32( t0, 0 );
+    #else
+        const float tmina_0 = vgetq_lane_f32( min4, 0 ), tmaxa_1 = vgetq_lane_f32( max4, 1 );
+        const float tminb_2 = vgetq_lane_f32( min4, 2 ), tmaxb_3 = vgetq_lane_f32( max4, 3 );
+        float dist1 = tmaxa_1 >= tmina_0 ? tmina_0 : 1e30f;
+        float dist2 = tmaxb_3 >= tminb_2 ? tminb_2 : 1e30f;
+    #endif
+        if (dist1 > dist2)
+        {
+            float t = dist1; dist1 = dist2; dist2 = t;
+            unsigned i = lidx; lidx = ridx; ridx = i;
+        }
+        if (dist1 == 1e30f)
+        {
+            if (stackPtr == 0) break; else node = stack[--stackPtr];
+        }
+        else
+        {
+            node = alt2Node + lidx;
+            if (dist2 != 1e30f) stack[stackPtr++] = alt2Node + ridx;
+        }
+    }
+    return steps;
+}
+
+#endif // BVH_USENEON
+
 } // namespace tinybvh
 
 #endif // TINYBVH_IMPLEMENTATION

diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp
@@ -86,6 +86,8 @@ void Init()
 	// build a BVH over the scene
 #if defined(BVH_USEAVX)
 	bvh.BuildAVX( triangles, verts / 3 );
+#elif defined(BVH_USENEON)
+    bvh.BuildNEON( triangles, verts / 3 );
 #else
 	// bvh.Build( triangles, verts / 3 );
 #endif

diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp
@@ -176,6 +176,17 @@ int main()
 	printf( "%7.2fms for %7i triangles ", buildTimeAVX * 1000.0f, verts / 3 );
 	printf( "- %6i nodes, SAH=%.2f\n", bvh.usedBVHNodes, bvh.SAHCost() );
 
+#endif
+#ifdef BVH_USENEON
+
+    // measure single-core bvh construction time - NEON builder
+    printf( "- fast NEON builder: " );
+    t.reset();
+    for (int pass = 0; pass < 3; pass++) bvh.BuildNEON( triangles, verts / 3 );
+    float buildTimeNEON = t.elapsed() / 3.0f;
+    printf( "%7.2fms for %7i triangles ", buildTimeNEON * 1000.0f, verts / 3 );
+    printf( "- %6i nodes, SAH=%.2f\n", bvh.usedBVHNodes, bvh.SAHCost() );
+
 #endif
 #endif