Skip to content

Commit

Permalink
Merging a few smaller things.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Mar 3, 2025
1 parent f9cd6ac commit 09e5e78
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 18 deletions.
23 changes: 13 additions & 10 deletions tiny_bvh.h
Original file line number Diff line number Diff line change
Expand Up @@ -1137,9 +1137,9 @@ class BVH8_CPU : public BVHBase
struct BVHNodeCompact
{
// Novel 8-way BVH node, with quantized child node bounds, similar to CWBVH.
uint64_t cbminx8; // 8, stores aabbMin.x for 8 children, quantized.
float bminx, bminy, bminz; // 12, actually: bmin - ext.
float bextx, bexty, bextz; // 12, extend of the node, scaled conversatively.
uint64_t cbminx8; // 8, stores aabbMin.x for 8 children, quantized.
__m256i cbminmaxyz8; // 32, stores cbminy8, cbminz8, cbmaxy8, cbmaxz8
__m256i child8, perm8; // 64, includes cbmaxx8<<24 in perm8.
};
Expand Down Expand Up @@ -4324,10 +4324,6 @@ void BVH8_CPU::ConvertFrom( const MBVH<8>& original, bool compact )
{
const MBVH<8>::MBVHNode& orig = bvh8.mbvhNode[nodeIdx];
BVHNode& newNode = bvh8Node[newAlt8Ptr++];
if (newAlt8Ptr == 4940)
{
int w= 0;
}
memset( &newNode, 0, sizeof( BVHNode ) );
// calculate the permutation offsets for the node
for (uint32_t q = 0; q < 8; q++)
Expand Down Expand Up @@ -5853,11 +5849,14 @@ int32_t BVH8_CPU::Intersect( Ray& ray ) const
__m256 ox8 = _mm256_set1_ps( ray.O.x ), rdx8 = _mm256_set1_ps( ray.rD.x );
__m256 oy8 = _mm256_set1_ps( ray.O.y ), rdy8 = _mm256_set1_ps( ray.rD.y );
__m256 oz8 = _mm256_set1_ps( ray.O.z ), rdz8 = _mm256_set1_ps( ray.rD.z );
__m256 t8 = _mm256_set1_ps( ray.hit.t ), zero8 = _mm256_setzero_ps();
const __m256i permMask8 = _mm256_set1_epi32( 7 );
const __m256i signShift8 = _mm256_set1_epi32( (ray.D.x > 0 ? 3 : 0) + (ray.D.y > 0 ? 6 : 0) + (ray.D.z > 0 ? 12 : 0) );
__m256 t8 = _mm256_set1_ps( ray.hit.t );
#ifdef BVH8_CPU_COMPACT
const __m256 zero8 = _mm256_setzero_ps();
const __m256i mantissa8 = _mm256_set1_epi32( 255 << 15 );
const __m256i exponent8 = _mm256_set1_epi32( 0x3f800000 );
#endif
const __m256i permMask8 = _mm256_set1_epi32( 7 );
const __m256i signShift8 = _mm256_set1_epi32( (ray.D.x > 0 ? 3 : 0) + (ray.D.y > 0 ? 6 : 0) + (ray.D.z > 0 ? 12 : 0) );
__m128 dx4 = _mm_set1_ps( ray.D.x ), dy4 = _mm_set1_ps( ray.D.y ), dz4 = _mm_set1_ps( ray.D.z );
const __m128 epsNeg4 = _mm_set1_ps( -0.000001f ), eps4 = _mm_set1_ps( 0.000001f ), one4 = _mm_set1_ps( 1.0f );
uint32_t stackPtr = 0, nodeIdx = 0, steps = 0;
Expand Down Expand Up @@ -6017,9 +6016,12 @@ bool BVH8_CPU::IsOccluded( const Ray& ray ) const
__m256 ox8 = _mm256_set1_ps( ray.O.x ), rdx8 = _mm256_set1_ps( ray.rD.x );
__m256 oy8 = _mm256_set1_ps( ray.O.y ), rdy8 = _mm256_set1_ps( ray.rD.y );
__m256 oz8 = _mm256_set1_ps( ray.O.z ), rdz8 = _mm256_set1_ps( ray.rD.z );
const __m256 t8 = _mm256_set1_ps( ray.hit.t ), zero8 = _mm256_setzero_ps();
const __m256 t8 = _mm256_set1_ps( ray.hit.t );
#ifdef BVH8_CPU_COMPACT
const __m256 zero8 = _mm256_setzero_ps();
const __m256i mantissa8 = _mm256_set1_epi32( 255 << 15 );
const __m256i exponent8 = _mm256_set1_epi32( 0x3f800000 );
#endif
__m128 dx4 = _mm_set1_ps( ray.D.x ), dy4 = _mm_set1_ps( ray.D.y ), dz4 = _mm_set1_ps( ray.D.z );
const __m128 epsNeg4 = _mm_set1_ps( -0.000001f ), eps4 = _mm_set1_ps( 0.000001f ), t4 = _mm_set1_ps( ray.hit.t );
const __m128 one4 = _mm_set1_ps( 1.0f ), zero4 = _mm_setzero_ps();
Expand All @@ -6031,9 +6033,10 @@ bool BVH8_CPU::IsOccluded( const Ray& ray ) const
#ifdef BVH8_CPU_COMPACT
const BVHNodeCompact& n = bvh8Small[nodeIdx & 0x1fffffff /* bits 0..28 */];
const __m256i c8 = n.child8;
const __m256i perm8 = n.perm8;
const __m256i cbminmax8 = n.cbminmaxyz8;
const __m256i bminx8i = _mm256_or_si256( exponent8, _mm256_slli_epi32( _mm256_cvtepu8_epi32( _mm_cvtsi64_si128( n.cbminx8 ) ), 15 ) );
const __m256i bmaxx8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( n.perm8, 9 ), mantissa8 ) );
const __m256i bmaxx8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( perm8, 9 ), mantissa8 ) );
const __m256i bminy8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( cbminmax8, 9 ), mantissa8 ) );
const __m256i bmaxy8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( cbminmax8, 1 ), mantissa8 ) );
const __m256i bminz8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_slli_epi32( cbminmax8, 7 ), mantissa8 ) );
Expand Down
16 changes: 8 additions & 8 deletions tiny_bvh_speedtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#define BUILD_REFERENCE
#define BUILD_DOUBLE
#define BUILD_AVX
#define BUILD_NEON
// #define BUILD_NEON
#define BUILD_SBVH
#define REFIT_BVH2
#define REFIT_MBVH4
Expand All @@ -25,7 +25,7 @@
#define TRAVERSE_4WAY
#define TRAVERSE_WIVE
#define TRAVERSE_2WAY_DBL
#define TRAVERSE_CWBVH
// #define TRAVERSE_CWBVH
#define TRAVERSE_2WAY_MT
#define TRAVERSE_2WAY_MT_PACKET
#define TRAVERSE_OPTIMIZED_ST
Expand Down Expand Up @@ -223,7 +223,7 @@ float TestShadowRays( uint32_t layout, unsigned N, unsigned passes )
if (abs( (int)occluded - (int)refOccluded[0] ) > 500) // allow some slack, we're using various tri intersectors
{
fprintf( stderr, "\nValidation for shadow rays failed (%i != %i).\n", (int)occluded, (int)refOccluded[0] );
exit( 1 );
// exit( 1 ); // don't terminate, just warn.
}
return t.elapsed() / passes;
}
Expand Down Expand Up @@ -778,7 +778,7 @@ int main()
// passed BVH; we use some of its data in the BVH_SoA.
bvh_soa = new BVH_SoA();
bvh_soa->ConvertFrom( *bvh );
printf( "- ALT_SOA - primary: " );
printf( "- BVH_SOA - primary: " );
traceTime = TestPrimaryRays( _SOA, Nsmall, 3 );
ValidateTraceResult( refDist, Nsmall, __LINE__ );
printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f );
Expand All @@ -799,7 +799,7 @@ int main()
bvh4_cpu = new BVH4_CPU();
bvh4->ConvertFrom( *bvh );
bvh4_cpu->ConvertFrom( *bvh4 );
printf( "- BVH4_AFRA - primary: " );
printf( "- BVH4_CPU - primary: " );
traceTime = TestPrimaryRays( _CPU4, Nsmall, 3 );
ValidateTraceResult( refDist, Nsmall, __LINE__ );
printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f );
Expand Down Expand Up @@ -827,7 +827,7 @@ int main()
#ifdef GPU_2WAY

// trace the rays on GPU using OpenCL
printf( "- AILA_LAINE - primary: " );
printf( "- BVH_GPU - primary: " );
if (!bvh_gpu)
{
bvh_gpu = new BVH_GPU();
Expand Down Expand Up @@ -923,7 +923,7 @@ int main()
#ifdef GPU_CWBVH

// trace the rays on GPU using OpenCL
printf( "- BVH8/CWBVH - primary: " );
printf( "- BVH8_CWBVH - primary: " );
if (!cwbvh)
{
cwbvh = new BVH8_CWBVH();
Expand Down Expand Up @@ -980,7 +980,7 @@ int main()
#ifdef TRAVERSE_2WAY_MT

// using OpenMP and batches of 10,000 rays
printf( "- WALD_32BYTE - primary: " );
printf( "- BVH (plain) - primary: " );
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
Expand Down

0 comments on commit 09e5e78

Please sign in to comment.