From 9cc8cbb147ec199c6bf89a2720eb3ca50661f65a Mon Sep 17 00:00:00 2001 From: Jacco Bikker Date: Fri, 28 Feb 2025 09:28:58 +0100 Subject: [PATCH 1/7] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 10a04eb..96bc2de 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +# dev +This is the **development branch** for tinybvh. + # tinybvh Single-header BVH construction and traversal library written as "Sane C++" (or "C with classes"). Some C++11 is used, e.g. for threading. The library has no dependencies. From ad62042cba5746d1e40290276dc92cb3c4a624c8 Mon Sep 17 00:00:00 2001 From: Jacco Bikker Date: Fri, 28 Feb 2025 09:41:30 +0100 Subject: [PATCH 2/7] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 96bc2de..5570cb4 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,9 @@ Plans, ordered by priority: * Speed improvements: * Faster optimizer for AVX-capable CPUs * Improve speed of SBVH builder +* Features & outstanding issues: + * 'Watertight' triangle intersection option + * Load/save/Optimize/Refit for BVH8_CPU * Demo of tinybvh on GPU using other apis: * Ray tracing in pure OpenGL * Ray tracing in pure DirectX From 9b60ad361398f608396773d0f452b4f236240a35 Mon Sep 17 00:00:00 2001 From: jbikker Date: Fri, 28 Feb 2025 11:11:34 +0100 Subject: [PATCH 3/7] Fixed names of C_INT and C_TRAV. --- tiny_bvh.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tiny_bvh.h b/tiny_bvh.h index d52b6f8..b6125ba 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -118,10 +118,10 @@ THE SOFTWARE. // These are defaults, which initialize the public members c_int and c_trav in // BVHBase (and thus each BVH instance). #ifndef C_INT -#define C_INT_ 1 +#define C_INT 1 #endif #ifndef C_TRAV -#define C_TRAV_ 1 +#define C_TRAV 1 #endif // SBVH: "Unsplitting" @@ -663,8 +663,8 @@ class BVHBase uint32_t usedNodes = 0; // number of nodes used for the BVH. uint32_t triCount = 0; // number of primitives in the BVH. uint32_t idxCount = 0; // number of primitive indices; can exceed triCount for SBVH. - float c_trav = C_TRAV_; // cost of a traversal step, used to steer SAH construction. - float c_int = C_INT_; // cost of a primitive intersection, used to steer SAH construction. + float c_trav = C_TRAV; // cost of a traversal step, used to steer SAH construction. + float c_int = C_INT; // cost of a primitive intersection, used to steer SAH construction. bvhvec3 aabbMin, aabbMax; // bounds of the root node of the BVH. // Custom memory allocation void* AlignedAlloc( size_t size ); From 4c997cfa7afada8da28537fe12d1235229336068 Mon Sep 17 00:00:00 2001 From: wuyakuma Date: Sat, 1 Mar 2025 22:15:27 +0800 Subject: [PATCH 4/7] Enable OpenCL on Apple --- CMakeLists.txt | 6 ++++++ tiny_bvh_speedtest.cpp | 16 +++++++++++----- tiny_ocl.h | 25 ++++++++++++++++++++++++- traverse_bvh2.cl | 24 ++++++++++++++++++++++-- 4 files changed, 63 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f8dc4d..a16e552 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,10 @@ project(tiny_bvh LANGUAGES CXX) if (APPLE) find_library(COCOA_LIBRARY Cocoa) + find_library(OPENCL_LIBRARY OpenCL) + if (OPENCL_LIBRARY) + set(OPENCL_FRAMEWORK "-framework OpenCL") + endif() elseif (UNIX AND NOT EMSCRIPTEN) find_package(X11) elseif (EMSCRIPTEN) @@ -118,6 +122,8 @@ if (NOT MSVC) set(tiny_bvh_speedtest_link_flags ${tiny_bvh_speedtest_link_flags} -sPROXY_TO_PTHREAD=1) endif() endif() + elseif (APPLE) + target_link_libraries(tiny_bvh_speedtest ${COCOA_LIBRARY} ${OPENCL_FRAMEWORK}) endif() target_compile_options(tiny_bvh_speedtest PRIVATE ${tiny_bvh_speedtest_cxx_flags}) target_link_options(tiny_bvh_speedtest PRIVATE ${tiny_bvh_speedtest_link_flags}) diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index e73b932..3cdc2fc 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -48,6 +48,8 @@ using namespace tinybvh; #endif #ifdef _WIN32 #include // for __cpuidex +#elif defined(__APPLE__) && defined(__MACH__) +// Keep ENABLE_OPENCL for APPLE #elif defined ENABLE_OPENCL #undef ENABLE_OPENCL #endif @@ -733,7 +735,8 @@ int main() #endif #ifdef TRAVERSE_CWBVH - +#ifdef BVH_USEAVX + // CWBVH - Not efficient on CPU. if (!cwbvh) { @@ -744,7 +747,8 @@ int main() traceTime = TestPrimaryRays( _CWBVH, Nsmall, 3 ); ValidateTraceResult( refDist, Nsmall, __LINE__ ); printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); - + +#endif #endif #if defined TRAVERSE_OPTIMIZED_ST || defined TRAVERSE_4WAY_OPTIMIZED @@ -766,7 +770,8 @@ int main() #endif #ifdef TRAVERSE_OPTIMIZED_ST - +#ifdef BVH_USEAVX + // ALT_SOA delete bvh_soa; // Building a BVH_SoA over an optimized BVH: Careful, do not delete the @@ -779,7 +784,8 @@ int main() printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); traceTime = TestShadowRays( _SOA, Nsmall, 3 ); printf( "shadow: %5.1fms (%7.2fMRays/s)\n", traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); - + +#endif #endif #ifdef TRAVERSE_4WAY_OPTIMIZED @@ -1082,4 +1088,4 @@ int main() printf( "all done." ); return 0; -} \ No newline at end of file +} diff --git a/tiny_ocl.h b/tiny_ocl.h index 461b70a..37456b0 100644 --- a/tiny_ocl.h +++ b/tiny_ocl.h @@ -359,7 +359,7 @@ class Kernel inline static cl_context context; // simplifies some things, but limits us to one device inline static cl_command_queue queue, queue2; inline static char* log = 0; - inline static bool isNVidia = false, isAMD = false, isIntel = false, isOther = false; + inline static bool isNVidia = false, isAMD = false, isIntel = false, isApple = false, isOther = false; inline static bool isAmpere = false, isTuring = false, isPascal = false; inline static bool isAda = false, isBlackwell = false, isRubin = false, isHopper = false; inline static int vendorLines = 0; @@ -753,6 +753,7 @@ Kernel::Kernel( const char* file, const char* entryPoint ) if (isNVidia) csText = "#define ISNVIDIA\n" + csText, vendorLines++; if (isAMD) csText = "#define ISAMD\n" + csText, vendorLines++; if (isIntel) csText = "#define ISINTEL\n" + csText, vendorLines++; + if (isApple) csText = "#define ISAPPLE\n" + csText, vendorLines++; if (isOther) csText = "#define ISOTHER\n" + csText, vendorLines++; if (isAmpere) csText = "#define ISAMPERE\n" + csText, vendorLines++; if (isTuring) csText = "#define ISTURING\n" + csText, vendorLines++; @@ -962,7 +963,11 @@ bool Kernel::InitCL() string deviceList( extensions ); free( extensions ); string mustHave[] = { +#if defined(__APPLE__) && defined(__MACH__) + "cl_APPLE_gl_sharing", +#else "cl_khr_gl_sharing", +#endif "cl_khr_global_int32_base_atomics" }; bool hasAll = true; @@ -1067,6 +1072,10 @@ bool Kernel::InitCL() { isIntel = true; } + else if (strstr( d, "apple" )) + { + isApple = true; + } else { isOther = true; @@ -1092,17 +1101,31 @@ bool Kernel::InitCL() { printf( "Intel.\n" ); } + else if (isApple) + { + printf( "Apple.\n" ); + } else { printf( "identification failed.\n" ); } // create a command-queue +#if defined(__APPLE__) && defined(__MACH__) + // Cannot find symbol for _clCreateCommandQueueWithProperties on APPLE + cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE; + queue = clCreateCommandQueue( context, devices[deviceUsed], props, &error ); + if (!CHECKCL( error )) return false; + // create a second command queue for asynchronous copies + queue2 = clCreateCommandQueue( context, devices[deviceUsed], props, &error ); + if (!CHECKCL( error )) return false; +#else cl_queue_properties props[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; queue = clCreateCommandQueueWithProperties( context, devices[deviceUsed], props, &error ); if (!CHECKCL( error )) return false; // create a second command queue for asynchronous copies queue2 = clCreateCommandQueueWithProperties( context, devices[deviceUsed], props, &error ); if (!CHECKCL( error )) return false; +#endif // cleanup delete[] devices; clStarted = true; diff --git a/traverse_bvh2.cl b/traverse_bvh2.cl index ed95d2c..83d5f11 100644 --- a/traverse_bvh2.cl +++ b/traverse_bvh2.cl @@ -31,8 +31,18 @@ float4 traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* i for (unsigned i = 0; i < triCount; i++) { const unsigned triIdx = idx[firstTri + i]; +#ifdef ISAPPLE + // FIX error: initializing 'const __private float4 *__private' with an expression of type '__global float4 *' changes address space of pointer + const float4 tri[3] = + { + verts[3 * triIdx], + verts[3 * triIdx + 1], + verts[3 * triIdx + 2], + }; +#else const float4* tri = verts + 3 * triIdx; - // triangle intersection - Möller-Trumbore +#endif + // triangle intersection - M�ller-Trumbore const float4 edge1 = tri[1] - tri[0], edge2 = tri[2] - tri[0]; const float3 h = cross( D, edge2.xyz ); const float a = dot( edge1.xyz, h ); @@ -100,8 +110,18 @@ bool isoccluded_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* i for (unsigned i = 0; i < triCount; i++) { const unsigned triIdx = idx[firstTri + i]; +#ifdef ISAPPLE + // FIX error: initializing 'const __private float4 *__private' with an expression of type '__global float4 *' changes address space of pointer + const float4 tri[3] = + { + verts[3 * triIdx], + verts[3 * triIdx + 1], + verts[3 * triIdx + 2], + }; +#else const float4* tri = verts + 3 * triIdx; - // triangle intersection - Möller-Trumbore +#endif + // triangle intersection - M�ller-Trumbore const float4 edge1 = tri[1] - tri[0], edge2 = tri[2] - tri[0]; const float3 h = cross( D, edge2.xyz ); const float a = dot( edge1.xyz, h ); From 09e5e7806158a66f2065f535c9fbb08b8a48ee82 Mon Sep 17 00:00:00 2001 From: jbikker Date: Mon, 3 Mar 2025 09:14:12 +0100 Subject: [PATCH 5/7] Merging a few smaller things. --- tiny_bvh.h | 23 +++++++++++++---------- tiny_bvh_speedtest.cpp | 16 ++++++++-------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/tiny_bvh.h b/tiny_bvh.h index 95923e4..5273e84 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -1137,9 +1137,9 @@ class BVH8_CPU : public BVHBase struct BVHNodeCompact { // Novel 8-way BVH node, with quantized child node bounds, similar to CWBVH. + uint64_t cbminx8; // 8, stores aabbMin.x for 8 children, quantized. float bminx, bminy, bminz; // 12, actually: bmin - ext. float bextx, bexty, bextz; // 12, extend of the node, scaled conversatively. - uint64_t cbminx8; // 8, stores aabbMin.x for 8 children, quantized. __m256i cbminmaxyz8; // 32, stores cbminy8, cbminz8, cbmaxy8, cbmaxz8 __m256i child8, perm8; // 64, includes cbmaxx8<<24 in perm8. }; @@ -4324,10 +4324,6 @@ void BVH8_CPU::ConvertFrom( const MBVH<8>& original, bool compact ) { const MBVH<8>::MBVHNode& orig = bvh8.mbvhNode[nodeIdx]; BVHNode& newNode = bvh8Node[newAlt8Ptr++]; - if (newAlt8Ptr == 4940) - { - int w= 0; - } memset( &newNode, 0, sizeof( BVHNode ) ); // calculate the permutation offsets for the node for (uint32_t q = 0; q < 8; q++) @@ -5853,11 +5849,14 @@ int32_t BVH8_CPU::Intersect( Ray& ray ) const __m256 ox8 = _mm256_set1_ps( ray.O.x ), rdx8 = _mm256_set1_ps( ray.rD.x ); __m256 oy8 = _mm256_set1_ps( ray.O.y ), rdy8 = _mm256_set1_ps( ray.rD.y ); __m256 oz8 = _mm256_set1_ps( ray.O.z ), rdz8 = _mm256_set1_ps( ray.rD.z ); - __m256 t8 = _mm256_set1_ps( ray.hit.t ), zero8 = _mm256_setzero_ps(); - const __m256i permMask8 = _mm256_set1_epi32( 7 ); - const __m256i signShift8 = _mm256_set1_epi32( (ray.D.x > 0 ? 3 : 0) + (ray.D.y > 0 ? 6 : 0) + (ray.D.z > 0 ? 12 : 0) ); + __m256 t8 = _mm256_set1_ps( ray.hit.t ); +#ifdef BVH8_CPU_COMPACT + const __m256 zero8 = _mm256_setzero_ps(); const __m256i mantissa8 = _mm256_set1_epi32( 255 << 15 ); const __m256i exponent8 = _mm256_set1_epi32( 0x3f800000 ); +#endif + const __m256i permMask8 = _mm256_set1_epi32( 7 ); + const __m256i signShift8 = _mm256_set1_epi32( (ray.D.x > 0 ? 3 : 0) + (ray.D.y > 0 ? 6 : 0) + (ray.D.z > 0 ? 12 : 0) ); __m128 dx4 = _mm_set1_ps( ray.D.x ), dy4 = _mm_set1_ps( ray.D.y ), dz4 = _mm_set1_ps( ray.D.z ); const __m128 epsNeg4 = _mm_set1_ps( -0.000001f ), eps4 = _mm_set1_ps( 0.000001f ), one4 = _mm_set1_ps( 1.0f ); uint32_t stackPtr = 0, nodeIdx = 0, steps = 0; @@ -6017,9 +6016,12 @@ bool BVH8_CPU::IsOccluded( const Ray& ray ) const __m256 ox8 = _mm256_set1_ps( ray.O.x ), rdx8 = _mm256_set1_ps( ray.rD.x ); __m256 oy8 = _mm256_set1_ps( ray.O.y ), rdy8 = _mm256_set1_ps( ray.rD.y ); __m256 oz8 = _mm256_set1_ps( ray.O.z ), rdz8 = _mm256_set1_ps( ray.rD.z ); - const __m256 t8 = _mm256_set1_ps( ray.hit.t ), zero8 = _mm256_setzero_ps(); + const __m256 t8 = _mm256_set1_ps( ray.hit.t ); +#ifdef BVH8_CPU_COMPACT + const __m256 zero8 = _mm256_setzero_ps(); const __m256i mantissa8 = _mm256_set1_epi32( 255 << 15 ); const __m256i exponent8 = _mm256_set1_epi32( 0x3f800000 ); +#endif __m128 dx4 = _mm_set1_ps( ray.D.x ), dy4 = _mm_set1_ps( ray.D.y ), dz4 = _mm_set1_ps( ray.D.z ); const __m128 epsNeg4 = _mm_set1_ps( -0.000001f ), eps4 = _mm_set1_ps( 0.000001f ), t4 = _mm_set1_ps( ray.hit.t ); const __m128 one4 = _mm_set1_ps( 1.0f ), zero4 = _mm_setzero_ps(); @@ -6031,9 +6033,10 @@ bool BVH8_CPU::IsOccluded( const Ray& ray ) const #ifdef BVH8_CPU_COMPACT const BVHNodeCompact& n = bvh8Small[nodeIdx & 0x1fffffff /* bits 0..28 */]; const __m256i c8 = n.child8; + const __m256i perm8 = n.perm8; const __m256i cbminmax8 = n.cbminmaxyz8; const __m256i bminx8i = _mm256_or_si256( exponent8, _mm256_slli_epi32( _mm256_cvtepu8_epi32( _mm_cvtsi64_si128( n.cbminx8 ) ), 15 ) ); - const __m256i bmaxx8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( n.perm8, 9 ), mantissa8 ) ); + const __m256i bmaxx8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( perm8, 9 ), mantissa8 ) ); const __m256i bminy8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( cbminmax8, 9 ), mantissa8 ) ); const __m256i bmaxy8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( cbminmax8, 1 ), mantissa8 ) ); const __m256i bminz8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_slli_epi32( cbminmax8, 7 ), mantissa8 ) ); diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index 3cdc2fc..fb973ba 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -14,7 +14,7 @@ #define BUILD_REFERENCE #define BUILD_DOUBLE #define BUILD_AVX -#define BUILD_NEON +// #define BUILD_NEON #define BUILD_SBVH #define REFIT_BVH2 #define REFIT_MBVH4 @@ -25,7 +25,7 @@ #define TRAVERSE_4WAY #define TRAVERSE_WIVE #define TRAVERSE_2WAY_DBL -#define TRAVERSE_CWBVH +// #define TRAVERSE_CWBVH #define TRAVERSE_2WAY_MT #define TRAVERSE_2WAY_MT_PACKET #define TRAVERSE_OPTIMIZED_ST @@ -223,7 +223,7 @@ float TestShadowRays( uint32_t layout, unsigned N, unsigned passes ) if (abs( (int)occluded - (int)refOccluded[0] ) > 500) // allow some slack, we're using various tri intersectors { fprintf( stderr, "\nValidation for shadow rays failed (%i != %i).\n", (int)occluded, (int)refOccluded[0] ); - exit( 1 ); + // exit( 1 ); // don't terminate, just warn. } return t.elapsed() / passes; } @@ -778,7 +778,7 @@ int main() // passed BVH; we use some of its data in the BVH_SoA. bvh_soa = new BVH_SoA(); bvh_soa->ConvertFrom( *bvh ); - printf( "- ALT_SOA - primary: " ); + printf( "- BVH_SOA - primary: " ); traceTime = TestPrimaryRays( _SOA, Nsmall, 3 ); ValidateTraceResult( refDist, Nsmall, __LINE__ ); printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); @@ -799,7 +799,7 @@ int main() bvh4_cpu = new BVH4_CPU(); bvh4->ConvertFrom( *bvh ); bvh4_cpu->ConvertFrom( *bvh4 ); - printf( "- BVH4_AFRA - primary: " ); + printf( "- BVH4_CPU - primary: " ); traceTime = TestPrimaryRays( _CPU4, Nsmall, 3 ); ValidateTraceResult( refDist, Nsmall, __LINE__ ); printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); @@ -827,7 +827,7 @@ int main() #ifdef GPU_2WAY // trace the rays on GPU using OpenCL - printf( "- AILA_LAINE - primary: " ); + printf( "- BVH_GPU - primary: " ); if (!bvh_gpu) { bvh_gpu = new BVH_GPU(); @@ -923,7 +923,7 @@ int main() #ifdef GPU_CWBVH // trace the rays on GPU using OpenCL - printf( "- BVH8/CWBVH - primary: " ); + printf( "- BVH8_CWBVH - primary: " ); if (!cwbvh) { cwbvh = new BVH8_CWBVH(); @@ -980,7 +980,7 @@ int main() #ifdef TRAVERSE_2WAY_MT // using OpenMP and batches of 10,000 rays - printf( "- WALD_32BYTE - primary: " ); + printf( "- BVH (plain) - primary: " ); for (int pass = 0; pass < 4; pass++) { if (pass == 1) t.reset(); // first pass is cache warming From ca58e1ab65df78b6e3c38f836349b005be760831 Mon Sep 17 00:00:00 2001 From: jbikker Date: Mon, 3 Mar 2025 09:19:12 +0100 Subject: [PATCH 6/7] Formatting and cleanup. --- tiny_bvh.h | 32 ++++++++++++++++---------------- tiny_bvh_speedtest.cpp | 12 ++++++------ tiny_ocl.h | 28 ++++++++++++++-------------- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tiny_bvh.h b/tiny_bvh.h index 67e6243..f0300f9 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -351,7 +351,7 @@ inline bvhvec4 tinybvh_max( const bvhvec4& a, const bvhvec4& b ) { return bvhvec inline float tinybvh_clamp( const float x, const float a, const float b ) { return x > a ? (x < b ? x : b) : a; /* NaN safe */ } inline int32_t tinybvh_clamp( const int32_t x, const int32_t a, const int32_t b ) { return x > a ? (x < b ? x : b) : a; /* NaN safe */ } template inline static void tinybvh_swap( T& a, T& b ) { T t = a; a = b; b = t; } -inline float tinybvh_halfAreaf(const bvhvec3& v) { return v.x < -BVH_FAR ? 0 : (v.x * v.y + v.y * v.z + v.z * v.x); } // for SAH calculations +inline float tinybvh_half_area( const bvhvec3& v ) { return v.x < -BVH_FAR ? 0 : (v.x * v.y + v.y * v.z + v.z * v.x); } // for SAH calculations // Operator overloads. // Only a minimal set is provided. @@ -483,7 +483,7 @@ inline bvhdbl3 tinybvh_cross( const bvhdbl3& a, const bvhdbl3& b ) } inline double tinybvh_dot( const bvhdbl3& a, const bvhdbl3& b ) { return a.x * b.x + a.y * b.y + a.z * b.z; } -inline double tinybvh_halfAread(const bvhdbl3& v) { return v.x < -BVH_FAR ? 0 : (v.x * v.y + v.y * v.z + v.z * v.x); } // for SAH calculations +inline double tinybvh_half_area( const bvhdbl3& v ) { return v.x < -BVH_FAR ? 0 : (v.x * v.y + v.y * v.z + v.z * v.x); } // for SAH calculations #endif // DOUBLE_PRECISION_SUPPORT @@ -1831,8 +1831,8 @@ void BVH::Build() lBMax[i] = l2 = tinybvh_max( l2, binMax[a][i] ); rBMax[BVHBINS - 2 - i] = r2 = tinybvh_max( r2, binMax[a][BVHBINS - 1 - i] ); lN += count[a][i], rN += count[a][BVHBINS - 1 - i]; - ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_halfAreaf(l2 - l1) * (float)lN); - ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_halfAreaf(r2 - r1) * (float)rN); + ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_half_area( l2 - l1 ) * (float)lN); + ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_half_area( r2 - r1 ) * (float)rN); } // evaluate bin totals to find best position for object split for (uint32_t i = 0; i < BVHBINS - 1; i++) @@ -1982,7 +1982,7 @@ void BVH::BuildHQ() uint32_t nextFrag = triCount; // subdivide recursively BVHNode& root = bvhNode[0]; - const float rootArea = tinybvh_halfAreaf(root.aabbMax - root.aabbMin); + const float rootArea = tinybvh_half_area( root.aabbMax - root.aabbMin ); struct Task { uint32_t node, sliceStart, sliceEnd, dummy; }; ALIGNED( 64 ) Task task[1024]; uint32_t taskCount = 0, nodeIdx = 0, sliceStart = 0, sliceEnd = triCount + slack; @@ -2028,8 +2028,8 @@ void BVH::BuildHQ() lBMax[i] = l2 = tinybvh_max( l2, binMax[a][i] ); rBMax[HQBVHBINS - 2 - i] = r2 = tinybvh_max( r2, binMax[a][HQBVHBINS - 1 - i] ); lN += count[a][i], rN += count[a][HQBVHBINS - 1 - i]; - ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_halfAreaf(l2 - l1) * (float)lN); - ANR[HQBVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_halfAreaf(r2 - r1) * (float)rN); + ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_half_area( l2 - l1 ) * (float)lN); + ANR[HQBVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_half_area( r2 - r1 ) * (float)rN); } // evaluate bin totals to find best position for object split for (uint32_t i = 0; i < HQBVHBINS - 1; i++) @@ -2044,7 +2044,7 @@ void BVH::BuildHQ() bool spatial = false; uint32_t NL[HQBVHBINS - 1], NR[HQBVHBINS - 1], budget = sliceEnd - sliceStart, bestNL = 0, bestNR = 0; bvhvec3 spatialUnion = bestLMax - bestRMin; - float spatialOverlap = (tinybvh_halfAreaf(spatialUnion)) / rootArea; + float spatialOverlap = (tinybvh_half_area( spatialUnion )) / rootArea; if (budget > node.triCount && splitCost < 1e30f && spatialOverlap > 1e-5f) { for (uint32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a]) @@ -2087,8 +2087,8 @@ void BVH::BuildHQ() lBMin[i] = l1 = tinybvh_min( l1, binaMin[i] ), rBMin[HQBVHBINS - 2 - i] = r1 = tinybvh_min( r1, binaMin[HQBVHBINS - 1 - i] ); lBMax[i] = l2 = tinybvh_max( l2, binaMax[i] ), rBMax[HQBVHBINS - 2 - i] = r2 = tinybvh_max( r2, binaMax[HQBVHBINS - 1 - i] ); lN += countIn[i], rN += countOut[HQBVHBINS - 1 - i], NL[i] = lN, NR[HQBVHBINS - 2 - i] = rN; - ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_halfAreaf(l2 - l1) * (float)lN); - ANR[HQBVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_halfAreaf(r2 - r1) * (float)rN); + ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_half_area( l2 - l1 ) * (float)lN); + ANR[HQBVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_half_area( r2 - r1 ) * (float)rN); } // find best position for spatial split for (uint32_t i = 0; i < HQBVHBINS - 1; i++) @@ -2132,8 +2132,8 @@ void BVH::BuildHQ() { bvhvec3 unsplitLMin = tinybvh_min( bestLMin, fragment[fragIdx].bmin ); bvhvec3 unsplitLMax = tinybvh_max( bestLMax, fragment[fragIdx].bmax ); - float AL = tinybvh_halfAreaf(unsplitLMax - unsplitLMin); - float AR = tinybvh_halfAreaf(bestRMax - bestRMin); + float AL = tinybvh_half_area( unsplitLMax - unsplitLMin ); + float AR = tinybvh_half_area( bestRMax - bestRMin ); float CunsplitLeft = c_trav + c_int * rSAV * (AL * bestNL + AR * (bestNR - 1)); if (CunsplitLeft < splitCost) { @@ -2147,8 +2147,8 @@ void BVH::BuildHQ() { const bvhvec3 unsplitRMin = tinybvh_min( bestRMin, fragment[fragIdx].bmin ); const bvhvec3 unsplitRMax = tinybvh_max( bestRMax, fragment[fragIdx].bmax ); - const float AL = tinybvh_halfAreaf(bestLMax - bestLMin); - const float AR = tinybvh_halfAreaf(unsplitRMax - unsplitRMin); + const float AL = tinybvh_half_area( bestLMax - bestLMin ); + const float AR = tinybvh_half_area( unsplitRMax - unsplitRMin ); const float CunsplitRight = c_trav + c_int * rSAV * (AL * (bestNL - 1) + AR * bestNR); if (CunsplitRight < splitCost) { @@ -7100,8 +7100,8 @@ void BVH_Double::Build() lBMax[i] = l2 = tinybvh_max( l2, binMax[a][i] ); rBMax[BVHBINS - 2 - i] = r2 = tinybvh_max( r2, binMax[a][BVHBINS - 1 - i] ); lN += count[a][i], rN += count[a][BVHBINS - 1 - i]; - ANL[i] = lN == 0 ? BVH_DBL_FAR : (tinybvh_halfAread(l2 - l1) * (double)lN); - ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_DBL_FAR : (tinybvh_halfAread(r2 - r1) * (double)rN); + ANL[i] = lN == 0 ? BVH_DBL_FAR : (tinybvh_half_area( l2 - l1 ) * (double)lN); + ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_DBL_FAR : (tinybvh_half_area( r2 - r1 ) * (double)rN); } // evaluate bin totals to find best position for object split for (uint32_t i = 0; i < BVHBINS - 1; i++) diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index fb973ba..0c85e48 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -210,9 +210,9 @@ float TestShadowRays( uint32_t layout, unsigned N, unsigned passes ) #endif case _GPU2: for (unsigned i = 0; i < N; i++) occluded += bvh_gpu->IsOccluded( batch[i] ); break; case _CPU4: for (unsigned i = 0; i < N; i++) occluded += bvh4_cpu->IsOccluded( batch[i] ); break; - #ifdef BVH_USEAVX2 + #ifdef BVH_USEAVX2 case _CPU8: for (unsigned i = 0; i < N; i++) occluded += bvh8_cpu->IsOccluded( batch[i] ); break; - #endif + #endif default: break; } } @@ -736,7 +736,7 @@ int main() #ifdef TRAVERSE_CWBVH #ifdef BVH_USEAVX - + // CWBVH - Not efficient on CPU. if (!cwbvh) { @@ -747,7 +747,7 @@ int main() traceTime = TestPrimaryRays( _CWBVH, Nsmall, 3 ); ValidateTraceResult( refDist, Nsmall, __LINE__ ); printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); - + #endif #endif @@ -771,7 +771,7 @@ int main() #ifdef TRAVERSE_OPTIMIZED_ST #ifdef BVH_USEAVX - + // ALT_SOA delete bvh_soa; // Building a BVH_SoA over an optimized BVH: Careful, do not delete the @@ -784,7 +784,7 @@ int main() printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); traceTime = TestShadowRays( _SOA, Nsmall, 3 ); printf( "shadow: %5.1fms (%7.2fMRays/s)\n", traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); - + #endif #endif diff --git a/tiny_ocl.h b/tiny_ocl.h index 37456b0..6e53b3c 100644 --- a/tiny_ocl.h +++ b/tiny_ocl.h @@ -964,7 +964,7 @@ bool Kernel::InitCL() free( extensions ); string mustHave[] = { #if defined(__APPLE__) && defined(__MACH__) - "cl_APPLE_gl_sharing", + "cl_APPLE_gl_sharing", #else "cl_khr_gl_sharing", #endif @@ -1073,9 +1073,9 @@ bool Kernel::InitCL() isIntel = true; } else if (strstr( d, "apple" )) - { - isApple = true; - } + { + isApple = true; + } else { isOther = true; @@ -1102,22 +1102,22 @@ bool Kernel::InitCL() printf( "Intel.\n" ); } else if (isApple) - { - printf( "Apple.\n" ); - } + { + printf( "Apple.\n" ); + } else { printf( "identification failed.\n" ); } // create a command-queue #if defined(__APPLE__) && defined(__MACH__) - // Cannot find symbol for _clCreateCommandQueueWithProperties on APPLE - cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE; - queue = clCreateCommandQueue( context, devices[deviceUsed], props, &error ); - if (!CHECKCL( error )) return false; - // create a second command queue for asynchronous copies - queue2 = clCreateCommandQueue( context, devices[deviceUsed], props, &error ); - if (!CHECKCL( error )) return false; + // Cannot find symbol for _clCreateCommandQueueWithProperties on APPLE + cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE; + queue = clCreateCommandQueue( context, devices[deviceUsed], props, &error ); + if (!CHECKCL( error )) return false; + // create a second command queue for asynchronous copies + queue2 = clCreateCommandQueue( context, devices[deviceUsed], props, &error ); + if (!CHECKCL( error )) return false; #else cl_queue_properties props[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; queue = clCreateCommandQueueWithProperties( context, devices[deviceUsed], props, &error ); From 95fbc2e4cd36b83ad66eeb832ddf4bbe7d5d00cb Mon Sep 17 00:00:00 2001 From: jbikker Date: Mon, 3 Mar 2025 09:23:29 +0100 Subject: [PATCH 7/7] Version bump: 1.4.2. --- README.md | 6 ++---- tiny_bvh.h | 4 ++-- tiny_ocl.h | 3 ++- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 5570cb4..9f7b568 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,3 @@ -# dev -This is the **development branch** for tinybvh. - # tinybvh Single-header BVH construction and traversal library written as "Sane C++" (or "C with classes"). Some C++11 is used, e.g. for threading. The library has no dependencies. @@ -68,7 +65,7 @@ The **performance measurement tool** can be compiled with: ````g++ -std=c++20 -mavx2 -mfma -Ofast tiny_bvh_speedtest.cpp -o tiny_bvh_speedtest```` -# Version 1.4.1 +# Version 1.4.2 Version 1.4.0 introduces a new BVH layout for fast single-ray traversal on CPU: BVH8_CPU. This supersedes the previous fastest scheme, BVH4_CPU. @@ -128,6 +125,7 @@ This version of the library includes the following functionality: * Fast AVX2 ray tracing: Implements the 2017 paper by [Fuetterling et al.](https://web.cs.ucdavis.edu/~hamann/FuetterlingLojewskiPfreundtHamannEbertHPG2017PaperFinal06222017.pdf) * Fast triangle intersection: Implements the 2016 paper by [Baldwin & Weber](https://jcgt.org/published/0005/03/03/paper.pdf) * OpenCL traversal example code: Aila & Laine, 4-way quantized, CWBVH +* OpenCL support for MacOS, by [wuyakuma](https://github.com/wuyakuma) * Support for WASM / EMSCRIPTEN, g++, clang, Visual Studio * Optional user-defined memory allocation, by [Thierry Cantenot](https://github.com/tcantenot) * Vertex array can now have a custom stride, by [David Peicho](https://github.com/DavidPeicho) diff --git a/tiny_bvh.h b/tiny_bvh.h index f0300f9..9e64721 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -1,7 +1,7 @@ /* The MIT License (MIT) -Copyright (c) 2024, Jacco Bikker / Breda University of Applied Sciences. +Copyright (c) 2024-2025, Jacco Bikker / Breda University of Applied Sciences. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -170,7 +170,7 @@ THE SOFTWARE. // library version #define TINY_BVH_VERSION_MAJOR 1 #define TINY_BVH_VERSION_MINOR 4 -#define TINY_BVH_VERSION_SUB 1 +#define TINY_BVH_VERSION_SUB 2 // ============================================================================ // diff --git a/tiny_ocl.h b/tiny_ocl.h index 6e53b3c..f2f2e16 100644 --- a/tiny_ocl.h +++ b/tiny_ocl.h @@ -1,7 +1,7 @@ /* The MIT License (MIT) -Copyright (c) 2024, Jacco Bikker / Breda University of Applied Sciences. +Copyright (c) 2024-2025, Jacco Bikker / Breda University of Applied Sciences. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +// Mar 03, '25: version 0.2.0 : MacOS support, by wuyakuma // Nov 18, '24: version 0.1.1 : Added custom alloc/free. // Nov 15, '24: version 0.1.0 : Accidentally started another tiny lib.