diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f8dc4d..a16e552 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,10 @@ project(tiny_bvh LANGUAGES CXX) if (APPLE) find_library(COCOA_LIBRARY Cocoa) + find_library(OPENCL_LIBRARY OpenCL) + if (OPENCL_LIBRARY) + set(OPENCL_FRAMEWORK "-framework OpenCL") + endif() elseif (UNIX AND NOT EMSCRIPTEN) find_package(X11) elseif (EMSCRIPTEN) @@ -118,6 +122,8 @@ if (NOT MSVC) set(tiny_bvh_speedtest_link_flags ${tiny_bvh_speedtest_link_flags} -sPROXY_TO_PTHREAD=1) endif() endif() + elseif (APPLE) + target_link_libraries(tiny_bvh_speedtest ${COCOA_LIBRARY} ${OPENCL_FRAMEWORK}) endif() target_compile_options(tiny_bvh_speedtest PRIVATE ${tiny_bvh_speedtest_cxx_flags}) target_link_options(tiny_bvh_speedtest PRIVATE ${tiny_bvh_speedtest_link_flags}) diff --git a/README.md b/README.md index 10a04eb..9f7b568 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ The **performance measurement tool** can be compiled with: ````g++ -std=c++20 -mavx2 -mfma -Ofast tiny_bvh_speedtest.cpp -o tiny_bvh_speedtest```` -# Version 1.4.1 +# Version 1.4.2 Version 1.4.0 introduces a new BVH layout for fast single-ray traversal on CPU: BVH8_CPU. This supersedes the previous fastest scheme, BVH4_CPU. @@ -125,6 +125,7 @@ This version of the library includes the following functionality: * Fast AVX2 ray tracing: Implements the 2017 paper by [Fuetterling et al.](https://web.cs.ucdavis.edu/~hamann/FuetterlingLojewskiPfreundtHamannEbertHPG2017PaperFinal06222017.pdf) * Fast triangle intersection: Implements the 2016 paper by [Baldwin & Weber](https://jcgt.org/published/0005/03/03/paper.pdf) * OpenCL traversal example code: Aila & Laine, 4-way quantized, CWBVH +* OpenCL support for MacOS, by [wuyakuma](https://github.com/wuyakuma) * Support for WASM / EMSCRIPTEN, g++, clang, Visual Studio * Optional user-defined memory allocation, by [Thierry Cantenot](https://github.com/tcantenot) * Vertex array can now have a custom stride, by [David Peicho](https://github.com/DavidPeicho) @@ -142,6 +143,9 @@ Plans, ordered by priority: * Speed improvements: * Faster optimizer for AVX-capable CPUs * Improve speed of SBVH builder +* Features & outstanding issues: + * 'Watertight' triangle intersection option + * Load/save/Optimize/Refit for BVH8_CPU * Demo of tinybvh on GPU using other apis: * Ray tracing in pure OpenGL * Ray tracing in pure DirectX diff --git a/tiny_bvh.h b/tiny_bvh.h index 4215b07..9e64721 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -1,7 +1,7 @@ /* The MIT License (MIT) -Copyright (c) 2024, Jacco Bikker / Breda University of Applied Sciences. +Copyright (c) 2024-2025, Jacco Bikker / Breda University of Applied Sciences. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -170,7 +170,7 @@ THE SOFTWARE. // library version #define TINY_BVH_VERSION_MAJOR 1 #define TINY_BVH_VERSION_MINOR 4 -#define TINY_BVH_VERSION_SUB 1 +#define TINY_BVH_VERSION_SUB 2 // ============================================================================ // @@ -351,7 +351,7 @@ inline bvhvec4 tinybvh_max( const bvhvec4& a, const bvhvec4& b ) { return bvhvec inline float tinybvh_clamp( const float x, const float a, const float b ) { return x > a ? (x < b ? x : b) : a; /* NaN safe */ } inline int32_t tinybvh_clamp( const int32_t x, const int32_t a, const int32_t b ) { return x > a ? (x < b ? x : b) : a; /* NaN safe */ } template inline static void tinybvh_swap( T& a, T& b ) { T t = a; a = b; b = t; } -inline float tinybvh_halfAreaf(const bvhvec3& v) { return v.x < -BVH_FAR ? 0 : (v.x * v.y + v.y * v.z + v.z * v.x); } // for SAH calculations +inline float tinybvh_half_area( const bvhvec3& v ) { return v.x < -BVH_FAR ? 0 : (v.x * v.y + v.y * v.z + v.z * v.x); } // for SAH calculations // Operator overloads. // Only a minimal set is provided. @@ -483,7 +483,7 @@ inline bvhdbl3 tinybvh_cross( const bvhdbl3& a, const bvhdbl3& b ) } inline double tinybvh_dot( const bvhdbl3& a, const bvhdbl3& b ) { return a.x * b.x + a.y * b.y + a.z * b.z; } -inline double tinybvh_halfAread(const bvhdbl3& v) { return v.x < -BVH_FAR ? 0 : (v.x * v.y + v.y * v.z + v.z * v.x); } // for SAH calculations +inline double tinybvh_half_area( const bvhdbl3& v ) { return v.x < -BVH_FAR ? 0 : (v.x * v.y + v.y * v.z + v.z * v.x); } // for SAH calculations #endif // DOUBLE_PRECISION_SUPPORT @@ -1142,9 +1142,9 @@ class BVH8_CPU : public BVHBase struct BVHNodeCompact { // Novel 8-way BVH node, with quantized child node bounds, similar to CWBVH. + uint64_t cbminx8; // 8, stores aabbMin.x for 8 children, quantized. float bminx, bminy, bminz; // 12, actually: bmin - ext. float bextx, bexty, bextz; // 12, extend of the node, scaled conversatively. - uint64_t cbminx8; // 8, stores aabbMin.x for 8 children, quantized. __m256i cbminmaxyz8; // 32, stores cbminy8, cbminz8, cbmaxy8, cbmaxz8 __m256i child8, perm8; // 64, includes cbmaxx8<<24 in perm8. }; @@ -1831,8 +1831,8 @@ void BVH::Build() lBMax[i] = l2 = tinybvh_max( l2, binMax[a][i] ); rBMax[BVHBINS - 2 - i] = r2 = tinybvh_max( r2, binMax[a][BVHBINS - 1 - i] ); lN += count[a][i], rN += count[a][BVHBINS - 1 - i]; - ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_halfAreaf(l2 - l1) * (float)lN); - ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_halfAreaf(r2 - r1) * (float)rN); + ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_half_area( l2 - l1 ) * (float)lN); + ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_half_area( r2 - r1 ) * (float)rN); } // evaluate bin totals to find best position for object split for (uint32_t i = 0; i < BVHBINS - 1; i++) @@ -1982,7 +1982,7 @@ void BVH::BuildHQ() uint32_t nextFrag = triCount; // subdivide recursively BVHNode& root = bvhNode[0]; - const float rootArea = tinybvh_halfAreaf(root.aabbMax - root.aabbMin); + const float rootArea = tinybvh_half_area( root.aabbMax - root.aabbMin ); struct Task { uint32_t node, sliceStart, sliceEnd, dummy; }; ALIGNED( 64 ) Task task[1024]; uint32_t taskCount = 0, nodeIdx = 0, sliceStart = 0, sliceEnd = triCount + slack; @@ -2028,8 +2028,8 @@ void BVH::BuildHQ() lBMax[i] = l2 = tinybvh_max( l2, binMax[a][i] ); rBMax[HQBVHBINS - 2 - i] = r2 = tinybvh_max( r2, binMax[a][HQBVHBINS - 1 - i] ); lN += count[a][i], rN += count[a][HQBVHBINS - 1 - i]; - ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_halfAreaf(l2 - l1) * (float)lN); - ANR[HQBVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_halfAreaf(r2 - r1) * (float)rN); + ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_half_area( l2 - l1 ) * (float)lN); + ANR[HQBVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_half_area( r2 - r1 ) * (float)rN); } // evaluate bin totals to find best position for object split for (uint32_t i = 0; i < HQBVHBINS - 1; i++) @@ -2044,7 +2044,7 @@ void BVH::BuildHQ() bool spatial = false; uint32_t NL[HQBVHBINS - 1], NR[HQBVHBINS - 1], budget = sliceEnd - sliceStart, bestNL = 0, bestNR = 0; bvhvec3 spatialUnion = bestLMax - bestRMin; - float spatialOverlap = (tinybvh_halfAreaf(spatialUnion)) / rootArea; + float spatialOverlap = (tinybvh_half_area( spatialUnion )) / rootArea; if (budget > node.triCount && splitCost < 1e30f && spatialOverlap > 1e-5f) { for (uint32_t a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a]) @@ -2087,8 +2087,8 @@ void BVH::BuildHQ() lBMin[i] = l1 = tinybvh_min( l1, binaMin[i] ), rBMin[HQBVHBINS - 2 - i] = r1 = tinybvh_min( r1, binaMin[HQBVHBINS - 1 - i] ); lBMax[i] = l2 = tinybvh_max( l2, binaMax[i] ), rBMax[HQBVHBINS - 2 - i] = r2 = tinybvh_max( r2, binaMax[HQBVHBINS - 1 - i] ); lN += countIn[i], rN += countOut[HQBVHBINS - 1 - i], NL[i] = lN, NR[HQBVHBINS - 2 - i] = rN; - ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_halfAreaf(l2 - l1) * (float)lN); - ANR[HQBVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_halfAreaf(r2 - r1) * (float)rN); + ANL[i] = lN == 0 ? BVH_FAR : (tinybvh_half_area( l2 - l1 ) * (float)lN); + ANR[HQBVHBINS - 2 - i] = rN == 0 ? BVH_FAR : (tinybvh_half_area( r2 - r1 ) * (float)rN); } // find best position for spatial split for (uint32_t i = 0; i < HQBVHBINS - 1; i++) @@ -2132,8 +2132,8 @@ void BVH::BuildHQ() { bvhvec3 unsplitLMin = tinybvh_min( bestLMin, fragment[fragIdx].bmin ); bvhvec3 unsplitLMax = tinybvh_max( bestLMax, fragment[fragIdx].bmax ); - float AL = tinybvh_halfAreaf(unsplitLMax - unsplitLMin); - float AR = tinybvh_halfAreaf(bestRMax - bestRMin); + float AL = tinybvh_half_area( unsplitLMax - unsplitLMin ); + float AR = tinybvh_half_area( bestRMax - bestRMin ); float CunsplitLeft = c_trav + c_int * rSAV * (AL * bestNL + AR * (bestNR - 1)); if (CunsplitLeft < splitCost) { @@ -2147,8 +2147,8 @@ void BVH::BuildHQ() { const bvhvec3 unsplitRMin = tinybvh_min( bestRMin, fragment[fragIdx].bmin ); const bvhvec3 unsplitRMax = tinybvh_max( bestRMax, fragment[fragIdx].bmax ); - const float AL = tinybvh_halfAreaf(bestLMax - bestLMin); - const float AR = tinybvh_halfAreaf(unsplitRMax - unsplitRMin); + const float AL = tinybvh_half_area( bestLMax - bestLMin ); + const float AR = tinybvh_half_area( unsplitRMax - unsplitRMin ); const float CunsplitRight = c_trav + c_int * rSAV * (AL * (bestNL - 1) + AR * bestNR); if (CunsplitRight < splitCost) { @@ -4329,10 +4329,6 @@ void BVH8_CPU::ConvertFrom( const MBVH<8>& original, bool compact ) { const MBVH<8>::MBVHNode& orig = bvh8.mbvhNode[nodeIdx]; BVHNode& newNode = bvh8Node[newAlt8Ptr++]; - if (newAlt8Ptr == 4940) - { - int w= 0; - } memset( &newNode, 0, sizeof( BVHNode ) ); // calculate the permutation offsets for the node for (uint32_t q = 0; q < 8; q++) @@ -5858,11 +5854,14 @@ int32_t BVH8_CPU::Intersect( Ray& ray ) const __m256 ox8 = _mm256_set1_ps( ray.O.x ), rdx8 = _mm256_set1_ps( ray.rD.x ); __m256 oy8 = _mm256_set1_ps( ray.O.y ), rdy8 = _mm256_set1_ps( ray.rD.y ); __m256 oz8 = _mm256_set1_ps( ray.O.z ), rdz8 = _mm256_set1_ps( ray.rD.z ); - __m256 t8 = _mm256_set1_ps( ray.hit.t ), zero8 = _mm256_setzero_ps(); - const __m256i permMask8 = _mm256_set1_epi32( 7 ); - const __m256i signShift8 = _mm256_set1_epi32( (ray.D.x > 0 ? 3 : 0) + (ray.D.y > 0 ? 6 : 0) + (ray.D.z > 0 ? 12 : 0) ); + __m256 t8 = _mm256_set1_ps( ray.hit.t ); +#ifdef BVH8_CPU_COMPACT + const __m256 zero8 = _mm256_setzero_ps(); const __m256i mantissa8 = _mm256_set1_epi32( 255 << 15 ); const __m256i exponent8 = _mm256_set1_epi32( 0x3f800000 ); +#endif + const __m256i permMask8 = _mm256_set1_epi32( 7 ); + const __m256i signShift8 = _mm256_set1_epi32( (ray.D.x > 0 ? 3 : 0) + (ray.D.y > 0 ? 6 : 0) + (ray.D.z > 0 ? 12 : 0) ); __m128 dx4 = _mm_set1_ps( ray.D.x ), dy4 = _mm_set1_ps( ray.D.y ), dz4 = _mm_set1_ps( ray.D.z ); const __m128 epsNeg4 = _mm_set1_ps( -0.000001f ), eps4 = _mm_set1_ps( 0.000001f ), one4 = _mm_set1_ps( 1.0f ); uint32_t stackPtr = 0, nodeIdx = 0, steps = 0; @@ -6022,9 +6021,12 @@ bool BVH8_CPU::IsOccluded( const Ray& ray ) const __m256 ox8 = _mm256_set1_ps( ray.O.x ), rdx8 = _mm256_set1_ps( ray.rD.x ); __m256 oy8 = _mm256_set1_ps( ray.O.y ), rdy8 = _mm256_set1_ps( ray.rD.y ); __m256 oz8 = _mm256_set1_ps( ray.O.z ), rdz8 = _mm256_set1_ps( ray.rD.z ); - const __m256 t8 = _mm256_set1_ps( ray.hit.t ), zero8 = _mm256_setzero_ps(); + const __m256 t8 = _mm256_set1_ps( ray.hit.t ); +#ifdef BVH8_CPU_COMPACT + const __m256 zero8 = _mm256_setzero_ps(); const __m256i mantissa8 = _mm256_set1_epi32( 255 << 15 ); const __m256i exponent8 = _mm256_set1_epi32( 0x3f800000 ); +#endif __m128 dx4 = _mm_set1_ps( ray.D.x ), dy4 = _mm_set1_ps( ray.D.y ), dz4 = _mm_set1_ps( ray.D.z ); const __m128 epsNeg4 = _mm_set1_ps( -0.000001f ), eps4 = _mm_set1_ps( 0.000001f ), t4 = _mm_set1_ps( ray.hit.t ); const __m128 one4 = _mm_set1_ps( 1.0f ), zero4 = _mm_setzero_ps(); @@ -6036,9 +6038,10 @@ bool BVH8_CPU::IsOccluded( const Ray& ray ) const #ifdef BVH8_CPU_COMPACT const BVHNodeCompact& n = bvh8Small[nodeIdx & 0x1fffffff /* bits 0..28 */]; const __m256i c8 = n.child8; + const __m256i perm8 = n.perm8; const __m256i cbminmax8 = n.cbminmaxyz8; const __m256i bminx8i = _mm256_or_si256( exponent8, _mm256_slli_epi32( _mm256_cvtepu8_epi32( _mm_cvtsi64_si128( n.cbminx8 ) ), 15 ) ); - const __m256i bmaxx8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( n.perm8, 9 ), mantissa8 ) ); + const __m256i bmaxx8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( perm8, 9 ), mantissa8 ) ); const __m256i bminy8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( cbminmax8, 9 ), mantissa8 ) ); const __m256i bmaxy8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_srli_epi32( cbminmax8, 1 ), mantissa8 ) ); const __m256i bminz8i = _mm256_or_si256( exponent8, _mm256_and_si256( _mm256_slli_epi32( cbminmax8, 7 ), mantissa8 ) ); @@ -7097,8 +7100,8 @@ void BVH_Double::Build() lBMax[i] = l2 = tinybvh_max( l2, binMax[a][i] ); rBMax[BVHBINS - 2 - i] = r2 = tinybvh_max( r2, binMax[a][BVHBINS - 1 - i] ); lN += count[a][i], rN += count[a][BVHBINS - 1 - i]; - ANL[i] = lN == 0 ? BVH_DBL_FAR : (tinybvh_halfAread(l2 - l1) * (double)lN); - ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_DBL_FAR : (tinybvh_halfAread(r2 - r1) * (double)rN); + ANL[i] = lN == 0 ? BVH_DBL_FAR : (tinybvh_half_area( l2 - l1 ) * (double)lN); + ANR[BVHBINS - 2 - i] = rN == 0 ? BVH_DBL_FAR : (tinybvh_half_area( r2 - r1 ) * (double)rN); } // evaluate bin totals to find best position for object split for (uint32_t i = 0; i < BVHBINS - 1; i++) diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index e73b932..0c85e48 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -14,7 +14,7 @@ #define BUILD_REFERENCE #define BUILD_DOUBLE #define BUILD_AVX -#define BUILD_NEON +// #define BUILD_NEON #define BUILD_SBVH #define REFIT_BVH2 #define REFIT_MBVH4 @@ -25,7 +25,7 @@ #define TRAVERSE_4WAY #define TRAVERSE_WIVE #define TRAVERSE_2WAY_DBL -#define TRAVERSE_CWBVH +// #define TRAVERSE_CWBVH #define TRAVERSE_2WAY_MT #define TRAVERSE_2WAY_MT_PACKET #define TRAVERSE_OPTIMIZED_ST @@ -48,6 +48,8 @@ using namespace tinybvh; #endif #ifdef _WIN32 #include // for __cpuidex +#elif defined(__APPLE__) && defined(__MACH__) +// Keep ENABLE_OPENCL for APPLE #elif defined ENABLE_OPENCL #undef ENABLE_OPENCL #endif @@ -208,9 +210,9 @@ float TestShadowRays( uint32_t layout, unsigned N, unsigned passes ) #endif case _GPU2: for (unsigned i = 0; i < N; i++) occluded += bvh_gpu->IsOccluded( batch[i] ); break; case _CPU4: for (unsigned i = 0; i < N; i++) occluded += bvh4_cpu->IsOccluded( batch[i] ); break; - #ifdef BVH_USEAVX2 + #ifdef BVH_USEAVX2 case _CPU8: for (unsigned i = 0; i < N; i++) occluded += bvh8_cpu->IsOccluded( batch[i] ); break; - #endif + #endif default: break; } } @@ -221,7 +223,7 @@ float TestShadowRays( uint32_t layout, unsigned N, unsigned passes ) if (abs( (int)occluded - (int)refOccluded[0] ) > 500) // allow some slack, we're using various tri intersectors { fprintf( stderr, "\nValidation for shadow rays failed (%i != %i).\n", (int)occluded, (int)refOccluded[0] ); - exit( 1 ); + // exit( 1 ); // don't terminate, just warn. } return t.elapsed() / passes; } @@ -733,6 +735,7 @@ int main() #endif #ifdef TRAVERSE_CWBVH +#ifdef BVH_USEAVX // CWBVH - Not efficient on CPU. if (!cwbvh) @@ -746,6 +749,7 @@ int main() printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); #endif +#endif #if defined TRAVERSE_OPTIMIZED_ST || defined TRAVERSE_4WAY_OPTIMIZED @@ -766,6 +770,7 @@ int main() #endif #ifdef TRAVERSE_OPTIMIZED_ST +#ifdef BVH_USEAVX // ALT_SOA delete bvh_soa; @@ -773,7 +778,7 @@ int main() // passed BVH; we use some of its data in the BVH_SoA. bvh_soa = new BVH_SoA(); bvh_soa->ConvertFrom( *bvh ); - printf( "- ALT_SOA - primary: " ); + printf( "- BVH_SOA - primary: " ); traceTime = TestPrimaryRays( _SOA, Nsmall, 3 ); ValidateTraceResult( refDist, Nsmall, __LINE__ ); printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); @@ -781,6 +786,7 @@ int main() printf( "shadow: %5.1fms (%7.2fMRays/s)\n", traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); #endif +#endif #ifdef TRAVERSE_4WAY_OPTIMIZED @@ -793,7 +799,7 @@ int main() bvh4_cpu = new BVH4_CPU(); bvh4->ConvertFrom( *bvh ); bvh4_cpu->ConvertFrom( *bvh4 ); - printf( "- BVH4_AFRA - primary: " ); + printf( "- BVH4_CPU - primary: " ); traceTime = TestPrimaryRays( _CPU4, Nsmall, 3 ); ValidateTraceResult( refDist, Nsmall, __LINE__ ); printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); @@ -821,7 +827,7 @@ int main() #ifdef GPU_2WAY // trace the rays on GPU using OpenCL - printf( "- AILA_LAINE - primary: " ); + printf( "- BVH_GPU - primary: " ); if (!bvh_gpu) { bvh_gpu = new BVH_GPU(); @@ -917,7 +923,7 @@ int main() #ifdef GPU_CWBVH // trace the rays on GPU using OpenCL - printf( "- BVH8/CWBVH - primary: " ); + printf( "- BVH8_CWBVH - primary: " ); if (!cwbvh) { cwbvh = new BVH8_CWBVH(); @@ -974,7 +980,7 @@ int main() #ifdef TRAVERSE_2WAY_MT // using OpenMP and batches of 10,000 rays - printf( "- WALD_32BYTE - primary: " ); + printf( "- BVH (plain) - primary: " ); for (int pass = 0; pass < 4; pass++) { if (pass == 1) t.reset(); // first pass is cache warming @@ -1082,4 +1088,4 @@ int main() printf( "all done." ); return 0; -} \ No newline at end of file +} diff --git a/tiny_ocl.h b/tiny_ocl.h index 461b70a..f2f2e16 100644 --- a/tiny_ocl.h +++ b/tiny_ocl.h @@ -1,7 +1,7 @@ /* The MIT License (MIT) -Copyright (c) 2024, Jacco Bikker / Breda University of Applied Sciences. +Copyright (c) 2024-2025, Jacco Bikker / Breda University of Applied Sciences. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +// Mar 03, '25: version 0.2.0 : MacOS support, by wuyakuma // Nov 18, '24: version 0.1.1 : Added custom alloc/free. // Nov 15, '24: version 0.1.0 : Accidentally started another tiny lib. @@ -359,7 +360,7 @@ class Kernel inline static cl_context context; // simplifies some things, but limits us to one device inline static cl_command_queue queue, queue2; inline static char* log = 0; - inline static bool isNVidia = false, isAMD = false, isIntel = false, isOther = false; + inline static bool isNVidia = false, isAMD = false, isIntel = false, isApple = false, isOther = false; inline static bool isAmpere = false, isTuring = false, isPascal = false; inline static bool isAda = false, isBlackwell = false, isRubin = false, isHopper = false; inline static int vendorLines = 0; @@ -753,6 +754,7 @@ Kernel::Kernel( const char* file, const char* entryPoint ) if (isNVidia) csText = "#define ISNVIDIA\n" + csText, vendorLines++; if (isAMD) csText = "#define ISAMD\n" + csText, vendorLines++; if (isIntel) csText = "#define ISINTEL\n" + csText, vendorLines++; + if (isApple) csText = "#define ISAPPLE\n" + csText, vendorLines++; if (isOther) csText = "#define ISOTHER\n" + csText, vendorLines++; if (isAmpere) csText = "#define ISAMPERE\n" + csText, vendorLines++; if (isTuring) csText = "#define ISTURING\n" + csText, vendorLines++; @@ -962,7 +964,11 @@ bool Kernel::InitCL() string deviceList( extensions ); free( extensions ); string mustHave[] = { +#if defined(__APPLE__) && defined(__MACH__) + "cl_APPLE_gl_sharing", +#else "cl_khr_gl_sharing", +#endif "cl_khr_global_int32_base_atomics" }; bool hasAll = true; @@ -1067,6 +1073,10 @@ bool Kernel::InitCL() { isIntel = true; } + else if (strstr( d, "apple" )) + { + isApple = true; + } else { isOther = true; @@ -1092,17 +1102,31 @@ bool Kernel::InitCL() { printf( "Intel.\n" ); } + else if (isApple) + { + printf( "Apple.\n" ); + } else { printf( "identification failed.\n" ); } // create a command-queue +#if defined(__APPLE__) && defined(__MACH__) + // Cannot find symbol for _clCreateCommandQueueWithProperties on APPLE + cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE; + queue = clCreateCommandQueue( context, devices[deviceUsed], props, &error ); + if (!CHECKCL( error )) return false; + // create a second command queue for asynchronous copies + queue2 = clCreateCommandQueue( context, devices[deviceUsed], props, &error ); + if (!CHECKCL( error )) return false; +#else cl_queue_properties props[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; queue = clCreateCommandQueueWithProperties( context, devices[deviceUsed], props, &error ); if (!CHECKCL( error )) return false; // create a second command queue for asynchronous copies queue2 = clCreateCommandQueueWithProperties( context, devices[deviceUsed], props, &error ); if (!CHECKCL( error )) return false; +#endif // cleanup delete[] devices; clStarted = true; diff --git a/traverse_bvh2.cl b/traverse_bvh2.cl index ed95d2c..83d5f11 100644 --- a/traverse_bvh2.cl +++ b/traverse_bvh2.cl @@ -31,8 +31,18 @@ float4 traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* i for (unsigned i = 0; i < triCount; i++) { const unsigned triIdx = idx[firstTri + i]; +#ifdef ISAPPLE + // FIX error: initializing 'const __private float4 *__private' with an expression of type '__global float4 *' changes address space of pointer + const float4 tri[3] = + { + verts[3 * triIdx], + verts[3 * triIdx + 1], + verts[3 * triIdx + 2], + }; +#else const float4* tri = verts + 3 * triIdx; - // triangle intersection - Möller-Trumbore +#endif + // triangle intersection - M�ller-Trumbore const float4 edge1 = tri[1] - tri[0], edge2 = tri[2] - tri[0]; const float3 h = cross( D, edge2.xyz ); const float a = dot( edge1.xyz, h ); @@ -100,8 +110,18 @@ bool isoccluded_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* i for (unsigned i = 0; i < triCount; i++) { const unsigned triIdx = idx[firstTri + i]; +#ifdef ISAPPLE + // FIX error: initializing 'const __private float4 *__private' with an expression of type '__global float4 *' changes address space of pointer + const float4 tri[3] = + { + verts[3 * triIdx], + verts[3 * triIdx + 1], + verts[3 * triIdx + 2], + }; +#else const float4* tri = verts + 3 * triIdx; - // triangle intersection - Möller-Trumbore +#endif + // triangle intersection - M�ller-Trumbore const float4 edge1 = tri[1] - tri[0], edge2 = tri[2] - tri[0]; const float3 h = cross( D, edge2.xyz ); const float a = dot( edge1.xyz, h );