From 0402288795276856d35828bcfdf9535b97c267e2 Mon Sep 17 00:00:00 2001 From: jbikker Date: Sun, 3 Nov 2024 20:06:55 +0100 Subject: [PATCH] New speed test tool added. --- tiny_bvh.h | 4 + tiny_bvh_fenster.cpp | 12 +- tiny_bvh_speedtest.cpp | 178 ++++++++++++++++++++++++++++++ tiny_bvh_test.sln | 10 ++ vcproj/tiny_bvh_speedtest.vcxproj | 144 ++++++++++++++++++++++++ 5 files changed, 344 insertions(+), 4 deletions(-) create mode 100644 tiny_bvh_speedtest.cpp create mode 100644 vcproj/tiny_bvh_speedtest.vcxproj diff --git a/tiny_bvh.h b/tiny_bvh.h index 93ccd2a..320547f 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -70,6 +70,10 @@ THE SOFTWARE. #define BVH_USEAVX #endif +// library version +#define TINY_BVH_VERSION_MAJOR 0 +#define TINY_BVH_VERSION_MINOR 2 + // ============================================================================ // // P R E L I M I N A R I E S diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp index 7cd69c1..8658831 100644 --- a/tiny_bvh_fenster.cpp +++ b/tiny_bvh_fenster.cpp @@ -42,11 +42,13 @@ void Init() void Tick( uint32_t* buf ) { - // setup camera + // setup view pyramid for a pinhole camera: + // eye, p1 (top-left), p2 (top-right) and p3 (bottom-left) bvhvec3 eye( -3.5f, -1.5f, -6.5f ), view = normalize( bvhvec3( 3, 1.5f, 5 ) ); bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), view ) ); bvhvec3 up = 0.8f * cross( view, right ), C = eye + 2 * view; bvhvec3 p1 = C - right + up, p2 = C + right + up, p3 = C - right - up; + // generate primary rays in a buffer int N = 0; Ray* rays = new Ray[SCRWIDTH * SCRHEIGHT * 16]; @@ -60,8 +62,10 @@ void Tick( uint32_t* buf ) rays[N++] = Ray( eye, normalize( P - eye ) ); } } + // trace primary rays for (int i = 0; i < N; i++) bvh.Intersect( rays[i] ); + // visualize result for (int i = 0, y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++) { @@ -69,9 +73,9 @@ void Tick( uint32_t* buf ) for (int s = 0; s < 16; s++, i++) if (rays[i].hit.t < 1000) { int primIdx = rays[i].hit.prim; - bvhvec3 v0 = *(bvhvec3*)&triangles[primIdx * 3 + 0]; - bvhvec3 v1 = *(bvhvec3*)&triangles[primIdx * 3 + 1]; - bvhvec3 v2 = *(bvhvec3*)&triangles[primIdx * 3 + 2]; + bvhvec3 v0 = triangles[primIdx * 3 + 0]; + bvhvec3 v1 = triangles[primIdx * 3 + 1]; + bvhvec3 v2 = triangles[primIdx * 3 + 2]; bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) ); avg += fabs( dot( N, normalize( bvhvec3( 1, 2, 3 ) ) ) ); } diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp new file mode 100644 index 0000000..cd5e75c --- /dev/null +++ b/tiny_bvh_speedtest.cpp @@ -0,0 +1,178 @@ +#define TINYBVH_IMPLEMENTATION +#include "tiny_bvh.h" +#ifdef _MSC_VER +#include "stdio.h" // for printf +#include "stdlib.h" // for rand +#else +#include +#endif + +// 'screen resolution': see tiny_bvh_fenster.cpp; this program traces the +// same rays, but without visualization - just performance statistics. +#define SCRWIDTH 800 +#define SCRHEIGHT 600 + +using namespace tinybvh; + +bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{}; +int verts = 0; +BVH bvh; + +float uniform_rand() { return (float)rand() / (float)RAND_MAX; } + +#include +struct Timer +{ + Timer() { reset(); } + float elapsed() const + { + auto t2 = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast>(t2 - start).count(); + } + void reset() { start = std::chrono::high_resolution_clock::now(); } + std::chrono::high_resolution_clock::time_point start; +}; + +void sphere_flake( float x, float y, float z, float s, int d = 0 ) +{ + // procedural tesselated sphere flake object +#define P(F,a,b,c) p[i+F*64]={(float)a ,(float)b,(float)c} + bvhvec3 p[384], pos( x, y, z ), ofs( 3.5 ); + for (int i = 0, u = 0; u < 8; u++) for (int v = 0; v < 8; v++, i++) + P( 0, u, v, 0 ), P( 1, u, 0, v ), P( 2, 0, u, v ), + P( 3, u, v, 7 ), P( 4, u, 7, v ), P( 5, 7, u, v ); + for (int i = 0; i < 384; i++) p[i] = normalize( p[i] - ofs ) * s + pos; + for (int i = 0, side = 0; side < 6; side++, i += 8) + for (int u = 0; u < 7; u++, i++) for (int v = 0; v < 7; v++, i++) + triangles[verts++] = p[i], triangles[verts++] = p[i + 8], + triangles[verts++] = p[i + 1], triangles[verts++] = p[i + 1], + triangles[verts++] = p[i + 9], triangles[verts++] = p[i + 8]; + if (d < 3) sphere_flake( x + s * 1.55f, y, z, s * 0.5f, d + 1 ); + if (d < 3) sphere_flake( x - s * 1.5f, y, z, s * 0.5f, d + 1 ); + if (d < 3) sphere_flake( x, y + s * 1.5f, z, s * 0.5f, d + 1 ); + if (d < 3) sphere_flake( x, x - s * 1.5f, z, s * 0.5f, d + 1 ); + if (d < 3) sphere_flake( x, y, z + s * 1.5f, s * 0.5f, d + 1 ); + if (d < 3) sphere_flake( x, y, z - s * 1.5f, s * 0.5f, d + 1 ); +} + +int main() +{ + // generate a sphere flake scene + sphere_flake( 0, 0, 0, 1.5f ); + + // setup view pyramid for a pinhole camera: + // eye, p1 (top-left), p2 (top-right) and p3 (bottom-left) + bvhvec3 eye( -3.5f, -1.5f, -6.5f ), view = normalize( bvhvec3( 3, 1.5f, 5 ) ); + bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), view ) ); + bvhvec3 up = 0.8f * cross( view, right ), C = eye + 2 * view; + bvhvec3 p1 = C - right + up, p2 = C + right + up, p3 = C - right - up; + + // generate primary rays in a cacheline-aligned buffer + int N = 0; + Ray* rays = (Ray*)ALIGNED_MALLOC( SCRWIDTH * SCRHEIGHT * 16 * sizeof( Ray ) ); + for (int y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++) + { + for (int s = 0; s < 16; s++) // 16 samples per pixel + { + float u = (float)(x * 4 + (s & 3)) / (SCRWIDTH * 4); + float v = (float)(y * 4 + (s >> 2)) / (SCRHEIGHT * 4); + bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1); + rays[N++] = Ray( eye, normalize( P - eye ) ); + } + } + + // T I N Y _ B V H P E R F O R M A N C E M E A S U R E M E N T S + + int minor = TINY_BVH_VERSION_MINOR, major = TINY_BVH_VERSION_MAJOR; + printf( "tiny_bvh version %i.%i performance statistics\n", major, minor ); + printf( "----------------------------------------------------------------\n" ); + + Timer t; + + // measure single-core bvh construction time - warming caches + printf( "BVH construction speed\n" ); + printf( "warming caches...\n" ); + bvh.Build( (bvhvec4*)triangles, verts / 3 ); + + // measure single-core bvh construction time - reference builder + t.reset(); + printf( "- reference builder: " ); + for (int pass = 0; pass < 3; pass++) + bvh.Build( (bvhvec4*)triangles, verts / 3 ); + float buildTime = t.elapsed() / 3.0f; + printf( "%.2fms for %i triangles ", buildTime * 1000.0f, verts / 3 ); + printf( "- %i nodes, SAH=%.2f\n", bvh.newNodePtr, bvh.SAHCost() ); + +#ifdef BVH_USEAVX + // measure single-core bvh construction time - AVX builder + t.reset(); + printf( "- fast AVX builder: " ); + for (int pass = 0; pass < 3; pass++) bvh.BuildAVX( (bvhvec4*)triangles, verts / 3 ); + float buildTimeAVX = t.elapsed() / 3.0f; + printf( "%.2fms for %i triangles ", buildTimeAVX * 1000.0f, verts / 3 ); + printf( "- %i nodes, SAH=%.2f\n", bvh.newNodePtr, bvh.SAHCost() ); +#endif + + // trace all rays once to warm the caches + printf( "BVH traversal speed\n" ); + printf( "warming caches...\n" ); + for (int i = 0; i < N; i++) bvh.Intersect( rays[i] ); + + // trace all rays three times to estimate average performance + // - single core version + t.reset(); + printf( "- CPU, coherent, basic 2-way layout, ST: " ); + for (int pass = 0; pass < 3; pass++) + for (int i = 0; i < N; i++) bvh.Intersect( rays[i] ); + float traceTimeST = t.elapsed() / 3.0f; + float mrays = (float)N / traceTimeST; + printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeST * 1000, (float)N * 1e-6f, mrays * 1e-6f ); + + // trace all rays three times to estimate average performance + // - multi-core version (using OpenMP and batches of 10,000 rays) + t.reset(); + printf( "- CPU, coherent, basic 2-way layout, MT: " ); + for (int j = 0; j < 3; j++) + { + const int batchCount = N / 10000; + #pragma omp parallel for schedule(dynamic) + for (int batch = 0; batch < batchCount; batch++) + { + const int batchStart = batch * 10000; + for (int i = 0; i < 10000; i++) bvh.Intersect( rays[batchStart + i] ); + } + } + float traceTimeMT = t.elapsed() / 3.0f; + mrays = (float)N / traceTimeMT; + printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeMT * 1000, (float)N * 1e-6f, mrays * 1e-6f ); + + // shuffle rays for the next experiment + for( int i = 0; i < N; i++ ) + { + int j = (i + 17 * rand()) % N; + Ray t = rays[i]; + rays[i] = rays[j]; + rays[j] = t; + } + + // trace all rays three times to estimate average performance + // - divergent distribution, multi-core + t.reset(); + printf( "- CPU, incoherent, basic 2-way layout, MT: " ); + for (int j = 0; j < 3; j++) + { + const int batchCount = N / 10000; + #pragma omp parallel for schedule(dynamic) + for (int batch = 0; batch < batchCount; batch++) + { + const int batchStart = batch * 10000; + for (int i = 0; i < 10000; i++) bvh.Intersect( rays[batchStart + i] ); + } + } + float traceTimeMTI = t.elapsed() / 3.0f; + mrays = (float)N / traceTimeMTI; + printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeMTI * 1000, (float)N * 1e-6f, mrays * 1e-6f ); + + // all done. + return 0; +} \ No newline at end of file diff --git a/tiny_bvh_test.sln b/tiny_bvh_test.sln index 43a9548..162f405 100644 --- a/tiny_bvh_test.sln +++ b/tiny_bvh_test.sln @@ -9,6 +9,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_renderer", "vcproj EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_test", "vcproj\tiny_bvh_test.vcxproj", "{0B5C86B2-9438-49E3-BF1A-4E1593BB436D}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_speedtest", "vcproj\tiny_bvh_speedtest.vcxproj", "{547F1A98-C394-46FB-AF15-3DB009D758FA}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -41,6 +43,14 @@ Global {0B5C86B2-9438-49E3-BF1A-4E1593BB436D}.Release|x64.Build.0 = Release|x64 {0B5C86B2-9438-49E3-BF1A-4E1593BB436D}.Release|x86.ActiveCfg = Release|Win32 {0B5C86B2-9438-49E3-BF1A-4E1593BB436D}.Release|x86.Build.0 = Release|Win32 + {547F1A98-C394-46FB-AF15-3DB009D758FA}.Debug|x64.ActiveCfg = Debug|x64 + {547F1A98-C394-46FB-AF15-3DB009D758FA}.Debug|x64.Build.0 = Debug|x64 + {547F1A98-C394-46FB-AF15-3DB009D758FA}.Debug|x86.ActiveCfg = Debug|Win32 + {547F1A98-C394-46FB-AF15-3DB009D758FA}.Debug|x86.Build.0 = Debug|Win32 + {547F1A98-C394-46FB-AF15-3DB009D758FA}.Release|x64.ActiveCfg = Release|x64 + {547F1A98-C394-46FB-AF15-3DB009D758FA}.Release|x64.Build.0 = Release|x64 + {547F1A98-C394-46FB-AF15-3DB009D758FA}.Release|x86.ActiveCfg = Release|Win32 + {547F1A98-C394-46FB-AF15-3DB009D758FA}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/vcproj/tiny_bvh_speedtest.vcxproj b/vcproj/tiny_bvh_speedtest.vcxproj new file mode 100644 index 0000000..b8eb08e --- /dev/null +++ b/vcproj/tiny_bvh_speedtest.vcxproj @@ -0,0 +1,144 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 17.0 + Win32Proj + {547F1A98-C394-46FB-AF15-3DB009D758FA} + tinybvhspeedtest + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + Level3 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions + Fast + true + + + Console + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions + Fast + true + + + Console + true + true + true + + + + + + \ No newline at end of file