Aila & Laine node layout GPU traversal.

jbikker · Nov 18, 2024 · d7bda0b · d7bda0b
1 parent 0922c82
commit d7bda0b
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 40 deletions.
diff --git a/tiny_bvh.h b/tiny_bvh.h
@@ -22,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+// Nov 18, '24: version 0.9.1 : Added custom alloc/free (tcantenot).
+// Mov 16, '24: version 0.9.0 : (external) OpenCL in speedtest.
 // Nov 15, '24: version 0.8.3 : Incremental update / bugfixes.
 // Nov 14, '24: version 0.8.0 : ARM/NEON support.
 // Nov 13, '24: version 0.7.5 : Support for WASM with EMSCRIPTEN. 
@@ -94,7 +96,7 @@ THE SOFTWARE.
 // library version
 #define TINY_BVH_VERSION_MAJOR	0
 #define TINY_BVH_VERSION_MINOR	9
-#define TINY_BVH_VERSION_SUB	1
+#define TINY_BVH_VERSION_SUB	2
 
 // ============================================================================
 //
@@ -351,7 +353,7 @@ struct Intersection
 	// squeezing this in the 'prim' field in some way.
 	// Using this data and the original triangle data, all other info for
 	// shading (such as normal, texture color etc.) can be reconstructed.
-	float t, u, v;		// distance along ray & barycentric coordinates of the intersection
+	float t, u, v;	// distance along ray & barycentric coordinates of the intersection
 	unsigned prim;	// primitive index
 };
 

diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp
@@ -7,7 +7,7 @@
 #define SCRHEIGHT	600
 
 // scene selection
-#define LOADSPONZA
+// #define LOADSPONZA
 
 // GPU ray tracing
 #define ENABLE_OPENCL
@@ -56,7 +56,7 @@ using namespace tinybvh;
 bvhvec4* triangles = 0;
 #include <fstream>
 #else
-ALIGNED( 16 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{};
+ALIGNED( 64 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{};
 #endif
 int verts = 0;
 BVH bvh;
@@ -150,17 +150,20 @@ int main()
 #endif
 	printf( "----------------------------------------------------------------\n" );
 
+#ifdef ENABLE_OPENCL
+
+	// load and compile the OpenCL kernel code
+	// This also triggers OpenCL init and device identification.
+	tinyocl::Kernel kernel( "traverse.cl", "traverse" );
+	printf( "----------------------------------------------------------------\n" );
+
+#endif
+
 #ifdef LOADSPONZA
 	// load raw vertex data for Crytek's Sponza
-	std::string filename{ "../testdata/cryteksponza.bin" };
+	std::string filename{ "testdata/cryteksponza.bin" };
 	std::fstream s{ filename, s.binary | s.in };
-	if (!s.is_open())
-	{
-		// try again, look in .\testdata
-		filename = std::string{ "./testdata/cryteksponza.bin" };
-		s = std::fstream{ filename, s.binary | s.in };
-		assert( s.is_open() );
-	}
+	assert( s.is_open() );
 	s.seekp( 0 );
 	s.read( (char*)&verts, 4 );
 	printf( "Loading triangle data (%i tris).\n", verts );
@@ -329,24 +332,38 @@ int main()
 #ifdef GPU_2WAY
 
 	// trace the rays on GPU using OpenCL
-	printf( "- CPU, coherent,   alt 2-way layout,   ST: " );
+	printf( "- GPU, coherent,   alt 2-way layout,  ocl: " );
 	bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE );
-	// load and compile the OpenCL kernel code
-	tinyocl::Kernel kernel( "traverse.cl", "traverse" );
-	// create an OpenCL buffer for the BVH nodes calculated by tiny_bvh.h
-	tinyocl::Buffer gpuNodes( bvh.usedBVHNodes * sizeof( BVH::BVHNodeAlt ) );
-	// copy the data to the host-side version of the buffer
-	memcpy( gpuNodes.GetHostPtr(), bvh.altNode, bvh.usedBVHNodes * sizeof( BVH::BVHNodeAlt ) );
-	// synchronize the host-side buffer to the gpu side
+	// create OpenCL buffers for the BVH data calculated by tiny_bvh.h
+	tinyocl::Buffer gpuNodes( bvh.usedBVHNodes * sizeof( BVH::BVHNodeAlt ), bvh.altNode );
+	tinyocl::Buffer idxData( bvh.idxCount * sizeof( unsigned ), bvh.triIdx );
+	tinyocl::Buffer triData( bvh.triCount * 3 * sizeof( tinybvh::bvhvec4 ), bvh.verts );
+	// synchronize the host-side data to the gpu side
 	gpuNodes.CopyToDevice();
+	idxData.CopyToDevice();
+	triData.CopyToDevice();
+	// create rays and send them to the gpu side
+	tinyocl::Buffer rayData( N * sizeof( tinybvh::Ray ), rays );
+	rayData.CopyToDevice();
+	// create an event to time the OpenCL kernel
+	cl_event event; 
+	cl_ulong startTime, endTime;
 	// start timer and start kernel on gpu
 	t.reset();
+	float traceTimeGPU = 0;
+	kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData );
 	for (int pass = 0; pass < 3; pass++)
 	{
-		kernel.SetArguments( &gpuNodes );
-		kernel.Run( 32 ); // for now, todo.
+		kernel.Run( N, 64, 0, &event ); // for now, todo.
+		clWaitForEvents(1, &event ); // OpenCL kernsl run asynchronously
+		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 ); 
+		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 ); 
+		traceTimeGPU += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
 	}
-	float traceTimeGPU = t.elapsed() / 3.0f;
+	// get results from GPU - this also syncs the queue.
+	rayData.CopyFromDevice();
+	// report on timing
+	traceTimeGPU /= 3.0f;
 	mrays = (float)N / traceTimeGPU;
 	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU * 1000, (float)N * 1e-6f, mrays * 1e-6f );
 
@@ -451,17 +468,17 @@ int main()
 		bvh.Build( triangles, verts / 3 ); // rebuild with full splitting.
 		bvh.Optimize( 1000000 );
 		bvh.MergeLeafs();
-		printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() );
-		bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA );
-		for (int i = 0; i < N; i += 2) bvh.Intersect( rays[i], BVH::ALT_SOA ); // re-warm
-		printf( "- CPU, coherent,   2-way optimized,    ST: " );
-		t.reset();
-		for (int pass = 0; pass < 3; pass++)
-			for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::ALT_SOA );
-		float traceTimeOpt = t.elapsed() / 3.0f;
-		mrays = (float)N / traceTimeOpt;
-		printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeOpt * 1000, (float)N * 1e-6f, mrays * 1e-6f );
 	}
+	printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() );
+	bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA );
+	for (int i = 0; i < N; i += 2) bvh.Intersect( rays[i], BVH::ALT_SOA ); // re-warm
+	printf( "- CPU, coherent,   2-way optimized,    ST: " );
+	t.reset();
+	for (int pass = 0; pass < 3; pass++)
+		for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::ALT_SOA );
+	float traceTimeOpt = t.elapsed() / 3.0f;
+	mrays = (float)N / traceTimeOpt;
+	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeOpt * 1000, (float)N * 1e-6f, mrays * 1e-6f );
 
 #endif
 
@@ -470,7 +487,7 @@ int main()
 	// trace all rays three times to estimate average performance
 	// - coherent, Embree, single-threaded
 	printf( "- CPU, coherent,   Embree BVH,  Embree ST: " );
-	struct RTCRayHit* rayhits = (RTCRayHit*)default_malloc( SCRWIDTH * SCRHEIGHT * 16 * sizeof( RTCRayHit ) );
+	struct RTCRayHit* rayhits = (RTCRayHit*)tinybvh::malloc64( SCRWIDTH * SCRHEIGHT * 16 * sizeof( RTCRayHit ) );
 	// copy our rays to Embree format
 	for (int i = 0; i < N; i++)
 	{
@@ -494,6 +511,7 @@ int main()
 	}
 	mrays = (float)N / traceTimeEmbree;
 	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeEmbree * 1000, (float)N * 1e-6f, mrays * 1e-6f );
+	tinybvh::free64( rayhits );
 
 #endif
 

diff --git a/tiny_ocl.h b/tiny_ocl.h
@@ -22,7 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-// Nov 15, '24: version 0.1.0 : Accidentally started another tiny lib 
+// Nov 18, '24: version 0.1.1 : Added custom alloc/free.
+// Nov 15, '24: version 0.1.0 : Accidentally started another tiny lib.
 
 //
 // Use this in *one* .c or .cpp
@@ -306,7 +307,17 @@ class Kernel
 	template<class T> void SetArgument( int idx, T value )
 	{
 		CheckCLStarted();
-		clSetKernelArg( kernel, idx, sizeof( T ), &value );
+		if (sizeof( T ) == 12)
+		{
+			// probably int3 / float3; pad to 16 bytes
+			unsigned tmp[4] = {};
+			memcpy( tmp, &value, 12 );
+			clSetKernelArg( kernel, idx, 16, &value );
+		}
+		else
+		{
+			clSetKernelArg( kernel, idx, sizeof( T ), &value );
+		}
 	}
 	// other methods
 public:
@@ -357,6 +368,7 @@ using namespace tinyocl;
 #else
 #include <unistd.h>
 #endif
+#include <fstream>
 
 #define CHECKCL(r) CheckCL( r, __FILE__, __LINE__ )
 

diff --git a/traverse.cl b/traverse.cl
@@ -1,6 +1,91 @@
 // gpu-side code for ray traversal
 
-void kernel traverse( global float4* nodeData )
+// Note: We are taking in nodes and rays as collection of floa4's here.
+// You can use structs in OpenCL, which will be more convenient and
+// clear here. Be careful though: float3 / int3 struct members are padded 
+// to 16 bytes in OpenCL.
+
+void kernel traverse( global float4* bvhNode, global unsigned* idx, global float4* verts, global float4* rayData )
 {
-	// placeholder
+	// fetch ray
+	unsigned threadId = get_global_id( 0 );
+	float3 O = rayData[threadId * 4 + 0].xyz;
+	float3 D = rayData[threadId * 4 + 1].xyz;
+	float3 rD = rayData[threadId * 4 + 2].xyz;
+	float4 hit = rayData[threadId * 4 + 3];
+	hit.x = 1e30f;
+	// traverse BVH
+	unsigned node = 0, stack[64], stackPtr = 0;
+	while (1)
+	{
+		// fetch the node
+		const float4 lmin = bvhNode[node * 4 + 0];
+		const float4 lmax = bvhNode[node * 4 + 1];
+		const float4 rmin = bvhNode[node * 4 + 2];
+		const float4 rmax = bvhNode[node * 4 + 3];
+		const unsigned triCount = as_uint( rmin.w );
+		if (triCount > 0)
+		{
+			// process leaf node
+			const unsigned firstTri = as_uint( rmax.w );
+			for (unsigned i = 0; i < triCount; i++)
+			{
+				const unsigned triIdx = idx[firstTri + i];
+				const float4* tri = verts + 3 * triIdx;
+				// triangle intersection
+				const float4 edge1 = tri[1] - tri[0], edge2 = tri[2] - tri[0];
+				const float3 h = cross( D, edge2.xyz );
+				const float a = dot( edge1.xyz, h );
+				if (fabs( a ) >= 0.0000001f)
+				{
+					const float f = 1 / a;
+					const float3 s = O - tri[0].xyz;
+					const float u = f * dot( s, h );
+					if (u >= 0 && u <= 1)
+					{
+						const float3 q = cross( s, edge1.xyz );
+						const float v = f * dot( D, q );
+						if (v >= 0 && u + v <= 1)
+						{
+							const float d = f * dot( edge2.xyz, q );
+							if (d > 0.0f && d < hit.x /* i.e., ray.t */)
+								hit = (float4)(d, u, v, as_float( triIdx ));
+						}
+					}
+				}
+			}
+			if (stackPtr == 0) break;
+			node = stack[--stackPtr];
+			continue;
+		}
+		unsigned left = as_uint( lmin.w ), right = as_uint( lmax.w );
+		// child AABB intersection tests
+		const float3 t1a = (lmin.xyz - O) * rD, t2a = (lmax.xyz - O) * rD;
+		const float3 t1b = (rmin.xyz - O) * rD, t2b = (rmax.xyz - O) * rD;
+		const float3 minta = fmin( t1a, t2a ), maxta = fmax( t1a, t2a );
+		const float3 mintb = fmin( t1b, t2b ), maxtb = fmax( t1b, t2b );
+		const float tmina = fmax( fmax( fmax( minta.x, minta.y ), minta.z ), 0 );
+		const float tminb = fmax( fmax( fmax( mintb.x, mintb.y ), mintb.z ), 0 );
+		const float tmaxa = fmin( fmin( fmin( maxta.x, maxta.y ), maxta.z ), hit.x );
+		const float tmaxb = fmin( fmin( fmin( maxtb.x, maxtb.y ), maxtb.z ), hit.x );
+		float dist1 = tmina > tmaxa ? 1e30f : tmina;
+		float dist2 = tminb > tmaxb ? 1e30f : tminb;
+		// traverse nearest child first
+		if (dist1 > dist2)
+		{
+			float h = dist1; dist1 = dist2; dist2 = h;
+			unsigned t = left; left = right; right = t;
+		}
+		if (dist1 == 1e30f)
+		{
+			if (stackPtr == 0) break; else node = stack[--stackPtr];
+		}
+		else
+		{
+			node = left;
+			if (dist2 != 1e30f) stack[stackPtr++] = right;
+		}
+	}
+	// write back intersection result
+	rayData[threadId * 4 + 3] = hit;
 }
diff --git a/vcproj/tiny_bvh_speedtest.vcxproj b/vcproj/tiny_bvh_speedtest.vcxproj
@@ -117,7 +117,7 @@
       <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
       <FloatingPointModel>Fast</FloatingPointModel>
       <OpenMPSupport>true</OpenMPSupport>
-      <AdditionalIncludeDirectories>external/OpenCL/inc/;../external/OpenCL/inc/</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../external/OpenCL/inc/;../external/embree/include</AdditionalIncludeDirectories>
       <LanguageStandard>stdcpp17</LanguageStandard>
     </ClCompile>
     <Link>
@@ -138,7 +138,7 @@
       <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
       <FloatingPointModel>Fast</FloatingPointModel>
       <OpenMPSupport>true</OpenMPSupport>
-      <AdditionalIncludeDirectories>external/OpenCL/inc/;../external/OpenCL/inc/</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>../external/OpenCL/inc/;../external/embree/include</AdditionalIncludeDirectories>
       <LanguageStandard>stdcpp17</LanguageStandard>
     </ClCompile>
     <Link>