Fast GPU-side traversal for al4gpu layout.

jbikker · Nov 19, 2024 · 4eb2ea7 · 4eb2ea7
1 parent 640c8c2
commit 4eb2ea7
Show file tree

Hide file tree

Showing 3 changed files with 150 additions and 8 deletions.
diff --git a/tiny_bvh.h b/tiny_bvh.h
@@ -96,7 +96,7 @@ THE SOFTWARE.
 // library version
 #define TINY_BVH_VERSION_MAJOR	0
 #define TINY_BVH_VERSION_MINOR	9
-#define TINY_BVH_VERSION_SUB	2
+#define TINY_BVH_VERSION_SUB	3
 
 // ============================================================================
 //

diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp
@@ -154,7 +154,8 @@ int main()
 
 	// load and compile the OpenCL kernel code
 	// This also triggers OpenCL init and device identification.
-	tinyocl::Kernel kernel( "traverse.cl", "traverse" );
+	tinyocl::Kernel ailalaine_kernel( "traverse.cl", "traverse_ailalaine" );
+	tinyocl::Kernel gpu4way_kernel( "traverse.cl", "traverse_gpu4way" );
 	printf( "----------------------------------------------------------------\n" );
 
 #endif
@@ -335,7 +336,7 @@ int main()
 	printf( "- GPU, coherent,   alt 2-way layout,  ocl: " );
 	bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE );
 	// create OpenCL buffers for the BVH data calculated by tiny_bvh.h
-	tinyocl::Buffer gpuNodes( bvh.usedBVHNodes * sizeof( BVH::BVHNodeAlt ), bvh.altNode );
+	tinyocl::Buffer gpuNodes( bvh.usedAltNodes * sizeof( BVH::BVHNodeAlt ), bvh.altNode );
 	tinyocl::Buffer idxData( bvh.idxCount * sizeof( unsigned ), bvh.triIdx );
 	tinyocl::Buffer triData( bvh.triCount * 3 * sizeof( tinybvh::bvhvec4 ), bvh.verts );
 	// synchronize the host-side data to the gpu side
@@ -351,10 +352,10 @@ int main()
 	// start timer and start kernel on gpu
 	t.reset();
 	float traceTimeGPU = 0;
-	kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData );
-	for (int pass = 0; pass < 3; pass++)
+	ailalaine_kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData );
+	for (int pass = 0; pass < 8; pass++)
 	{
-		kernel.Run( N, 64, 0, &event ); // for now, todo.
+		ailalaine_kernel.Run( N, 64, 0, &event ); // for now, todo.
 		clWaitForEvents(1, &event ); // OpenCL kernsl run asynchronously
 		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 ); 
 		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 ); 
@@ -363,12 +364,51 @@ int main()
 	// get results from GPU - this also syncs the queue.
 	rayData.CopyFromDevice();
 	// report on timing
-	traceTimeGPU /= 3.0f;
+	traceTimeGPU /= 8.0f;
 	mrays = (float)N / traceTimeGPU;
 	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU * 1000, (float)N * 1e-6f, mrays * 1e-6f );
 
 #endif
 
+#ifdef GPU_4WAY
+
+	// trace the rays on GPU using OpenCL
+	printf( "- GPU, coherent,   alt 4-way layout,  ocl: " );
+	bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 );
+	bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_GPU );
+	// create OpenCL buffers for the BVH data calculated by tiny_bvh.h
+	tinyocl::Buffer gpu4Nodes( bvh.usedAlt4Blocks * sizeof( tinybvh::bvhvec4 ), bvh.bvh4Alt );
+	// synchronize the host-side data to the gpu side
+	gpu4Nodes.CopyToDevice();
+#ifndef GPU_2WAY // otherwise these already exist.
+	// create an event to time the OpenCL kernel
+	cl_event event; 
+	cl_ulong startTime, endTime;
+	// create rays and send them to the gpu side
+	tinyocl::Buffer rayData( N * sizeof( tinybvh::Ray ), rays );
+	rayData.CopyToDevice();
+#endif
+	// start timer and start kernel on gpu
+	t.reset();
+	float traceTimeGPU4 = 0;
+	gpu4way_kernel.SetArguments( &gpu4Nodes, &rayData );
+	for (int pass = 0; pass < 8; pass++)
+	{
+		gpu4way_kernel.Run( N, 64, 0, &event ); // for now, todo.
+		clWaitForEvents(1, &event ); // OpenCL kernsl run asynchronously
+		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 ); 
+		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 ); 
+		traceTimeGPU4 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
+	}
+	// get results from GPU - this also syncs the queue.
+	rayData.CopyFromDevice();
+	// report on timing
+	traceTimeGPU4 /= 8.0f;
+	mrays = (float)N / traceTimeGPU4;
+	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU4 * 1000, (float)N * 1e-6f, mrays * 1e-6f );
+
+#endif
+
 #endif
 
 #ifdef TRAVERSE_SOA2WAY_ST

diff --git a/traverse.cl b/traverse.cl
@@ -16,7 +16,7 @@ struct BVHNodeAlt
 	float4 rmax; // unsigned firstTri in w
 };
 
-void kernel traverse( global struct BVHNodeAlt* altNode, global unsigned* idx, global float4* verts, global struct Ray* rayData )
+void kernel traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* idx, global float4* verts, global struct Ray* rayData )
 {
 	// fetch ray
 	const unsigned threadId = get_global_id( 0 );
@@ -90,4 +90,106 @@ void kernel traverse( global struct BVHNodeAlt* altNode, global unsigned* idx, g
 	}
 	// write back intersection result
 	rayData[threadId].hit = hit;
+}
+
+void kernel traverse_gpu4way( global float4* alt4Node, global struct Ray* rayData )
+{
+	// fetch ray
+	const unsigned threadId = get_global_id( 0 );
+	const float3 O = rayData[threadId].O.xyz;
+	const float3 D = rayData[threadId].D.xyz;
+	const float3 rD = rayData[threadId].rD.xyz;
+	float4 hit;
+	hit.x = 1e30f;
+	// some local memory for storing leaf information
+	local unsigned smem[64 * 4];
+	// traverse the BVH
+	const float4 zero4 = (float4)(0);
+	unsigned offset = 0, stack[64], stackPtr = 0;
+	const unsigned smBase = get_local_id( 0 ) * 4;
+	while (1)
+	{
+		// vectorized 4-wide quantized aabb intersection
+		const float4 data0 = alt4Node[offset];
+		const float4 data1 = alt4Node[offset + 1];
+		const float4 data2 = alt4Node[offset + 2];
+		const float4 cminx4 = convert_float4( as_uchar4( data0.w ) );
+		const float4 cmaxx4 = convert_float4( as_uchar4( data1.w ) );
+		const float4 cminy4 = convert_float4( as_uchar4( data2.x ) );
+		const float3 bminO = (O - data0.xyz) * rD, rDe = rD * data1.xyz;
+		const float4 cmaxy4 = convert_float4( as_uchar4( data2.y ) );
+		const float4 cminz4 = convert_float4( as_uchar4( data2.z ) );
+		const float4 cmaxz4 = convert_float4( as_uchar4( data2.w ) );
+		const float4 t1x4 = cminx4 * rDe.xxxx - bminO.xxxx, t2x4 = cmaxx4 * rDe.xxxx - bminO.xxxx;
+		const float4 t1y4 = cminy4 * rDe.yyyy - bminO.yyyy, t2y4 = cmaxy4 * rDe.yyyy - bminO.yyyy;
+		const float4 t1z4 = cminz4 * rDe.zzzz - bminO.zzzz, t2z4 = cmaxz4 * rDe.zzzz - bminO.zzzz;
+		uint4 data3 = as_uint4( alt4Node[offset + 3] );
+		const float4 mintx4 = fmin( t1x4, t2x4 ), maxtx4 = fmax( t1x4, t2x4 );
+		const float4 minty4 = fmin( t1y4, t2y4 ), maxty4 = fmax( t1y4, t2y4 );
+		const float4 mintz4 = fmin( t1z4, t2z4 ), maxtz4 = fmax( t1z4, t2z4 );
+		const float4 maxxy4 = select( mintx4, minty4, isless( mintx4, minty4 ) );
+		const float4 maxyz4 = select( maxxy4, mintz4, isless( maxxy4, mintz4 ) );
+		float4 dst4 = select( maxyz4, zero4, isless( maxyz4, zero4 ) );
+		const float4 minxy4 = select( maxtx4, maxty4, isgreater( maxtx4, maxty4 ) );
+		const float4 minyz4 = select( minxy4, maxtz4, isgreater( minxy4, maxtz4 ) );
+		const float4 tmax4 = select( minyz4, hit.xxxx, isgreater( minyz4, hit.xxxx ) );
+		dst4 = select( dst4, (float4)(1e30f), isgreater( dst4, tmax4 ) );
+		// sort intersection distances
+		if (dst4.x < dst4.z) dst4 = dst4.zyxw, data3 = data3.zyxw; // bertdobbelaere.github.io/sorting_networks.html
+		if (dst4.y < dst4.w) dst4 = dst4.xwzy, data3 = data3.xwzy;
+		if (dst4.x < dst4.y) dst4 = dst4.yxzw, data3 = data3.yxzw;
+		if (dst4.z < dst4.w) dst4 = dst4.xywz, data3 = data3.xywz;
+		if (dst4.y < dst4.z) dst4 = dst4.xzyw, data3 = data3.xzyw;
+		// process results, starting with farthest child, so nearest ends on top of stack
+		unsigned nextNode = 0, leafs = 0;
+		if (dst4.x < 1e30f) if (data3.x >> 31) smem[smBase + leafs++] = data3.x; else nextNode = data3.x;
+		if (dst4.y < 1e30f) if (data3.y >> 31) smem[smBase + leafs++] = data3.y; else
+		{
+			if (nextNode) stack[stackPtr++] = nextNode;
+			nextNode = data3.y;
+		}
+		if (dst4.z < 1e30f) if (data3.z >> 31) smem[smBase + leafs++] = data3.z; else
+		{
+			if (nextNode) stack[stackPtr++] = nextNode;
+			nextNode = data3.z;
+		}
+		if (dst4.w < 1e30f) if (data3.w >> 31) smem[smBase + leafs++] = data3.w; else
+		{
+			if (nextNode) stack[stackPtr++] = nextNode;
+			nextNode = data3.w;
+		}
+		// process encountered leaf primitives
+		int leaf = 0, prim = 0;
+		while (leaf < leafs)
+		{
+			const unsigned leafInfo = smem[smBase + leaf];
+			unsigned thisTri = (leafInfo & 0xffff) + offset + prim * 3;
+			const float4 v0 = alt4Node[thisTri];
+			const float4 v1 = alt4Node[thisTri + 1];
+			const float4 v2 = alt4Node[thisTri + 2];
+			const unsigned triCount = (leafInfo >> 16) & 0x7fff;
+			if (++prim == triCount) prim = 0, leaf++;
+			const float4 edge1 = v1 - v0, edge2 = v2 - v0;
+			const float3 h = cross( D, edge2.xyz );
+			const float a = dot( edge1.xyz, h );
+			if (fabs( a ) < 0.0000001f) continue;
+			const float f = native_recip( a );
+			const float3 s = O - v0.xyz;
+			const float u = f * dot( s, h );
+			if (u < 0 || u > 1) continue;
+			const float3 q = cross( s, edge1.xyz );
+			const float v = f * dot( D, q );
+			if (v < 0 || u + v > 1) continue;
+			const float d = f * dot( edge2.xyz, q );
+			if (d <= 0.0f || d > hit.x) continue;
+			hit = (float4)(d, u, v, v0.w);
+		}
+		// continue with nearest node or first node on the stack
+		if (nextNode) offset = nextNode; else
+		{
+			if (!stackPtr) break;
+			offset = stack[--stackPtr];
+		}
+	}
+	rayData[threadId].hit = hit;
 }