Skip to content

Commit

Permalink
Fast GPU-side traversal for al4gpu layout.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Nov 19, 2024
1 parent 640c8c2 commit 4eb2ea7
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 8 deletions.
2 changes: 1 addition & 1 deletion tiny_bvh.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ THE SOFTWARE.
// library version
#define TINY_BVH_VERSION_MAJOR 0
#define TINY_BVH_VERSION_MINOR 9
#define TINY_BVH_VERSION_SUB 2
#define TINY_BVH_VERSION_SUB 3

// ============================================================================
//
Expand Down
52 changes: 46 additions & 6 deletions tiny_bvh_speedtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ int main()

// load and compile the OpenCL kernel code
// This also triggers OpenCL init and device identification.
tinyocl::Kernel kernel( "traverse.cl", "traverse" );
tinyocl::Kernel ailalaine_kernel( "traverse.cl", "traverse_ailalaine" );
tinyocl::Kernel gpu4way_kernel( "traverse.cl", "traverse_gpu4way" );
printf( "----------------------------------------------------------------\n" );

#endif
Expand Down Expand Up @@ -335,7 +336,7 @@ int main()
printf( "- GPU, coherent, alt 2-way layout, ocl: " );
bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE );
// create OpenCL buffers for the BVH data calculated by tiny_bvh.h
tinyocl::Buffer gpuNodes( bvh.usedBVHNodes * sizeof( BVH::BVHNodeAlt ), bvh.altNode );
tinyocl::Buffer gpuNodes( bvh.usedAltNodes * sizeof( BVH::BVHNodeAlt ), bvh.altNode );
tinyocl::Buffer idxData( bvh.idxCount * sizeof( unsigned ), bvh.triIdx );
tinyocl::Buffer triData( bvh.triCount * 3 * sizeof( tinybvh::bvhvec4 ), bvh.verts );
// synchronize the host-side data to the gpu side
Expand All @@ -351,10 +352,10 @@ int main()
// start timer and start kernel on gpu
t.reset();
float traceTimeGPU = 0;
kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData );
for (int pass = 0; pass < 3; pass++)
ailalaine_kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData );
for (int pass = 0; pass < 8; pass++)
{
kernel.Run( N, 64, 0, &event ); // for now, todo.
ailalaine_kernel.Run( N, 64, 0, &event ); // for now, todo.
clWaitForEvents(1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
Expand All @@ -363,12 +364,51 @@ int main()
// get results from GPU - this also syncs the queue.
rayData.CopyFromDevice();
// report on timing
traceTimeGPU /= 3.0f;
traceTimeGPU /= 8.0f;
mrays = (float)N / traceTimeGPU;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU * 1000, (float)N * 1e-6f, mrays * 1e-6f );

#endif

#ifdef GPU_4WAY

// trace the rays on GPU using OpenCL
printf( "- GPU, coherent, alt 4-way layout, ocl: " );
bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 );
bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_GPU );
// create OpenCL buffers for the BVH data calculated by tiny_bvh.h
tinyocl::Buffer gpu4Nodes( bvh.usedAlt4Blocks * sizeof( tinybvh::bvhvec4 ), bvh.bvh4Alt );
// synchronize the host-side data to the gpu side
gpu4Nodes.CopyToDevice();
#ifndef GPU_2WAY // otherwise these already exist.
// create an event to time the OpenCL kernel
cl_event event;
cl_ulong startTime, endTime;
// create rays and send them to the gpu side
tinyocl::Buffer rayData( N * sizeof( tinybvh::Ray ), rays );
rayData.CopyToDevice();
#endif
// start timer and start kernel on gpu
t.reset();
float traceTimeGPU4 = 0;
gpu4way_kernel.SetArguments( &gpu4Nodes, &rayData );
for (int pass = 0; pass < 8; pass++)
{
gpu4way_kernel.Run( N, 64, 0, &event ); // for now, todo.
clWaitForEvents(1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
traceTimeGPU4 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
}
// get results from GPU - this also syncs the queue.
rayData.CopyFromDevice();
// report on timing
traceTimeGPU4 /= 8.0f;
mrays = (float)N / traceTimeGPU4;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU4 * 1000, (float)N * 1e-6f, mrays * 1e-6f );

#endif

#endif

#ifdef TRAVERSE_SOA2WAY_ST
Expand Down
104 changes: 103 additions & 1 deletion traverse.cl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ struct BVHNodeAlt
float4 rmax; // unsigned firstTri in w
};

void kernel traverse( global struct BVHNodeAlt* altNode, global unsigned* idx, global float4* verts, global struct Ray* rayData )
void kernel traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* idx, global float4* verts, global struct Ray* rayData )
{
// fetch ray
const unsigned threadId = get_global_id( 0 );
Expand Down Expand Up @@ -90,4 +90,106 @@ void kernel traverse( global struct BVHNodeAlt* altNode, global unsigned* idx, g
}
// write back intersection result
rayData[threadId].hit = hit;
}

void kernel traverse_gpu4way( global float4* alt4Node, global struct Ray* rayData )
{
// fetch ray
const unsigned threadId = get_global_id( 0 );
const float3 O = rayData[threadId].O.xyz;
const float3 D = rayData[threadId].D.xyz;
const float3 rD = rayData[threadId].rD.xyz;
float4 hit;
hit.x = 1e30f;
// some local memory for storing leaf information
local unsigned smem[64 * 4];
// traverse the BVH
const float4 zero4 = (float4)(0);
unsigned offset = 0, stack[64], stackPtr = 0;
const unsigned smBase = get_local_id( 0 ) * 4;
while (1)
{
// vectorized 4-wide quantized aabb intersection
const float4 data0 = alt4Node[offset];
const float4 data1 = alt4Node[offset + 1];
const float4 data2 = alt4Node[offset + 2];
const float4 cminx4 = convert_float4( as_uchar4( data0.w ) );
const float4 cmaxx4 = convert_float4( as_uchar4( data1.w ) );
const float4 cminy4 = convert_float4( as_uchar4( data2.x ) );
const float3 bminO = (O - data0.xyz) * rD, rDe = rD * data1.xyz;
const float4 cmaxy4 = convert_float4( as_uchar4( data2.y ) );
const float4 cminz4 = convert_float4( as_uchar4( data2.z ) );
const float4 cmaxz4 = convert_float4( as_uchar4( data2.w ) );
const float4 t1x4 = cminx4 * rDe.xxxx - bminO.xxxx, t2x4 = cmaxx4 * rDe.xxxx - bminO.xxxx;
const float4 t1y4 = cminy4 * rDe.yyyy - bminO.yyyy, t2y4 = cmaxy4 * rDe.yyyy - bminO.yyyy;
const float4 t1z4 = cminz4 * rDe.zzzz - bminO.zzzz, t2z4 = cmaxz4 * rDe.zzzz - bminO.zzzz;
uint4 data3 = as_uint4( alt4Node[offset + 3] );
const float4 mintx4 = fmin( t1x4, t2x4 ), maxtx4 = fmax( t1x4, t2x4 );
const float4 minty4 = fmin( t1y4, t2y4 ), maxty4 = fmax( t1y4, t2y4 );
const float4 mintz4 = fmin( t1z4, t2z4 ), maxtz4 = fmax( t1z4, t2z4 );
const float4 maxxy4 = select( mintx4, minty4, isless( mintx4, minty4 ) );
const float4 maxyz4 = select( maxxy4, mintz4, isless( maxxy4, mintz4 ) );
float4 dst4 = select( maxyz4, zero4, isless( maxyz4, zero4 ) );
const float4 minxy4 = select( maxtx4, maxty4, isgreater( maxtx4, maxty4 ) );
const float4 minyz4 = select( minxy4, maxtz4, isgreater( minxy4, maxtz4 ) );
const float4 tmax4 = select( minyz4, hit.xxxx, isgreater( minyz4, hit.xxxx ) );
dst4 = select( dst4, (float4)(1e30f), isgreater( dst4, tmax4 ) );
// sort intersection distances
if (dst4.x < dst4.z) dst4 = dst4.zyxw, data3 = data3.zyxw; // bertdobbelaere.github.io/sorting_networks.html
if (dst4.y < dst4.w) dst4 = dst4.xwzy, data3 = data3.xwzy;
if (dst4.x < dst4.y) dst4 = dst4.yxzw, data3 = data3.yxzw;
if (dst4.z < dst4.w) dst4 = dst4.xywz, data3 = data3.xywz;
if (dst4.y < dst4.z) dst4 = dst4.xzyw, data3 = data3.xzyw;
// process results, starting with farthest child, so nearest ends on top of stack
unsigned nextNode = 0, leafs = 0;
if (dst4.x < 1e30f) if (data3.x >> 31) smem[smBase + leafs++] = data3.x; else nextNode = data3.x;
if (dst4.y < 1e30f) if (data3.y >> 31) smem[smBase + leafs++] = data3.y; else
{
if (nextNode) stack[stackPtr++] = nextNode;
nextNode = data3.y;
}
if (dst4.z < 1e30f) if (data3.z >> 31) smem[smBase + leafs++] = data3.z; else
{
if (nextNode) stack[stackPtr++] = nextNode;
nextNode = data3.z;
}
if (dst4.w < 1e30f) if (data3.w >> 31) smem[smBase + leafs++] = data3.w; else
{
if (nextNode) stack[stackPtr++] = nextNode;
nextNode = data3.w;
}
// process encountered leaf primitives
int leaf = 0, prim = 0;
while (leaf < leafs)
{
const unsigned leafInfo = smem[smBase + leaf];
unsigned thisTri = (leafInfo & 0xffff) + offset + prim * 3;
const float4 v0 = alt4Node[thisTri];
const float4 v1 = alt4Node[thisTri + 1];
const float4 v2 = alt4Node[thisTri + 2];
const unsigned triCount = (leafInfo >> 16) & 0x7fff;
if (++prim == triCount) prim = 0, leaf++;
const float4 edge1 = v1 - v0, edge2 = v2 - v0;
const float3 h = cross( D, edge2.xyz );
const float a = dot( edge1.xyz, h );
if (fabs( a ) < 0.0000001f) continue;
const float f = native_recip( a );
const float3 s = O - v0.xyz;
const float u = f * dot( s, h );
if (u < 0 || u > 1) continue;
const float3 q = cross( s, edge1.xyz );
const float v = f * dot( D, q );
if (v < 0 || u + v > 1) continue;
const float d = f * dot( edge2.xyz, q );
if (d <= 0.0f || d > hit.x) continue;
hit = (float4)(d, u, v, v0.w);
}
// continue with nearest node or first node on the stack
if (nextNode) offset = nextNode; else
{
if (!stackPtr) break;
offset = stack[--stackPtr];
}
}
rayData[threadId].hit = hit;
}

0 comments on commit 4eb2ea7

Please sign in to comment.