Skip to content

Commit

Permalink
Fix Aila & Laine layout gpu traversal.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Nov 19, 2024
1 parent 8dd6944 commit 55c162a
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 19 deletions.
7 changes: 4 additions & 3 deletions tiny_bvh_fenster.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "external/fenster.h" // https://github.com/zserge/fenster

// #define USE_EMBREE // enable to verify correct implementation, win64 only for now.
// #define LOADSPONZA
#define LOADSPONZA

#define TINYBVH_IMPLEMENTATION
#include "tiny_bvh.h"
Expand Down Expand Up @@ -65,7 +65,7 @@ void Init()
s.seekp( 0 );
s.read( (char*)&verts, 4 );
printf( "Loading triangle data (%i tris).\n", verts );
verts *= 3, triangles = (bvhvec4*)default_malloc( verts * 16 );
verts *= 3, triangles = (bvhvec4*)malloc64( verts * 16 );
s.read( (char*)triangles, verts * 16 );
#else
// generate a sphere flake scene
Expand Down Expand Up @@ -95,6 +95,7 @@ void Init()
// build a BVH over the scene
#if defined(BVH_USEAVX)
bvh.BuildAVX( triangles, verts / 3 );
bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE );
#elif defined(BVH_USENEON)
bvh.BuildNEON( triangles, verts / 3 );
#else
Expand Down Expand Up @@ -152,7 +153,7 @@ void Tick( uint32_t* buf )
rays[i].hit.prim = rayhit.hit.primID, rays[i].hit.t = rayhit.ray.tfar;
}
#else
for (int i = 0; i < N; i++) bvh.Intersect( rays[i] );
for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::AILA_LAINE );
#endif

// visualize result
Expand Down
36 changes: 26 additions & 10 deletions tiny_bvh_speedtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#define TRAVERSE_2WAY_MT
#define TRAVERSE_2WAY_MT_PACKET
#define TRAVERSE_2WAY_MT_DIVERGENT
// #define TRAVERSE_OPTIMIZED_ST
#define TRAVERSE_OPTIMIZED_ST
// #define EMBREE_BUILD // win64-only for now.
// #define EMBREE_TRAVERSE // win64-only for now.

Expand Down Expand Up @@ -90,6 +90,21 @@ struct Timer
std::chrono::high_resolution_clock::time_point start;
};

#if 0
void dump_bitmap( Ray* rays )
{
unsigned char pixel[SCRWIDTH * SCRHEIGHT]; float sum;
for (int s, x, y, i = 0, ty = 0; ty < SCRHEIGHT / 4; ty++) for (int tx = 0; tx < SCRWIDTH / 4; tx++)
for (y = 0; y < 4; y++) for (s = 0, x = 0; x < 4; x++)
{
for (s = 0, sum = 0; s < 16; s++, i++) sum += rays[i].hit.t == 1e30f ? 0 : rays[i].hit.t;
pixel[tx * 4 + x + (ty * 4 + y) * SCRWIDTH] = (unsigned char)((int)(sum * 0.01f) & 255);
}
FILE* f = fopen( "testimage.raw", "wb" );
fwrite( pixel, 1, SCRWIDTH * SCRHEIGHT, f ); // for debugging, forget fclose
}
#endif

void sphere_flake( float x, float y, float z, float s, int d = 0 )
{
// procedural tesselated sphere flake object
Expand Down Expand Up @@ -306,7 +321,7 @@ int main()
printf( "- CPU, coherent, basic 2-way layout, ST: " );
t.reset();
for (int pass = 0; pass < 3; pass++)
for (int i = 0; i < N; i += 8 ) bvh.Intersect( rays[i] );
for (int i = 0; i < N; i += 8) bvh.Intersect( rays[i] );
float traceTimeST = t.elapsed() / 3.0f;
mrays = (float)(N / 8) / traceTimeST;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeST * 1000, (float)(N / 8) * 1e-6f, mrays * 1e-6f );
Expand Down Expand Up @@ -347,7 +362,7 @@ int main()
tinyocl::Buffer rayData( N * sizeof( tinybvh::Ray ), rays );
rayData.CopyToDevice();
// create an event to time the OpenCL kernel
cl_event event;
cl_event event;
cl_ulong startTime, endTime;
// start timer and start kernel on gpu
t.reset();
Expand All @@ -356,9 +371,9 @@ int main()
for (int pass = 0; pass < 8; pass++)
{
ailalaine_kernel.Run( N, 64, 0, &event ); // for now, todo.
clWaitForEvents(1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
traceTimeGPU += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
}
// get results from GPU - this also syncs the queue.
Expand All @@ -367,6 +382,7 @@ int main()
traceTimeGPU /= 8.0f;
mrays = (float)N / traceTimeGPU;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU * 1000, (float)N * 1e-6f, mrays * 1e-6f );
dump_bitmap( rays );

#endif

Expand All @@ -382,7 +398,7 @@ int main()
gpu4Nodes.CopyToDevice();
#ifndef GPU_2WAY // otherwise these already exist.
// create an event to time the OpenCL kernel
cl_event event;
cl_event event;
cl_ulong startTime, endTime;
// create rays and send them to the gpu side
tinyocl::Buffer rayData( N * sizeof( tinybvh::Ray ), rays );
Expand All @@ -395,9 +411,9 @@ int main()
for (int pass = 0; pass < 8; pass++)
{
gpu4way_kernel.Run( N, 64, 0, &event ); // for now, todo.
clWaitForEvents(1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
traceTimeGPU4 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
}
// get results from GPU - this also syncs the queue.
Expand Down
12 changes: 6 additions & 6 deletions traverse.cl
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ void kernel traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsign
const float3 O = rayData[threadId].O.xyz;
const float3 D = rayData[threadId].D.xyz;
const float3 rD = rayData[threadId].rD.xyz;
float t = 1e30f; // ignoring value set in ray to spare one memory transaction.
float4 hit;
hit.x = 1e30f; // ignoring value set in ray to spare one memory transaction.
// traverse BVH
unsigned node = 0, stack[64], stackPtr = 0;
while (1)
Expand All @@ -49,12 +49,12 @@ void kernel traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsign
const float f = 1 / a;
const float3 s = O - tri[0].xyz;
const float u = f * dot( s, h );
if (u < 0 && u > 1) continue;
if (u < 0 || u > 1) continue;
const float3 q = cross( s, edge1.xyz );
const float v = f * dot( D, q );
if (v < 0 && u + v > 1) continue;
if (v < 0 || u + v > 1) continue;
const float d = f * dot( edge2.xyz, q );
if (d > 0.0f && d < t) hit = (float4)(t = d, u, v, as_float( triIdx ));
if (d > 0.0f && d < hit.x) hit = (float4)(d, u, v, as_float( triIdx ));
}
if (stackPtr == 0) break;
node = stack[--stackPtr];
Expand All @@ -68,8 +68,8 @@ void kernel traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsign
const float3 mintb = fmin( t1b, t2b ), maxtb = fmax( t1b, t2b );
const float tmina = fmax( fmax( fmax( minta.x, minta.y ), minta.z ), 0 );
const float tminb = fmax( fmax( fmax( mintb.x, mintb.y ), mintb.z ), 0 );
const float tmaxa = fmin( fmin( fmin( maxta.x, maxta.y ), maxta.z ), t );
const float tmaxb = fmin( fmin( fmin( maxtb.x, maxtb.y ), maxtb.z ), t );
const float tmaxa = fmin( fmin( fmin( maxta.x, maxta.y ), maxta.z ), hit.x );
const float tmaxb = fmin( fmin( fmin( maxtb.x, maxtb.y ), maxtb.z ), hit.x );
float dist1 = tmina > tmaxa ? 1e30f : tmina;
float dist2 = tminb > tmaxb ? 1e30f : tminb;
// traverse nearest child first
Expand Down

0 comments on commit 55c162a

Please sign in to comment.