Skip to content

Commit

Permalink
Aila & Laine node layout GPU traversal.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Nov 18, 2024
1 parent 0922c82 commit d7bda0b
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 40 deletions.
6 changes: 4 additions & 2 deletions tiny_bvh.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

// Nov 18, '24: version 0.9.1 : Added custom alloc/free (tcantenot).
// Mov 16, '24: version 0.9.0 : (external) OpenCL in speedtest.
// Nov 15, '24: version 0.8.3 : Incremental update / bugfixes.
// Nov 14, '24: version 0.8.0 : ARM/NEON support.
// Nov 13, '24: version 0.7.5 : Support for WASM with EMSCRIPTEN.
Expand Down Expand Up @@ -94,7 +96,7 @@ THE SOFTWARE.
// library version
#define TINY_BVH_VERSION_MAJOR 0
#define TINY_BVH_VERSION_MINOR 9
#define TINY_BVH_VERSION_SUB 1
#define TINY_BVH_VERSION_SUB 2

// ============================================================================
//
Expand Down Expand Up @@ -351,7 +353,7 @@ struct Intersection
// squeezing this in the 'prim' field in some way.
// Using this data and the original triangle data, all other info for
// shading (such as normal, texture color etc.) can be reconstructed.
float t, u, v; // distance along ray & barycentric coordinates of the intersection
float t, u, v; // distance along ray & barycentric coordinates of the intersection
unsigned prim; // primitive index
};

Expand Down
82 changes: 50 additions & 32 deletions tiny_bvh_speedtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#define SCRHEIGHT 600

// scene selection
#define LOADSPONZA
// #define LOADSPONZA

// GPU ray tracing
#define ENABLE_OPENCL
Expand Down Expand Up @@ -56,7 +56,7 @@ using namespace tinybvh;
bvhvec4* triangles = 0;
#include <fstream>
#else
ALIGNED( 16 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{};
ALIGNED( 64 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{};
#endif
int verts = 0;
BVH bvh;
Expand Down Expand Up @@ -150,17 +150,20 @@ int main()
#endif
printf( "----------------------------------------------------------------\n" );

#ifdef ENABLE_OPENCL

// load and compile the OpenCL kernel code
// This also triggers OpenCL init and device identification.
tinyocl::Kernel kernel( "traverse.cl", "traverse" );
printf( "----------------------------------------------------------------\n" );

#endif

#ifdef LOADSPONZA
// load raw vertex data for Crytek's Sponza
std::string filename{ "../testdata/cryteksponza.bin" };
std::string filename{ "testdata/cryteksponza.bin" };
std::fstream s{ filename, s.binary | s.in };
if (!s.is_open())
{
// try again, look in .\testdata
filename = std::string{ "./testdata/cryteksponza.bin" };
s = std::fstream{ filename, s.binary | s.in };
assert( s.is_open() );
}
assert( s.is_open() );
s.seekp( 0 );
s.read( (char*)&verts, 4 );
printf( "Loading triangle data (%i tris).\n", verts );
Expand Down Expand Up @@ -329,24 +332,38 @@ int main()
#ifdef GPU_2WAY

// trace the rays on GPU using OpenCL
printf( "- CPU, coherent, alt 2-way layout, ST: " );
printf( "- GPU, coherent, alt 2-way layout, ocl: " );
bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE );
// load and compile the OpenCL kernel code
tinyocl::Kernel kernel( "traverse.cl", "traverse" );
// create an OpenCL buffer for the BVH nodes calculated by tiny_bvh.h
tinyocl::Buffer gpuNodes( bvh.usedBVHNodes * sizeof( BVH::BVHNodeAlt ) );
// copy the data to the host-side version of the buffer
memcpy( gpuNodes.GetHostPtr(), bvh.altNode, bvh.usedBVHNodes * sizeof( BVH::BVHNodeAlt ) );
// synchronize the host-side buffer to the gpu side
// create OpenCL buffers for the BVH data calculated by tiny_bvh.h
tinyocl::Buffer gpuNodes( bvh.usedBVHNodes * sizeof( BVH::BVHNodeAlt ), bvh.altNode );
tinyocl::Buffer idxData( bvh.idxCount * sizeof( unsigned ), bvh.triIdx );
tinyocl::Buffer triData( bvh.triCount * 3 * sizeof( tinybvh::bvhvec4 ), bvh.verts );
// synchronize the host-side data to the gpu side
gpuNodes.CopyToDevice();
idxData.CopyToDevice();
triData.CopyToDevice();
// create rays and send them to the gpu side
tinyocl::Buffer rayData( N * sizeof( tinybvh::Ray ), rays );
rayData.CopyToDevice();
// create an event to time the OpenCL kernel
cl_event event;
cl_ulong startTime, endTime;
// start timer and start kernel on gpu
t.reset();
float traceTimeGPU = 0;
kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData );
for (int pass = 0; pass < 3; pass++)
{
kernel.SetArguments( &gpuNodes );
kernel.Run( 32 ); // for now, todo.
kernel.Run( N, 64, 0, &event ); // for now, todo.
clWaitForEvents(1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
traceTimeGPU += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
}
float traceTimeGPU = t.elapsed() / 3.0f;
// get results from GPU - this also syncs the queue.
rayData.CopyFromDevice();
// report on timing
traceTimeGPU /= 3.0f;
mrays = (float)N / traceTimeGPU;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU * 1000, (float)N * 1e-6f, mrays * 1e-6f );

Expand Down Expand Up @@ -451,17 +468,17 @@ int main()
bvh.Build( triangles, verts / 3 ); // rebuild with full splitting.
bvh.Optimize( 1000000 );
bvh.MergeLeafs();
printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() );
bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA );
for (int i = 0; i < N; i += 2) bvh.Intersect( rays[i], BVH::ALT_SOA ); // re-warm
printf( "- CPU, coherent, 2-way optimized, ST: " );
t.reset();
for (int pass = 0; pass < 3; pass++)
for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::ALT_SOA );
float traceTimeOpt = t.elapsed() / 3.0f;
mrays = (float)N / traceTimeOpt;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeOpt * 1000, (float)N * 1e-6f, mrays * 1e-6f );
}
printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() );
bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA );
for (int i = 0; i < N; i += 2) bvh.Intersect( rays[i], BVH::ALT_SOA ); // re-warm
printf( "- CPU, coherent, 2-way optimized, ST: " );
t.reset();
for (int pass = 0; pass < 3; pass++)
for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::ALT_SOA );
float traceTimeOpt = t.elapsed() / 3.0f;
mrays = (float)N / traceTimeOpt;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeOpt * 1000, (float)N * 1e-6f, mrays * 1e-6f );

#endif

Expand All @@ -470,7 +487,7 @@ int main()
// trace all rays three times to estimate average performance
// - coherent, Embree, single-threaded
printf( "- CPU, coherent, Embree BVH, Embree ST: " );
struct RTCRayHit* rayhits = (RTCRayHit*)default_malloc( SCRWIDTH * SCRHEIGHT * 16 * sizeof( RTCRayHit ) );
struct RTCRayHit* rayhits = (RTCRayHit*)tinybvh::malloc64( SCRWIDTH * SCRHEIGHT * 16 * sizeof( RTCRayHit ) );
// copy our rays to Embree format
for (int i = 0; i < N; i++)
{
Expand All @@ -494,6 +511,7 @@ int main()
}
mrays = (float)N / traceTimeEmbree;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeEmbree * 1000, (float)N * 1e-6f, mrays * 1e-6f );
tinybvh::free64( rayhits );

#endif

Expand Down
16 changes: 14 additions & 2 deletions tiny_ocl.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

// Nov 15, '24: version 0.1.0 : Accidentally started another tiny lib
// Nov 18, '24: version 0.1.1 : Added custom alloc/free.
// Nov 15, '24: version 0.1.0 : Accidentally started another tiny lib.

//
// Use this in *one* .c or .cpp
Expand Down Expand Up @@ -306,7 +307,17 @@ class Kernel
template<class T> void SetArgument( int idx, T value )
{
CheckCLStarted();
clSetKernelArg( kernel, idx, sizeof( T ), &value );
if (sizeof( T ) == 12)
{
// probably int3 / float3; pad to 16 bytes
unsigned tmp[4] = {};
memcpy( tmp, &value, 12 );
clSetKernelArg( kernel, idx, 16, &value );
}
else
{
clSetKernelArg( kernel, idx, sizeof( T ), &value );
}
}
// other methods
public:
Expand Down Expand Up @@ -357,6 +368,7 @@ using namespace tinyocl;
#else
#include <unistd.h>
#endif
#include <fstream>

#define CHECKCL(r) CheckCL( r, __FILE__, __LINE__ )

Expand Down
89 changes: 87 additions & 2 deletions traverse.cl
Original file line number Diff line number Diff line change
@@ -1,6 +1,91 @@
// gpu-side code for ray traversal

void kernel traverse( global float4* nodeData )
// Note: We are taking in nodes and rays as collection of floa4's here.
// You can use structs in OpenCL, which will be more convenient and
// clear here. Be careful though: float3 / int3 struct members are padded
// to 16 bytes in OpenCL.

void kernel traverse( global float4* bvhNode, global unsigned* idx, global float4* verts, global float4* rayData )
{
// placeholder
// fetch ray
unsigned threadId = get_global_id( 0 );
float3 O = rayData[threadId * 4 + 0].xyz;
float3 D = rayData[threadId * 4 + 1].xyz;
float3 rD = rayData[threadId * 4 + 2].xyz;
float4 hit = rayData[threadId * 4 + 3];
hit.x = 1e30f;
// traverse BVH
unsigned node = 0, stack[64], stackPtr = 0;
while (1)
{
// fetch the node
const float4 lmin = bvhNode[node * 4 + 0];
const float4 lmax = bvhNode[node * 4 + 1];
const float4 rmin = bvhNode[node * 4 + 2];
const float4 rmax = bvhNode[node * 4 + 3];
const unsigned triCount = as_uint( rmin.w );
if (triCount > 0)
{
// process leaf node
const unsigned firstTri = as_uint( rmax.w );
for (unsigned i = 0; i < triCount; i++)
{
const unsigned triIdx = idx[firstTri + i];
const float4* tri = verts + 3 * triIdx;
// triangle intersection
const float4 edge1 = tri[1] - tri[0], edge2 = tri[2] - tri[0];
const float3 h = cross( D, edge2.xyz );
const float a = dot( edge1.xyz, h );
if (fabs( a ) >= 0.0000001f)
{
const float f = 1 / a;
const float3 s = O - tri[0].xyz;
const float u = f * dot( s, h );
if (u >= 0 && u <= 1)
{
const float3 q = cross( s, edge1.xyz );
const float v = f * dot( D, q );
if (v >= 0 && u + v <= 1)
{
const float d = f * dot( edge2.xyz, q );
if (d > 0.0f && d < hit.x /* i.e., ray.t */)
hit = (float4)(d, u, v, as_float( triIdx ));
}
}
}
}
if (stackPtr == 0) break;
node = stack[--stackPtr];
continue;
}
unsigned left = as_uint( lmin.w ), right = as_uint( lmax.w );
// child AABB intersection tests
const float3 t1a = (lmin.xyz - O) * rD, t2a = (lmax.xyz - O) * rD;
const float3 t1b = (rmin.xyz - O) * rD, t2b = (rmax.xyz - O) * rD;
const float3 minta = fmin( t1a, t2a ), maxta = fmax( t1a, t2a );
const float3 mintb = fmin( t1b, t2b ), maxtb = fmax( t1b, t2b );
const float tmina = fmax( fmax( fmax( minta.x, minta.y ), minta.z ), 0 );
const float tminb = fmax( fmax( fmax( mintb.x, mintb.y ), mintb.z ), 0 );
const float tmaxa = fmin( fmin( fmin( maxta.x, maxta.y ), maxta.z ), hit.x );
const float tmaxb = fmin( fmin( fmin( maxtb.x, maxtb.y ), maxtb.z ), hit.x );
float dist1 = tmina > tmaxa ? 1e30f : tmina;
float dist2 = tminb > tmaxb ? 1e30f : tminb;
// traverse nearest child first
if (dist1 > dist2)
{
float h = dist1; dist1 = dist2; dist2 = h;
unsigned t = left; left = right; right = t;
}
if (dist1 == 1e30f)
{
if (stackPtr == 0) break; else node = stack[--stackPtr];
}
else
{
node = left;
if (dist2 != 1e30f) stack[stackPtr++] = right;
}
}
// write back intersection result
rayData[threadId * 4 + 3] = hit;
}
4 changes: 2 additions & 2 deletions vcproj/tiny_bvh_speedtest.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
<EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
<FloatingPointModel>Fast</FloatingPointModel>
<OpenMPSupport>true</OpenMPSupport>
<AdditionalIncludeDirectories>external/OpenCL/inc/;../external/OpenCL/inc/</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>../external/OpenCL/inc/;../external/embree/include</AdditionalIncludeDirectories>
<LanguageStandard>stdcpp17</LanguageStandard>
</ClCompile>
<Link>
Expand All @@ -138,7 +138,7 @@
<EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
<FloatingPointModel>Fast</FloatingPointModel>
<OpenMPSupport>true</OpenMPSupport>
<AdditionalIncludeDirectories>external/OpenCL/inc/;../external/OpenCL/inc/</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>../external/OpenCL/inc/;../external/embree/include</AdditionalIncludeDirectories>
<LanguageStandard>stdcpp17</LanguageStandard>
</ClCompile>
<Link>
Expand Down

0 comments on commit d7bda0b

Please sign in to comment.