Skip to content

Commit

Permalink
Use custom memman in tinyocl.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Nov 18, 2024
1 parent c56c673 commit c8c3d3c
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 67 deletions.
40 changes: 11 additions & 29 deletions tiny_bvh.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ THE SOFTWARE.
// include fast AVX BVH builder
#if defined(__x86_64__) || defined(_M_X64) || defined(__wasm_simd128__) || defined(__wasm_relaxed_simd__)
#define BVH_USEAVX
#include "immintrin.h" // for __m128 and __m256
#elif defined(__aarch64__) || defined(_M_ARM64)
#define BVH_USENEON
#include "arm_neon.h"
Expand Down Expand Up @@ -113,25 +114,21 @@ THE SOFTWARE.
#endif

// aligned memory allocation
// note: formally size needs to be a multiple of 'alignment'.
// see https://en.cppreference.com/w/c/memory/aligned_alloc
// note: formally size needs to be a multiple of 'alignment'. See:
// https://en.cppreference.com/w/c/memory/aligned_alloc
// EMSCRIPTEN enforces this.
namespace tinybvh
{
// Copy of the same construct in tinyocl, different namespace.
namespace tinybvh {
inline size_t make_multiple_64( size_t x ) { return (x + 63) & ~0x3f; }
}

#ifdef _MSC_VER // Visual Studio / C11
#define ALIGNED( x ) __declspec( align( x ) )
namespace tinybvh {
inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : _aligned_malloc( make_multiple_64( size ), 64 );
}
inline void default_aligned_free( void* ptr, void* = nullptr )
{
_aligned_free( ptr );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { _aligned_free( ptr ); }
}
#elif defined(__EMSCRIPTEN__) // EMSCRIPTEN - needs to be before gcc and clang to avoid misdetection
#define ALIGNED( x ) __attribute__( ( aligned( x ) ) )
Expand All @@ -143,21 +140,15 @@ inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : _mm_malloc( size, 64 );
}
inline void default_aligned_free( void* ptr, void* = nullptr )
{
_mm_free( ptr );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { _mm_free( ptr ); }
}
#else
namespace tinybvh {
inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : aligned_alloc( 64, make_multiple_64( size ) );
}
inline void default_aligned_free( void* ptr, void* = nullptr )
{
free( ptr );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { free( ptr ); }
}
#endif
#else // gcc / clang
Expand All @@ -169,27 +160,18 @@ inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : _mm_malloc( make_multiple_64( size ), 64 );
}
inline void default_aligned_free( void* ptr, void* = nullptr )
{
_mm_free( ptr );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { _mm_free( ptr ); }
}
#else
namespace tinybvh {
inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : aligned_alloc( 64, make_multiple_64( size ) );
}
inline void default_aligned_free( void* ptr, void* = nullptr )
{
free( ptr );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { free( ptr ); }
}
#endif
#endif
#ifdef BVH_USEAVX
#include "immintrin.h" // for __m128 and __m256
#endif

namespace tinybvh {

Expand Down Expand Up @@ -357,7 +339,7 @@ typedef bvhvec4 SIMDVEC4;

// ============================================================================
//
// R A Y T R A C I N G S T R U C T S / C L A S S E S
// T I N Y _ B V H I N T E R F A C E
//
// ============================================================================

Expand Down
20 changes: 10 additions & 10 deletions tiny_bvh_speedtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
#define BUILD_REFERENCE
// #define BUILD_AVX
#define BUILD_NEON
#define BUILD_SBVH
#define TRAVERSE_2WAY_ST
#define TRAVERSE_ALT2WAY_ST
#define TRAVERSE_SOA2WAY_ST
#define TRAVERSE_2WAY_MT
#define TRAVERSE_2WAY_MT_PACKET
#define TRAVERSE_2WAY_MT_DIVERGENT
#define TRAVERSE_OPTIMIZED_ST
//#define BUILD_SBVH
//#define TRAVERSE_2WAY_ST
//#define TRAVERSE_ALT2WAY_ST
//#define TRAVERSE_SOA2WAY_ST
//#define TRAVERSE_2WAY_MT
//#define TRAVERSE_2WAY_MT_PACKET
//#define TRAVERSE_2WAY_MT_DIVERGENT
//#define TRAVERSE_OPTIMIZED_ST
// #define EMBREE_BUILD // win64-only for now.
// #define EMBREE_TRAVERSE // win64-only for now.

Expand Down Expand Up @@ -164,7 +164,7 @@ int main()
s.seekp( 0 );
s.read( (char*)&verts, 4 );
printf( "Loading triangle data (%i tris).\n", verts );
verts *= 3, triangles = (bvhvec4*)default_aligned_malloc( verts * 16 );
verts *= 3, triangles = (bvhvec4*)tinybvh::default_aligned_malloc( verts * 16 );
s.read( (char*)triangles, verts * 16 );
#else
// generate a sphere flake scene
Expand All @@ -186,7 +186,7 @@ int main()
// generate primary rays in a cacheline-aligned buffer - and, for data locality:
// organized in 4x4 pixel tiles, 16 samples per pixel, so 256 rays per tile.
int N = 0;
Ray* rays = (Ray*)default_aligned_malloc( SCRWIDTH * SCRHEIGHT * 16 * sizeof( Ray ) );
Ray* rays = (Ray*)tinybvh::default_aligned_malloc( SCRWIDTH * SCRHEIGHT * 16 * sizeof( Ray ) );
for (int ty = 0; ty < SCRHEIGHT / 4; ty++) for (int tx = 0; tx < SCRWIDTH / 4; tx++)
{
for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++)
Expand Down
124 changes: 96 additions & 28 deletions tiny_ocl.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,40 +38,62 @@ THE SOFTWARE.
#include <vector>

// aligned memory allocation
// note: formally size needs to be a multiple of 'alignment'.
// see https://en.cppreference.com/w/c/memory/aligned_alloc
// note: formally size needs to be a multiple of 'alignment'. See:
// https://en.cppreference.com/w/c/memory/aligned_alloc
// EMSCRIPTEN enforces this.
#define MAKE_MULIPLE_64( x ) ( ( ( x ) + 63 ) & ( ~0x3f ) )
// Copy of the same construct in tinybvh, different namespace.
namespace tinyocl {
inline size_t make_multiple_64( size_t x ) { return (x + 63) & ~0x3f; }
}
#ifdef _MSC_VER // Visual Studio / C11
#include <malloc.h>
#include <math.h> // for sqrtf, fabs
#include <string.h> // for memset
#define ALIGNED_MALLOC( x ) ( ( x ) == 0 ? 0 : _aligned_malloc( ( MAKE_MULIPLE_64( x ) ), 64 ) )
#define ALIGNED_FREE( x ) _aligned_free( x )
#define ALIGNED( x ) __declspec( align( x ) )
namespace tinyocl {
inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : _aligned_malloc( make_multiple_64( size ), 64 );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { _aligned_free( ptr ); }
}
#elif defined(__EMSCRIPTEN__) // EMSCRIPTEN - needs to be before gcc and clang to avoid misdetection
#include <cstdlib>
#include <cmath>
#include <cstring>
#define ALIGNED( x ) __attribute__( ( aligned( x ) ) )
#if defined(__wasm_simd128__) || defined(__wasm_relaxed_simd__)
// https://emscripten.org/docs/porting/simd.html
#include <xmmintrin.h>
#define ALIGNED_MALLOC( x ) ( ( x ) == 0 ? 0 : _mm_malloc( ( x ), 64 ) )
#define ALIGNED_FREE( x ) _mm_free( x )
namespace tinyocl {
inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : _mm_malloc( size, 64 );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { _mm_free( ptr ); }
}
#else
#define ALIGNED_MALLOC( x ) ( ( x ) == 0 ? 0 : aligned_alloc( 64, MAKE_MULIPLE_64( x ) ) )
#define ALIGNED_FREE( x ) free( x )
namespace tinyocl {
inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : aligned_alloc( 64, make_multiple_64( size ) );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { free( ptr ); }
}
#endif
#else // gcc / clang
#include <cstdlib>
#include <cmath>
#include <cstring>
#define ALIGNED( x ) __attribute__( ( aligned( x ) ) )
#if defined(__x86_64__) || defined(_M_X64)
#include <xmmintrin.h>
#define ALIGNED_MALLOC( x ) ( ( x ) == 0 ? 0 : _mm_malloc( ( MAKE_MULIPLE_64( x ) ), 64 ) )
#define ALIGNED_FREE( x ) _mm_free( x )
namespace tinyocl {
inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : _mm_malloc( make_multiple_64( size ), 64 );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { _mm_free( ptr ); }
}
#else
#define ALIGNED_MALLOC( x ) ( ( x ) == 0 ? 0 : aligned_alloc( 64, ( MAKE_MULIPLE_64( x ) ) ) )
#define ALIGNED_FREE( x ) free( x )
namespace tinyocl {
inline void* default_aligned_malloc( size_t size, void* = nullptr )
{
return size == 0 ? 0 : aligned_alloc( 64, make_multiple_64( size ) );
}
inline void default_aligned_free( void* ptr, void* = nullptr ) { free( ptr ); }
}
#endif
#endif

Expand All @@ -86,6 +108,34 @@ struct oclint2
};
struct oclvec3 { float x, y, z; };

// ============================================================================
//
// T I N Y _ O C L I N T E R F A C E
//
// ============================================================================

// OpenCL context
// You only need to explicitly instantiate this if you want to override the
// default memory allocator used by tinyocl::Buffer. In all other cases, the
// first use of tinyocl (creating a buffer, loading a kernel) will take care
// of this for you transparently.
struct OpenCLContext
{
void* (*malloc)(size_t size, void* userdata) = default_aligned_malloc;
void (*free)(void* ptr, void* userdata) = default_aligned_free;
void* userdata = nullptr;
};
class OpenCL
{
public:
OpenCL( OpenCLContext ctx = {} ) : context( ctx ) { ocl = this; }
OpenCLContext context;
void* AlignedAlloc( size_t size );
void AlignedFree( void* ptr );
static OpenCL* GetInstance() { return ocl; }
inline static OpenCL* ocl = 0;
};

// OpenCL buffer
class Buffer
{
Expand All @@ -104,7 +154,9 @@ class Buffer
void CopyFromDevice( const int offset, const int size, const bool blocking = true );
void CopyTo( Buffer* buffer );
void Clear();
private:
// data members
public:
unsigned int* hostBuffer;
cl_mem deviceBuffer = 0;
unsigned int type, size /* in bytes */, textureID;
Expand All @@ -127,6 +179,7 @@ class Kernel
static cl_command_queue& GetQueue2() { return queue2; }
static cl_context& GetContext() { return context; }
static cl_device_id& GetDevice() { return device; }
static OpenCL ocl;
// run methods
#if 1
void Run( cl_event* eventToWaitFor = 0, cl_event* eventToSet = 0 );
Expand Down Expand Up @@ -445,6 +498,18 @@ static cl_int getPlatformID( cl_platform_id* platform )
return CL_SUCCESS;
}

// memory management
// ----------------------------------------------------------------------------
void* OpenCL::AlignedAlloc( size_t size )
{
return OpenCL::context.malloc ? OpenCL::context.malloc( size, OpenCL::context.userdata ) : nullptr;
}
void OpenCL::AlignedFree( void* ptr )
{
if (OpenCL::context.free)
OpenCL::context.free( ptr, OpenCL::context.userdata );
}

// Buffer constructor
// ----------------------------------------------------------------------------
Buffer::Buffer( unsigned int N, void* ptr, unsigned int t )
Expand Down Expand Up @@ -483,7 +548,7 @@ Buffer::~Buffer()
{
if (ownData)
{
ALIGNED_FREE( hostBuffer );
OpenCL::GetInstance()->AlignedFree( hostBuffer );
hostBuffer = 0;
}
if ((type & (TEXTURE | TARGET)) == 0) clReleaseMemObject( deviceBuffer );
Expand All @@ -497,7 +562,7 @@ unsigned int* Buffer::GetHostPtr()
if (size == 0) return 0;
if (!hostBuffer)
{
hostBuffer = (unsigned*)ALIGNED_MALLOC( size );
hostBuffer = (unsigned*)OpenCL::GetInstance()->AlignedAlloc( size );
ownData = true;
aligned = true;
}
Expand All @@ -512,7 +577,7 @@ void Buffer::CopyToDevice( const bool blocking )
cl_int error;
if (!hostBuffer)
{
hostBuffer = (unsigned*)ALIGNED_MALLOC( size );
hostBuffer = (unsigned*)OpenCL::GetInstance()->AlignedAlloc( size );
ownData = true;
aligned = true;
}
Expand All @@ -524,7 +589,7 @@ void Buffer::CopyToDevice( const int offset, const int byteCount, const bool blo
cl_int error;
if (!hostBuffer)
{
hostBuffer = (unsigned*)ALIGNED_MALLOC( size );
hostBuffer = (unsigned*)OpenCL::GetInstance()->AlignedAlloc( size );
ownData = true;
aligned = true;
}
Expand All @@ -548,7 +613,7 @@ void Buffer::CopyFromDevice( const bool blocking )
cl_int error;
if (!hostBuffer)
{
hostBuffer = (unsigned*)ALIGNED_MALLOC( size );
hostBuffer = (unsigned*)OpenCL::GetInstance()->AlignedAlloc( size );
ownData = true;
aligned = true;
}
Expand All @@ -560,7 +625,7 @@ void Buffer::CopyFromDevice( const int offset, const int byteCount, const bool b
cl_int error;
if (!hostBuffer)
{
hostBuffer = (unsigned*)ALIGNED_MALLOC( size );
hostBuffer = (unsigned*)OpenCL::GetInstance()->AlignedAlloc( size );
ownData = true;
aligned = true;
}
Expand Down Expand Up @@ -816,6 +881,9 @@ Kernel::~Kernel()
// ----------------------------------------------------------------------------
bool Kernel::InitCL()
{
// prepare memory management
if (!OpenCL::ocl) OpenCL::ocl = new OpenCL(); // use the default memory allocation functions
// prepare OpenCL for first use
cl_platform_id platform;
cl_device_id* devices;
cl_uint devCount;
Expand Down

0 comments on commit c8c3d3c

Please sign in to comment.