Skip to content

Improvements to workgroup reduce + scan #876

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
09f16c2
minor fixes, example
keptsecret Apr 28, 2025
6f5f8b0
bug fixes and example
keptsecret Apr 28, 2025
1bac247
fix to data accessor indexing
keptsecret Apr 29, 2025
305ac7b
added template spec for vector dim 1
keptsecret Apr 29, 2025
c08063d
added inclusive scan
keptsecret Apr 29, 2025
b1d804f
exclusive scan working
keptsecret Apr 30, 2025
3cf98ab
removed outdated comment
keptsecret Apr 30, 2025
7b310e0
minor changes to config usage
keptsecret May 1, 2025
4b4e7e8
add 1 level scans
keptsecret May 1, 2025
2e5f29f
fixes to 1 level scans
keptsecret May 2, 2025
054b269
added handling >1 vectors on level 1 scan (untested)
keptsecret May 2, 2025
1b5282c
move load/store smem into scan funcs, setup config for 3 levels
keptsecret May 5, 2025
c6dc5bc
change to use coalesced indexing for 2-level scans
keptsecret May 6, 2025
aa0c36c
added 3-level scans
keptsecret May 6, 2025
74c359b
minor bug fixes
keptsecret May 6, 2025
ce244e2
changes to data accessor usage
keptsecret May 7, 2025
90b19d8
wg reduction uses reduce instead of scan
keptsecret May 8, 2025
d2a1663
fixes to calculating levels in config
keptsecret May 9, 2025
ea39d9e
fixes to 3-level scan
keptsecret May 12, 2025
2982e5e
Merge branch 'master' into improve-workgroup-scan-2
keptsecret May 13, 2025
1c0e72e
split config into new file
keptsecret May 14, 2025
59d02fe
merge master
keptsecret May 15, 2025
507904f
minor fixes
keptsecret May 15, 2025
542592f
soome changes to arithmetic config
keptsecret May 15, 2025
a9930a0
removed referencing workgroupID in scans
keptsecret May 15, 2025
55d89c5
no need to store locals in reduce
keptsecret May 16, 2025
4e4f26e
added workgroup accessor concepts, refactor accessor usage
keptsecret May 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_

#include "nbl/builtin/hlsl/concepts.hlsl"

namespace nbl
{
namespace hlsl
{
namespace workgroup2
{

#define NBL_CONCEPT_NAME ArithmeticSharedMemoryAccessor
#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
#define NBL_CONCEPT_PARAM_0 (accessor, T)
#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
#define NBL_CONCEPT_PARAM_2 (val, uint32_t)
NBL_CONCEPT_BEGIN(3)
#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
NBL_CONCEPT_END(
((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<uint32_t>(index, val)), is_same_v, void))
((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<uint32_t>(index, val)), is_same_v, void))
((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
);
#undef val
#undef index
#undef accessor
#include <nbl/builtin/hlsl/concepts/__end.hlsl>

#define NBL_CONCEPT_NAME ArithmeticDataAccessor
#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
#define NBL_CONCEPT_PARAM_0 (accessor, T)
#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
#define NBL_CONCEPT_PARAM_2 (val, uint32_t)
NBL_CONCEPT_BEGIN(3)
#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
NBL_CONCEPT_END(
((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<uint32_t>(index, val)), is_same_v, void))
((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<uint32_t>(index, val)), is_same_v, void))
((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
);
#undef val
#undef index
#undef accessor
#include <nbl/builtin/hlsl/concepts/__end.hlsl>

}
}
}

#endif
14 changes: 14 additions & 0 deletions include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@ namespace hlsl
namespace subgroup2
{

template<int32_t AssumeAllActive=false>
uint32_t LastSubgroupInvocation()
{
if (AssumeAllActive)
return glsl::gl_SubgroupSize()-1;
else
return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true));
}

bool ElectLast()
{
return glsl::gl_SubgroupInvocationID()==LastSubgroupInvocation();
}

template<uint32_t SubgroupSizeLog2>
struct Configuration
{
Expand Down
1 change: 1 addition & 0 deletions include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ struct vector_traits<vector<T, DIMENSION> >\
NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;\
};\

DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(1)
DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(2)
DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(3)
DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(4)
Expand Down
59 changes: 59 additions & 0 deletions include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h
#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_


#include "nbl/builtin/hlsl/functional.hlsl"
#include "nbl/builtin/hlsl/workgroup/ballot.hlsl"
#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
#include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl"
#include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl"


namespace nbl
{
namespace hlsl
{
namespace workgroup2
{

template<class Config, class BinOp, class device_capabilities=void>
struct reduction
{
template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking the reduction should return the reduction instead of void and not use the dataAccessor for setting at all (you can even call it a ReadOnlyAccessor)

{
impl::reduce<Config,BinOp,Config::LevelCount,device_capabilities> fn;
fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
}
};

template<class Config, class BinOp, class device_capabilities=void>
struct inclusive_scan
{
template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
{
impl::scan<Config,BinOp,false,Config::LevelCount,device_capabilities> fn;
fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
}
};

template<class Config, class BinOp, class device_capabilities=void>
struct exclusive_scan
{
template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor> && ArithmeticSharedMemoryAccessor<ScratchAccessor>)
static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
{
impl::scan<Config,BinOp,true,Config::LevelCount,device_capabilities> fn;
fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
}
};

}
}
}

#endif
94 changes: 94 additions & 0 deletions include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h
#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_
#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_

#include "nbl/builtin/hlsl/cpp_compat.hlsl"

namespace nbl
{
namespace hlsl
{
namespace workgroup2
{

namespace impl
{
template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
struct virtual_wg_size_log2
{
static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2+4, "WorkgroupSize cannot be larger than SubgroupSize*16");
NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value;
NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2;
};

template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation, uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2>
struct items_per_invocation
{
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v<int16_t,WorkgroupSizeLog2-SubgroupSizeLog2*VirtualWorkgroup::levels,0>;
NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation;
NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value;
NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>;
};

// explicit specializations for cases that don't fit
#define SPECIALIZE_VIRTUAL_WG_SIZE_CASE(WGLOG2, SGLOG2, LEVELS, VALUE) template<>\
struct virtual_wg_size_log2<WGLOG2, SGLOG2>\
{\
NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = LEVELS;\
NBL_CONSTEXPR_STATIC_INLINE uint16_t value = VALUE;\
};\

SPECIALIZE_VIRTUAL_WG_SIZE_CASE(11,4,3,12);
SPECIALIZE_VIRTUAL_WG_SIZE_CASE(7,7,1,7);
SPECIALIZE_VIRTUAL_WG_SIZE_CASE(6,6,1,6);
SPECIALIZE_VIRTUAL_WG_SIZE_CASE(5,5,1,5);
SPECIALIZE_VIRTUAL_WG_SIZE_CASE(4,4,1,4);
SPECIALIZE_VIRTUAL_WG_SIZE_CASE(3,3,1,3);
SPECIALIZE_VIRTUAL_WG_SIZE_CASE(2,2,1,2);

#undef SPECIALIZE_VIRTUAL_WG_SIZE_CASE
}

template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
struct ArithmeticConfiguration
{
NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;

// must have at least enough level 0 outputs to feed a single subgroup
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>;
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << SubgroupsPerVirtualWorkgroupLog2;

using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>;
NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels;
NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value;
using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation, WorkgroupSizeLog2, SubgroupSizeLog2>;
// NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0;
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1;
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2;
static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!");

NBL_CONSTEXPR_STATIC_INLINE uint16_t ElementCount = conditional_value<LevelCount==1,uint16_t,0,conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupSize*ItemsPerInvocation_1>::value;
};

template<class T>
struct is_configuration : bool_constant<false> {};

template<uint16_t W, uint16_t S, uint16_t I>
struct is_configuration<ArithmeticConfiguration<W,S,I> > : bool_constant<true> {};

template<typename T>
NBL_CONSTEXPR bool is_configuration_v = is_configuration<T>::value;


}
}
}

#endif
Loading