-
Notifications
You must be signed in to change notification settings - Fork 65
Improvements to workgroup reduce + scan #876
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
keptsecret
wants to merge
56
commits into
master
Choose a base branch
from
improve-workgroup-scan-2
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 21 commits
Commits
Show all changes
56 commits
Select commit
Hold shift + click to select a range
09f16c2
minor fixes, example
keptsecret 6f5f8b0
bug fixes and example
keptsecret 1bac247
fix to data accessor indexing
keptsecret 305ac7b
added template spec for vector dim 1
keptsecret c08063d
added inclusive scan
keptsecret b1d804f
exclusive scan working
keptsecret 3cf98ab
removed outdated comment
keptsecret 7b310e0
minor changes to config usage
keptsecret 4b4e7e8
add 1 level scans
keptsecret 2e5f29f
fixes to 1 level scans
keptsecret 054b269
added handling >1 vectors on level 1 scan (untested)
keptsecret 1b5282c
move load/store smem into scan funcs, setup config for 3 levels
keptsecret c6dc5bc
change to use coalesced indexing for 2-level scans
keptsecret aa0c36c
added 3-level scans
keptsecret 74c359b
minor bug fixes
keptsecret ce244e2
changes to data accessor usage
keptsecret 90b19d8
wg reduction uses reduce instead of scan
keptsecret d2a1663
fixes to calculating levels in config
keptsecret ea39d9e
fixes to 3-level scan
keptsecret 2982e5e
Merge branch 'master' into improve-workgroup-scan-2
keptsecret 1c0e72e
split config into new file
keptsecret 59d02fe
merge master
keptsecret 507904f
minor fixes
keptsecret 542592f
soome changes to arithmetic config
keptsecret a9930a0
removed referencing workgroupID in scans
keptsecret 55d89c5
no need to store locals in reduce
keptsecret 4e4f26e
added workgroup accessor concepts, refactor accessor usage
keptsecret 56f013e
Merge branch 'master' into improve-workgroup-scan-2
keptsecret 004c95a
fixed minor bug
keptsecret ccacddb
store temporaries with data accessor
keptsecret 9c59677
minor fixes
keptsecret eb44262
moved indexing functionality to config struct
keptsecret 573ce44
reduction returns value instead of saving directly to storage
keptsecret 49ca655
fixes to 2-level scan indexing
keptsecret a639145
fixes to 3-level scan and minor stuff
keptsecret 7751359
some minor fixes
keptsecret fd6f527
latest example
keptsecret 27d84c8
merge master, fix conflicts
keptsecret 350c6a3
more util funcs in config, fix some calculations
keptsecret 14e5d15
added generic data/shared mem accessors
keptsecret f07329e
fix include guard
keptsecret 48a7d16
changes to arithmetic accessor concepts
keptsecret 20a54be
concept macro for checking types
keptsecret d83ac5c
revert concept macro addition
keptsecret 00787bf
added generic read/write accessors
keptsecret c0dfc1e
more refactor for accessor concept changes
keptsecret 55840a3
don't pass scalar_t as index type
keptsecret d758ff7
refactor accessor to match accessor template
keptsecret b062ede
simplified indexing functions
keptsecret 472aa0b
more fixes to indexing
keptsecret c483941
share level 0 scan between 2-level and 3-level scans (and reduce)
keptsecret 951ff99
reduce duplicate vars in config
keptsecret 127c6d9
some fixes to indexing
keptsecret 90d3579
fix scans for level 1+
keptsecret 203c03a
some indexing fixes for 3-level reduce/scan
keptsecret 0b16307
fix 3-level scan downsweep step
keptsecret File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Submodule examples_tests
updated
21 files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ | ||
#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ | ||
|
||
|
||
#include "nbl/builtin/hlsl/functional.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup/ballot.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl" | ||
|
||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace workgroup2 | ||
{ | ||
|
||
template<class Config, class BinOp, class device_capabilities=void> | ||
struct reduction | ||
{ | ||
template<class DataAccessor, class ScratchAccessor> | ||
static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
impl::reduce<Config,BinOp,Config::LevelCount,device_capabilities> fn; | ||
fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor); | ||
} | ||
}; | ||
|
||
template<class Config, class BinOp, class device_capabilities=void> | ||
struct inclusive_scan | ||
{ | ||
template<class DataAccessor, class ScratchAccessor> | ||
static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) | ||
{ | ||
impl::scan<Config,BinOp,false,Config::LevelCount,device_capabilities> fn; | ||
fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor); | ||
} | ||
}; | ||
|
||
template<class Config, class BinOp, class device_capabilities=void> | ||
struct exclusive_scan | ||
{ | ||
template<class DataAccessor, class ScratchAccessor> | ||
static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) | ||
{ | ||
impl::scan<Config,BinOp,true,Config::LevelCount,device_capabilities> fn; | ||
fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor); | ||
} | ||
}; | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
} | ||
} | ||
} | ||
|
||
#endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_ | ||
#define _NBL_BUILTIN_HLSL_WORKGROUP2_CONFIG_INCLUDED_ | ||
|
||
#include "nbl/builtin/hlsl/cpp_compat.hlsl" | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace workgroup2 | ||
{ | ||
|
||
namespace impl | ||
{ | ||
template<uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2> | ||
struct virtual_wg_size_log2 | ||
{ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t levels = conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2),uint16_t,conditional_value<(WorkgroupSizeLog2>SubgroupSizeLog2*2+2),uint16_t,3,2>::value,1>::value; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t value = mpl::max_v<uint32_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>+SubgroupSizeLog2; | ||
}; | ||
|
||
template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation, uint16_t WorkgroupSizeLog2, uint16_t SubgroupSizeLog2> | ||
struct items_per_invocation | ||
{ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocationProductLog2 = mpl::max_v<int16_t,WorkgroupSizeLog2-SubgroupSizeLog2*VirtualWorkgroup::levels,0>; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t value0 = BaseItemsPerInvocation; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t value1 = uint16_t(0x1u) << conditional_value<VirtualWorkgroup::levels==3, uint16_t,mpl::min_v<uint16_t,ItemsPerInvocationProductLog2,2>, ItemsPerInvocationProductLog2>::value; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t value2 = uint16_t(0x1u) << mpl::max_v<int16_t,ItemsPerInvocationProductLog2-2,0>; | ||
}; | ||
} | ||
|
||
template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation> | ||
struct Configuration | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2; | ||
static_assert(WorkgroupSizeLog2>=_SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); | ||
|
||
// must have at least enough level 0 outputs to feed a single subgroup | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = mpl::max_v<uint16_t, WorkgroupSizeLog2-SubgroupSizeLog2, SubgroupSizeLog2>; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = uint16_t(0x1u) << SubgroupsPerVirtualWorkgroupLog2; | ||
|
||
using virtual_wg_t = impl::virtual_wg_size_log2<WorkgroupSizeLog2, SubgroupSizeLog2>; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = virtual_wg_t::levels; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << virtual_wg_t::value; | ||
using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation, WorkgroupSizeLog2, SubgroupSizeLog2>; | ||
// NBL_CONSTEXPR_STATIC_INLINE uint32_t2 ItemsPerInvocation; TODO? doesn't allow inline definitions for uint32_t2 for some reason, uint32_t[2] as well ; declaring out of line results in not constant expression | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = items_per_invoc_t::value0; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = items_per_invoc_t::value1; | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = items_per_invoc_t::value2; | ||
static_assert(ItemsPerInvocation_1<=4, "3 level scan would have been needed with this config!"); | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SharedMemSize = conditional_value<LevelCount==3,uint16_t,SubgroupSize*ItemsPerInvocation_2,0>::value + SubgroupsPerVirtualWorkgroup*ItemsPerInvocation_1; | ||
}; | ||
|
||
// special case when workgroup size 2048 and subgroup size 16 needs 3 levels and virtual workgroup size 4096 to get a full subgroup scan each on level 1 and 2 16x16x16=4096 | ||
// specializing with macros because of DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007 | ||
#define SPECIALIZE_CONFIG_CASE_2048_16(ITEMS_PER_INVOC) template<>\ | ||
struct Configuration<11, 4, ITEMS_PER_INVOC>\ | ||
{\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << 11u;\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = uint16_t(4u);\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroupLog2 = 7u;\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupsPerVirtualWorkgroup = 128u;\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t LevelCount = 3u;\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t VirtualWorkgroupSize = uint16_t(0x1u) << 4096;\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_0 = ITEMS_PER_INVOC;\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_1 = 1u;\ | ||
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation_2 = 1u;\ | ||
};\ | ||
|
||
SPECIALIZE_CONFIG_CASE_2048_16(1) | ||
SPECIALIZE_CONFIG_CASE_2048_16(2) | ||
SPECIALIZE_CONFIG_CASE_2048_16(4) | ||
|
||
} | ||
} | ||
} | ||
|
||
#undef SPECIALIZE_CONFIG_CASE_2048_16 | ||
|
||
#endif |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.