diff --git a/examples_tests b/examples_tests
index e30938c261..4c10dc1cdb 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit e30938c2615dd5d3ab69cadca3ba11d1e03f8233
+Subproject commit 4c10dc1cdba4ab12dfedef97768aa4a10e606213
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
index 262cb3c0c7..9088b0c7b4 100644
--- a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
@@ -1,7 +1,7 @@
 #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_
 #define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_
 
-#include "nbl/builtin/hlsl/concepts.hlsl"
+#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl"
 #include "nbl/builtin/hlsl/fft/common.hlsl"
 
 namespace nbl
@@ -17,49 +17,15 @@ namespace fft
 //      * void set(uint32_t index, in uint32_t value); 
 //      * void workgroupExecutionAndMemoryBarrier();
 
-#define NBL_CONCEPT_NAME FFTSharedMemoryAccessor
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
-#define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
-#define NBL_CONCEPT_PARAM_2 (val, uint32_t)
-NBL_CONCEPT_BEGIN(3)
-#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<uint32_t, uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<uint32_t, uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
-);
-#undef val
-#undef index
-#undef accessor
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
-
+template<typename T, typename V=uint32_t, typename I=uint32_t>
+NBL_BOOL_CONCEPT FFTSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor<T,V,I>;
 
 // The Accessor (for a small FFT) MUST provide the following methods:
 //     * void get(uint32_t index, NBL_REF_ARG(complex_t<Scalar>) value);
 //     * void set(uint32_t index, in complex_t<Scalar> value);
 
-#define NBL_CONCEPT_NAME FFTAccessor
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(Scalar)
-#define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
-#define NBL_CONCEPT_PARAM_2 (val, complex_t<Scalar>)
-NBL_CONCEPT_BEGIN(3)
-#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<complex_t<Scalar> >(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<complex_t<Scalar> >(index, val)), is_same_v, void))
-);
-#undef val
-#undef index
-#undef accessor
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+template<typename T, typename Scalar, typename I=uint32_t>
+NBL_BOOL_CONCEPT FFTAccessor = concepts::accessors::GenericDataAccessor<T,complex_t<Scalar>,I>;
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
new file mode 100644
index 0000000000..cc22595444
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
@@ -0,0 +1,79 @@
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_
+
+#include "nbl/builtin/hlsl/concepts.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace concepts
+{
+namespace accessors
+{
+
+#define NBL_CONCEPT_NAME GenericSharedMemoryAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (val, V)
+#define NBL_CONCEPT_PARAM_2 (index, I)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<V,I>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V,I>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+#define NBL_CONCEPT_NAME GenericReadAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (val, V)
+#define NBL_CONCEPT_PARAM_2 (index, I)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V,I>(index, val)), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+#define NBL_CONCEPT_NAME GenericWriteAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (val, V)
+#define NBL_CONCEPT_PARAM_2 (index, I)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<V,I>(index, val)), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT GenericDataAccessor = GenericWriteAccessor<T,V,I> && GenericWriteAccessor<T,V,I>;
+
+}
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
new file mode 100644
index 0000000000..267342634f
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
@@ -0,0 +1,26 @@
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
+
+#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT ArithmeticSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor<T,V,I>;
+
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT ArithmeticReadOnlyDataAccessor = concepts::accessors::GenericReadAccessor<T,V,I>;
+
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT ArithmeticDataAccessor = concepts::accessors::GenericDataAccessor<T,V,I>;
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/memory_accessor.hlsl b/include/nbl/builtin/hlsl/memory_accessor.hlsl
index 99ec0736a4..2194b1e917 100644
--- a/include/nbl/builtin/hlsl/memory_accessor.hlsl
+++ b/include/nbl/builtin/hlsl/memory_accessor.hlsl
@@ -112,8 +112,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase<IndexType,ElementStride,S
     BaseAccessor accessor;
 
     // Question: shall we go back to requiring a `access_t get(index_t)` on the `BaseAccessor`, then we could `enable_if` check the return type (via `has_method_get`) matches and we won't get Nasty HLSL copy-in copy-out conversions
-    template<typename T>
-    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> get(const index_t ix, NBL_REF_ARG(T) value)
+    template<typename T, typename I=index_t>
+    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> get(const I ix, NBL_REF_ARG(T) value)
     {
         NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t);
         // `vector` for now, we'll use `array` later when `bit_cast` gets fixed
@@ -123,8 +123,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase<IndexType,ElementStride,S
         value = bit_cast<T,vector<access_t,SubElementCount> >(aux);
     }
 
-    template<typename T>
-    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> set(const index_t ix, NBL_CONST_REF_ARG(T) value)
+    template<typename T, typename I=index_t>
+    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> set(const I ix, NBL_CONST_REF_ARG(T) value)
     { 
         NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t);
         // `vector` for now, we'll use `array` later when `bit_cast` gets fixed
@@ -209,11 +209,11 @@ struct Offset : impl::OffsetBase<IndexType,_Offset>
 
     BaseAccessor accessor;
 
-    template <typename T>
-    void set(index_t idx, T value) {accessor.set(idx+base_t::offset,value); }
+    template <typename T, typename I=index_t>
+    void set(I idx, T value) {accessor.set(idx+base_t::offset,value); }
 
-    template <typename T> 
-    void get(index_t idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);}
+    template <typename T, typename I=index_t> 
+    void get(I idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);}
     
     template<typename S=BaseAccessor>
     enable_if_t<
diff --git a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
index 724887b995..3b511126b4 100644
--- a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
+++ b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
@@ -4,6 +4,8 @@
 #ifndef _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_
 #define _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_
 
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl"
+
 namespace nbl 
 {
 namespace hlsl
@@ -11,6 +13,20 @@ namespace hlsl
 namespace subgroup2
 {
 
+template<int32_t AssumeAllActive=false>
+uint32_t LastSubgroupInvocation()
+{
+    if (AssumeAllActive)
+        return glsl::gl_SubgroupSize()-1;
+    else
+        return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true));
+}
+
+bool ElectLast()
+{
+    return glsl::gl_SubgroupInvocationID()==LastSubgroupInvocation();
+}
+
 template<uint32_t SubgroupSizeLog2>
 struct Configuration
 {
diff --git a/include/nbl/builtin/hlsl/tuple.hlsl b/include/nbl/builtin/hlsl/tuple.hlsl
new file mode 100644
index 0000000000..a9c26090ea
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tuple.hlsl
@@ -0,0 +1,61 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_
+
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+
+template<typename T0, typename T1=void, typename T2=void> // TODO: in the future use BOOST_PP to make this
+struct tuple
+{
+    T0 t0;
+    T1 t1;
+    T2 t2;
+};
+
+template<uint32_t N, typename Tuple>
+struct tuple_element;
+
+template<typename T0>
+struct tuple<T0,void,void>
+{
+   T0 t0;
+};
+
+template<typename T0, typename T1>
+struct tuple<T0,T1,void>
+{
+   T0 t0;
+   T1 t1;
+};
+// specializations for less and less void elements
+
+// base case
+template<typename Head, typename T1, typename T2>
+struct tuple_element<0,tuple<Head,T1,T2> >
+{
+   using type = Head;
+};
+
+template<typename T0, typename Head, typename T2>
+struct tuple_element<1,tuple<T0,Head,T2> >
+{
+   using type = Head;
+};
+
+template<typename T0, typename T1, typename Head>
+struct tuple_element<2,tuple<T0,T1,Head> >
+{
+   using type = Head;
+};
+
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
index 9aefc3b3d8..652cabd7c7 100644
--- a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
+++ b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
@@ -28,6 +28,7 @@ struct vector_traits<vector<T, DIMENSION> >\
     NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;\
 };\
 
+DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(1)
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(2)
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(3)
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(4)
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
new file mode 100644
index 0000000000..62a9fb7bef
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -0,0 +1,63 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
+
+
+#include "nbl/builtin/hlsl/functional.hlsl"
+#include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl"
+
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
+struct reduction
+{
+    using scalar_t = typename BinOp::type_t;
+
+    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticReadOnlyDataAccessor<ReadOnlyDataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
+    static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        impl::reduce<Config,BinOp,Config::LevelCount,device_capabilities> fn;
+        return fn.template __call<ReadOnlyDataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+    }
+};
+
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
+struct inclusive_scan
+{
+    using scalar_t = typename BinOp::type_t;
+
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        impl::scan<Config,BinOp,false,Config::LevelCount,device_capabilities> fn;
+        fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+    }
+};
+
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
+struct exclusive_scan
+{
+    using scalar_t = typename BinOp::type_t;
+
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        impl::scan<Config,BinOp,true,Config::LevelCount,device_capabilities> fn;
+        fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+    }
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
new file mode 100644
index 0000000000..9a211899cb
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -0,0 +1,225 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/tuple.hlsl"
+#include "nbl/builtin/hlsl/mpl.hlsl"
+
+namespace nbl 
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+namespace impl
+{
+template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2>
+struct virtual_wg_size_log2
+{
+    #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
+    #define MAX(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #include "impl/virtual_wg_size_def.hlsl"
+    #undef SELECT
+    #undef MAX
+    #undef DEFINE_ASSIGN
+    
+    // must have at least enough level 0 outputs to feed a single subgroup
+    static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
+    static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16");
+};
+
+template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation>
+struct items_per_invocation
+{
+    #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
+    #define VIRTUAL_WG_SIZE VirtualWorkgroup::
+    #define MIN(TYPE,ARG1,ARG2) mpl::min_v<TYPE, ARG1, ARG2>
+    #define MAX(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #include "impl/items_per_invoc_def.hlsl"
+    #undef SELECT
+    #undef MAX
+    #undef MIN
+    #undef VIRTUAL_WG_SIZE
+    #undef DEFINE_ASSIGN
+
+    using ItemsPerInvocation = tuple<integral_constant<uint16_t,value0>,integral_constant<uint16_t,value1>,integral_constant<uint16_t,value2> >;
+};
+}
+
+template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
+struct ArithmeticConfiguration
+{
+    using virtual_wg_t = impl::virtual_wg_size_log2<_WorkgroupSizeLog2, _SubgroupSizeLog2>;
+    using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation>;
+    using ItemsPerInvocation = typename items_per_invoc_t::ItemsPerInvocation;
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
+    #define VIRTUAL_WG_SIZE virtual_wg_t::
+    #define ITEMS_PER_INVOC items_per_invoc_t::
+    #define MAX(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #include "impl/arithmetic_config_def.hlsl"
+    #undef SELECT
+    #undef MAX
+    #undef ITEMS_PER_INVOC
+    #undef VIRTUAL_WG_SIZE
+    #undef DEFINE_ASSIGN
+
+    using ChannelStride = tuple<integral_constant<uint16_t,__padding>,integral_constant<uint16_t,__channelStride_1>,integral_constant<uint16_t,__channelStride_2> >; // we don't use stride 0
+
+    static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
+    static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!");
+
+#ifdef __HLSL_VERSION
+    static bool electLast()
+    {
+        return glsl::gl_SubgroupInvocationID()==SubgroupSize-1;
+    }
+#endif
+
+    // gets a subgroupID as if each workgroup has (VirtualWorkgroupSize/SubgroupSize) subgroups
+    // each subgroup does work (VirtualWorkgroupSize/WorkgroupSize) times, the index denoted by workgroupInVirtualIndex
+    static uint16_t virtualSubgroupID(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
+    {
+        return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID;
+    }
+
+    // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1
+    // specify the next level to store values for in template param
+    // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    static uint16_t sharedStoreIndex(const uint16_t virtualSubgroupID)
+    {
+        const uint16_t ItemsPerNextInvocation = tuple_element<level,ItemsPerInvocation>::type::value;
+        const uint16_t outChannel = virtualSubgroupID & (ItemsPerNextInvocation-uint16_t(1u));
+        const uint16_t outInvocation = virtualSubgroupID / ItemsPerNextInvocation;
+        const uint16_t localOffset = outChannel * tuple_element<level,ChannelStride>::type::value + outInvocation;
+
+        if (level==2)
+        {
+            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1;
+            return baseOffset + localOffset;
+        }
+        else
+        {
+            const uint16_t paddingOffset = virtualSubgroupID / (SubgroupSize * ItemsPerInvocation_1);
+            return localOffset + paddingOffset;
+        }
+    }
+
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    static uint16_t sharedStoreIndexFromVirtualIndex(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
+    {
+        const uint16_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex);
+        return sharedStoreIndex<level>(virtualID);
+    }
+
+    // get the coalesced index in shared mem at the current level
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
+    {
+        const uint16_t localOffset = component * tuple_element<level,ChannelStride>::type::value + invocationIndex;
+        const uint16_t paddingOffset = invocationIndex / SubgroupSize;
+
+        if (level==2)
+        {
+            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1;
+            return baseOffset + localOffset + paddingOffset;
+        }
+        else
+            return localOffset + paddingOffset;
+    }
+};
+
+#ifndef __HLSL_VERSION
+namespace impl
+{
+struct SVirtualWGSizeLog2
+{
+    void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2)
+    {
+        #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__;
+        #define MAX(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #include "impl/virtual_wg_size_def.hlsl"
+        #undef SELECT
+        #undef MAX
+        #undef DEFINE_ASSIGN
+    }
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
+    #include "impl/virtual_wg_size_def.hlsl"
+    #undef DEFINE_ASSIGN
+};
+
+struct SItemsPerInvoc
+{
+    void init(const SVirtualWGSizeLog2 virtualWgSizeLog2, const uint16_t BaseItemsPerInvocation)
+    {
+        #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__;
+        #define VIRTUAL_WG_SIZE virtualWgSizeLog2.
+        #define MIN(TYPE,ARG1,ARG2) hlsl::min<TYPE>(ARG1, ARG2)
+        #define MAX(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #include "impl/items_per_invoc_def.hlsl"
+        #undef SELECT
+        #undef MAX
+        #undef MIN
+        #undef VIRTUAL_WG_SIZE
+        #undef DEFINE_ASSIGN
+    }
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
+    #include "impl/items_per_invoc_def.hlsl"
+    #undef DEFINE_ASSIGN
+};
+}
+
+struct SArithmeticConfiguration
+{
+    void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2, const uint16_t _ItemsPerInvocation)
+    {
+        impl::SVirtualWGSizeLog2 virtualWgSizeLog2;
+        virtualWgSizeLog2.init(_WorkgroupSizeLog2, _SubgroupSizeLog2);
+        impl::SItemsPerInvoc itemsPerInvoc;
+        itemsPerInvoc.init(virtualWgSizeLog2, _ItemsPerInvocation);
+
+        #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__;
+        #define VIRTUAL_WG_SIZE virtualWgSizeLog2.
+        #define ITEMS_PER_INVOC itemsPerInvoc.
+        #define MAX(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #include "impl/arithmetic_config_def.hlsl"
+        #undef SELECT
+        #undef MAX
+        #undef ITEMS_PER_INVOC
+        #undef VIRTUAL_WG_SIZE
+        #undef DEFINE_ASSIGN
+    }
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
+    #include "impl/arithmetic_config_def.hlsl"
+    #undef DEFINE_ASSIGN
+};
+#endif
+
+template<class T>
+struct is_configuration : bool_constant<false> {};
+
+template<uint16_t W, uint16_t S, uint16_t I>
+struct is_configuration<ArithmeticConfiguration<W,S,I> > : bool_constant<true> {};
+
+template<typename T>
+NBL_CONSTEXPR bool is_configuration_v = is_configuration<T>::value;
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
new file mode 100644
index 0000000000..94f54409db
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
@@ -0,0 +1,34 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, WorkgroupSize, uint16_t(0x1u) << WorkgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, SubgroupSize, uint16_t(0x1u) << SubgroupSizeLog2)
+
+DEFINE_ASSIGN(uint16_t, LevelCount, VIRTUAL_WG_SIZE levels)
+DEFINE_ASSIGN(uint16_t, VirtualWorkgroupSize, uint16_t(0x1u) << VIRTUAL_WG_SIZE value)
+
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_0, ITEMS_PER_INVOC value0)
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_1, ITEMS_PER_INVOC value1)
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_2, ITEMS_PER_INVOC value2)
+
+DEFINE_ASSIGN(uint16_t, LevelInputCount_1, SELECT(uint16_t,(LevelCount==3),
+    MAX(uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize),
+    SubgroupSize*ItemsPerInvocation_1))
+DEFINE_ASSIGN(uint16_t, LevelInputCount_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize*ItemsPerInvocation_2,0))
+DEFINE_ASSIGN(uint16_t, VirtualInvocationsAtLevel1, LevelInputCount_1 / ItemsPerInvocation_1)
+
+DEFINE_ASSIGN(uint16_t, __padding, SELECT(uint16_t,(LevelCount==3),SubgroupSize-1,0))
+DEFINE_ASSIGN(uint16_t, __channelStride_1, SELECT(uint16_t,(LevelCount==3),VirtualInvocationsAtLevel1,SubgroupSize) + __padding)
+DEFINE_ASSIGN(uint16_t, __channelStride_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize,0))
+
+// user specified the shared mem size of Scalars
+DEFINE_ASSIGN(uint32_t, SharedScratchElementCount, SELECT(uint16_t,(LevelCount==1),
+    0,
+    SELECT(uint16_t,(LevelCount==3),
+        LevelInputCount_2+(SubgroupSize*ItemsPerInvocation_1)-1,
+        0
+        ) + LevelInputCount_1
+    ))
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
new file mode 100644
index 0000000000..c32d7ef8bd
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
@@ -0,0 +1,8 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocationProductLog2, MAX(int16_t,VIRTUAL_WG_SIZE WorkgroupSizeLog2-VIRTUAL_WG_SIZE SubgroupSizeLog2*VIRTUAL_WG_SIZE levels,0))
+DEFINE_ASSIGN(uint16_t, value0, BaseItemsPerInvocation)
+DEFINE_ASSIGN(uint16_t, value1, uint16_t(0x1u) << SELECT(uint16_t,(VIRTUAL_WG_SIZE levels==3),MIN(uint16_t,ItemsPerInvocationProductLog2,2),ItemsPerInvocationProductLog2))
+DEFINE_ASSIGN(uint16_t, value2, uint16_t(0x1u) << MAX(int16_t,ItemsPerInvocationProductLog2-2,0))
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl
new file mode 100644
index 0000000000..e4c4047f1d
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl
@@ -0,0 +1,8 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, levels, SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2),SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2*2+2),3,2),1))
+DEFINE_ASSIGN(uint16_t, value, MAX(uint16_t, _SubgroupSizeLog2*levels, _WorkgroupSizeLog2))
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
new file mode 100644
index 0000000000..5b19c55fbd
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -0,0 +1,411 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_
+
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/ballot.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/mpl.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+namespace impl
+{
+
+template<class Config, class BinOp, uint16_t LevelCount, class device_capabilities>
+struct reduce;
+
+template<class Config, class BinOp, bool Exclusive, uint16_t LevelCount, class device_capabilities>
+struct scan;
+
+// 1-level scans
+template<class Config, class BinOp, class device_capabilities>
+struct reduce<Config, BinOp, 1, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    // doesn't use scratch smem, should be NOOP accessor
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        subgroup2::reduction<params_t> reduction;
+        vector_t value;
+        dataAccessor.template get<vector_t, uint16_t>(uint16_t(glsl::gl_SubgroupInvocationID()), value);
+        return reduction(value);
+    }
+};
+
+template<class Config, class BinOp, bool Exclusive, class device_capabilities>
+struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    // doesn't use scratch smem, should be NOOP accessor
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        vector_t value;
+        dataAccessor.template get<vector_t, uint16_t>(uint16_t(glsl::gl_SubgroupInvocationID()), value);
+        if (Exclusive)
+        {
+            subgroup2::exclusive_scan<params_t> excl_scan;
+            value = excl_scan(value);
+        }
+        else
+        {
+            subgroup2::inclusive_scan<params_t> incl_scan;
+            value = incl_scan(value);
+        }
+        dataAccessor.template set<vector_t, uint16_t>(uint16_t(glsl::gl_SubgroupInvocationID()), value);
+    }
+};
+
+// do level 0 scans for 2- and 3-level scans (same code)
+template<class Config, class BinOp, class device_capabilities>
+struct reduce_level0
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+
+    template<class DataAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 0 scan
+        subgroup2::reduction<params_t> reduction0;
+        [unroll]
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            vector_t scan_local;
+            dataAccessor.template get<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            scan_local = reduction0(scan_local);
+            if (Config::electLast())
+            {
+                const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+    };
+};
+
+template<class Config, class BinOp, class device_capabilities>
+struct scan_level0
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+
+    template<class DataAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        subgroup2::inclusive_scan<params_t> inclusiveScan0;
+        // level 0 scan
+        [unroll]
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            vector_t value;
+            dataAccessor.template get<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            value = inclusiveScan0(value);
+            dataAccessor.template set<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            if (Config::electLast())
+            {
+                const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+    }
+};
+
+// 2-level scans
+template<class Config, class BinOp, class device_capabilities>
+struct reduce<Config, BinOp, 2, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        BinOp binop;
+
+        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 1 scan
+        subgroup2::reduction<params_lv1_t> reduction1;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            lv1_val = reduction1(lv1_val);
+
+            if (Config::electLast())
+                scratchAccessor.template set<scalar_t, uint16_t>(0, lv1_val[Config::ItemsPerInvocation_1-1]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        scalar_t reduce_val;
+        scratchAccessor.template get<scalar_t, uint32_t>(0,reduce_val);
+        return reduce_val;
+    }
+};
+
+template<class Config, class BinOp, bool Exclusive, class device_capabilities>
+struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        BinOp binop;
+
+        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 1 scan
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            lv1_val = inclusiveScan1(lv1_val);
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // combine with level 0
+        [unroll]
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            vector_lv0_t value;
+            dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+
+            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx);
+            scalar_t left = BinOp::identity;
+            if (idx != 0 || glsl::gl_SubgroupID() != 0)
+                scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
+            if (Exclusive)
+            {
+                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
+                [unroll]
+                for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
+                    value[i] = binop(left, value[i-1]);
+                value[0] = binop(left, left_last_elem);
+            }
+            else
+            {
+                [unroll]
+                for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                    value[i] = binop(left, value[i]);
+            }
+            dataAccessor.template set<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+        }
+    }
+};
+
+// 3-level scans
+template<class Config, class BinOp, class device_capabilities>
+struct reduce<Config, BinOp, 3, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+    using vector_lv2_t = vector<scalar_t, Config::ItemsPerInvocation_2>;
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
+        BinOp binop;
+
+        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 1 scan
+        subgroup2::reduction<params_lv1_t> reduction1;
+        if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            lv1_val = reduction1(lv1_val);
+            if (Config::electLast())
+            {
+                const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // level 2 scan
+        subgroup2::reduction<params_lv2_t> reduction2;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_lv2_t lv2_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+            lv2_val = reduction2(lv2_val);
+            if (Config::electLast())
+                scratchAccessor.template set<scalar_t, uint16_t>(0, lv2_val[Config::ItemsPerInvocation_2-1]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        scalar_t reduce_val;
+        scratchAccessor.template get<scalar_t, uint16_t>(0,reduce_val);
+        return reduce_val;
+    }
+};
+
+template<class Config, class BinOp, bool Exclusive, class device_capabilities>
+struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+    using vector_lv2_t = vector<scalar_t, Config::ItemsPerInvocation_2>;
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
+        BinOp binop;
+
+        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 1 scan
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            lv1_val = inclusiveScan1(lv1_val);
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // level 2 scan
+        subgroup2::inclusive_scan<params_lv2_t> inclusiveScan2;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            const uint16_t lastChannel = Config::ItemsPerInvocation_1 - uint16_t(1u);
+            vector_lv2_t lv2_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+            {
+                const uint16_t inputSubgroupID = invocationIndex * Config::ItemsPerInvocation_2 + i;
+                const uint16_t inputSubgroupLastInvocation = inputSubgroupID * Config::SubgroupSize + (Config::SubgroupSize - uint16_t(1u));
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(inputSubgroupLastInvocation, lastChannel),lv2_val[i]);
+            }
+            lv2_val = inclusiveScan2(lv2_val);
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // combine with level 1
+        if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]);
+
+            scalar_t lv2_scan = BinOp::identity;
+            const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u));
+            if (glsl::gl_SubgroupID() != 0)
+                scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex, lv2_scan);
+
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan));
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // combine with level 0
+        [unroll]
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            vector_lv0_t value;
+            dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+
+            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx);
+            scalar_t left = BinOp::identity;
+            if (idx != 0 || glsl::gl_SubgroupID() != 0)
+                scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
+            if (Exclusive)
+            {
+                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
+                [unroll]
+                for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
+                    value[i] = binop(left, value[i-1]);
+                value[0] = binop(left, left_last_elem);
+            }
+            else
+            {
+                [unroll]
+                for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                    value[i] = binop(left, value[i]);
+            }
+            dataAccessor.template set<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+        }
+    }
+};
+
+}
+
+}
+}
+}
+
+#endif
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 9333a0d3b4..a3d15744a7 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -330,6 +330,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/basic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability_impl.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl")
+#subgroup2
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/ballot.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability_impl.hlsl")
 #shared header between C++ and HLSL
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/surface_transform.h")
 #workgroup
@@ -341,6 +345,13 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl")
+#workgroup2
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic_config.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/virtual_wg_size_def.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/items_per_invoc_def.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/arithmetic_config_def.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/shared_scan.hlsl")
 #Extensions
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl")
@@ -361,7 +372,9 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/anisotropi
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/loadable_image.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/generic_shared_data.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/workgroup_arithmetic.hlsl")
 #tgmath
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/impl.hlsl")